internal/process: improve windows shutdown behaviour (#808)
Add Windows specific shutdown code paths so stopping of child processes is more reliable: - stopping llama-swap won't leave behind any child processes it created - uses Job Objects in Windows so the whole llama-swap tree is closed by the os - add procCtx to baseRouter. It replaces shutdownCtx as a signal for managing lifetime state. - shutdownCtx is only used by the router to stop handling new requests during shutdown - improve debug logging to make it easier to trace source of issues Fixes #804 Updates #807
This commit is contained in:
+30
-2
@@ -19,6 +19,7 @@ import (
|
||||
"github.com/mostlygeek/llama-swap/internal/event"
|
||||
"github.com/mostlygeek/llama-swap/internal/logmon"
|
||||
"github.com/mostlygeek/llama-swap/internal/perf"
|
||||
"github.com/mostlygeek/llama-swap/internal/process"
|
||||
"github.com/mostlygeek/llama-swap/internal/server"
|
||||
"github.com/mostlygeek/llama-swap/internal/shared"
|
||||
"github.com/mostlygeek/llama-swap/internal/watcher"
|
||||
@@ -122,6 +123,13 @@ func main() {
|
||||
applyLogSettings(cfg)
|
||||
proxyLog.Debugf("PID: %d", os.Getpid())
|
||||
|
||||
// On Windows, bind the process tree to a Job Object so every upstream
|
||||
// process is reaped when llama-swap exits — even on a forced kill. No-op
|
||||
// elsewhere. Non-fatal: a failure just falls back to per-process teardown.
|
||||
if err := process.SetupTreeCleanup(); err != nil {
|
||||
proxyLog.Warnf("failed to set up process tree cleanup: %v", err)
|
||||
}
|
||||
|
||||
// perfMon outlives config reloads; its config is updated in place.
|
||||
var perfMon *perf.Monitor
|
||||
if !cfg.Performance.Disabled {
|
||||
@@ -267,6 +275,16 @@ func main() {
|
||||
proxyLog.Infof("received signal %v, shutting down", sig)
|
||||
watcherCancel()
|
||||
|
||||
// Backstop against a stalled shutdown: force the process to
|
||||
// exit once the whole graceful sequence has had its full budget.
|
||||
// On Windows the Job Object reaps upstream processes on exit, so
|
||||
// a forced exit still cleans up rather than orphaning children.
|
||||
go func() {
|
||||
time.Sleep(shutdownTimeout + 5*time.Second)
|
||||
proxyLog.Warnf("graceful shutdown exceeded %v, forcing exit", shutdownTimeout)
|
||||
os.Exit(1)
|
||||
}()
|
||||
|
||||
activeMu.RLock()
|
||||
srv := activeSrv
|
||||
activeMu.RUnlock()
|
||||
@@ -275,13 +293,23 @@ func main() {
|
||||
// drain without blocking on them for the full timeout.
|
||||
srv.CloseStreams()
|
||||
|
||||
shutdownCtx, cancel := context.WithTimeout(context.Background(), shutdownTimeout)
|
||||
// Both phases share a single deadline so total shutdown is
|
||||
// bounded by shutdownTimeout rather than 2x it.
|
||||
deadline := time.Now().Add(shutdownTimeout)
|
||||
shutdownCtx, cancel := context.WithDeadline(context.Background(), deadline)
|
||||
defer cancel()
|
||||
if err := httpServer.Shutdown(shutdownCtx); err != nil {
|
||||
proxyLog.Warnf("http server shutdown error: %v", err)
|
||||
}
|
||||
|
||||
if err := srv.Shutdown(shutdownTimeout); err != nil {
|
||||
// Clamp the remaining budget to a small positive value: a
|
||||
// non-positive timeout makes the router fall back to its own
|
||||
// healthCheckTimeout, which would defeat the shared deadline.
|
||||
remaining := time.Until(deadline)
|
||||
if remaining <= 0 {
|
||||
remaining = time.Millisecond
|
||||
}
|
||||
if err := srv.Shutdown(remaining); err != nil {
|
||||
proxyLog.Warnf("router shutdown error: %v", err)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user