internal/process: improve windows shutdown behaviour (#808)

Add Windows specific shutdown code paths so stopping of child processes
is more reliable:

- stopping llama-swap won't leave behind any child processes it created
- uses Job Objects in Windows so the whole llama-swap tree is closed by
the os
- add procCtx to baseRouter. It replaces shutdownCtx as a signal for
managing lifetime state.
- shutdownCtx is only used by the router to stop handling new requests
during shutdown
- improve debug logging to make it easier to trace source of issues

Fixes #804
Updates #807
This commit is contained in:
Benson Wong
2026-06-01 00:45:30 -07:00
committed by GitHub
parent 6ea551362e
commit 9be9a87fa0
9 changed files with 184 additions and 27 deletions
+30 -2
View File
@@ -19,6 +19,7 @@ import (
"github.com/mostlygeek/llama-swap/internal/event"
"github.com/mostlygeek/llama-swap/internal/logmon"
"github.com/mostlygeek/llama-swap/internal/perf"
"github.com/mostlygeek/llama-swap/internal/process"
"github.com/mostlygeek/llama-swap/internal/server"
"github.com/mostlygeek/llama-swap/internal/shared"
"github.com/mostlygeek/llama-swap/internal/watcher"
@@ -122,6 +123,13 @@ func main() {
applyLogSettings(cfg)
proxyLog.Debugf("PID: %d", os.Getpid())
// On Windows, bind the process tree to a Job Object so every upstream
// process is reaped when llama-swap exits — even on a forced kill. No-op
// elsewhere. Non-fatal: a failure just falls back to per-process teardown.
if err := process.SetupTreeCleanup(); err != nil {
proxyLog.Warnf("failed to set up process tree cleanup: %v", err)
}
// perfMon outlives config reloads; its config is updated in place.
var perfMon *perf.Monitor
if !cfg.Performance.Disabled {
@@ -267,6 +275,16 @@ func main() {
proxyLog.Infof("received signal %v, shutting down", sig)
watcherCancel()
// Backstop against a stalled shutdown: force the process to
// exit once the whole graceful sequence has had its full budget.
// On Windows the Job Object reaps upstream processes on exit, so
// a forced exit still cleans up rather than orphaning children.
go func() {
time.Sleep(shutdownTimeout + 5*time.Second)
proxyLog.Warnf("graceful shutdown exceeded %v, forcing exit", shutdownTimeout)
os.Exit(1)
}()
activeMu.RLock()
srv := activeSrv
activeMu.RUnlock()
@@ -275,13 +293,23 @@ func main() {
// drain without blocking on them for the full timeout.
srv.CloseStreams()
shutdownCtx, cancel := context.WithTimeout(context.Background(), shutdownTimeout)
// Both phases share a single deadline so total shutdown is
// bounded by shutdownTimeout rather than 2x it.
deadline := time.Now().Add(shutdownTimeout)
shutdownCtx, cancel := context.WithDeadline(context.Background(), deadline)
defer cancel()
if err := httpServer.Shutdown(shutdownCtx); err != nil {
proxyLog.Warnf("http server shutdown error: %v", err)
}
if err := srv.Shutdown(shutdownTimeout); err != nil {
// Clamp the remaining budget to a small positive value: a
// non-positive timeout makes the router fall back to its own
// healthCheckTimeout, which would defeat the shared deadline.
remaining := time.Until(deadline)
if remaining <= 0 {
remaining = time.Millisecond
}
if err := srv.Shutdown(remaining); err != nil {
proxyLog.Warnf("router shutdown error: %v", err)
}