Compare commits

...

2 Commits

Author SHA1 Message Date
Benson Wong 29d3d9ba20 perf: add macOS GPU monitoring via mactop and ioreg (#816)
Implement performance monitoring on OSX for Apple Silicon hardware. 

The implementation uses a combination of mactop and ioreg. If mactop is
installed (`brew install mactop`) it is used in a headless cli mode to
stream usage metrics. mactop hooks into unpublished(?) C based APIs in
OSX. Rather than introduce a cgo dependency into llama-swap's build
chain only for darwin I opted to go the external process route.

ioreg, which comes bundled with OSX is used as the fallback. It does not
provide temperature and power usage data but is able to show accurate
GPU and memory utilization.

Updates #771, #814
2026-06-03 21:51:03 -07:00
Benson Wong 9be9a87fa0 internal/process: improve windows shutdown behaviour (#808)
Add Windows specific shutdown code paths so stopping of child processes
is more reliable:

- stopping llama-swap won't leave behind any child processes it created
- uses Job Objects in Windows so the whole llama-swap tree is closed by
the os
- add procCtx to baseRouter. It replaces shutdownCtx as a signal for
managing lifetime state.
- shutdownCtx is only used by the router to stop handling new requests
during shutdown
- improve debug logging to make it easier to trace source of issues

Fixes #804
Updates #807
2026-06-01 00:45:30 -07:00
13 changed files with 555 additions and 29 deletions
+1 -1
View File
@@ -15,6 +15,7 @@ require (
github.com/tidwall/gjson v1.18.0
github.com/tidwall/sjson v1.2.5
golang.org/x/sync v0.20.0
golang.org/x/sys v0.41.0
gopkg.in/yaml.v3 v3.0.1
)
@@ -70,7 +71,6 @@ require (
golang.org/x/arch v0.8.0 // indirect
golang.org/x/crypto v0.45.0 // indirect
golang.org/x/net v0.47.0 // indirect
golang.org/x/sys v0.41.0 // indirect
golang.org/x/text v0.31.0 // indirect
google.golang.org/protobuf v1.34.1 // indirect
)
+169
View File
@@ -1,6 +1,10 @@
package perf
import (
"encoding/json"
"fmt"
"math"
"regexp"
"strconv"
"strings"
"time"
@@ -43,3 +47,168 @@ func ParseNvidiaSmiLine(line string) *GpuStat {
PowerDrawW: powerDraw,
}
}
// mactopOutput maps the subset of mactop's headless JSON output that is
// relevant to GpuStat. Note that mactop's memory object is whole-system memory,
// not GPU-attributed; the darwin monitor overlays ioreg's GPU-attributed
// unified memory (see overlayIoregMem) so both backends report consistent
// memory figures.
type mactopOutput struct {
SocMetrics struct {
GPUPower float64 `json:"gpu_power"`
GPUFreq int `json:"gpu_freq_mhz"`
GPUTemp float64 `json:"gpu_temp"`
} `json:"soc_metrics"`
Memory struct {
Total uint64 `json:"total"`
Used uint64 `json:"used"`
} `json:"memory"`
GPUUsage float64 `json:"gpu_usage"`
SystemInfo struct {
Name string `json:"name"`
GPUCoreCount int `json:"gpu_core_count"`
} `json:"system_info"`
Fans []struct {
RPM int `json:"rpm"`
MinRPM int `json:"min_rpm"`
MaxRPM int `json:"max_rpm"`
} `json:"fans"`
Temperatures []struct {
Group string `json:"group"`
Avg float64 `json:"avg_celsius"`
} `json:"temperatures"`
}
// ioreg output uses ` = ` (with spaces) for top-level device properties and
// `=` (no spaces) for values inside nested dictionaries such as
// PerformanceStatistics.
var (
reIoregModel = regexp.MustCompile(`"model"\s*=\s*"([^"]+)"`)
reIoregCoreCount = regexp.MustCompile(`"gpu-core-count"\s*=\s*(\d+)`)
reIoregUtil = regexp.MustCompile(`"Device Utilization %"=(\d+)`)
reIoregMemUsed = regexp.MustCompile(`"In use system memory"=(\d+)`)
)
// ParseIoregOutput parses `ioreg -r -c IOGPU -d 1 -f` output into a GpuStat for
// the Apple Silicon integrated GPU. This is a fallback for when mactop is not
// installed: utilization and used memory are available, but power, temperature,
// and fan speed are not exposed by ioreg. memTotalMB is the unified memory size
// supplied by the caller, since Apple Silicon shares memory between CPU and GPU.
// Returns nil if no GPU device is found in the output.
func ParseIoregOutput(out []byte, memTotalMB int) *GpuStat {
utilMatch := reIoregUtil.FindSubmatch(out)
memMatch := reIoregMemUsed.FindSubmatch(out)
if utilMatch == nil && memMatch == nil {
return nil
}
var gpuUtil float64
if utilMatch != nil {
gpuUtil, _ = strconv.ParseFloat(string(utilMatch[1]), 64)
}
const toMB = 1024 * 1024
var memUsedMB int
if memMatch != nil {
memUsedBytes, _ := strconv.ParseInt(string(memMatch[1]), 10, 64)
memUsedMB = int(memUsedBytes / toMB)
}
var memUtil float64
if memTotalMB > 0 {
memUtil = float64(memUsedMB) / float64(memTotalMB) * 100
}
name := "Apple GPU"
if m := reIoregModel.FindSubmatch(out); m != nil {
name = string(m[1])
}
if m := reIoregCoreCount.FindSubmatch(out); m != nil {
if cores, err := strconv.Atoi(string(m[1])); err == nil && cores > 0 {
name = fmt.Sprintf("%s (%d-core GPU)", name, cores)
}
}
return &GpuStat{
Timestamp: time.Now(),
ID: 0,
Name: name,
GpuUtilPct: gpuUtil,
MemUtilPct: memUtil,
MemUsedMB: memUsedMB,
MemTotalMB: memTotalMB,
}
}
// ParseMactopLine parses a single line of mactop headless JSON output into a
// GpuStat for the Apple Silicon integrated GPU. Returns nil if the line cannot
// be parsed.
func ParseMactopLine(line string) *GpuStat {
line = strings.TrimSpace(line)
if line == "" {
return nil
}
var out mactopOutput
if err := json.Unmarshal([]byte(line), &out); err != nil {
return nil
}
const toMB = 1024 * 1024
memUsedMB := int(out.Memory.Used / toMB)
memTotalMB := int(out.Memory.Total / toMB)
var memUtil float64
if memTotalMB > 0 {
memUtil = float64(memUsedMB) / float64(memTotalMB) * 100
}
name := out.SystemInfo.Name
if name == "" {
name = "Apple GPU"
}
if out.SystemInfo.GPUCoreCount > 0 {
name = fmt.Sprintf("%s (%d-core GPU)", name, out.SystemInfo.GPUCoreCount)
}
// Unified memory has no dedicated VRAM sensor; use the memory temperature
// group when mactop exposes it.
var vramTempC int
for _, t := range out.Temperatures {
if strings.EqualFold(t.Group, "Memory") {
vramTempC = int(math.Round(t.Avg))
break
}
}
// Average fan load across all fans as a percentage of their RPM range.
var fanSpeed float64
var fanCount int
for _, f := range out.Fans {
if f.MaxRPM > f.MinRPM {
pct := float64(f.RPM-f.MinRPM) / float64(f.MaxRPM-f.MinRPM) * 100
if pct < 0 {
pct = 0
}
fanSpeed += pct
fanCount++
}
}
if fanCount > 0 {
fanSpeed /= float64(fanCount)
}
return &GpuStat{
Timestamp: time.Now(),
ID: 0,
Name: name,
TempC: int(math.Round(out.SocMetrics.GPUTemp)),
VramTempC: vramTempC,
GpuUtilPct: out.GPUUsage,
MemUtilPct: memUtil,
MemUsedMB: memUsedMB,
MemTotalMB: memTotalMB,
FanSpeedPct: fanSpeed,
PowerDrawW: out.SocMetrics.GPUPower,
}
}
+154 -1
View File
@@ -1,7 +1,11 @@
package perf
import (
"bufio"
"context"
"fmt"
"os/exec"
"strings"
"time"
"github.com/mostlygeek/llama-swap/internal/logmon"
@@ -11,7 +15,156 @@ import (
)
func getGpuStats(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
return nil, ErrNotImplemented
if ch, err := tryMactop(ctx, every, logger); err == nil {
logger.Info("using mactop for GPU monitoring")
return ch, nil
} else {
logger.Debugf("mactop: %s", err.Error())
}
if ch, err := tryIoreg(ctx, every, logger); err == nil {
logger.Info("using ioreg for GPU monitoring")
return ch, nil
} else {
logger.Debugf("ioreg: %s", err.Error())
}
return nil, ErrNoGpuTool
}
// tryIoreg polls `ioreg -r -c IOGPU -d 1 -f` for Apple Silicon GPU stats. It is
// a fallback for when mactop is not installed. ioreg exposes GPU utilization and
// used memory but not power, temperature, or fan speed.
func tryIoreg(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
if _, err := exec.LookPath("ioreg"); err != nil {
return nil, ErrNoGpuTool
}
// Verify ioreg actually reports a GPU device before committing to it, so we
// can fall through to ErrNoGpuTool otherwise.
if stat := sampleIoreg(ctx); stat == nil {
return nil, fmt.Errorf("ioreg reported no GPU device")
}
if every < time.Second {
every = time.Second
}
ch := make(chan []GpuStat, 1)
go func() {
defer close(ch)
ticker := time.NewTicker(every)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
stat := sampleIoreg(ctx)
if stat == nil {
continue
}
select {
case ch <- []GpuStat{*stat}:
default:
}
}
}
}()
return ch, nil
}
// sampleIoreg runs ioreg once and parses a single GpuStat, or returns nil.
func sampleIoreg(ctx context.Context) *GpuStat {
out, err := exec.CommandContext(ctx, "ioreg", "-r", "-c", "IOGPU", "-d", "1", "-f").Output()
if err != nil {
return nil
}
var memTotalMB int
if vmStat, err := mem.VirtualMemory(); err == nil {
memTotalMB = int(vmStat.Total / (1024 * 1024))
}
return ParseIoregOutput(out, memTotalMB)
}
// overlayIoregMem replaces a GpuStat's memory fields with the GPU-attributed
// unified memory reported by ioreg. mactop only exposes whole-system memory, so
// without this the mactop and ioreg backends would report different memory
// semantics. It is a no-op when ioreg is unavailable or reports no GPU memory,
// leaving the mactop-supplied values in place.
func overlayIoregMem(ctx context.Context, stat *GpuStat) {
ioStat := sampleIoreg(ctx)
if ioStat == nil {
return
}
stat.MemUsedMB = ioStat.MemUsedMB
stat.MemTotalMB = ioStat.MemTotalMB
stat.MemUtilPct = ioStat.MemUtilPct
}
// tryMactop streams Apple Silicon GPU stats from mactop's headless mode.
// See https://github.com/metaspartan/mactop. mactop emits one JSON object per
// sample to stdout, which we parse into GpuStat.
func tryMactop(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
if _, err := exec.LookPath("mactop"); err != nil {
return nil, ErrNoGpuTool
}
// mactop samples power over the interval, so give it at least a second.
intervalMs := int(every.Milliseconds())
if intervalMs < 1000 {
intervalMs = 1000
}
cmd := exec.CommandContext(ctx, "mactop",
"--headless",
"--format", "json",
"--interval", fmt.Sprintf("%d", intervalMs),
)
stdout, err := cmd.StdoutPipe()
if err != nil {
return nil, fmt.Errorf("mactop stdout pipe failed: %w", err)
}
if err := cmd.Start(); err != nil {
return nil, fmt.Errorf("mactop start failed: %w", err)
}
ch := make(chan []GpuStat, 1)
go func() {
defer close(ch)
scanner := bufio.NewScanner(stdout)
// mactop's JSON objects can be large; allow generous line lengths.
scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" {
continue
}
stat := ParseMactopLine(line)
if stat != nil {
// mactop only reports whole-system memory; overlay ioreg's
// GPU-attributed unified memory so both backends are consistent.
overlayIoregMem(ctx, stat)
select {
case ch <- []GpuStat{*stat}:
default:
}
}
}
cmd.Wait()
}()
return ch, nil
}
func readSysStats() (SysStat, error) {
+47
View File
@@ -264,3 +264,50 @@ func TestParseNvidiaSmiLine_ZeroMemoryTotal(t *testing.T) {
require.NotNil(t, stat)
assert.Equal(t, 0.0, stat.MemUtilPct)
}
const ioregSample = `+-o AGXAcceleratorG13X <class AGXAcceleratorG13X, id 0x1000009a1, registered, matched, active, busy 0 (39191 ms), retain 108>
{
"model" = "Apple M1 Pro"
"gpu-core-count" = 16
"PerformanceStatistics" = {"In use system memory (driver)"=0,"Alloc system memory"=14511046656,"Tiler Utilization %"=34,"recoveryCount"=0,"Renderer Utilization %"=34,"Device Utilization %"=34,"In use system memory"=7688503296}
"IOClass" = "AGXAcceleratorG13X"
}`
func TestParseIoregOutput_ValidOutput(t *testing.T) {
const memTotalMB = 32768
stat := ParseIoregOutput([]byte(ioregSample), memTotalMB)
require.NotNil(t, stat)
assert.Equal(t, 0, stat.ID)
assert.Equal(t, "Apple M1 Pro (16-core GPU)", stat.Name)
assert.Equal(t, 34.0, stat.GpuUtilPct)
assert.Equal(t, 7688503296/(1024*1024), stat.MemUsedMB)
assert.Equal(t, memTotalMB, stat.MemTotalMB)
assert.InDelta(t, float64(stat.MemUsedMB)/memTotalMB*100, stat.MemUtilPct, 0.01)
// Not exposed by ioreg.
assert.Equal(t, 0, stat.TempC)
assert.Equal(t, 0.0, stat.PowerDrawW)
assert.Equal(t, 0.0, stat.FanSpeedPct)
}
func TestParseIoregOutput_NoGpuDevice(t *testing.T) {
stat := ParseIoregOutput([]byte("no gpu here"), 32768)
assert.Nil(t, stat)
}
func TestParseIoregOutput_ZeroMemTotal(t *testing.T) {
stat := ParseIoregOutput([]byte(ioregSample), 0)
require.NotNil(t, stat)
assert.Equal(t, 0.0, stat.MemUtilPct)
}
func TestParseIoregOutput_MissingModel(t *testing.T) {
const out = `"Device Utilization %"=50,"In use system memory"=1048576`
stat := ParseIoregOutput([]byte(out), 1024)
require.NotNil(t, stat)
assert.Equal(t, "Apple GPU", stat.Name)
assert.Equal(t, 50.0, stat.GpuUtilPct)
assert.Equal(t, 1, stat.MemUsedMB)
}
+41 -10
View File
@@ -30,6 +30,13 @@ var ErrStartAborted = fmt.Errorf("aborted")
// the stop request, and stays independent of the caller's graceful timeout.
const cmdWaitDelay = 10 * time.Second
// parentCancelGraceTimeout is the graceful timeout used when the process is
// torn down because parentCtx was cancelled (final router teardown or app
// shutdown). In the normal flow the process has already been stopped via
// Stop() by this point, so killProcess is a no-op kill; the short grace just
// bounds the rare case where a process is still alive when its context is cut.
const parentCancelGraceTimeout = time.Second
type runReq struct {
timeout time.Duration
respond chan error
@@ -180,7 +187,7 @@ func (p *ProcessCommand) run() {
setState(StateShutdown)
if cmd != nil {
p.handler.Store(nil)
p.killProcess(cmd, cmdCancel, cmdDone, 100*time.Millisecond)
p.killProcess(cmd, cmdCancel, cmdDone, parentCancelGraceTimeout)
cmd = nil
cmdDone = nil
cmdCancel = nil
@@ -315,7 +322,7 @@ func (p *ProcessCommand) run() {
setState(StateShutdown)
res := <-resultCh
if res.cmd != nil {
p.killProcess(res.cmd, res.cancel, res.cmdDone, 100*time.Millisecond)
p.killProcess(res.cmd, res.cancel, res.cmdDone, parentCancelGraceTimeout)
}
notifyWaiters(fmt.Errorf("[%s] shutdown", p.id))
respondRun(fmt.Errorf("[%s] shutdown", p.id))
@@ -425,12 +432,20 @@ func (p *ProcessCommand) doStart(startCtx context.Context, healthCheckTimeout ti
go func() {
waitErr := cmd.Wait()
if exitErr, ok := waitErr.(*exec.ExitError); ok {
p.proxyLogger.Debugf("<%s> process exited: code=%d, err=%v", p.id, exitErr.ExitCode(), waitErr)
} else if waitErr != nil {
p.proxyLogger.Debugf("<%s> process exited with error: %v", p.id, waitErr)
} else {
switch st := p.State(); {
case waitErr == nil:
p.proxyLogger.Debugf("<%s> process exited cleanly", p.id)
case st == StateStopping || st == StateShutdown:
// Expected: we force-terminated the process. A forced kill exits
// the child with a non-zero code (e.g. taskkill /f on Windows
// yields exit status 1), so this is not an error.
p.proxyLogger.Debugf("<%s> process stopped by llama-swap: %v", p.id, waitErr)
default:
if exitErr, ok := waitErr.(*exec.ExitError); ok {
p.proxyLogger.Debugf("<%s> process exited: code=%d, err=%v", p.id, exitErr.ExitCode(), waitErr)
} else {
p.proxyLogger.Debugf("<%s> process exited with error: %v", p.id, waitErr)
}
}
close(cmdDone)
}()
@@ -503,24 +518,40 @@ func (p *ProcessCommand) doStart(startCtx context.Context, healthCheckTimeout ti
// cmd's context is cancelled.
func (p *ProcessCommand) sendStopSignal(cmd *exec.Cmd) error {
if cmd == nil || cmd.Process == nil {
p.processLogger.Debugf("<%s> sendStopSignal() called with nil cmd or process, nothing to stop", p.id)
return nil
}
pid := cmd.Process.Pid
if p.config.CmdStop != "" {
p.processLogger.Debugf("<%s> sendStopSignal() using CmdStop %q for pid %d", p.id, p.config.CmdStop, pid)
stopArgs, err := config.SanitizeCommand(
strings.ReplaceAll(p.config.CmdStop, "${PID}", fmt.Sprintf("%d", cmd.Process.Pid)),
strings.ReplaceAll(p.config.CmdStop, "${PID}", fmt.Sprintf("%d", pid)),
)
if err == nil {
p.processLogger.Debugf("<%s> sendStopSignal() running stop command: %s", p.id, strings.Join(stopArgs, " "))
stopCmd := exec.Command(stopArgs[0], stopArgs[1:]...)
stopCmd.Env = cmd.Env
setProcAttributes(stopCmd)
return stopCmd.Run()
runErr := stopCmd.Run()
if runErr != nil {
p.processLogger.Errorf("<%s> sendStopSignal() stop command failed: %v", p.id, runErr)
} else {
p.processLogger.Debugf("<%s> sendStopSignal() stop command completed for pid %d", p.id, pid)
}
return runErr
}
// fall through to SIGTERM if sanitize failed
p.processLogger.Errorf("<%s> sendStopSignal() failed to sanitize CmdStop %q: %v, falling back to terminateProcessTree", p.id, p.config.CmdStop, err)
}
// On Unix this SIGTERMs the whole process group so a forked grandchild
// (e.g. a shell wrapper that backgrounds the real binary) is taken down
// with the parent rather than orphaned.
return terminateProcessTree(cmd)
p.processLogger.Debugf("<%s> sendStopSignal() no CmdStop configured, calling terminateProcessTree for pid %d", p.id, pid)
termErr := terminateProcessTree(cmd)
if termErr != nil {
p.processLogger.Errorf("<%s> sendStopSignal() terminateProcessTree failed for pid %d: %v", p.id, pid, termErr)
}
return termErr
}
// killProcess terminates the upstream process. The flow:
+29 -12
View File
@@ -3,11 +3,13 @@
package process
import (
"fmt"
"os/exec"
"syscall"
)
// setProcAttributes sets platform-specific process attributes
// setProcAttributes sets platform-specific process attributes. CREATE_NO_WINDOW
// keeps the upstream from spawning its own console window.
func setProcAttributes(cmd *exec.Cmd) {
cmd.SysProcAttr = &syscall.SysProcAttr{
HideWindow: true,
@@ -15,22 +17,37 @@ func setProcAttributes(cmd *exec.Cmd) {
}
}
// terminateProcessTree asks the upstream process to stop. Windows has no
// process-group signalling here — process-tree teardown is handled by the
// configured CmdStop, which defaults to `taskkill /f /t` — so this preserves
// the previous single-process SIGTERM behaviour.
// terminateProcessTree requests a graceful shutdown of the whole process tree
// rooted at cmd.Process. Windows has no SIGTERM or process-group signalling, so
// we shell out to `taskkill /t`, which walks the child tree by PID — the
// equivalent of signalling a Unix process group. Without /f, taskkill asks the
// processes to close rather than force-killing them.
func terminateProcessTree(cmd *exec.Cmd) error {
if cmd == nil || cmd.Process == nil {
return nil
}
return cmd.Process.Signal(syscall.SIGTERM)
return taskkillProcessTree(cmd, false)
}
// killProcessTree force-terminates the upstream process. Tree teardown on
// Windows relies on CmdStop (taskkill /t); this kills the launched process.
// killProcessTree force-terminates the whole process tree rooted at cmd.Process
// via `taskkill /f /t`, so any descendant that ignored or outlived the graceful
// request is killed alongside the parent rather than leaked as an orphan.
func killProcessTree(cmd *exec.Cmd) error {
return taskkillProcessTree(cmd, true)
}
// taskkillProcessTree runs taskkill against cmd.Process.Pid. The /t flag
// terminates the process together with any child processes it started, which is
// the Windows analogue of signalling a Unix process group via its negative PID.
// When force is true the /f flag force-kills; otherwise taskkill requests a
// graceful close.
func taskkillProcessTree(cmd *exec.Cmd, force bool) error {
if cmd == nil || cmd.Process == nil {
return nil
}
return cmd.Process.Kill()
args := make([]string, 0, 4)
if force {
args = append(args, "/f")
}
args = append(args, "/t", "/pid", fmt.Sprintf("%d", cmd.Process.Pid))
kill := exec.Command("taskkill", args...)
setProcAttributes(kill)
return kill.Run()
}
+7
View File
@@ -0,0 +1,7 @@
//go:build !windows
package process
// SetupTreeCleanup is a no-op on non-Windows platforms, where upstream process
// teardown is handled via process-group signalling (see runtime_unix.go).
func SetupTreeCleanup() error { return nil }
+50
View File
@@ -0,0 +1,50 @@
//go:build windows
package process
import (
"fmt"
"unsafe"
"golang.org/x/sys/windows"
)
// SetupTreeCleanup assigns the current process to a Windows Job Object
// configured with JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE. Upstream processes
// spawned afterwards are associated with the same job, so when llama-swap exits
// for any reason — graceful shutdown, a forced second Ctrl+C, or a crash — the
// OS terminates the whole job and reaps every child instead of leaving orphans
// behind. It is the parent-side complement to the per-process teardown in
// runtime_windows.go.
//
// The job handle is intentionally leaked for the lifetime of the process: the
// kill-on-close behaviour fires when the last handle is released, which the OS
// does when the process exits.
func SetupTreeCleanup() error {
job, err := windows.CreateJobObject(nil, nil)
if err != nil {
return fmt.Errorf("CreateJobObject: %w", err)
}
info := windows.JOBOBJECT_EXTENDED_LIMIT_INFORMATION{
BasicLimitInformation: windows.JOBOBJECT_BASIC_LIMIT_INFORMATION{
LimitFlags: windows.JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE,
},
}
if _, err := windows.SetInformationJobObject(
job,
windows.JobObjectExtendedLimitInformation,
uintptr(unsafe.Pointer(&info)),
uint32(unsafe.Sizeof(info)),
); err != nil {
windows.CloseHandle(job)
return fmt.Errorf("SetInformationJobObject: %w", err)
}
if err := windows.AssignProcessToJobObject(job, windows.CurrentProcess()); err != nil {
windows.CloseHandle(job)
return fmt.Errorf("AssignProcessToJobObject: %w", err)
}
return nil
}
+21
View File
@@ -75,10 +75,21 @@ type baseRouter struct {
logger *logmon.Monitor
planner swapPlanner
// shutdownCtx governs the request machinery: cancelling it tells grant()
// and ServeHTTP to stop granting and reject callers. It is deliberately
// separate from procCtx — see procCtx below.
shutdownCtx context.Context
shutdownFn context.CancelFunc
shuttingDown atomic.Bool
// procCtx is the parent context for every managed process and governs
// process lifetime only. handleShutdown stops processes gracefully via
// Stop() and cancels procCtx afterwards, so teardown is never a context
// cancel racing the graceful path (which collapsed the grace to 100ms and
// let the caller return before children were reaped — see process run loop).
procCtx context.Context
procCancel context.CancelFunc
handlerCh chan handlerReq
shutdownCh chan shutdownReq
unloadCh chan unloadReq
@@ -97,6 +108,7 @@ type baseRouter struct {
func newBaseRouter(name string, conf config.Config, processes map[string]process.Process, planner swapPlanner, logger *logmon.Monitor) *baseRouter {
shutdownCtx, shutdownFn := context.WithCancel(context.Background())
procCtx, procCancel := context.WithCancel(context.Background())
return &baseRouter{
name: name,
config: conf,
@@ -105,6 +117,8 @@ func newBaseRouter(name string, conf config.Config, processes map[string]process
planner: planner,
shutdownCtx: shutdownCtx,
shutdownFn: shutdownFn,
procCtx: procCtx,
procCancel: procCancel,
handlerCh: make(chan handlerReq),
shutdownCh: make(chan shutdownReq),
unloadCh: make(chan unloadReq),
@@ -492,6 +506,8 @@ func (b *baseRouter) handleShutdown(req shutdownReq, active map[string]*activeSw
// The grant calls below then either land (waiter happened to receive
// before noticing shutdown) or fall through immediately via grant's
// shutdownCtx case — either way the waiter sees a non-OK response.
// This does NOT touch processes: their lifetime is procCtx, cancelled
// only after the graceful Stop() calls below have reaped them.
b.shutdownFn()
for _, s := range active {
@@ -535,6 +551,11 @@ func (b *baseRouter) handleShutdown(req shutdownReq, active map[string]*activeSw
<-done
}
// Every process is stopped (children reaped via Stop()). Cancel procCtx so
// the process run-loop goroutines exit; they are already StateStopped, so
// this is a clean no-op kill rather than a forced teardown.
b.procCancel()
req.respond <- nil
}
+3 -1
View File
@@ -36,12 +36,14 @@ func NewGroup(conf config.Config, proxylog, upstreamlog *logmon.Monitor) (*Group
modelCfg, _, ok := conf.FindConfig(mid)
if !ok {
base.shutdownFn()
base.procCancel()
return nil, fmt.Errorf("no model config for %q", mid)
}
procLog := logmon.NewWriter(upstreamlog)
p, err := process.New(base.shutdownCtx, mid, modelCfg, procLog, proxylog)
p, err := process.New(base.procCtx, mid, modelCfg, procLog, proxylog)
if err != nil {
base.shutdownFn()
base.procCancel()
return nil, fmt.Errorf("creating process for %q: %w", mid, err)
}
processes[mid] = p
+2 -1
View File
@@ -31,9 +31,10 @@ func NewMatrix(conf config.Config, proxylog, upstreamlog *logmon.Monitor) (*Matr
for mid, modelCfg := range conf.Models {
procLog := logmon.NewWriter(upstreamlog)
p, err := process.New(base.shutdownCtx, mid, modelCfg, procLog, proxylog)
p, err := process.New(base.procCtx, mid, modelCfg, procLog, proxylog)
if err != nil {
base.shutdownFn()
base.procCancel()
return nil, fmt.Errorf("creating process for %q: %w", mid, err)
}
processes[mid] = p
+30 -2
View File
@@ -19,6 +19,7 @@ import (
"github.com/mostlygeek/llama-swap/internal/event"
"github.com/mostlygeek/llama-swap/internal/logmon"
"github.com/mostlygeek/llama-swap/internal/perf"
"github.com/mostlygeek/llama-swap/internal/process"
"github.com/mostlygeek/llama-swap/internal/server"
"github.com/mostlygeek/llama-swap/internal/shared"
"github.com/mostlygeek/llama-swap/internal/watcher"
@@ -122,6 +123,13 @@ func main() {
applyLogSettings(cfg)
proxyLog.Debugf("PID: %d", os.Getpid())
// On Windows, bind the process tree to a Job Object so every upstream
// process is reaped when llama-swap exits — even on a forced kill. No-op
// elsewhere. Non-fatal: a failure just falls back to per-process teardown.
if err := process.SetupTreeCleanup(); err != nil {
proxyLog.Warnf("failed to set up process tree cleanup: %v", err)
}
// perfMon outlives config reloads; its config is updated in place.
var perfMon *perf.Monitor
if !cfg.Performance.Disabled {
@@ -267,6 +275,16 @@ func main() {
proxyLog.Infof("received signal %v, shutting down", sig)
watcherCancel()
// Backstop against a stalled shutdown: force the process to
// exit once the whole graceful sequence has had its full budget.
// On Windows the Job Object reaps upstream processes on exit, so
// a forced exit still cleans up rather than orphaning children.
go func() {
time.Sleep(shutdownTimeout + 5*time.Second)
proxyLog.Warnf("graceful shutdown exceeded %v, forcing exit", shutdownTimeout)
os.Exit(1)
}()
activeMu.RLock()
srv := activeSrv
activeMu.RUnlock()
@@ -275,13 +293,23 @@ func main() {
// drain without blocking on them for the full timeout.
srv.CloseStreams()
shutdownCtx, cancel := context.WithTimeout(context.Background(), shutdownTimeout)
// Both phases share a single deadline so total shutdown is
// bounded by shutdownTimeout rather than 2x it.
deadline := time.Now().Add(shutdownTimeout)
shutdownCtx, cancel := context.WithDeadline(context.Background(), deadline)
defer cancel()
if err := httpServer.Shutdown(shutdownCtx); err != nil {
proxyLog.Warnf("http server shutdown error: %v", err)
}
if err := srv.Shutdown(shutdownTimeout); err != nil {
// Clamp the remaining budget to a small positive value: a
// non-positive timeout makes the router fall back to its own
// healthCheckTimeout, which would defeat the shared deadline.
remaining := time.Until(deadline)
if remaining <= 0 {
remaining = time.Millisecond
}
if err := srv.Shutdown(remaining); err != nil {
proxyLog.Warnf("router shutdown error: %v", err)
}
+1 -1
View File
@@ -402,7 +402,7 @@
<p class="text-sm text-txtsecondary">
This is an experimental feature. Please use <a
class="underline hover:text-txtmain"
href="https://github.com/mostlygeek/llama-swap/discussions/771">discussion #711</a
href="https://github.com/mostlygeek/llama-swap/discussions/771">discussion #771</a
> for instructions and to share feedback.
</p>