remove noisy debug print message

add guard to avoid unnecessary logic in Process.Shutdown
Add linux install and uninstall shell scripts (#139 )
2025-05-19 15:36:15 -07:00 · 2025-05-19 15:34:30 -07:00 · 2025-05-19 12:03:33 -07:00 · 2025-05-16 19:54:44 -07:00 · 2025-05-16 13:52:04 -07:00 · 2025-05-16 13:48:42 -07:00
22 changed files with 735 additions and 101 deletions
@@ -0,0 +1,37 @@
+---
+name: Bug Report
+about: Something is not working as expected...
+title: ''
+labels: bug
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**Expected behaviour**
+A clear and concise description of what you expected to happen.
+
+**Operating system and version**
+
+- OS: (linux, osx, windows, freebsd, etc)
+- GPUs: (list architecture)
+
+**My Configuration**
+
+```yaml
+# copy / paste your configuration here
+```
+
+**Proxy Logs**
+
+```
+# copy / paste from /logs
+```
+
+**Upstream Logs**
+
+```
+# copy/paste from /logs
+```
@@ -0,0 +1,50 @@
+name: Windows CI
+
+on:
+  push:
+    branches: [ "main" ]
+
+  pull_request:
+    branches: [ "main" ]
+
+  # Allows manual triggering of the workflow
+  workflow_dispatch:
+
+jobs:
+
+  run-tests:
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Go
+      uses: actions/setup-go@v4
+      with:
+        go-version: '1.23'
+
+    # cache simple-responder to save the build time
+    - name: Restore Simple Responder
+      id: restore-simple-responder
+      uses: actions/cache/restore@v4
+      with:
+        path: ./build
+        key: ${{ runner.os }}-simple-responder-${{ hashFiles('misc/simple-responder/simple-responder.go') }}
+
+    # necessary for testing proxy/Process swapping
+    - name: Create simple-responder
+      if: steps.restore-simple-responder.outputs.cache-hit != 'true'
+      shell: bash
+      run: make simple-responder-windows
+
+    - name: Save Simple Responder
+      # nothing new to save ... skip this step
+      if: steps.restore-simple-responder.outputs.cache-hit != 'true'
+      id: save-simple-responder
+      uses: actions/cache/save@v4
+      with:
+        path: ./build
+        key: ${{ runner.os }}-simple-responder-${{ hashFiles('misc/simple-responder/simple-responder.go') }}
+
+    - name: Test all
+      shell: bash
+      run: make test-all
@@ -1,6 +1,4 @@
-# This workflow will build a golang project
-
-name: CI
+name: Linux CI

 on:
  push:
@@ -24,9 +22,26 @@ jobs:
      with:
        go-version: '1.23'

+    # cache simple-responder to save the build time
+    - name: Restore Simple Responder
+      id: restore-simple-responder
+      uses: actions/cache/restore@v4
+      with:
+        path: ./build
+        key: ${{ runner.os }}-simple-responder-${{ hashFiles('misc/simple-responder/simple-responder.go') }}
+
    # necessary for testing proxy/Process swapping
    - name: Create simple-responder
      run: make simple-responder

+    - name: Save Simple Responder
+      # nothing new to save ... skip this step
+      if: steps.restore-simple-responder.outputs.cache-hit != 'true'
+      id: save-simple-responder
+      uses: actions/cache/save@v4
+      with:
+        path: ./build
+        key: ${{ runner.os }}-simple-responder-${{ hashFiles('misc/simple-responder/simple-responder.go') }}
+
    - name: Test all
      run: make test-all
@@ -20,10 +20,10 @@ clean:
 	rm -rf $(BUILD_DIR)

 test:
-	go test -short -v ./proxy
+	go test -short -v -count=1 ./proxy

 test-all:
-	go test -v ./proxy
+	go test -v -count=1 ./proxy

 # Build OSX binary
 mac:
@@ -46,6 +46,10 @@ simple-responder:
 	GOOS=darwin GOARCH=arm64 go build -o $(BUILD_DIR)/simple-responder_darwin_arm64 misc/simple-responder/simple-responder.go
 	GOOS=linux GOARCH=amd64 go build -o $(BUILD_DIR)/simple-responder_linux_amd64 misc/simple-responder/simple-responder.go

+simple-responder-windows:
+	@echo "Building simple responder for windows"
+	GOOS=windows GOARCH=amd64 go build -o $(BUILD_DIR)/simple-responder.exe misc/simple-responder/simple-responder.go
+
 # Ensure build directory exists
 $(BUILD_DIR):
 	mkdir -p $(BUILD_DIR)
@@ -46,14 +46,14 @@ llama-swap's configuration is purposefully simple.
 models:
  "qwen2.5":
    proxy: "http://127.0.0.1:9999"
-    cmd: >
+    cmd: |
      /app/llama-server
      -hf bartowski/Qwen2.5-0.5B-Instruct-GGUF:Q4_K_M
      --port 9999

  "smollm2":
    proxy: "http://127.0.0.1:9999"
-    cmd: >
+    cmd: |
      /app/llama-server
      -hf bartowski/SmolLM2-135M-Instruct-GGUF:Q4_K_M
      --port 9999
@@ -82,7 +82,7 @@ startPort: 10001
 models:
  "llama":
    # multiline for readability
-    cmd: >
+    cmd: |
      llama-server --port 8999
      --model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf

@@ -123,12 +123,18 @@ models:
  # Docker Support (v26.1.4+ required!)
  "docker-llama":
    proxy: "http://127.0.0.1:${PORT}"
-    cmd: >
+    cmd: |
      docker run --name dockertest
      --init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
      ghcr.io/ggerganov/llama.cpp:server
      --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'

+    # use a custom command to stop the model when swapping. By default
+    # this is SIGTERM on POSIX systems, and taskkill on Windows systems
+    # the ${PID} variable can be used in cmdStop, it will be automatically replaced
+    # with the PID of the running model
+    cmdStop: docker stop dockertest
+
 # Groups provide advanced controls over model swapping behaviour. Using groups
 # some models can be kept loaded indefinitely, while others are swapped out.
 #
@@ -247,11 +253,11 @@ Pre-built binaries are available for Linux, FreeBSD and Darwin (OSX). These are
 1. Create a configuration file, see [config.example.yaml](config.example.yaml)
 1. Download a [release](https://github.com/mostlygeek/llama-swap/releases) appropriate for your OS and architecture.
 1. Run the binary with `llama-swap --config path/to/config.yaml`.
-  Available flags:
-    - `--config`: Path to the configuration file (default: `config.yaml`).
-    - `--listen`: Address and port to listen on (default: `:8080`).
-    - `--version`: Show version information and exit.
-    - `--watch-config`: Automatically reload the configuration file when it changes. This will wait for in-flight requests to complete then stop all running models (default: `false`).
+   Available flags:
+   - `--config`: Path to the configuration file (default: `config.yaml`).
+   - `--listen`: Address and port to listen on (default: `:8080`).
+   - `--version`: Show version information and exit.
+   - `--watch-config`: Automatically reload the configuration file when it changes. This will wait for in-flight requests to complete then stop all running models (default: `false`).

 ### Building from source

@@ -15,7 +15,7 @@ groups:

 models:
  "llama":
-    cmd: >
+    cmd: |
      models/llama-server-osx
      --port ${PORT}
      -m models/Llama-3.2-1B-Instruct-Q4_0.gguf
@@ -38,7 +38,7 @@ models:
  # Embedding example with Nomic
  # https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF
  "nomic":
-    cmd: >
+    cmd: |
      models/llama-server-osx --port ${PORT}
      -m models/nomic-embed-text-v1.5.Q8_0.gguf
      --ctx-size 8192
@@ -51,7 +51,7 @@ models:
  # Reranking example with bge-reranker
  # https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF
  "bge-reranker":
-    cmd: >
+    cmd: |
      models/llama-server-osx --port ${PORT}
      -m models/bge-reranker-v2-m3-Q4_K_M.gguf
      --ctx-size 8192
@@ -59,7 +59,7 @@ models:

  # Docker Support (v26.1.4+ required!)
  "dockertest":
-    cmd: >
+    cmd: |
      docker run --name dockertest
      --init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
      ghcr.io/ggerganov/llama.cpp:server
@@ -5,7 +5,6 @@ go 1.23.0
 require (
 	github.com/fsnotify/fsnotify v1.9.0
 	github.com/gin-gonic/gin v1.10.0
-	github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510
 	github.com/stretchr/testify v1.9.0
 	github.com/tidwall/gjson v1.18.0
 	github.com/tidwall/sjson v1.2.5
@@ -13,6 +12,7 @@ require (
 )

 require (
+	github.com/billziss-gh/golib v0.2.0 // indirect
 	github.com/bytedance/sonic v1.11.6 // indirect
 	github.com/bytedance/sonic/loader v0.1.1 // indirect
 	github.com/cloudwego/base64x v0.1.4 // indirect
@@ -1,3 +1,5 @@
+github.com/billziss-gh/golib v0.2.0 h1:NyvcAQdfvM8xokKkKotiligKjKXzuQD4PPykg1nKc/8=
+github.com/billziss-gh/golib v0.2.0/go.mod h1:mZpUYANXZkDKSnyYbX9gfnyxwe0ddRhUtfXcsD5r8dw=
 github.com/bytedance/sonic v1.11.6 h1:oUp34TzMlL+OY1OUWxHqsdkgC/Zfc85zGqw9siXjrc0=
 github.com/bytedance/sonic v1.11.6/go.mod h1:LysEHSvpvDySVdC2f87zGWf6CIKJcAvqab1ZaiQtds4=
 github.com/bytedance/sonic/loader v0.1.1 h1:c+e5Pt1k/cy5wMveRDyk2X4B9hF4g7an8N3zCYjJFNM=
@@ -26,6 +26,8 @@ func main() {

 	silent := flag.Bool("silent", false, "disable all logging")

+	ignoreSigTerm := flag.Bool("ignore-sig-term", false, "ignore SIGTERM signal")
+
 	flag.Parse() // Parse the command-line flags

 	// Create a new Gin router
@@ -190,6 +192,10 @@ func main() {
 		log.SetOutput(io.Discard)
 	}

+	if !*silent {
+		fmt.Printf("My PID: %d\n", os.Getpid())
+	}
+
 	go func() {
 		log.Printf("simple-responder listening on %s\n", address)
 		// service connections
@@ -200,11 +206,36 @@ func main() {

 	// Wait for interrupt signal to gracefully shutdown the server with
 	// a timeout of 5 seconds.
-	quit := make(chan os.Signal, 1)
+	sigChan := make(chan os.Signal, 1)
 	// kill (no param) default send syscall.SIGTERM
 	// kill -2 is syscall.SIGINT
 	// kill -9 is syscall.SIGKILL but can't be catch, so don't need add it
-	signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
-	<-quit
+	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
+
+	countSigInt := 0
+
+runloop:
+	for {
+		signal := <-sigChan
+		switch signal {
+		case syscall.SIGINT:
+			countSigInt++
+			if countSigInt > 1 {
+				break runloop
+			} else {
+				log.Println("Recieved SIGINT, send another SIGINT to shutdown")
+			}
+		case syscall.SIGTERM:
+			if *ignoreSigTerm {
+				log.Println("Ignoring SIGTERM")
+			} else {
+				log.Println("Recieved SIGTERM, shutting down")
+				break runloop
+			}
+		default:
+			break runloop
+		}
+	}
+
 	log.Println("simple-responder shutting down")
 }
@@ -4,11 +4,12 @@ import (
 	"fmt"
 	"io"
 	"os"
+	"runtime"
 	"sort"
 	"strconv"
 	"strings"

-	"github.com/google/shlex"
+	"github.com/billziss-gh/golib/shlex"
 	"gopkg.in/yaml.v3"
 )

@@ -16,6 +17,7 @@ const DEFAULT_GROUP_ID = "(default)"

 type ModelConfig struct {
 	Cmd           string   `yaml:"cmd"`
+	CmdStop       string   `yaml:"cmdStop"`
 	Proxy         string   `yaml:"proxy"`
 	Aliases       []string `yaml:"aliases"`
 	Env           []string `yaml:"env"`
@@ -134,7 +136,6 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 		}
 	}

-	// iterate over the models and replace any ${PORT} with the next available port
 	// Get and sort all model IDs first, makes testing more consistent
 	modelIds := make([]string, 0, len(config.Models))
 	for modelId := range config.Models {
@@ -142,10 +143,10 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 	}
 	sort.Strings(modelIds) // This guarantees stable iteration order

-	// iterate over the sorted models
 	nextPort := config.StartPort
 	for _, modelId := range modelIds {
 		modelConfig := config.Models[modelId]
+		// iterate over the models and replace any ${PORT} with the next available port
 		if strings.Contains(modelConfig.Cmd, "${PORT}") {
 			modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, "${PORT}", strconv.Itoa(nextPort))
 			if modelConfig.Proxy == "" {
@@ -159,6 +160,7 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 			return Config{}, fmt.Errorf("model %s requires a proxy value when not using automatic ${PORT}", modelId)
 		}
 	}
+
 	config = AddDefaultGroupToConfig(config)
 	// check that members are all unique in the groups
 	memberUsage := make(map[string]string) // maps member to group it appears in
@@ -228,14 +230,30 @@ func AddDefaultGroupToConfig(config Config) Config {
 }

 func SanitizeCommand(cmdStr string) ([]string, error) {
-	// Remove trailing backslashes
-	cmdStr = strings.ReplaceAll(cmdStr, "\\ \n", " ")
-	cmdStr = strings.ReplaceAll(cmdStr, "\\\n", " ")
+	var cleanedLines []string
+	for _, line := range strings.Split(cmdStr, "\n") {
+		trimmed := strings.TrimSpace(line)
+		// Skip comment lines
+		if strings.HasPrefix(trimmed, "#") {
+			continue
+		}
+		// Handle trailing backslashes by replacing with space
+		if strings.HasSuffix(trimmed, "\\") {
+			cleanedLines = append(cleanedLines, strings.TrimSuffix(trimmed, "\\")+" ")
+		} else {
+			cleanedLines = append(cleanedLines, line)
+		}
+	}
+
+	// put it back together
+	cmdStr = strings.Join(cleanedLines, "\n")

 	// Split the command into arguments
-	args, err := shlex.Split(cmdStr)
-	if err != nil {
-		return nil, err
+	var args []string
+	if runtime.GOOS == "windows" {
+		args = shlex.Windows.Split(cmdStr)
+	} else {
+		args = shlex.Posix.Split(cmdStr)
 	}

 	// Ensure the command is not empty
@@ -0,0 +1,42 @@
+//go:build !windows
+
+package proxy
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestConfig_SanitizeCommand(t *testing.T) {
+	// Test a command with spaces and newlines
+	args, err := SanitizeCommand(`python model1.py \
+		-a "double quotes" \
+		--arg2 'single quotes'
+		-s
+		# comment 1
+		--arg3 123 \
+
+		  # comment 2
+		--arg4 '"string in string"'
+
+
+		# this will get stripped out as well as the white space above
+		-c "'single quoted'"
+		`)
+	assert.NoError(t, err)
+	assert.Equal(t, []string{
+		"python", "model1.py",
+		"-a", "double quotes",
+		"--arg2", "single quotes",
+		"-s",
+		"--arg3", "123",
+		"--arg4", `"string in string"`,
+		"-c", `'single quoted'`,
+	}, args)
+
+	// Test an empty command
+	args, err = SanitizeCommand("")
+	assert.Error(t, err)
+	assert.Nil(t, args)
+}
@@ -258,34 +258,6 @@ func TestConfig_FindConfig(t *testing.T) {
 	assert.Equal(t, ModelConfig{}, modelConfig)
 }

-func TestConfig_SanitizeCommand(t *testing.T) {
-
-	// Test a command with spaces and newlines
-	args, err := SanitizeCommand(`python model1.py \
-    -a "double quotes" \
-    --arg2 'single quotes'
-	-s
-	--arg3 123 \
-	--arg4 '"string in string"'
-	-c "'single quoted'"
-	`)
-	assert.NoError(t, err)
-	assert.Equal(t, []string{
-		"python", "model1.py",
-		"-a", "double quotes",
-		"--arg2", "single quotes",
-		"-s",
-		"--arg3", "123",
-		"--arg4", `"string in string"`,
-		"-c", `'single quoted'`,
-	}, args)
-
-	// Test an empty command
-	args, err = SanitizeCommand("")
-	assert.Error(t, err)
-	assert.Nil(t, args)
-}
-
 func TestConfig_AutomaticPortAssignments(t *testing.T) {

 	t.Run("Default Port Ranges", func(t *testing.T) {
@@ -0,0 +1,41 @@
+//go:build windows
+
+package proxy
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestConfig_SanitizeCommand(t *testing.T) {
+	// does not support single quoted strings like in config_posix_test.go
+	args, err := SanitizeCommand(`python model1.py \
+
+	-a "double quotes" \
+	-s
+	--arg3 123 \
+
+	   # comment 2
+	--arg4 '"string in string"'
+
+
+
+	# this will get stripped out as well as the white space above
+	-c "'single quoted'"
+	`)
+	assert.NoError(t, err)
+	assert.Equal(t, []string{
+		"python", "model1.py",
+		"-a", "double quotes",
+		"-s",
+		"--arg3", "123",
+		"--arg4", "'string in string'", // this is a little weird but the lexer says so...?
+		"-c", `'single quoted'`,
+	}, args)
+
+	// Test an empty command
+	args, err = SanitizeCommand("")
+	assert.Error(t, err)
+	assert.Nil(t, args)
+}
@@ -45,17 +45,26 @@ func TestMain(m *testing.M) {
 func getSimpleResponderPath() string {
 	goos := runtime.GOOS
 	goarch := runtime.GOARCH
-	return filepath.Join("..", "build", fmt.Sprintf("simple-responder_%s_%s", goos, goarch))
+
+	if goos == "windows" {
+		return filepath.Join("..", "build", "simple-responder.exe")
+	} else {
+		return filepath.Join("..", "build", fmt.Sprintf("simple-responder_%s_%s", goos, goarch))
+	}
 }

-func getTestSimpleResponderConfig(expectedMessage string) ModelConfig {
+func getTestPort() int {
 	portMutex.Lock()
 	defer portMutex.Unlock()

 	port := nextTestPort
 	nextTestPort++

-	return getTestSimpleResponderConfigPort(expectedMessage, port)
+	return port
+}
+
+func getTestSimpleResponderConfig(expectedMessage string) ModelConfig {
+	return getTestSimpleResponderConfigPort(expectedMessage, getTestPort())
 }

 func getTestSimpleResponderConfigPort(expectedMessage string, port int) ModelConfig {
@@ -8,6 +8,7 @@ import (
 	"net/http"
 	"net/url"
 	"os/exec"
+	"runtime"
 	"strconv"
 	"strings"
 	"sync"
@@ -67,6 +68,12 @@ type Process struct {

 	// for managing concurrency limits
 	concurrencyLimitSemaphore chan struct{}
+
+	// stop timeout waiting for graceful shutdown
+	gracefulStopTimeout time.Duration
+
+	// track that this happened
+	upstreamWasStoppedWithKill bool
 }

 func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLogger *LogMonitor, proxyLogger *LogMonitor) *Process {
@@ -74,9 +81,8 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
 	concurrentLimit := 10
 	if config.ConcurrencyLimit > 0 {
 		concurrentLimit = config.ConcurrencyLimit
-	} else {
-		proxyLogger.Debugf("Concurrency limit for model %s not set, defaulting to 10", ID)
 	}
+
 	return &Process{
 		ID:                      ID,
 		config:                  config,
@@ -92,6 +98,10 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo

 		// concurrency limit
 		concurrencyLimitSemaphore: make(chan struct{}, concurrentLimit),
+
+		// stop timeout
+		gracefulStopTimeout:        5 * time.Second,
+		upstreamWasStoppedWithKill: false,
 	}
 }

@@ -138,7 +148,9 @@ func isValidTransition(from, to ProcessState) bool {
 		return to == StateStopping
 	case StateStopping:
 		return to == StateStopped || to == StateShutdown
-	case StateFailed, StateShutdown:
+	case StateFailed:
+		return to == StateStopping
+	case StateShutdown:
 		return false // No transitions allowed from these states
 	}
 	return false
@@ -208,6 +220,15 @@ func (p *Process) start() error {
 	go func() {
 		exitErr := p.cmd.Wait()
 		p.proxyLogger.Debugf("<%s> cmd.Wait() returned error: %v", p.ID, exitErr)
+
+		// there is a race condition when SIGKILL is used, p.cmd.Wait() returns, and then
+		// the code below fires, putting an error into cmdWaitChan. This code is to prevent this
+		if p.upstreamWasStoppedWithKill {
+			p.proxyLogger.Debugf("<%s> process was killed, NOT sending exitErr: %v", p.ID, exitErr)
+			p.upstreamWasStoppedWithKill = false
+			return
+		}
+
 		p.cmdWaitChan <- exitErr
 	}()

@@ -339,16 +360,23 @@ func (p *Process) StopImmediately() {
 		return
 	}

-	p.proxyLogger.Debugf("<%s> Stopping process", p.ID)
+	p.proxyLogger.Debugf("<%s> Stopping process, current state: %s", p.ID, p.CurrentState())
+	currentState := p.CurrentState()

-	// calling Stop() when state is invalid is a no-op
-	if curState, err := p.swapState(StateReady, StateStopping); err != nil {
-		p.proxyLogger.Infof("<%s> Stop() Ready -> StateStopping err: %v, current state: %v", p.ID, err, curState)
-		return
+	if currentState == StateFailed {
+		if curState, err := p.swapState(StateFailed, StateStopping); err != nil {
+			p.proxyLogger.Infof("<%s> Stop() Failed -> StateStopping err: %v, current state: %v", p.ID, err, curState)
+			return
+		}
+	} else {
+		if curState, err := p.swapState(StateReady, StateStopping); err != nil {
+			p.proxyLogger.Infof("<%s> Stop() Ready -> StateStopping err: %v, current state: %v", p.ID, err, curState)
+			return
+		}
 	}

 	// stop the process with a graceful exit timeout
-	p.stopCommand(5 * time.Second)
+	p.stopCommand(p.gracefulStopTimeout)

 	if curState, err := p.swapState(StateStopping, StateStopped); err != nil {
 		p.proxyLogger.Infof("<%s> Stop() StateStopping -> StateStopped err: %v, current state: %v", p.ID, err, curState)
@@ -360,8 +388,14 @@ func (p *Process) StopImmediately() {
 // is in the state of starting, it will cancel it and shut it down. Once a process is in
 // the StateShutdown state, it can not be started again.
 func (p *Process) Shutdown() {
+	if !isValidTransition(p.CurrentState(), StateStopping) {
+		return
+	}
+
 	p.shutdownCancel()
-	p.stopCommand(5 * time.Second)
+	p.stopCommand(p.gracefulStopTimeout)
+
+	// just force it to this state since there is no recovery from shutdown
 	p.state = StateShutdown
 }

@@ -381,13 +415,44 @@ func (p *Process) stopCommand(sigtermTTL time.Duration) {
 		return
 	}

-	if err := p.terminateProcess(); err != nil {
-		p.proxyLogger.Debugf("<%s> Process already terminated: %v (normal during shutdown)", p.ID, err)
+	// if err := p.terminateProcess(); err != nil {
+	// 	p.proxyLogger.Debugf("<%s> Process already terminated: %v (normal during shutdown)", p.ID, err)
+	// }
+	// the default cmdStop to taskkill /f /t /pid ${PID}
+	if runtime.GOOS == "windows" && strings.TrimSpace(p.config.CmdStop) == "" {
+		p.config.CmdStop = "taskkill /f /t /pid ${PID}"
+	}
+
+	if p.config.CmdStop != "" {
+		// replace ${PID} with the pid of the process
+		stopArgs, err := SanitizeCommand(strings.ReplaceAll(p.config.CmdStop, "${PID}", fmt.Sprintf("%d", p.cmd.Process.Pid)))
+		if err != nil {
+			p.proxyLogger.Errorf("<%s> Failed to sanitize stop command: %v", p.ID, err)
+			return
+		}
+
+		p.proxyLogger.Debugf("<%s> Executing stop command: %s", p.ID, strings.Join(stopArgs, " "))
+
+		stopCmd := exec.Command(stopArgs[0], stopArgs[1:]...)
+		stopCmd.Stdout = p.processLogger
+		stopCmd.Stderr = p.processLogger
+		stopCmd.Env = p.config.Env
+
+		if err := stopCmd.Run(); err != nil {
+			p.proxyLogger.Errorf("<%s> Failed to exec stop command: %v", p.ID, err)
+			return
+		}
+	} else {
+		if err := p.cmd.Process.Signal(syscall.SIGTERM); err != nil {
+			p.proxyLogger.Errorf("<%s> Failed to send SIGTERM to process: %v", p.ID, err)
+			return
+		}
 	}

 	select {
 	case <-sigtermTimeout.Done():
 		p.proxyLogger.Debugf("<%s> Process timed out waiting to stop, sending KILL signal (normal during shutdown)", p.ID)
+		p.upstreamWasStoppedWithKill = true
 		if err := p.cmd.Process.Kill(); err != nil {
 			p.proxyLogger.Errorf("<%s> Failed to kill process: %v", p.ID, err)
 		}
@@ -1,9 +0,0 @@
-//go:build !windows
-
-package proxy
-
-import "syscall"
-
-func (p *Process) terminateProcess() error {
-	return p.cmd.Process.Signal(syscall.SIGTERM)
-}
@@ -1,14 +0,0 @@
-//go:build windows
-
-package proxy
-
-import (
-	"fmt"
-	"os/exec"
-)
-
-func (p *Process) terminateProcess() error {
-	pid := fmt.Sprintf("%d", p.cmd.Process.Pid)
-	cmd := exec.Command("taskkill", "/f", "/t", "/pid", pid)
-	return cmd.Run()
-}
@@ -5,6 +5,7 @@ import (
 	"net/http"
 	"net/http/httptest"
 	"os"
+	"runtime"
 	"sync"
 	"testing"
 	"time"
@@ -393,3 +394,77 @@ func TestProcess_StopImmediately(t *testing.T) {
 	process.StopImmediately()
 	assert.Equal(t, process.CurrentState(), StateStopped)
 }
+
+// Test that SIGKILL is sent when gracefulStopTimeout is reached and properly terminates
+// the upstream command
+func TestProcess_ForceStopWithKill(t *testing.T) {
+
+	expectedMessage := "test_sigkill"
+	binaryPath := getSimpleResponderPath()
+	port := getTestPort()
+
+	config := ModelConfig{
+		// note --ignore-sig-term which ignores the SIGTERM signal so a SIGKILL must be sent
+		// to force the process to exit
+		Cmd:           fmt.Sprintf("%s --port %d --respond %s --silent --ignore-sig-term", binaryPath, port, expectedMessage),
+		Proxy:         fmt.Sprintf("http://127.0.0.1:%d", port),
+		CheckEndpoint: "/health",
+	}
+
+	process := NewProcess("stop_immediate", 2, config, debugLogger, debugLogger)
+	defer process.Stop()
+
+	// reduce to make testing go faster
+	process.gracefulStopTimeout = time.Second
+
+	err := process.start()
+	assert.Nil(t, err)
+	assert.Equal(t, process.CurrentState(), StateReady)
+
+	waitChan := make(chan struct{})
+	go func() {
+		// slow, but will get killed by StopImmediate
+		req := httptest.NewRequest("GET", "/slow-respond?echo=12345&delay=2s", nil)
+		w := httptest.NewRecorder()
+		process.ProxyRequest(w, req)
+
+		// StatusOK because that was already sent before the kill
+		assert.Equal(t, http.StatusOK, w.Code)
+
+		// unexpected EOF because the kill happened, the "1" is sent before the kill
+		// then the unexpected EOF is sent after the kill
+		if runtime.GOOS == "windows" {
+			assert.Contains(t, w.Body.String(), "wsarecv: An existing connection was forcibly closed by the remote host")
+		} else {
+			assert.Contains(t, w.Body.String(), "unexpected EOF")
+		}
+
+		close(waitChan)
+	}()
+
+	<-time.After(time.Millisecond)
+	process.StopImmediately()
+	assert.Equal(t, process.CurrentState(), StateStopped)
+
+	// the request should have been interrupted by SIGKILL
+	<-waitChan
+}
+
+func TestProcess_StopCmd(t *testing.T) {
+	config := getTestSimpleResponderConfig("test_stop_cmd")
+
+	if runtime.GOOS == "windows" {
+		config.CmdStop = "taskkill /f /t /pid ${PID}"
+	} else {
+		config.CmdStop = "kill -TERM ${PID}"
+	}
+
+	process := NewProcess("testStopCmd", 2, config, debugLogger, debugLogger)
+	defer process.Stop()
+
+	err := process.start()
+	assert.Nil(t, err)
+	assert.Equal(t, process.CurrentState(), StateReady)
+	process.StopImmediately()
+	assert.Equal(t, process.CurrentState(), StateStopped)
+}
@@ -334,7 +334,33 @@ func (pm *ProxyManager) upstreamIndex(c *gin.Context) {

 	// Iterate over sorted keys
 	for _, modelID := range modelIDs {
-		html.WriteString(fmt.Sprintf("<li><a href=\"/upstream/%s\">%s</a></li>", modelID, modelID))
+		// Get process state
+		processGroup := pm.findGroupByModelName(modelID)
+		var state string
+		if processGroup != nil {
+			process := processGroup.processes[modelID]
+			if process != nil {
+				var stateStr string
+				switch process.CurrentState() {
+				case StateReady:
+					stateStr = "Ready"
+				case StateStarting:
+					stateStr = "Starting"
+				case StateStopping:
+					stateStr = "Stopping"
+				case StateFailed:
+					stateStr = "Failed"
+				case StateShutdown:
+					stateStr = "Shutdown"
+				case StateStopped:
+					stateStr = "Stopped"
+				default:
+					stateStr = "Unknown"
+				}
+				state = stateStr
+			}
+		}
+		html.WriteString(fmt.Sprintf("<li><a href=\"/upstream/%s\">%s</a> - %s</li>", modelID, modelID, state))
 	}
 	html.WriteString("</ul></body></html>")
 	c.Header("Content-Type", "text/html")
@@ -374,7 +400,8 @@ func (pm *ProxyManager) proxyOAIHandler(c *gin.Context) {

 	// dechunk it as we already have all the body bytes see issue #11
 	c.Request.Header.Del("transfer-encoding")
-	c.Request.Header.Add("content-length", strconv.Itoa(len(bodyBytes)))
+	c.Request.Header.Set("content-length", strconv.Itoa(len(bodyBytes)))
+	c.Request.ContentLength = int64(len(bodyBytes))

 	if err := processGroup.ProxyRequest(realModelName, c.Writer, c.Request); err != nil {
 		pm.sendErrorResponse(c, http.StatusInternalServerError, fmt.Sprintf("error proxying request: %s", err.Error()))
@@ -14,6 +14,7 @@ import (
 	"time"

 	"github.com/stretchr/testify/assert"
+	"github.com/tidwall/gjson"
 )

 func TestProxyManager_SwapProcessCorrectly(t *testing.T) {
@@ -339,7 +340,7 @@ func TestProxyManager_RunningEndpoint(t *testing.T) {
 			"model1": getTestSimpleResponderConfig("model1"),
 			"model2": getTestSimpleResponderConfig("model2"),
 		},
-		LogLevel: "debug",
+		LogLevel: "warn",
 	})

 	// Define a helper struct to parse the JSON response.
@@ -448,7 +449,6 @@ func TestProxyManager_AudioTranscriptionHandler(t *testing.T) {
 // Test useModelName in configuration sends overrides what is sent to upstream
 func TestProxyManager_UseModelName(t *testing.T) {
 	upstreamModelName := "upstreamModel"
-
 	modelConfig := getTestSimpleResponderConfig(upstreamModelName)
 	modelConfig.UseModelName = upstreamModelName

@@ -473,6 +473,12 @@ func TestProxyManager_UseModelName(t *testing.T) {
 		proxy.ServeHTTP(w, req)
 		assert.Equal(t, http.StatusOK, w.Code)
 		assert.Contains(t, w.Body.String(), upstreamModelName)
+
+		// make sure the content length was set correctly
+		// simple-responder will return the content length it got in the response
+		body := w.Body.Bytes()
+		contentLength := int(gjson.GetBytes(body, "h_content_length").Int())
+		assert.Equal(t, len(fmt.Sprintf(`{"model":"%s"}`, upstreamModelName)), contentLength)
 	})

 	t.Run("useModelName over rides requested model: /v1/audio/transcriptions", func(t *testing.T) {
@@ -0,0 +1,189 @@
+#!/bin/sh
+# This script installs llama-swap on Linux.
+# It detects the current operating system architecture and installs the appropriate version of llama-swap.
+
+set -eu
+
+red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
+plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
+
+status() { echo ">>> $*" >&2; }
+error() { echo "${red}ERROR:${plain} $*"; exit 1; }
+warning() { echo "${red}WARNING:${plain} $*"; }
+
+available() { command -v $1 >/dev/null; }
+require() {
+    local MISSING=''
+    for TOOL in $*; do
+        if ! available $TOOL; then
+            MISSING="$MISSING $TOOL"
+        fi
+    done
+
+    echo $MISSING
+}
+
+SUDO=
+if [ "$(id -u)" -ne 0 ]; then
+    if ! available sudo; then
+        error "This script requires superuser permissions. Please re-run as root."
+    fi
+
+    SUDO="sudo"
+fi
+
+NEEDS=$(require curl tee jq tar)
+if [ -n "$NEEDS" ]; then
+    status "ERROR: The following tools are required but missing:"
+    for NEED in $NEEDS; do
+        echo "  - $NEED"
+    done
+    exit 1
+fi
+
+[ "$(uname -s)" = "Linux" ] || error 'This script is intended to run on Linux only.'
+
+ARCH=$(uname -m)
+case "$ARCH" in
+    x86_64) ARCH="amd64" ;;
+    aarch64|arm64) ARCH="arm64" ;;
+    *) error "Unsupported architecture: $ARCH" ;;
+esac
+
+IS_WSL2=false
+
+KERN=$(uname -r)
+case "$KERN" in
+    *icrosoft*WSL2 | *icrosoft*wsl2) IS_WSL2=true;;
+    *icrosoft) error "Microsoft WSL1 is not currently supported. Please use WSL2 with 'wsl --set-version <distro> 2'" ;;
+    *) ;;
+esac
+
+download_binary() {
+    ASSET_NAME="linux_$ARCH"
+
+    # Fetch the latest release info and extract the matching asset URL
+    DL_URL=$(curl -s "https://api.github.com/repos/mostlygeek/llama-swap/releases/latest" | \
+        jq -r --arg name "$ASSET_NAME" \
+        '.assets[] | select(.name | contains($name)) | .browser_download_url')
+
+    # Check if a URL was successfully extracted
+    if [ -z "$DL_URL" ]; then
+        error "No matching asset found with name containing '$ASSET_NAME'."
+    fi
+
+    status "Downloading Linux $ARCH binary"
+    curl -s -L "$DL_URL" | $SUDO tar -xzf - -C /usr/local/bin llama-swap
+}
+download_binary
+
+configure_systemd() {
+    if ! id llama-swap >/dev/null 2>&1; then
+        status "Creating llama-swap user..."
+        $SUDO useradd -r -s /bin/false -U -m -d /usr/share/llama-swap llama-swap
+    fi
+    if getent group render >/dev/null 2>&1; then
+        status "Adding llama-swap user to render group..."
+        $SUDO usermod -a -G render llama-swap
+    fi
+    if getent group video >/dev/null 2>&1; then
+        status "Adding llama-swap user to video group..."
+        $SUDO usermod -a -G video llama-swap
+    fi
+    if getent group docker >/dev/null 2>&1; then
+        status "Adding llama-swap user to docker group..."
+        $SUDO usermod -a -G docker llama-swap
+    fi
+
+    status "Adding current user to llama-swap group..."
+    $SUDO usermod -a -G llama-swap $(whoami)
+
+    if [ ! -f "/usr/share/llama-swap/config.yaml" ]; then
+        status "Creating default config.yaml..."
+        cat <<EOF | $SUDO -u llama-swap tee /usr/share/llama-swap/config.yaml >/dev/null
+# default 15s likely to fail for default models due to downloading models
+healthCheckTimeout: 60
+
+models:
+  "qwen2.5":
+    cmd: |
+      docker run
+        --rm
+        -p \${PORT}:8080
+        --name qwen2.5
+      ghcr.io/ggml-org/llama.cpp:server
+        -hf bartowski/Qwen2.5-0.5B-Instruct-GGUF:Q4_K_M
+    cmdStop: docker stop qwen2.5
+
+  "smollm2":
+    cmd: |
+      docker run
+        --rm
+        -p \${PORT}:8080
+        --name smollm2
+      ghcr.io/ggml-org/llama.cpp:server
+        -hf bartowski/SmolLM2-135M-Instruct-GGUF:Q4_K_M
+    cmdStop: docker stop smollm2
+EOF
+    fi
+
+    status "Creating llama-swap systemd service..."
+    cat <<EOF | $SUDO tee /etc/systemd/system/llama-swap.service >/dev/null
+[Unit]
+Description=llama-swap
+After=network.target
+
+[Service]
+User=llama-swap
+Group=llama-swap
+
+# set this to match your environment
+ExecStart=/usr/local/bin/llama-swap --config /usr/share/llama-swap/config.yaml --watch-config
+
+Restart=on-failure
+RestartSec=3
+StartLimitBurst=3
+StartLimitInterval=30
+
+[Install]
+WantedBy=multi-user.target
+EOF
+    SYSTEMCTL_RUNNING="$(systemctl is-system-running || true)"
+    case $SYSTEMCTL_RUNNING in
+        running|degraded)
+            status "Enabling and starting llama-swap service..."
+            $SUDO systemctl daemon-reload
+            $SUDO systemctl enable llama-swap
+
+            start_service() { $SUDO systemctl restart llama-swap; }
+            trap start_service EXIT
+            ;;
+        *)
+            warning "systemd is not running"
+            if [ "$IS_WSL2" = true ]; then
+                warning "see https://learn.microsoft.com/en-us/windows/wsl/systemd#how-to-enable-systemd to enable it"
+            fi
+            ;;
+    esac
+}
+
+if available systemctl; then
+    configure_systemd
+fi
+
+install_success() {
+    status 'The llama-swap API is now available at 127.0.0.1:8080.'
+    status 'Customize the config file at /usr/share/llama-swap/config.yaml.'
+    status 'Install complete.'
+}
+
+# WSL2 only supports GPUs via nvidia passthrough
+# so check for nvidia-smi to determine if GPU is available
+if [ "$IS_WSL2" = true ]; then
+    if available nvidia-smi && [ -n "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\.[0-9]*")" ]; then
+        status "Nvidia GPU detected."
+    fi
+    exit 0
+fi
+
+install_success
@@ -0,0 +1,68 @@
+#!/bin/sh
+# This script uninstalls llama-swap on Linux.
+# It removes the binary, systemd service, config.yaml (optional), and llama-swap user and group.
+
+set -eu
+
+red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
+plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
+
+status() { echo ">>> $*" >&2; }
+error() { echo "${red}ERROR:${plain} $*"; exit 1; }
+warning() { echo "${red}WARNING:${plain} $*"; }
+
+available() { command -v $1 >/dev/null; }
+
+SUDO=
+if [ "$(id -u)" -ne 0 ]; then
+    if ! available sudo; then
+        error "This script requires superuser permissions. Please re-run as root."
+    fi
+
+    SUDO="sudo"
+fi
+
+configure_systemd() {
+    status "Stopping llama-swap service..."
+    $SUDO systemctl stop llama-swap
+
+    status "Disabling llama-swap service..."
+    $SUDO systemctl disable llama-swap
+}
+if available systemctl; then
+    configure_systemd
+fi
+
+if available llama-swap; then
+    status "Removing llama-swap binary..."
+    $SUDO rm $(which llama-swap)
+fi
+
+if [ -f "/usr/share/llama-swap/config.yaml" ]; then
+    while true; do
+        printf "Delete config.yaml (/usr/share/llama-swap/config.yaml)? [y/N] " >&2
+        read answer
+        case "$answer" in
+            [Yy]* ) 
+                $SUDO rm -r /usr/share/llama-swap
+                break
+                ;;
+            [Nn]* | "" ) 
+                break
+                ;;
+            * ) 
+                echo "Invalid input. Please enter y or n."
+                ;;
+        esac
+    done
+fi
+
+if id llama-swap >/dev/null 2>&1; then
+    status "Removing llama-swap user..."
+    $SUDO userdel llama-swap
+fi
+
+if getent group llama-swap >/dev/null 2>&1; then
+    status "Removing llama-swap group..."
+    $SUDO groupdel llama-swap
+fi
Author	SHA1	Message	Date
Benson Wong	e7af671d8e	remove noisy debug print message	2025-05-19 15:36:15 -07:00
Benson Wong	8e62098eef	add guard to avoid unnecessary logic in Process.Shutdown	2025-05-19 15:34:30 -07:00
choyuansu	c260907415	Add linux install and uninstall shell scripts (#139 ) Contribution for install, and uninstall llama-swap in linux.	2025-05-19 12:03:33 -07:00
Benson Wong	b83a5fa291	make Failed stated recoverable (#137 ) A process in the failed state can transition to stopped either by calling /unload or swapping to another model.	2025-05-16 19:54:44 -07:00
Benson Wong	6e2ff28d59	improve cmdStop docs [no ci]	2025-05-16 13:52:04 -07:00
Benson Wong	a8b81f2799	Add stopCmd for custom stopping instructions (#136 ) Allow configuration of how a model is stopped before swapping. Setting `cmdStop` in the configuration will override the default behaviour and enables better integration with other process/container managers like docker or podman.	2025-05-16 13:48:42 -07:00
Benson Wong	f9ee7156dc	update configuration examples for multiline yaml commands #133	2025-05-16 11:45:39 -07:00
fakezeta	2d00120781	Update proxymanager.go (#135 )	2025-05-16 06:45:09 -07:00
Benson Wong	afc9aef058	Fix #133 SanitizeCommand removes comments (#134 )	2025-05-15 15:28:50 -07:00
Benson Wong	d7b390df74	Add GH Action for Testing on Windows (#132 ) * Add windows specific test changes * Change the command line parsing library - Possible breaking changes for windows users!	2025-05-14 21:51:53 -07:00
Benson Wong	5025c2f1f3	Add GH windows tests (not working yet)	2025-05-14 19:58:22 -07:00
Benson Wong	e3a0b013c1	add content length test for #131	2025-05-14 19:50:01 -07:00
Fadenfire	f5763a94a0	Fix content length being incorrect when useModelName is used (#131 ) * Fix content length being incorrect when useModelName is used * Update c.Request.ContentLength as well	2025-05-14 19:37:54 -07:00
Benson Wong	8ada72eb57	Update issue templates	2025-05-14 16:36:32 -07:00
Benson Wong	2441b383d3	Make checking for process killed status more robust	2025-05-14 16:26:56 -07:00
Benson Wong	25f251699c	Prevent StateFailed after SIGKILL (#129 ) Closes #125	2025-05-14 10:47:35 -07:00
Benson Wong	7f37bcc6eb	Improve testing around using SIGKILL (#127 ) * Add test for SIGKILL of process * silent TestProxyManager_RunningEndpoint debug output * Ref #125	2025-05-13 21:21:52 -07:00