Add macros to Configuration schema (#149 )

* Add macros to Configuration schema * update docs
small doc update [skip ci]
2025-05-29 21:51:25 -07:00 · 2025-05-26 16:03:27 -07:00 · 2025-05-26 15:46:08 -07:00 · 2025-05-26 09:57:53 -07:00 · 2025-05-23 22:54:43 -07:00 · 2025-05-23 09:39:55 -07:00
23 changed files with 950 additions and 270 deletions
@@ -0,0 +1,37 @@
+---
+name: Bug Report
+about: Something is not working as expected...
+title: ''
+labels: bug
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**Expected behaviour**
+A clear and concise description of what you expected to happen.
+
+**Operating system and version**
+
+- OS: (linux, osx, windows, freebsd, etc)
+- GPUs: (list architecture)
+
+**My Configuration**
+
+```yaml
+# copy / paste your configuration here
+```
+
+**Proxy Logs**
+
+```
+# copy / paste from /logs
+```
+
+**Upstream Logs**
+
+```
+# copy/paste from /logs
+```
@@ -15,7 +15,8 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        platform: [intel, cuda, vulkan, cpu, musa]
+        #platform: [intel, cuda, vulkan, cpu, musa]
+        platform: [cuda, vulkan, cpu, musa]
      fail-fast: false
    steps:
      - name: Checkout code
@@ -0,0 +1,50 @@
+name: Windows CI
+
+on:
+  push:
+    branches: [ "main" ]
+
+  pull_request:
+    branches: [ "main" ]
+
+  # Allows manual triggering of the workflow
+  workflow_dispatch:
+
+jobs:
+
+  run-tests:
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Go
+      uses: actions/setup-go@v4
+      with:
+        go-version: '1.23'
+
+    # cache simple-responder to save the build time
+    - name: Restore Simple Responder
+      id: restore-simple-responder
+      uses: actions/cache/restore@v4
+      with:
+        path: ./build
+        key: ${{ runner.os }}-simple-responder-${{ hashFiles('misc/simple-responder/simple-responder.go') }}
+
+    # necessary for testing proxy/Process swapping
+    - name: Create simple-responder
+      if: steps.restore-simple-responder.outputs.cache-hit != 'true'
+      shell: bash
+      run: make simple-responder-windows
+
+    - name: Save Simple Responder
+      # nothing new to save ... skip this step
+      if: steps.restore-simple-responder.outputs.cache-hit != 'true'
+      id: save-simple-responder
+      uses: actions/cache/save@v4
+      with:
+        path: ./build
+        key: ${{ runner.os }}-simple-responder-${{ hashFiles('misc/simple-responder/simple-responder.go') }}
+
+    - name: Test all
+      shell: bash
+      run: make test-all
@@ -1,6 +1,4 @@
-# This workflow will build a golang project
-
-name: CI
+name: Linux CI

 on:
  push:
@@ -24,9 +22,26 @@ jobs:
      with:
        go-version: '1.23'

+    # cache simple-responder to save the build time
+    - name: Restore Simple Responder
+      id: restore-simple-responder
+      uses: actions/cache/restore@v4
+      with:
+        path: ./build
+        key: ${{ runner.os }}-simple-responder-${{ hashFiles('misc/simple-responder/simple-responder.go') }}
+
    # necessary for testing proxy/Process swapping
    - name: Create simple-responder
      run: make simple-responder

+    - name: Save Simple Responder
+      # nothing new to save ... skip this step
+      if: steps.restore-simple-responder.outputs.cache-hit != 'true'
+      id: save-simple-responder
+      uses: actions/cache/save@v4
+      with:
+        path: ./build
+        key: ${{ runner.os }}-simple-responder-${{ hashFiles('misc/simple-responder/simple-responder.go') }}
+
    - name: Test all
      run: make test-all
@@ -20,10 +20,10 @@ clean:
 	rm -rf $(BUILD_DIR)

 test:
-	go test -short -v ./proxy
+	go test -short -v -count=1 ./proxy

 test-all:
-	go test -v ./proxy
+	go test -v -count=1 ./proxy

 # Build OSX binary
 mac:
@@ -46,6 +46,10 @@ simple-responder:
 	GOOS=darwin GOARCH=arm64 go build -o $(BUILD_DIR)/simple-responder_darwin_arm64 misc/simple-responder/simple-responder.go
 	GOOS=linux GOARCH=amd64 go build -o $(BUILD_DIR)/simple-responder_linux_amd64 misc/simple-responder/simple-responder.go

+simple-responder-windows:
+	@echo "Building simple responder for windows"
+	GOOS=windows GOARCH=amd64 go build -o $(BUILD_DIR)/simple-responder.exe misc/simple-responder/simple-responder.go
+
 # Ensure build directory exists
 $(BUILD_DIR):
 	mkdir -p $(BUILD_DIR)
@@ -7,7 +7,7 @@

 llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.

-Written in golang, it is very easy to install (single binary with no dependancies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images.
+Written in golang, it is very easy to install (single binary with no dependencies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images.

 ## Features:

@@ -45,151 +45,31 @@ llama-swap's configuration is purposefully simple.
 ```yaml
 models:
  "qwen2.5":
-    proxy: "http://127.0.0.1:9999"
-    cmd: >
+    cmd: |
      /app/llama-server
      -hf bartowski/Qwen2.5-0.5B-Instruct-GGUF:Q4_K_M
-      --port 9999
+      --port ${PORT}

  "smollm2":
-    proxy: "http://127.0.0.1:9999"
-    cmd: >
+    cmd: |
      /app/llama-server
      -hf bartowski/SmolLM2-135M-Instruct-GGUF:Q4_K_M
-      --port 9999
+      --port ${PORT}
 ```

-<details>
-<summary>But also very powerful ...</summary>
+But also very powerful:

-```yaml
-# Seconds to wait for llama.cpp to load and be ready to serve requests
-# Default (and minimum) is 15 seconds
-healthCheckTimeout: 60
+- ⚡ `groups` to run multiple models at once
+- ⚡ `macros` for reusable snippets
+- ⚡ `ttl` to automatically unload models
+- ⚡ `aliases` to use familiar model names (e.g., "gpt-4o-mini")
+- ⚡ `env` variables to pass custom environment to inference servers
+- ⚡ `useModelName` to override model names sent to upstream servers
+- ⚡ `healthCheckTimeout` to control model startup wait times
+- ⚡ `${PORT}` automatic port variables for dynamic port assignment
+- ⚡ Docker/podman compatible

-# Valid log levels: debug, info (default), warn, error
-logLevel: info
-
-# Automatic Port Values
-# use ${PORT} in model.cmd and model.proxy to use an automatic port number
-# when you use ${PORT} you can omit a custom model.proxy value, as it will
-# default to http://localhost:${PORT}
-
-# override the default port (5800) for automatic port values
-startPort: 10001
-
-# define valid model values and the upstream server start
-models:
-  "llama":
-    # multiline for readability
-    cmd: >
-      llama-server --port 8999
-      --model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
-
-    # environment variables to pass to the command
-    env:
-      - "CUDA_VISIBLE_DEVICES=0"
-
-    # where to reach the server started by cmd, make sure the ports match
-    # can be omitted if you use an automatic ${PORT} in cmd
-    proxy: http://127.0.0.1:8999
-
-    # aliases names to use this model for
-    aliases:
-      - "gpt-4o-mini"
-      - "gpt-3.5-turbo"
-
-    # check this path for an HTTP 200 OK before serving requests
-    # default: /health to match llama.cpp
-    # use "none" to skip endpoint checking, but may cause HTTP errors
-    # until the model is ready
-    checkEndpoint: /custom-endpoint
-
-    # automatically unload the model after this many seconds
-    # ttl values must be a value greater than 0
-    # default: 0 = never unload model
-    ttl: 60
-
-    # `useModelName` overrides the model name in the request
-    # and sends a specific name to the upstream server
-    useModelName: "qwen:qwq"
-
-  # unlisted models do not show up in /v1/models or /upstream lists
-  # but they can still be requested as normal
-  "qwen-unlisted":
-    unlisted: true
-    cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
-
-  # Docker Support (v26.1.4+ required!)
-  "docker-llama":
-    proxy: "http://127.0.0.1:${PORT}"
-    cmd: >
-      docker run --name dockertest
-      --init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
-      ghcr.io/ggerganov/llama.cpp:server
-      --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
-
-# Groups provide advanced controls over model swapping behaviour. Using groups
-# some models can be kept loaded indefinitely, while others are swapped out.
-#
-# Tips:
-#
-#  - models must be defined above in the Models section
-#  - a model can only be a member of one group
-#  - group behaviour is controlled via the `swap`, `exclusive` and `persistent` fields
-#  - see issue #109 for details
-#
-# NOTE: the example below uses model names that are not defined above for demonstration purposes
-groups:
-  # group1 is the default behaviour of llama-swap where only one model is allowed
-  # to run a time across the whole llama-swap instance
-  "group1":
-    # swap controls the model swapping behaviour in within the group
-    # - true : only one model is allowed to run at a time
-    # - false: all models can run together, no swapping
-    swap: true
-
-    # exclusive controls how the group affects other groups
-    # - true: causes all other groups to unload their models when this group runs a model
-    # - false: does not affect other groups
-    exclusive: true
-
-    # members references the models defined above
-    members:
-      - "llama"
-      - "qwen-unlisted"
-
-  # models in this group are never unloaded
-  "group2":
-    swap: false
-    exclusive: false
-    members:
-      - "docker-llama"
-      # (not defined above, here for example)
-      - "modelA"
-      - "modelB"
-
-  "forever":
-    # setting persistent to true causes the group to never be affected by the swapping behaviour of
-    # other groups. It is a shortcut to keeping some models always loaded.
-    persistent: true
-
-    # set swap/exclusive to false to prevent swapping inside the group and effect on other groups
-    swap: false
-    exclusive: false
-    members:
-      - "forever-modelA"
-      - "forever-modelB"
-      - "forever-modelc"
-```
-
-### Use Case Examples
-
- [config.example.yaml](config.example.yaml) includes example for supporting `v1/embeddings` and `v1/rerank` endpoints
- [Speculative Decoding](examples/speculative-decoding/README.md) - using a small draft model can increase inference speeds from 20% to 40%. This example includes a configurations Qwen2.5-Coder-32B (2.5x increase) and Llama-3.1-70B (1.4x increase) in the best cases.
- [Optimizing Code Generation](examples/benchmark-snakegame/README.md) - find the optimal settings for your machine. This example demonstrates defining multiple configurations and testing which one is fastest.
- [Restart on Config Change](examples/restart-on-config-change/README.md) - automatically restart llama-swap when trying out different configurations.
-</details>
+Check the [wiki](https://github.com/mostlygeek/llama-swap/wiki/Configuration) full documentation.

 ## Docker Install ([download images](https://github.com/mostlygeek/llama-swap/pkgs/container/llama-swap))

@@ -247,11 +127,11 @@ Pre-built binaries are available for Linux, FreeBSD and Darwin (OSX). These are
 1. Create a configuration file, see [config.example.yaml](config.example.yaml)
 1. Download a [release](https://github.com/mostlygeek/llama-swap/releases) appropriate for your OS and architecture.
 1. Run the binary with `llama-swap --config path/to/config.yaml`.
-  Available flags:
-    - `--config`: Path to the configuration file (default: `config.yaml`).
-    - `--listen`: Address and port to listen on (default: `:8080`).
-    - `--version`: Show version information and exit.
-    - `--watch-config`: Automatically reload the configuration file when it changes. This will wait for in-flight requests to complete then stop all running models (default: `false`).
+   Available flags:
+   - `--config`: Path to the configuration file (default: `config.yaml`).
+   - `--listen`: Address and port to listen on (default: `:8080`).
+   - `--version`: Show version information and exit.
+   - `--watch-config`: Automatically reload the configuration file when it changes. This will wait for in-flight requests to complete then stop all running models (default: `false`).

 ### Building from source

@@ -292,32 +172,6 @@ Any OpenAI compatible server would work. llama-swap was originally designed for

 For Python based inference servers like vllm or tabbyAPI it is recommended to run them via podman or docker. This provides clean environment isolation as well as responding correctly to `SIGTERM` signals to shutdown.

-## Systemd Unit Files
-
-Use this unit file to start llama-swap on boot. This is only tested on Ubuntu.
-
-`/etc/systemd/system/llama-swap.service`
-
-```
-[Unit]
-Description=llama-swap
-After=network.target
-
-[Service]
-User=nobody
-
-# set this to match your environment
-ExecStart=/path/to/llama-swap --config /path/to/llama-swap.config.yml
-
-Restart=on-failure
-RestartSec=3
-StartLimitBurst=3
-StartLimitInterval=30
-
-[Install]
-WantedBy=multi-user.target
-```
-
 ## Star History

 [![Star History Chart](https://api.star-history.com/svg?repos=mostlygeek/llama-swap&type=Date)](https://www.star-history.com/#mostlygeek/llama-swap&Date)
@@ -15,7 +15,7 @@ groups:

 models:
  "llama":
-    cmd: >
+    cmd: |
      models/llama-server-osx
      --port ${PORT}
      -m models/Llama-3.2-1B-Instruct-Q4_0.gguf
@@ -38,7 +38,7 @@ models:
  # Embedding example with Nomic
  # https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF
  "nomic":
-    cmd: >
+    cmd: |
      models/llama-server-osx --port ${PORT}
      -m models/nomic-embed-text-v1.5.Q8_0.gguf
      --ctx-size 8192
@@ -51,7 +51,7 @@ models:
  # Reranking example with bge-reranker
  # https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF
  "bge-reranker":
-    cmd: >
+    cmd: |
      models/llama-server-osx --port ${PORT}
      -m models/bge-reranker-v2-m3-Q4_K_M.gguf
      --ctx-size 8192
@@ -59,7 +59,7 @@ models:

  # Docker Support (v26.1.4+ required!)
  "dockertest":
-    cmd: >
+    cmd: |
      docker run --name dockertest
      --init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
      ghcr.io/ggerganov/llama.cpp:server
@@ -5,7 +5,6 @@ go 1.23.0
 require (
 	github.com/fsnotify/fsnotify v1.9.0
 	github.com/gin-gonic/gin v1.10.0
-	github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510
 	github.com/stretchr/testify v1.9.0
 	github.com/tidwall/gjson v1.18.0
 	github.com/tidwall/sjson v1.2.5
@@ -13,6 +12,7 @@ require (
 )

 require (
+	github.com/billziss-gh/golib v0.2.0 // indirect
 	github.com/bytedance/sonic v1.11.6 // indirect
 	github.com/bytedance/sonic/loader v0.1.1 // indirect
 	github.com/cloudwego/base64x v0.1.4 // indirect
@@ -1,3 +1,5 @@
+github.com/billziss-gh/golib v0.2.0 h1:NyvcAQdfvM8xokKkKotiligKjKXzuQD4PPykg1nKc/8=
+github.com/billziss-gh/golib v0.2.0/go.mod h1:mZpUYANXZkDKSnyYbX9gfnyxwe0ddRhUtfXcsD5r8dw=
 github.com/bytedance/sonic v1.11.6 h1:oUp34TzMlL+OY1OUWxHqsdkgC/Zfc85zGqw9siXjrc0=
 github.com/bytedance/sonic v1.11.6/go.mod h1:LysEHSvpvDySVdC2f87zGWf6CIKJcAvqab1ZaiQtds4=
 github.com/bytedance/sonic/loader v0.1.1 h1:c+e5Pt1k/cy5wMveRDyk2X4B9hF4g7an8N3zCYjJFNM=
@@ -26,6 +26,8 @@ func main() {

 	silent := flag.Bool("silent", false, "disable all logging")

+	ignoreSigTerm := flag.Bool("ignore-sig-term", false, "ignore SIGTERM signal")
+
 	flag.Parse() // Parse the command-line flags

 	// Create a new Gin router
@@ -190,6 +192,10 @@ func main() {
 		log.SetOutput(io.Discard)
 	}

+	if !*silent {
+		fmt.Printf("My PID: %d\n", os.Getpid())
+	}
+
 	go func() {
 		log.Printf("simple-responder listening on %s\n", address)
 		// service connections
@@ -200,11 +206,36 @@ func main() {

 	// Wait for interrupt signal to gracefully shutdown the server with
 	// a timeout of 5 seconds.
-	quit := make(chan os.Signal, 1)
+	sigChan := make(chan os.Signal, 1)
 	// kill (no param) default send syscall.SIGTERM
 	// kill -2 is syscall.SIGINT
 	// kill -9 is syscall.SIGKILL but can't be catch, so don't need add it
-	signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
-	<-quit
+	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
+
+	countSigInt := 0
+
+runloop:
+	for {
+		signal := <-sigChan
+		switch signal {
+		case syscall.SIGINT:
+			countSigInt++
+			if countSigInt > 1 {
+				break runloop
+			} else {
+				log.Println("Received SIGINT, send another SIGINT to shutdown")
+			}
+		case syscall.SIGTERM:
+			if *ignoreSigTerm {
+				log.Println("Ignoring SIGTERM")
+			} else {
+				log.Println("Received SIGTERM, shutting down")
+				break runloop
+			}
+		default:
+			break runloop
+		}
+	}
+
 	log.Println("simple-responder shutting down")
 }
@@ -4,11 +4,13 @@ import (
 	"fmt"
 	"io"
 	"os"
+	"regexp"
+	"runtime"
 	"sort"
 	"strconv"
 	"strings"

-	"github.com/google/shlex"
+	"github.com/billziss-gh/golib/shlex"
 	"gopkg.in/yaml.v3"
 )

@@ -16,6 +18,7 @@ const DEFAULT_GROUP_ID = "(default)"

 type ModelConfig struct {
 	Cmd           string   `yaml:"cmd"`
+	CmdStop       string   `yaml:"cmdStop"`
 	Proxy         string   `yaml:"proxy"`
 	Aliases       []string `yaml:"aliases"`
 	Env           []string `yaml:"env"`
@@ -65,6 +68,9 @@ type Config struct {
 	Profiles           map[string][]string    `yaml:"profiles"`
 	Groups             map[string]GroupConfig `yaml:"groups"` /* key is group ID */

+	// for key/value replacements in model's cmd, cmdStop, proxy, checkEndPoint
+	Macros map[string]string `yaml:"macros"`
+
 	// map aliases to actual model IDs
 	aliases map[string]string

@@ -111,7 +117,12 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 		return Config{}, err
 	}

-	if config.HealthCheckTimeout < 15 {
+	if config.HealthCheckTimeout == 0 {
+		// this high default timeout helps avoid failing health checks
+		// for configurations that wait for docker or have slower startup
+		config.HealthCheckTimeout = 120
+	} else if config.HealthCheckTimeout < 15 {
+		// set a minimum of 15 seconds
 		config.HealthCheckTimeout = 15
 	}

@@ -134,7 +145,30 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 		}
 	}

-	// iterate over the models and replace any ${PORT} with the next available port
+	/* check macro constraint rules:
+
+	- name must fit the regex ^[a-zA-Z0-9_-]+$
+	- names must be less than 64 characters (no reason, just cause)
+	- name can not be any reserved macros: PORT
+	- macro values must be less than 1024 characters
+	*/
+	macroNameRegex := regexp.MustCompile(`^[a-zA-Z0-9_-]+$`)
+	for macroName, macroValue := range config.Macros {
+		if len(macroName) >= 64 {
+			return Config{}, fmt.Errorf("macro name '%s' exceeds maximum length of 63 characters", macroName)
+		}
+		if !macroNameRegex.MatchString(macroName) {
+			return Config{}, fmt.Errorf("macro name '%s' contains invalid characters, must match pattern ^[a-zA-Z0-9_-]+$", macroName)
+		}
+		if len(macroValue) >= 1024 {
+			return Config{}, fmt.Errorf("macro value for '%s' exceeds maximum length of 1024 characters", macroName)
+		}
+		switch macroName {
+		case "PORT":
+			return Config{}, fmt.Errorf("macro name '%s' is reserved and cannot be used", macroName)
+		}
+	}
+
 	// Get and sort all model IDs first, makes testing more consistent
 	modelIds := make([]string, 0, len(config.Models))
 	for modelId := range config.Models {
@@ -142,23 +176,56 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 	}
 	sort.Strings(modelIds) // This guarantees stable iteration order

-	// iterate over the sorted models
 	nextPort := config.StartPort
 	for _, modelId := range modelIds {
 		modelConfig := config.Models[modelId]
-		if strings.Contains(modelConfig.Cmd, "${PORT}") {
-			modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, "${PORT}", strconv.Itoa(nextPort))
+
+		// go through model config fields: cmd, cmdStop, proxy, checkEndPoint and replace macros with macro values
+		for macroName, macroValue := range config.Macros {
+			macroSlug := fmt.Sprintf("${%s}", macroName)
+			modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, macroSlug, macroValue)
+			modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, macroSlug, macroValue)
+			modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, macroSlug, macroValue)
+			modelConfig.CheckEndpoint = strings.ReplaceAll(modelConfig.CheckEndpoint, macroSlug, macroValue)
+		}
+
+		// only iterate over models that use ${PORT} to keep port numbers from increasing unnecessarily
+		if strings.Contains(modelConfig.Cmd, "${PORT}") || strings.Contains(modelConfig.Proxy, "${PORT}") || strings.Contains(modelConfig.CmdStop, "${PORT}") {
 			if modelConfig.Proxy == "" {
-				modelConfig.Proxy = fmt.Sprintf("http://localhost:%d", nextPort)
-			} else {
-				modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, "${PORT}", strconv.Itoa(nextPort))
+				modelConfig.Proxy = "http://localhost:${PORT}"
 			}
+
+			nextPortStr := strconv.Itoa(nextPort)
+			modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, "${PORT}", nextPortStr)
+			modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, "${PORT}", nextPortStr)
+			modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, "${PORT}", nextPortStr)
 			nextPort++
-			config.Models[modelId] = modelConfig
 		} else if modelConfig.Proxy == "" {
 			return Config{}, fmt.Errorf("model %s requires a proxy value when not using automatic ${PORT}", modelId)
 		}
+
+		// make sure there are no unknown macros that have not been replaced
+		macroPattern := regexp.MustCompile(`\$\{([a-zA-Z0-9_-]+)\}`)
+		fieldMap := map[string]string{
+			"cmd":           modelConfig.Cmd,
+			"cmdStop":       modelConfig.CmdStop,
+			"proxy":         modelConfig.Proxy,
+			"checkEndpoint": modelConfig.CheckEndpoint,
+		}
+
+		for fieldName, fieldValue := range fieldMap {
+			matches := macroPattern.FindAllStringSubmatch(fieldValue, -1)
+			for _, match := range matches {
+				macroName := match[1]
+				if _, exists := config.Macros[macroName]; !exists {
+					return Config{}, fmt.Errorf("unknown macro '${%s}' found in %s.%s", macroName, modelId, fieldName)
+				}
+			}
+		}
+
+		config.Models[modelId] = modelConfig
 	}
+
 	config = AddDefaultGroupToConfig(config)
 	// check that members are all unique in the groups
 	memberUsage := make(map[string]string) // maps member to group it appears in
@@ -228,14 +295,30 @@ func AddDefaultGroupToConfig(config Config) Config {
 }

 func SanitizeCommand(cmdStr string) ([]string, error) {
-	// Remove trailing backslashes
-	cmdStr = strings.ReplaceAll(cmdStr, "\\ \n", " ")
-	cmdStr = strings.ReplaceAll(cmdStr, "\\\n", " ")
+	var cleanedLines []string
+	for _, line := range strings.Split(cmdStr, "\n") {
+		trimmed := strings.TrimSpace(line)
+		// Skip comment lines
+		if strings.HasPrefix(trimmed, "#") {
+			continue
+		}
+		// Handle trailing backslashes by replacing with space
+		if strings.HasSuffix(trimmed, "\\") {
+			cleanedLines = append(cleanedLines, strings.TrimSuffix(trimmed, "\\")+" ")
+		} else {
+			cleanedLines = append(cleanedLines, line)
+		}
+	}
+
+	// put it back together
+	cmdStr = strings.Join(cleanedLines, "\n")

 	// Split the command into arguments
-	args, err := shlex.Split(cmdStr)
-	if err != nil {
-		return nil, err
+	var args []string
+	if runtime.GOOS == "windows" {
+		args = shlex.Windows.Split(cmdStr)
+	} else {
+		args = shlex.Posix.Split(cmdStr)
 	}

 	// Ensure the command is not empty
@@ -0,0 +1,42 @@
+//go:build !windows
+
+package proxy
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestConfig_SanitizeCommand(t *testing.T) {
+	// Test a command with spaces and newlines
+	args, err := SanitizeCommand(`python model1.py \
+		-a "double quotes" \
+		--arg2 'single quotes'
+		-s
+		# comment 1
+		--arg3 123 \
+
+		  # comment 2
+		--arg4 '"string in string"'
+
+
+		# this will get stripped out as well as the white space above
+		-c "'single quoted'"
+		`)
+	assert.NoError(t, err)
+	assert.Equal(t, []string{
+		"python", "model1.py",
+		"-a", "double quotes",
+		"--arg2", "single quotes",
+		"-s",
+		"--arg3", "123",
+		"--arg4", `"string in string"`,
+		"-c", `'single quoted'`,
+	}, args)
+
+	// Test an empty command
+	args, err = SanitizeCommand("")
+	assert.Error(t, err)
+	assert.Nil(t, args)
+}
@@ -19,6 +19,8 @@ func TestConfig_Load(t *testing.T) {

 	tempFile := filepath.Join(tempDir, "config.yaml")
 	content := `
+macros:
+  svr-path: "path/to/server"
 models:
  model1:
    cmd: path/to/cmd --arg1 one
@@ -31,7 +33,7 @@ models:
      - "VAR2=value2"
    checkEndpoint: "/health"
  model2:
-    cmd: path/to/cmd --arg1 one
+    cmd: ${svr-path} --arg1 one
    proxy: "http://localhost:8081"
    aliases:
      - "m2"
@@ -76,6 +78,9 @@ groups:

 	expected := Config{
 		StartPort: 5800,
+		Macros: map[string]string{
+			"svr-path": "path/to/server",
+		},
 		Models: map[string]ModelConfig{
 			"model1": {
 				Cmd:           "path/to/cmd --arg1 one",
@@ -85,7 +90,7 @@ groups:
 				CheckEndpoint: "/health",
 			},
 			"model2": {
-				Cmd:           "path/to/cmd --arg1 one",
+				Cmd:           "path/to/server --arg1 one",
 				Proxy:         "http://localhost:8081",
 				Aliases:       []string{"m2"},
 				Env:           nil,
@@ -258,34 +263,6 @@ func TestConfig_FindConfig(t *testing.T) {
 	assert.Equal(t, ModelConfig{}, modelConfig)
 }

-func TestConfig_SanitizeCommand(t *testing.T) {
-
-	// Test a command with spaces and newlines
-	args, err := SanitizeCommand(`python model1.py \
-    -a "double quotes" \
-    --arg2 'single quotes'
-	-s
-	--arg3 123 \
-	--arg4 '"string in string"'
-	-c "'single quoted'"
-	`)
-	assert.NoError(t, err)
-	assert.Equal(t, []string{
-		"python", "model1.py",
-		"-a", "double quotes",
-		"--arg2", "single quotes",
-		"-s",
-		"--arg3", "123",
-		"--arg4", `"string in string"`,
-		"-c", `'single quoted'`,
-	}, args)
-
-	// Test an empty command
-	args, err = SanitizeCommand("")
-	assert.Error(t, err)
-	assert.Nil(t, args)
-}
-
 func TestConfig_AutomaticPortAssignments(t *testing.T) {

 	t.Run("Default Port Ranges", func(t *testing.T) {
@@ -359,3 +336,106 @@ models:
 		assert.Equal(t, "model model1 requires a proxy value when not using automatic ${PORT}", err.Error())
 	})
 }
+
+func TestConfig_MacroReplacement(t *testing.T) {
+	content := `
+startPort: 9990
+macros:
+  svr-path: "path/to/server"
+  argOne: "--arg1"
+  argTwo: "--arg2"
+  autoPort: "--port ${PORT}"
+
+models:
+  model1:
+    cmd: |
+      ${svr-path} ${argTwo}
+      # the automatic ${PORT} is replaced
+      ${autoPort}
+      ${argOne}
+      --arg3 three
+    cmdStop: |
+      /path/to/stop.sh --port ${PORT} ${argTwo}
+`
+
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+	sanitizedCmd, err := SanitizeCommand(config.Models["model1"].Cmd)
+	assert.NoError(t, err)
+	assert.Equal(t, "path/to/server --arg2 --port 9990 --arg1 --arg3 three", strings.Join(sanitizedCmd, " "))
+
+	sanitizedCmdStop, err := SanitizeCommand(config.Models["model1"].CmdStop)
+	assert.NoError(t, err)
+	assert.Equal(t, "/path/to/stop.sh --port 9990 --arg2", strings.Join(sanitizedCmdStop, " "))
+}
+
+func TestConfig_MacroErrorOnUnknownMacros(t *testing.T) {
+	tests := []struct {
+		name    string
+		field   string
+		content string
+	}{
+		{
+			name:  "unknown macro in cmd",
+			field: "cmd",
+			content: `
+startPort: 9990
+macros:
+  svr-path: "path/to/server"
+models:
+  model1:
+    cmd: |
+      ${svr-path} --port ${PORT}
+      ${unknownMacro}
+`,
+		},
+		{
+			name:  "unknown macro in cmdStop",
+			field: "cmdStop",
+			content: `
+startPort: 9990
+macros:
+  svr-path: "path/to/server"
+models:
+  model1:
+    cmd: "${svr-path} --port ${PORT}"
+    cmdStop: "kill ${unknownMacro}"
+`,
+		},
+		{
+			name:  "unknown macro in proxy",
+			field: "proxy",
+			content: `
+startPort: 9990
+macros:
+  svr-path: "path/to/server"
+models:
+  model1:
+    cmd: "${svr-path} --port ${PORT}"
+    proxy: "http://localhost:${unknownMacro}"
+`,
+		},
+		{
+			name:  "unknown macro in checkEndpoint",
+			field: "checkEndpoint",
+			content: `
+startPort: 9990
+macros:
+  svr-path: "path/to/server"
+models:
+  model1:
+    cmd: "${svr-path} --port ${PORT}"
+    checkEndpoint: "http://localhost:${unknownMacro}/health"
+`,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			_, err := LoadConfigFromReader(strings.NewReader(tt.content))
+			assert.Error(t, err)
+			assert.Contains(t, err.Error(), "unknown macro '${unknownMacro}' found in model1."+tt.field)
+			//t.Log(err)
+		})
+	}
+}
@@ -0,0 +1,41 @@
+//go:build windows
+
+package proxy
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestConfig_SanitizeCommand(t *testing.T) {
+	// does not support single quoted strings like in config_posix_test.go
+	args, err := SanitizeCommand(`python model1.py \
+
+	-a "double quotes" \
+	-s
+	--arg3 123 \
+
+	   # comment 2
+	--arg4 '"string in string"'
+
+
+
+	# this will get stripped out as well as the white space above
+	-c "'single quoted'"
+	`)
+	assert.NoError(t, err)
+	assert.Equal(t, []string{
+		"python", "model1.py",
+		"-a", "double quotes",
+		"-s",
+		"--arg3", "123",
+		"--arg4", "'string in string'", // this is a little weird but the lexer says so...?
+		"-c", `'single quoted'`,
+	}, args)
+
+	// Test an empty command
+	args, err = SanitizeCommand("")
+	assert.Error(t, err)
+	assert.Nil(t, args)
+}
@@ -45,17 +45,26 @@ func TestMain(m *testing.M) {
 func getSimpleResponderPath() string {
 	goos := runtime.GOOS
 	goarch := runtime.GOARCH
-	return filepath.Join("..", "build", fmt.Sprintf("simple-responder_%s_%s", goos, goarch))
+
+	if goos == "windows" {
+		return filepath.Join("..", "build", "simple-responder.exe")
+	} else {
+		return filepath.Join("..", "build", fmt.Sprintf("simple-responder_%s_%s", goos, goarch))
+	}
 }

-func getTestSimpleResponderConfig(expectedMessage string) ModelConfig {
+func getTestPort() int {
 	portMutex.Lock()
 	defer portMutex.Unlock()

 	port := nextTestPort
 	nextTestPort++

-	return getTestSimpleResponderConfigPort(expectedMessage, port)
+	return port
+}
+
+func getTestSimpleResponderConfig(expectedMessage string) ModelConfig {
+	return getTestSimpleResponderConfigPort(expectedMessage, getTestPort())
 }

 func getTestSimpleResponderConfigPort(expectedMessage string, port int) ModelConfig {
@@ -8,6 +8,7 @@ import (
 	"net/http"
 	"net/url"
 	"os/exec"
+	"runtime"
 	"strconv"
 	"strings"
 	"sync"
@@ -67,6 +68,12 @@ type Process struct {

 	// for managing concurrency limits
 	concurrencyLimitSemaphore chan struct{}
+
+	// stop timeout waiting for graceful shutdown
+	gracefulStopTimeout time.Duration
+
+	// track that this happened
+	upstreamWasStoppedWithKill bool
 }

 func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLogger *LogMonitor, proxyLogger *LogMonitor) *Process {
@@ -74,9 +81,8 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
 	concurrentLimit := 10
 	if config.ConcurrencyLimit > 0 {
 		concurrentLimit = config.ConcurrencyLimit
-	} else {
-		proxyLogger.Debugf("Concurrency limit for model %s not set, defaulting to 10", ID)
 	}
+
 	return &Process{
 		ID:                      ID,
 		config:                  config,
@@ -92,6 +98,10 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo

 		// concurrency limit
 		concurrencyLimitSemaphore: make(chan struct{}, concurrentLimit),
+
+		// stop timeout
+		gracefulStopTimeout:        10 * time.Second,
+		upstreamWasStoppedWithKill: false,
 	}
 }

@@ -138,7 +148,9 @@ func isValidTransition(from, to ProcessState) bool {
 		return to == StateStopping
 	case StateStopping:
 		return to == StateStopped || to == StateShutdown
-	case StateFailed, StateShutdown:
+	case StateFailed:
+		return to == StateStopping
+	case StateShutdown:
 		return false // No transitions allowed from these states
 	}
 	return false
@@ -208,6 +220,15 @@ func (p *Process) start() error {
 	go func() {
 		exitErr := p.cmd.Wait()
 		p.proxyLogger.Debugf("<%s> cmd.Wait() returned error: %v", p.ID, exitErr)
+
+		// there is a race condition when SIGKILL is used, p.cmd.Wait() returns, and then
+		// the code below fires, putting an error into cmdWaitChan. This code is to prevent this
+		if p.upstreamWasStoppedWithKill {
+			p.proxyLogger.Debugf("<%s> process was killed, NOT sending exitErr: %v", p.ID, exitErr)
+			p.upstreamWasStoppedWithKill = false
+			return
+		}
+
 		p.cmdWaitChan <- exitErr
 	}()

@@ -339,16 +360,23 @@ func (p *Process) StopImmediately() {
 		return
 	}

-	p.proxyLogger.Debugf("<%s> Stopping process", p.ID)
+	p.proxyLogger.Debugf("<%s> Stopping process, current state: %s", p.ID, p.CurrentState())
+	currentState := p.CurrentState()

-	// calling Stop() when state is invalid is a no-op
-	if curState, err := p.swapState(StateReady, StateStopping); err != nil {
-		p.proxyLogger.Infof("<%s> Stop() Ready -> StateStopping err: %v, current state: %v", p.ID, err, curState)
-		return
+	if currentState == StateFailed {
+		if curState, err := p.swapState(StateFailed, StateStopping); err != nil {
+			p.proxyLogger.Infof("<%s> Stop() Failed -> StateStopping err: %v, current state: %v", p.ID, err, curState)
+			return
+		}
+	} else {
+		if curState, err := p.swapState(StateReady, StateStopping); err != nil {
+			p.proxyLogger.Infof("<%s> Stop() Ready -> StateStopping err: %v, current state: %v", p.ID, err, curState)
+			return
+		}
 	}

 	// stop the process with a graceful exit timeout
-	p.stopCommand(5 * time.Second)
+	p.stopCommand(p.gracefulStopTimeout)

 	if curState, err := p.swapState(StateStopping, StateStopped); err != nil {
 		p.proxyLogger.Infof("<%s> Stop() StateStopping -> StateStopped err: %v, current state: %v", p.ID, err, curState)
@@ -360,8 +388,14 @@ func (p *Process) StopImmediately() {
 // is in the state of starting, it will cancel it and shut it down. Once a process is in
 // the StateShutdown state, it can not be started again.
 func (p *Process) Shutdown() {
+	if !isValidTransition(p.CurrentState(), StateStopping) {
+		return
+	}
+
 	p.shutdownCancel()
-	p.stopCommand(5 * time.Second)
+	p.stopCommand(p.gracefulStopTimeout)
+
+	// just force it to this state since there is no recovery from shutdown
 	p.state = StateShutdown
 }

@@ -381,13 +415,44 @@ func (p *Process) stopCommand(sigtermTTL time.Duration) {
 		return
 	}

-	if err := p.terminateProcess(); err != nil {
-		p.proxyLogger.Debugf("<%s> Process already terminated: %v (normal during shutdown)", p.ID, err)
+	// if err := p.terminateProcess(); err != nil {
+	// 	p.proxyLogger.Debugf("<%s> Process already terminated: %v (normal during shutdown)", p.ID, err)
+	// }
+	// the default cmdStop to taskkill /f /t /pid ${PID}
+	if runtime.GOOS == "windows" && strings.TrimSpace(p.config.CmdStop) == "" {
+		p.config.CmdStop = "taskkill /f /t /pid ${PID}"
+	}
+
+	if p.config.CmdStop != "" {
+		// replace ${PID} with the pid of the process
+		stopArgs, err := SanitizeCommand(strings.ReplaceAll(p.config.CmdStop, "${PID}", fmt.Sprintf("%d", p.cmd.Process.Pid)))
+		if err != nil {
+			p.proxyLogger.Errorf("<%s> Failed to sanitize stop command: %v", p.ID, err)
+			return
+		}
+
+		p.proxyLogger.Debugf("<%s> Executing stop command: %s", p.ID, strings.Join(stopArgs, " "))
+
+		stopCmd := exec.Command(stopArgs[0], stopArgs[1:]...)
+		stopCmd.Stdout = p.processLogger
+		stopCmd.Stderr = p.processLogger
+		stopCmd.Env = p.config.Env
+
+		if err := stopCmd.Run(); err != nil {
+			p.proxyLogger.Errorf("<%s> Failed to exec stop command: %v", p.ID, err)
+			return
+		}
+	} else {
+		if err := p.cmd.Process.Signal(syscall.SIGTERM); err != nil {
+			p.proxyLogger.Errorf("<%s> Failed to send SIGTERM to process: %v", p.ID, err)
+			return
+		}
 	}

 	select {
 	case <-sigtermTimeout.Done():
 		p.proxyLogger.Debugf("<%s> Process timed out waiting to stop, sending KILL signal (normal during shutdown)", p.ID)
+		p.upstreamWasStoppedWithKill = true
 		if err := p.cmd.Process.Kill(); err != nil {
 			p.proxyLogger.Errorf("<%s> Failed to kill process: %v", p.ID, err)
 		}
@@ -1,9 +0,0 @@
-//go:build !windows
-
-package proxy
-
-import "syscall"
-
-func (p *Process) terminateProcess() error {
-	return p.cmd.Process.Signal(syscall.SIGTERM)
-}
@@ -1,14 +0,0 @@
-//go:build windows
-
-package proxy
-
-import (
-	"fmt"
-	"os/exec"
-)
-
-func (p *Process) terminateProcess() error {
-	pid := fmt.Sprintf("%d", p.cmd.Process.Pid)
-	cmd := exec.Command("taskkill", "/f", "/t", "/pid", pid)
-	return cmd.Run()
-}
@@ -5,6 +5,7 @@ import (
 	"net/http"
 	"net/http/httptest"
 	"os"
+	"runtime"
 	"sync"
 	"testing"
 	"time"
@@ -393,3 +394,77 @@ func TestProcess_StopImmediately(t *testing.T) {
 	process.StopImmediately()
 	assert.Equal(t, process.CurrentState(), StateStopped)
 }
+
+// Test that SIGKILL is sent when gracefulStopTimeout is reached and properly terminates
+// the upstream command
+func TestProcess_ForceStopWithKill(t *testing.T) {
+
+	expectedMessage := "test_sigkill"
+	binaryPath := getSimpleResponderPath()
+	port := getTestPort()
+
+	config := ModelConfig{
+		// note --ignore-sig-term which ignores the SIGTERM signal so a SIGKILL must be sent
+		// to force the process to exit
+		Cmd:           fmt.Sprintf("%s --port %d --respond %s --silent --ignore-sig-term", binaryPath, port, expectedMessage),
+		Proxy:         fmt.Sprintf("http://127.0.0.1:%d", port),
+		CheckEndpoint: "/health",
+	}
+
+	process := NewProcess("stop_immediate", 2, config, debugLogger, debugLogger)
+	defer process.Stop()
+
+	// reduce to make testing go faster
+	process.gracefulStopTimeout = time.Second
+
+	err := process.start()
+	assert.Nil(t, err)
+	assert.Equal(t, process.CurrentState(), StateReady)
+
+	waitChan := make(chan struct{})
+	go func() {
+		// slow, but will get killed by StopImmediate
+		req := httptest.NewRequest("GET", "/slow-respond?echo=12345&delay=2s", nil)
+		w := httptest.NewRecorder()
+		process.ProxyRequest(w, req)
+
+		// StatusOK because that was already sent before the kill
+		assert.Equal(t, http.StatusOK, w.Code)
+
+		// unexpected EOF because the kill happened, the "1" is sent before the kill
+		// then the unexpected EOF is sent after the kill
+		if runtime.GOOS == "windows" {
+			assert.Contains(t, w.Body.String(), "wsarecv: An existing connection was forcibly closed by the remote host")
+		} else {
+			assert.Contains(t, w.Body.String(), "unexpected EOF")
+		}
+
+		close(waitChan)
+	}()
+
+	<-time.After(time.Millisecond)
+	process.StopImmediately()
+	assert.Equal(t, process.CurrentState(), StateStopped)
+
+	// the request should have been interrupted by SIGKILL
+	<-waitChan
+}
+
+func TestProcess_StopCmd(t *testing.T) {
+	config := getTestSimpleResponderConfig("test_stop_cmd")
+
+	if runtime.GOOS == "windows" {
+		config.CmdStop = "taskkill /f /t /pid ${PID}"
+	} else {
+		config.CmdStop = "kill -TERM ${PID}"
+	}
+
+	process := NewProcess("testStopCmd", 2, config, debugLogger, debugLogger)
+	defer process.Stop()
+
+	err := process.start()
+	assert.Nil(t, err)
+	assert.Equal(t, process.CurrentState(), StateReady)
+	process.StopImmediately()
+	assert.Equal(t, process.CurrentState(), StateStopped)
+}
@@ -319,7 +319,7 @@ func (pm *ProxyManager) proxyToUpstream(c *gin.Context) {
 func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
 	var html strings.Builder

-	html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><ul>")
+	html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><a href=\"/unload\">Unload all models</a><ul>")

 	// Extract keys and sort them
 	var modelIDs []string
@@ -334,7 +334,33 @@ func (pm *ProxyManager) upstreamIndex(c *gin.Context) {

 	// Iterate over sorted keys
 	for _, modelID := range modelIDs {
-		html.WriteString(fmt.Sprintf("<li><a href=\"/upstream/%s\">%s</a></li>", modelID, modelID))
+		// Get process state
+		processGroup := pm.findGroupByModelName(modelID)
+		var state string
+		if processGroup != nil {
+			process := processGroup.processes[modelID]
+			if process != nil {
+				var stateStr string
+				switch process.CurrentState() {
+				case StateReady:
+					stateStr = "Ready"
+				case StateStarting:
+					stateStr = "Starting"
+				case StateStopping:
+					stateStr = "Stopping"
+				case StateFailed:
+					stateStr = "Failed"
+				case StateShutdown:
+					stateStr = "Shutdown"
+				case StateStopped:
+					stateStr = "Stopped"
+				default:
+					stateStr = "Unknown"
+				}
+				state = stateStr
+			}
+		}
+		html.WriteString(fmt.Sprintf("<li><a href=\"/upstream/%s\">%s</a> - %s</li>", modelID, modelID, state))
 	}
 	html.WriteString("</ul></body></html>")
 	c.Header("Content-Type", "text/html")
@@ -374,7 +400,8 @@ func (pm *ProxyManager) proxyOAIHandler(c *gin.Context) {

 	// dechunk it as we already have all the body bytes see issue #11
 	c.Request.Header.Del("transfer-encoding")
-	c.Request.Header.Add("content-length", strconv.Itoa(len(bodyBytes)))
+	c.Request.Header.Set("content-length", strconv.Itoa(len(bodyBytes)))
+	c.Request.ContentLength = int64(len(bodyBytes))

 	if err := processGroup.ProxyRequest(realModelName, c.Writer, c.Request); err != nil {
 		pm.sendErrorResponse(c, http.StatusInternalServerError, fmt.Sprintf("error proxying request: %s", err.Error()))
@@ -14,6 +14,7 @@ import (
 	"time"

 	"github.com/stretchr/testify/assert"
+	"github.com/tidwall/gjson"
 )

 func TestProxyManager_SwapProcessCorrectly(t *testing.T) {
@@ -339,7 +340,7 @@ func TestProxyManager_RunningEndpoint(t *testing.T) {
 			"model1": getTestSimpleResponderConfig("model1"),
 			"model2": getTestSimpleResponderConfig("model2"),
 		},
-		LogLevel: "debug",
+		LogLevel: "warn",
 	})

 	// Define a helper struct to parse the JSON response.
@@ -448,7 +449,6 @@ func TestProxyManager_AudioTranscriptionHandler(t *testing.T) {
 // Test useModelName in configuration sends overrides what is sent to upstream
 func TestProxyManager_UseModelName(t *testing.T) {
 	upstreamModelName := "upstreamModel"
-
 	modelConfig := getTestSimpleResponderConfig(upstreamModelName)
 	modelConfig.UseModelName = upstreamModelName

@@ -473,6 +473,12 @@ func TestProxyManager_UseModelName(t *testing.T) {
 		proxy.ServeHTTP(w, req)
 		assert.Equal(t, http.StatusOK, w.Code)
 		assert.Contains(t, w.Body.String(), upstreamModelName)
+
+		// make sure the content length was set correctly
+		// simple-responder will return the content length it got in the response
+		body := w.Body.Bytes()
+		contentLength := int(gjson.GetBytes(body, "h_content_length").Int())
+		assert.Equal(t, len(fmt.Sprintf(`{"model":"%s"}`, upstreamModelName)), contentLength)
 	})

 	t.Run("useModelName over rides requested model: /v1/audio/transcriptions", func(t *testing.T) {
@@ -0,0 +1,213 @@
+#!/bin/sh
+# This script installs llama-swap on Linux.
+# It detects the current operating system architecture and installs the appropriate version of llama-swap.
+
+set -eu
+
+LLAMA_SWAP_DEFAULT_ADDRESS=${LLAMA_SWAP_DEFAULT_ADDRESS:-"127.0.0.1:8080"}
+
+red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
+plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
+
+status() { echo ">>> $*" >&2; }
+error() { echo "${red}ERROR:${plain} $*"; exit 1; }
+warning() { echo "${red}WARNING:${plain} $*"; }
+
+available() { command -v "$1" >/dev/null; }
+require() {
+    _MISSING=''
+    for TOOL in "$@"; do
+        if ! available "$TOOL"; then
+            _MISSING="$_MISSING $TOOL"
+        fi
+    done
+
+    echo "$_MISSING"
+}
+
+SUDO=
+if [ "$(id -u)" -ne 0 ]; then
+    if ! available sudo; then
+        error "This script requires superuser permissions. Please re-run as root."
+    fi
+
+    SUDO="sudo"
+fi
+
+NEEDS=$(require tee tar python3 mktemp)
+if [ -n "$NEEDS" ]; then
+    status "ERROR: The following tools are required but missing:"
+    for NEED in $NEEDS; do
+        echo "  - $NEED"
+    done
+    exit 1
+fi
+
+[ "$(uname -s)" = "Linux" ] || error 'This script is intended to run on Linux only.'
+
+ARCH=$(uname -m)
+case "$ARCH" in
+    x86_64) ARCH="amd64" ;;
+    aarch64|arm64) ARCH="arm64" ;;
+    *) error "Unsupported architecture: $ARCH" ;;
+esac
+
+IS_WSL2=false
+
+KERN=$(uname -r)
+case "$KERN" in
+    *icrosoft*WSL2 | *icrosoft*wsl2) IS_WSL2=true;;
+    *icrosoft) error "Microsoft WSL1 is not currently supported. Please use WSL2 with 'wsl --set-version <distro> 2'" ;;
+    *) ;;
+esac
+
+download_binary() {
+    ASSET_NAME="linux_$ARCH"
+
+    TMPDIR=$(mktemp -d)
+    trap 'rm -rf "${TMPDIR}"' EXIT INT TERM HUP
+    PYTHON_SCRIPT=$(cat <<EOF
+import os
+import json
+import sys
+import urllib.request
+
+ASSET_NAME = "${ASSET_NAME}"
+
+with urllib.request.urlopen("https://api.github.com/repos/mostlygeek/llama-swap/releases/latest") as resp:
+    data = json.load(resp)
+    for asset in data.get("assets", []):
+        if ASSET_NAME in asset.get("name", ""):
+            url = asset["browser_download_url"]
+            break
+    else:
+        print("ERROR: Matching asset not found.", file=sys.stderr)
+        exit(1)
+
+print("Downloading:", url, file=sys.stderr)
+output_path = os.path.join("${TMPDIR}", "llama-swap.tar.gz")
+urllib.request.urlretrieve(url, output_path)
+print(output_path)
+EOF
+)
+
+    TARFILE=$(python3 -c "$PYTHON_SCRIPT")
+    if [ ! -f "$TARFILE" ]; then
+        error "Failed to download binary."
+    fi
+
+    status "Extracting to /usr/local/bin"
+    $SUDO tar -xzf "$TARFILE" -C /usr/local/bin llama-swap
+}
+download_binary
+
+configure_systemd() {
+    if ! id llama-swap >/dev/null 2>&1; then
+        status "Creating llama-swap user..."
+        $SUDO useradd -r -s /bin/false -U -m -d /usr/share/llama-swap llama-swap
+    fi
+    if getent group render >/dev/null 2>&1; then
+        status "Adding llama-swap user to render group..."
+        $SUDO usermod -a -G render llama-swap
+    fi
+    if getent group video >/dev/null 2>&1; then
+        status "Adding llama-swap user to video group..."
+        $SUDO usermod -a -G video llama-swap
+    fi
+    if getent group docker >/dev/null 2>&1; then
+        status "Adding llama-swap user to docker group..."
+        $SUDO usermod -a -G docker llama-swap
+    fi
+
+    status "Adding current user to llama-swap group..."
+    $SUDO usermod -a -G llama-swap "$(whoami)"
+
+    if [ ! -f "/usr/share/llama-swap/config.yaml" ]; then
+        status "Creating default config.yaml..."
+        cat <<EOF | $SUDO -u llama-swap tee /usr/share/llama-swap/config.yaml >/dev/null
+# default 15s likely to fail for default models due to downloading models
+healthCheckTimeout: 60
+
+models:
+  "qwen2.5":
+    cmd: |
+      docker run
+        --rm
+        -p \${PORT}:8080
+        --name qwen2.5
+      ghcr.io/ggml-org/llama.cpp:server
+        -hf bartowski/Qwen2.5-0.5B-Instruct-GGUF:Q4_K_M
+    cmdStop: docker stop qwen2.5
+
+  "smollm2":
+    cmd: |
+      docker run
+        --rm
+        -p \${PORT}:8080
+        --name smollm2
+      ghcr.io/ggml-org/llama.cpp:server
+        -hf bartowski/SmolLM2-135M-Instruct-GGUF:Q4_K_M
+    cmdStop: docker stop smollm2
+EOF
+    fi
+
+    status "Creating llama-swap systemd service..."
+    cat <<EOF | $SUDO tee /etc/systemd/system/llama-swap.service >/dev/null
+[Unit]
+Description=llama-swap
+After=network.target
+
+[Service]
+User=llama-swap
+Group=llama-swap
+
+# set this to match your environment
+ExecStart=/usr/local/bin/llama-swap --config /usr/share/llama-swap/config.yaml --watch-config -listen ${LLAMA_SWAP_DEFAULT_ADDRESS}
+
+Restart=on-failure
+RestartSec=3
+StartLimitBurst=3
+StartLimitInterval=30
+
+[Install]
+WantedBy=multi-user.target
+EOF
+    SYSTEMCTL_RUNNING="$(systemctl is-system-running || true)"
+    case $SYSTEMCTL_RUNNING in
+        running|degraded)
+            status "Enabling and starting llama-swap service..."
+            $SUDO systemctl daemon-reload
+            $SUDO systemctl enable llama-swap
+
+            start_service() { $SUDO systemctl restart llama-swap; }
+            trap start_service EXIT
+            ;;
+        *)
+            warning "systemd is not running"
+            if [ "$IS_WSL2" = true ]; then
+                warning "see https://learn.microsoft.com/en-us/windows/wsl/systemd#how-to-enable-systemd to enable it"
+            fi
+            ;;
+    esac
+}
+
+if available systemctl; then
+    configure_systemd
+fi
+
+install_success() {
+    status "The llama-swap API is now available at http://${LLAMA_SWAP_DEFAULT_ADDRESS}"
+    status 'Customize the config file at /usr/share/llama-swap/config.yaml.'
+    status 'Install complete.'
+}
+
+# WSL2 only supports GPUs via nvidia passthrough
+# so check for nvidia-smi to determine if GPU is available
+if [ "$IS_WSL2" = true ]; then
+    if available nvidia-smi && [ -n "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\.[0-9]*")" ]; then
+        status "Nvidia GPU detected."
+    fi
+    exit 0
+fi
+
+install_success
@@ -0,0 +1,68 @@
+#!/bin/sh
+# This script uninstalls llama-swap on Linux.
+# It removes the binary, systemd service, config.yaml (optional), and llama-swap user and group.
+
+set -eu
+
+red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
+plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
+
+status() { echo ">>> $*" >&2; }
+error() { echo "${red}ERROR:${plain} $*"; exit 1; }
+warning() { echo "${red}WARNING:${plain} $*"; }
+
+available() { command -v $1 >/dev/null; }
+
+SUDO=
+if [ "$(id -u)" -ne 0 ]; then
+    if ! available sudo; then
+        error "This script requires superuser permissions. Please re-run as root."
+    fi
+
+    SUDO="sudo"
+fi
+
+configure_systemd() {
+    status "Stopping llama-swap service..."
+    $SUDO systemctl stop llama-swap
+
+    status "Disabling llama-swap service..."
+    $SUDO systemctl disable llama-swap
+}
+if available systemctl; then
+    configure_systemd
+fi
+
+if available llama-swap; then
+    status "Removing llama-swap binary..."
+    $SUDO rm $(which llama-swap)
+fi
+
+if [ -f "/usr/share/llama-swap/config.yaml" ]; then
+    while true; do
+        printf "Delete config.yaml (/usr/share/llama-swap/config.yaml)? [y/N] " >&2
+        read answer
+        case "$answer" in
+            [Yy]* ) 
+                $SUDO rm -r /usr/share/llama-swap
+                break
+                ;;
+            [Nn]* | "" ) 
+                break
+                ;;
+            * ) 
+                echo "Invalid input. Please enter y or n."
+                ;;
+        esac
+    done
+fi
+
+if id llama-swap >/dev/null 2>&1; then
+    status "Removing llama-swap user..."
+    $SUDO userdel llama-swap
+fi
+
+if getent group llama-swap >/dev/null 2>&1; then
+    status "Removing llama-swap group..."
+    $SUDO groupdel llama-swap
+fi
Author	SHA1	Message	Date
Benson Wong	1ac6499c08	Add macros to Configuration schema (#149 ) * Add macros to Configuration schema * update docs	2025-05-29 21:51:25 -07:00
Benson Wong	25f3dc25e7	small doc update [skip ci]	2025-05-26 16:03:27 -07:00
Benson Wong	8422e4e6a1	move some docs to the wiki [no-ci]	2025-05-26 15:46:08 -07:00
Benson Wong	02ee29d881	increase default healthCheckTimeout to 120s	2025-05-26 09:57:53 -07:00
Benson Wong	b2a891f8f4	Disable building of intel container until it's fixed upstream	2025-05-23 22:54:43 -07:00
Yuta Hayashibe	8d2b568897	Improve install script (#144 ) * Use `python3` instead of `curl` and `jq` * Use quote to word splitting * Remove undefined `local` in POSIX sh * Added `LLAMA_SWAP_DEFAULT_ADDRESS` to customize the server address * Added `mktemp` to `NEEDS`	2025-05-23 09:39:55 -07:00
Yuta Hayashibe	fb44cf4e08	Fix typos (#143 )	2025-05-23 08:40:15 -07:00
Benson Wong	02aee4e86d	remove noisy debug print message	2025-05-20 10:43:10 -07:00
Benson Wong	f45896d395	add guard to avoid unnecessary logic in Process.Shutdown	2025-05-20 10:43:09 -07:00
choyuansu	f7e46a359f	Add link to unload endpoint in upstream list (#140 ) * Add link to open /unload	2025-05-20 08:31:44 -07:00
choyuansu	c260907415	Add linux install and uninstall shell scripts (#139 ) Contribution for install, and uninstall llama-swap in linux.	2025-05-19 12:03:33 -07:00
Benson Wong	b83a5fa291	make Failed stated recoverable (#137 ) A process in the failed state can transition to stopped either by calling /unload or swapping to another model.	2025-05-16 19:54:44 -07:00
Benson Wong	6e2ff28d59	improve cmdStop docs [no ci]	2025-05-16 13:52:04 -07:00
Benson Wong	a8b81f2799	Add stopCmd for custom stopping instructions (#136 ) Allow configuration of how a model is stopped before swapping. Setting `cmdStop` in the configuration will override the default behaviour and enables better integration with other process/container managers like docker or podman.	2025-05-16 13:48:42 -07:00
Benson Wong	f9ee7156dc	update configuration examples for multiline yaml commands #133	2025-05-16 11:45:39 -07:00
fakezeta	2d00120781	Update proxymanager.go (#135 )	2025-05-16 06:45:09 -07:00
Benson Wong	afc9aef058	Fix #133 SanitizeCommand removes comments (#134 )	2025-05-15 15:28:50 -07:00
Benson Wong	d7b390df74	Add GH Action for Testing on Windows (#132 ) * Add windows specific test changes * Change the command line parsing library - Possible breaking changes for windows users!	2025-05-14 21:51:53 -07:00
Benson Wong	5025c2f1f3	Add GH windows tests (not working yet)	2025-05-14 19:58:22 -07:00
Benson Wong	e3a0b013c1	add content length test for #131	2025-05-14 19:50:01 -07:00
Fadenfire	f5763a94a0	Fix content length being incorrect when useModelName is used (#131 ) * Fix content length being incorrect when useModelName is used * Update c.Request.ContentLength as well	2025-05-14 19:37:54 -07:00
Benson Wong	8ada72eb57	Update issue templates	2025-05-14 16:36:32 -07:00
Benson Wong	2441b383d3	Make checking for process killed status more robust	2025-05-14 16:26:56 -07:00
Benson Wong	25f251699c	Prevent StateFailed after SIGKILL (#129 ) Closes #125	2025-05-14 10:47:35 -07:00
Benson Wong	7f37bcc6eb	Improve testing around using SIGKILL (#127 ) * Add test for SIGKILL of process * silent TestProxyManager_RunningEndpoint debug output * Ref #125	2025-05-13 21:21:52 -07:00