Compare commits

...

17 Commits

Author SHA1 Message Date
Benson Wong e7af671d8e remove noisy debug print message 2025-05-19 15:36:15 -07:00
Benson Wong 8e62098eef add guard to avoid unnecessary logic in Process.Shutdown 2025-05-19 15:34:30 -07:00
choyuansu c260907415 Add linux install and uninstall shell scripts (#139)
Contribution for install, and uninstall llama-swap in linux.
2025-05-19 12:03:33 -07:00
Benson Wong b83a5fa291 make Failed stated recoverable (#137)
A process in the failed state can transition to stopped either by calling /unload or swapping to another model.
2025-05-16 19:54:44 -07:00
Benson Wong 6e2ff28d59 improve cmdStop docs [no ci] 2025-05-16 13:52:04 -07:00
Benson Wong a8b81f2799 Add stopCmd for custom stopping instructions (#136)
Allow configuration of how a model is stopped before swapping. Setting `cmdStop` in the configuration will override the default behaviour and enables better integration with other process/container managers like docker or podman.
2025-05-16 13:48:42 -07:00
Benson Wong f9ee7156dc update configuration examples for multiline yaml commands #133 2025-05-16 11:45:39 -07:00
fakezeta 2d00120781 Update proxymanager.go (#135) 2025-05-16 06:45:09 -07:00
Benson Wong afc9aef058 Fix #133 SanitizeCommand removes comments (#134) 2025-05-15 15:28:50 -07:00
Benson Wong d7b390df74 Add GH Action for Testing on Windows (#132)
* Add windows specific test changes
* Change the command line parsing library - Possible breaking changes for windows users!
2025-05-14 21:51:53 -07:00
Benson Wong 5025c2f1f3 Add GH windows tests (not working yet) 2025-05-14 19:58:22 -07:00
Benson Wong e3a0b013c1 add content length test for #131 2025-05-14 19:50:01 -07:00
Fadenfire f5763a94a0 Fix content length being incorrect when useModelName is used (#131)
* Fix content length being incorrect when useModelName is used
* Update c.Request.ContentLength as well
2025-05-14 19:37:54 -07:00
Benson Wong 8ada72eb57 Update issue templates 2025-05-14 16:36:32 -07:00
Benson Wong 2441b383d3 Make checking for process killed status more robust 2025-05-14 16:26:56 -07:00
Benson Wong 25f251699c Prevent StateFailed after SIGKILL (#129)
Closes #125
2025-05-14 10:47:35 -07:00
Benson Wong 7f37bcc6eb Improve testing around using SIGKILL (#127)
* Add test for SIGKILL of process
* silent TestProxyManager_RunningEndpoint debug output
* Ref #125
2025-05-13 21:21:52 -07:00
22 changed files with 735 additions and 101 deletions
+37
View File
@@ -0,0 +1,37 @@
---
name: Bug Report
about: Something is not working as expected...
title: ''
labels: bug
assignees: ''
---
**Describe the bug**
A clear and concise description of what the bug is.
**Expected behaviour**
A clear and concise description of what you expected to happen.
**Operating system and version**
- OS: (linux, osx, windows, freebsd, etc)
- GPUs: (list architecture)
**My Configuration**
```yaml
# copy / paste your configuration here
```
**Proxy Logs**
```
# copy / paste from /logs
```
**Upstream Logs**
```
# copy/paste from /logs
```
+50
View File
@@ -0,0 +1,50 @@
name: Windows CI
on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
# Allows manual triggering of the workflow
workflow_dispatch:
jobs:
run-tests:
runs-on: windows-latest
steps:
- uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: '1.23'
# cache simple-responder to save the build time
- name: Restore Simple Responder
id: restore-simple-responder
uses: actions/cache/restore@v4
with:
path: ./build
key: ${{ runner.os }}-simple-responder-${{ hashFiles('misc/simple-responder/simple-responder.go') }}
# necessary for testing proxy/Process swapping
- name: Create simple-responder
if: steps.restore-simple-responder.outputs.cache-hit != 'true'
shell: bash
run: make simple-responder-windows
- name: Save Simple Responder
# nothing new to save ... skip this step
if: steps.restore-simple-responder.outputs.cache-hit != 'true'
id: save-simple-responder
uses: actions/cache/save@v4
with:
path: ./build
key: ${{ runner.os }}-simple-responder-${{ hashFiles('misc/simple-responder/simple-responder.go') }}
- name: Test all
shell: bash
run: make test-all
+18 -3
View File
@@ -1,6 +1,4 @@
# This workflow will build a golang project
name: CI
name: Linux CI
on:
push:
@@ -24,9 +22,26 @@ jobs:
with:
go-version: '1.23'
# cache simple-responder to save the build time
- name: Restore Simple Responder
id: restore-simple-responder
uses: actions/cache/restore@v4
with:
path: ./build
key: ${{ runner.os }}-simple-responder-${{ hashFiles('misc/simple-responder/simple-responder.go') }}
# necessary for testing proxy/Process swapping
- name: Create simple-responder
run: make simple-responder
- name: Save Simple Responder
# nothing new to save ... skip this step
if: steps.restore-simple-responder.outputs.cache-hit != 'true'
id: save-simple-responder
uses: actions/cache/save@v4
with:
path: ./build
key: ${{ runner.os }}-simple-responder-${{ hashFiles('misc/simple-responder/simple-responder.go') }}
- name: Test all
run: make test-all
+6 -2
View File
@@ -20,10 +20,10 @@ clean:
rm -rf $(BUILD_DIR)
test:
go test -short -v ./proxy
go test -short -v -count=1 ./proxy
test-all:
go test -v ./proxy
go test -v -count=1 ./proxy
# Build OSX binary
mac:
@@ -46,6 +46,10 @@ simple-responder:
GOOS=darwin GOARCH=arm64 go build -o $(BUILD_DIR)/simple-responder_darwin_arm64 misc/simple-responder/simple-responder.go
GOOS=linux GOARCH=amd64 go build -o $(BUILD_DIR)/simple-responder_linux_amd64 misc/simple-responder/simple-responder.go
simple-responder-windows:
@echo "Building simple responder for windows"
GOOS=windows GOARCH=amd64 go build -o $(BUILD_DIR)/simple-responder.exe misc/simple-responder/simple-responder.go
# Ensure build directory exists
$(BUILD_DIR):
mkdir -p $(BUILD_DIR)
+15 -9
View File
@@ -46,14 +46,14 @@ llama-swap's configuration is purposefully simple.
models:
"qwen2.5":
proxy: "http://127.0.0.1:9999"
cmd: >
cmd: |
/app/llama-server
-hf bartowski/Qwen2.5-0.5B-Instruct-GGUF:Q4_K_M
--port 9999
"smollm2":
proxy: "http://127.0.0.1:9999"
cmd: >
cmd: |
/app/llama-server
-hf bartowski/SmolLM2-135M-Instruct-GGUF:Q4_K_M
--port 9999
@@ -82,7 +82,7 @@ startPort: 10001
models:
"llama":
# multiline for readability
cmd: >
cmd: |
llama-server --port 8999
--model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
@@ -123,12 +123,18 @@ models:
# Docker Support (v26.1.4+ required!)
"docker-llama":
proxy: "http://127.0.0.1:${PORT}"
cmd: >
cmd: |
docker run --name dockertest
--init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
ghcr.io/ggerganov/llama.cpp:server
--model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
# use a custom command to stop the model when swapping. By default
# this is SIGTERM on POSIX systems, and taskkill on Windows systems
# the ${PID} variable can be used in cmdStop, it will be automatically replaced
# with the PID of the running model
cmdStop: docker stop dockertest
# Groups provide advanced controls over model swapping behaviour. Using groups
# some models can be kept loaded indefinitely, while others are swapped out.
#
@@ -247,11 +253,11 @@ Pre-built binaries are available for Linux, FreeBSD and Darwin (OSX). These are
1. Create a configuration file, see [config.example.yaml](config.example.yaml)
1. Download a [release](https://github.com/mostlygeek/llama-swap/releases) appropriate for your OS and architecture.
1. Run the binary with `llama-swap --config path/to/config.yaml`.
Available flags:
- `--config`: Path to the configuration file (default: `config.yaml`).
- `--listen`: Address and port to listen on (default: `:8080`).
- `--version`: Show version information and exit.
- `--watch-config`: Automatically reload the configuration file when it changes. This will wait for in-flight requests to complete then stop all running models (default: `false`).
Available flags:
- `--config`: Path to the configuration file (default: `config.yaml`).
- `--listen`: Address and port to listen on (default: `:8080`).
- `--version`: Show version information and exit.
- `--watch-config`: Automatically reload the configuration file when it changes. This will wait for in-flight requests to complete then stop all running models (default: `false`).
### Building from source
+4 -4
View File
@@ -15,7 +15,7 @@ groups:
models:
"llama":
cmd: >
cmd: |
models/llama-server-osx
--port ${PORT}
-m models/Llama-3.2-1B-Instruct-Q4_0.gguf
@@ -38,7 +38,7 @@ models:
# Embedding example with Nomic
# https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF
"nomic":
cmd: >
cmd: |
models/llama-server-osx --port ${PORT}
-m models/nomic-embed-text-v1.5.Q8_0.gguf
--ctx-size 8192
@@ -51,7 +51,7 @@ models:
# Reranking example with bge-reranker
# https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF
"bge-reranker":
cmd: >
cmd: |
models/llama-server-osx --port ${PORT}
-m models/bge-reranker-v2-m3-Q4_K_M.gguf
--ctx-size 8192
@@ -59,7 +59,7 @@ models:
# Docker Support (v26.1.4+ required!)
"dockertest":
cmd: >
cmd: |
docker run --name dockertest
--init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
ghcr.io/ggerganov/llama.cpp:server
+1 -1
View File
@@ -5,7 +5,6 @@ go 1.23.0
require (
github.com/fsnotify/fsnotify v1.9.0
github.com/gin-gonic/gin v1.10.0
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510
github.com/stretchr/testify v1.9.0
github.com/tidwall/gjson v1.18.0
github.com/tidwall/sjson v1.2.5
@@ -13,6 +12,7 @@ require (
)
require (
github.com/billziss-gh/golib v0.2.0 // indirect
github.com/bytedance/sonic v1.11.6 // indirect
github.com/bytedance/sonic/loader v0.1.1 // indirect
github.com/cloudwego/base64x v0.1.4 // indirect
+2
View File
@@ -1,3 +1,5 @@
github.com/billziss-gh/golib v0.2.0 h1:NyvcAQdfvM8xokKkKotiligKjKXzuQD4PPykg1nKc/8=
github.com/billziss-gh/golib v0.2.0/go.mod h1:mZpUYANXZkDKSnyYbX9gfnyxwe0ddRhUtfXcsD5r8dw=
github.com/bytedance/sonic v1.11.6 h1:oUp34TzMlL+OY1OUWxHqsdkgC/Zfc85zGqw9siXjrc0=
github.com/bytedance/sonic v1.11.6/go.mod h1:LysEHSvpvDySVdC2f87zGWf6CIKJcAvqab1ZaiQtds4=
github.com/bytedance/sonic/loader v0.1.1 h1:c+e5Pt1k/cy5wMveRDyk2X4B9hF4g7an8N3zCYjJFNM=
+34 -3
View File
@@ -26,6 +26,8 @@ func main() {
silent := flag.Bool("silent", false, "disable all logging")
ignoreSigTerm := flag.Bool("ignore-sig-term", false, "ignore SIGTERM signal")
flag.Parse() // Parse the command-line flags
// Create a new Gin router
@@ -190,6 +192,10 @@ func main() {
log.SetOutput(io.Discard)
}
if !*silent {
fmt.Printf("My PID: %d\n", os.Getpid())
}
go func() {
log.Printf("simple-responder listening on %s\n", address)
// service connections
@@ -200,11 +206,36 @@ func main() {
// Wait for interrupt signal to gracefully shutdown the server with
// a timeout of 5 seconds.
quit := make(chan os.Signal, 1)
sigChan := make(chan os.Signal, 1)
// kill (no param) default send syscall.SIGTERM
// kill -2 is syscall.SIGINT
// kill -9 is syscall.SIGKILL but can't be catch, so don't need add it
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
<-quit
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
countSigInt := 0
runloop:
for {
signal := <-sigChan
switch signal {
case syscall.SIGINT:
countSigInt++
if countSigInt > 1 {
break runloop
} else {
log.Println("Recieved SIGINT, send another SIGINT to shutdown")
}
case syscall.SIGTERM:
if *ignoreSigTerm {
log.Println("Ignoring SIGTERM")
} else {
log.Println("Recieved SIGTERM, shutting down")
break runloop
}
default:
break runloop
}
}
log.Println("simple-responder shutting down")
}
+27 -9
View File
@@ -4,11 +4,12 @@ import (
"fmt"
"io"
"os"
"runtime"
"sort"
"strconv"
"strings"
"github.com/google/shlex"
"github.com/billziss-gh/golib/shlex"
"gopkg.in/yaml.v3"
)
@@ -16,6 +17,7 @@ const DEFAULT_GROUP_ID = "(default)"
type ModelConfig struct {
Cmd string `yaml:"cmd"`
CmdStop string `yaml:"cmdStop"`
Proxy string `yaml:"proxy"`
Aliases []string `yaml:"aliases"`
Env []string `yaml:"env"`
@@ -134,7 +136,6 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
}
}
// iterate over the models and replace any ${PORT} with the next available port
// Get and sort all model IDs first, makes testing more consistent
modelIds := make([]string, 0, len(config.Models))
for modelId := range config.Models {
@@ -142,10 +143,10 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
}
sort.Strings(modelIds) // This guarantees stable iteration order
// iterate over the sorted models
nextPort := config.StartPort
for _, modelId := range modelIds {
modelConfig := config.Models[modelId]
// iterate over the models and replace any ${PORT} with the next available port
if strings.Contains(modelConfig.Cmd, "${PORT}") {
modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, "${PORT}", strconv.Itoa(nextPort))
if modelConfig.Proxy == "" {
@@ -159,6 +160,7 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
return Config{}, fmt.Errorf("model %s requires a proxy value when not using automatic ${PORT}", modelId)
}
}
config = AddDefaultGroupToConfig(config)
// check that members are all unique in the groups
memberUsage := make(map[string]string) // maps member to group it appears in
@@ -228,14 +230,30 @@ func AddDefaultGroupToConfig(config Config) Config {
}
func SanitizeCommand(cmdStr string) ([]string, error) {
// Remove trailing backslashes
cmdStr = strings.ReplaceAll(cmdStr, "\\ \n", " ")
cmdStr = strings.ReplaceAll(cmdStr, "\\\n", " ")
var cleanedLines []string
for _, line := range strings.Split(cmdStr, "\n") {
trimmed := strings.TrimSpace(line)
// Skip comment lines
if strings.HasPrefix(trimmed, "#") {
continue
}
// Handle trailing backslashes by replacing with space
if strings.HasSuffix(trimmed, "\\") {
cleanedLines = append(cleanedLines, strings.TrimSuffix(trimmed, "\\")+" ")
} else {
cleanedLines = append(cleanedLines, line)
}
}
// put it back together
cmdStr = strings.Join(cleanedLines, "\n")
// Split the command into arguments
args, err := shlex.Split(cmdStr)
if err != nil {
return nil, err
var args []string
if runtime.GOOS == "windows" {
args = shlex.Windows.Split(cmdStr)
} else {
args = shlex.Posix.Split(cmdStr)
}
// Ensure the command is not empty
+42
View File
@@ -0,0 +1,42 @@
//go:build !windows
package proxy
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestConfig_SanitizeCommand(t *testing.T) {
// Test a command with spaces and newlines
args, err := SanitizeCommand(`python model1.py \
-a "double quotes" \
--arg2 'single quotes'
-s
# comment 1
--arg3 123 \
# comment 2
--arg4 '"string in string"'
# this will get stripped out as well as the white space above
-c "'single quoted'"
`)
assert.NoError(t, err)
assert.Equal(t, []string{
"python", "model1.py",
"-a", "double quotes",
"--arg2", "single quotes",
"-s",
"--arg3", "123",
"--arg4", `"string in string"`,
"-c", `'single quoted'`,
}, args)
// Test an empty command
args, err = SanitizeCommand("")
assert.Error(t, err)
assert.Nil(t, args)
}
-28
View File
@@ -258,34 +258,6 @@ func TestConfig_FindConfig(t *testing.T) {
assert.Equal(t, ModelConfig{}, modelConfig)
}
func TestConfig_SanitizeCommand(t *testing.T) {
// Test a command with spaces and newlines
args, err := SanitizeCommand(`python model1.py \
-a "double quotes" \
--arg2 'single quotes'
-s
--arg3 123 \
--arg4 '"string in string"'
-c "'single quoted'"
`)
assert.NoError(t, err)
assert.Equal(t, []string{
"python", "model1.py",
"-a", "double quotes",
"--arg2", "single quotes",
"-s",
"--arg3", "123",
"--arg4", `"string in string"`,
"-c", `'single quoted'`,
}, args)
// Test an empty command
args, err = SanitizeCommand("")
assert.Error(t, err)
assert.Nil(t, args)
}
func TestConfig_AutomaticPortAssignments(t *testing.T) {
t.Run("Default Port Ranges", func(t *testing.T) {
+41
View File
@@ -0,0 +1,41 @@
//go:build windows
package proxy
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestConfig_SanitizeCommand(t *testing.T) {
// does not support single quoted strings like in config_posix_test.go
args, err := SanitizeCommand(`python model1.py \
-a "double quotes" \
-s
--arg3 123 \
# comment 2
--arg4 '"string in string"'
# this will get stripped out as well as the white space above
-c "'single quoted'"
`)
assert.NoError(t, err)
assert.Equal(t, []string{
"python", "model1.py",
"-a", "double quotes",
"-s",
"--arg3", "123",
"--arg4", "'string in string'", // this is a little weird but the lexer says so...?
"-c", `'single quoted'`,
}, args)
// Test an empty command
args, err = SanitizeCommand("")
assert.Error(t, err)
assert.Nil(t, args)
}
+12 -3
View File
@@ -45,17 +45,26 @@ func TestMain(m *testing.M) {
func getSimpleResponderPath() string {
goos := runtime.GOOS
goarch := runtime.GOARCH
return filepath.Join("..", "build", fmt.Sprintf("simple-responder_%s_%s", goos, goarch))
if goos == "windows" {
return filepath.Join("..", "build", "simple-responder.exe")
} else {
return filepath.Join("..", "build", fmt.Sprintf("simple-responder_%s_%s", goos, goarch))
}
}
func getTestSimpleResponderConfig(expectedMessage string) ModelConfig {
func getTestPort() int {
portMutex.Lock()
defer portMutex.Unlock()
port := nextTestPort
nextTestPort++
return getTestSimpleResponderConfigPort(expectedMessage, port)
return port
}
func getTestSimpleResponderConfig(expectedMessage string) ModelConfig {
return getTestSimpleResponderConfigPort(expectedMessage, getTestPort())
}
func getTestSimpleResponderConfigPort(expectedMessage string, port int) ModelConfig {
+77 -12
View File
@@ -8,6 +8,7 @@ import (
"net/http"
"net/url"
"os/exec"
"runtime"
"strconv"
"strings"
"sync"
@@ -67,6 +68,12 @@ type Process struct {
// for managing concurrency limits
concurrencyLimitSemaphore chan struct{}
// stop timeout waiting for graceful shutdown
gracefulStopTimeout time.Duration
// track that this happened
upstreamWasStoppedWithKill bool
}
func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLogger *LogMonitor, proxyLogger *LogMonitor) *Process {
@@ -74,9 +81,8 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
concurrentLimit := 10
if config.ConcurrencyLimit > 0 {
concurrentLimit = config.ConcurrencyLimit
} else {
proxyLogger.Debugf("Concurrency limit for model %s not set, defaulting to 10", ID)
}
return &Process{
ID: ID,
config: config,
@@ -92,6 +98,10 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
// concurrency limit
concurrencyLimitSemaphore: make(chan struct{}, concurrentLimit),
// stop timeout
gracefulStopTimeout: 5 * time.Second,
upstreamWasStoppedWithKill: false,
}
}
@@ -138,7 +148,9 @@ func isValidTransition(from, to ProcessState) bool {
return to == StateStopping
case StateStopping:
return to == StateStopped || to == StateShutdown
case StateFailed, StateShutdown:
case StateFailed:
return to == StateStopping
case StateShutdown:
return false // No transitions allowed from these states
}
return false
@@ -208,6 +220,15 @@ func (p *Process) start() error {
go func() {
exitErr := p.cmd.Wait()
p.proxyLogger.Debugf("<%s> cmd.Wait() returned error: %v", p.ID, exitErr)
// there is a race condition when SIGKILL is used, p.cmd.Wait() returns, and then
// the code below fires, putting an error into cmdWaitChan. This code is to prevent this
if p.upstreamWasStoppedWithKill {
p.proxyLogger.Debugf("<%s> process was killed, NOT sending exitErr: %v", p.ID, exitErr)
p.upstreamWasStoppedWithKill = false
return
}
p.cmdWaitChan <- exitErr
}()
@@ -339,16 +360,23 @@ func (p *Process) StopImmediately() {
return
}
p.proxyLogger.Debugf("<%s> Stopping process", p.ID)
p.proxyLogger.Debugf("<%s> Stopping process, current state: %s", p.ID, p.CurrentState())
currentState := p.CurrentState()
// calling Stop() when state is invalid is a no-op
if curState, err := p.swapState(StateReady, StateStopping); err != nil {
p.proxyLogger.Infof("<%s> Stop() Ready -> StateStopping err: %v, current state: %v", p.ID, err, curState)
return
if currentState == StateFailed {
if curState, err := p.swapState(StateFailed, StateStopping); err != nil {
p.proxyLogger.Infof("<%s> Stop() Failed -> StateStopping err: %v, current state: %v", p.ID, err, curState)
return
}
} else {
if curState, err := p.swapState(StateReady, StateStopping); err != nil {
p.proxyLogger.Infof("<%s> Stop() Ready -> StateStopping err: %v, current state: %v", p.ID, err, curState)
return
}
}
// stop the process with a graceful exit timeout
p.stopCommand(5 * time.Second)
p.stopCommand(p.gracefulStopTimeout)
if curState, err := p.swapState(StateStopping, StateStopped); err != nil {
p.proxyLogger.Infof("<%s> Stop() StateStopping -> StateStopped err: %v, current state: %v", p.ID, err, curState)
@@ -360,8 +388,14 @@ func (p *Process) StopImmediately() {
// is in the state of starting, it will cancel it and shut it down. Once a process is in
// the StateShutdown state, it can not be started again.
func (p *Process) Shutdown() {
if !isValidTransition(p.CurrentState(), StateStopping) {
return
}
p.shutdownCancel()
p.stopCommand(5 * time.Second)
p.stopCommand(p.gracefulStopTimeout)
// just force it to this state since there is no recovery from shutdown
p.state = StateShutdown
}
@@ -381,13 +415,44 @@ func (p *Process) stopCommand(sigtermTTL time.Duration) {
return
}
if err := p.terminateProcess(); err != nil {
p.proxyLogger.Debugf("<%s> Process already terminated: %v (normal during shutdown)", p.ID, err)
// if err := p.terminateProcess(); err != nil {
// p.proxyLogger.Debugf("<%s> Process already terminated: %v (normal during shutdown)", p.ID, err)
// }
// the default cmdStop to taskkill /f /t /pid ${PID}
if runtime.GOOS == "windows" && strings.TrimSpace(p.config.CmdStop) == "" {
p.config.CmdStop = "taskkill /f /t /pid ${PID}"
}
if p.config.CmdStop != "" {
// replace ${PID} with the pid of the process
stopArgs, err := SanitizeCommand(strings.ReplaceAll(p.config.CmdStop, "${PID}", fmt.Sprintf("%d", p.cmd.Process.Pid)))
if err != nil {
p.proxyLogger.Errorf("<%s> Failed to sanitize stop command: %v", p.ID, err)
return
}
p.proxyLogger.Debugf("<%s> Executing stop command: %s", p.ID, strings.Join(stopArgs, " "))
stopCmd := exec.Command(stopArgs[0], stopArgs[1:]...)
stopCmd.Stdout = p.processLogger
stopCmd.Stderr = p.processLogger
stopCmd.Env = p.config.Env
if err := stopCmd.Run(); err != nil {
p.proxyLogger.Errorf("<%s> Failed to exec stop command: %v", p.ID, err)
return
}
} else {
if err := p.cmd.Process.Signal(syscall.SIGTERM); err != nil {
p.proxyLogger.Errorf("<%s> Failed to send SIGTERM to process: %v", p.ID, err)
return
}
}
select {
case <-sigtermTimeout.Done():
p.proxyLogger.Debugf("<%s> Process timed out waiting to stop, sending KILL signal (normal during shutdown)", p.ID)
p.upstreamWasStoppedWithKill = true
if err := p.cmd.Process.Kill(); err != nil {
p.proxyLogger.Errorf("<%s> Failed to kill process: %v", p.ID, err)
}
-9
View File
@@ -1,9 +0,0 @@
//go:build !windows
package proxy
import "syscall"
func (p *Process) terminateProcess() error {
return p.cmd.Process.Signal(syscall.SIGTERM)
}
-14
View File
@@ -1,14 +0,0 @@
//go:build windows
package proxy
import (
"fmt"
"os/exec"
)
func (p *Process) terminateProcess() error {
pid := fmt.Sprintf("%d", p.cmd.Process.Pid)
cmd := exec.Command("taskkill", "/f", "/t", "/pid", pid)
return cmd.Run()
}
+75
View File
@@ -5,6 +5,7 @@ import (
"net/http"
"net/http/httptest"
"os"
"runtime"
"sync"
"testing"
"time"
@@ -393,3 +394,77 @@ func TestProcess_StopImmediately(t *testing.T) {
process.StopImmediately()
assert.Equal(t, process.CurrentState(), StateStopped)
}
// Test that SIGKILL is sent when gracefulStopTimeout is reached and properly terminates
// the upstream command
func TestProcess_ForceStopWithKill(t *testing.T) {
expectedMessage := "test_sigkill"
binaryPath := getSimpleResponderPath()
port := getTestPort()
config := ModelConfig{
// note --ignore-sig-term which ignores the SIGTERM signal so a SIGKILL must be sent
// to force the process to exit
Cmd: fmt.Sprintf("%s --port %d --respond %s --silent --ignore-sig-term", binaryPath, port, expectedMessage),
Proxy: fmt.Sprintf("http://127.0.0.1:%d", port),
CheckEndpoint: "/health",
}
process := NewProcess("stop_immediate", 2, config, debugLogger, debugLogger)
defer process.Stop()
// reduce to make testing go faster
process.gracefulStopTimeout = time.Second
err := process.start()
assert.Nil(t, err)
assert.Equal(t, process.CurrentState(), StateReady)
waitChan := make(chan struct{})
go func() {
// slow, but will get killed by StopImmediate
req := httptest.NewRequest("GET", "/slow-respond?echo=12345&delay=2s", nil)
w := httptest.NewRecorder()
process.ProxyRequest(w, req)
// StatusOK because that was already sent before the kill
assert.Equal(t, http.StatusOK, w.Code)
// unexpected EOF because the kill happened, the "1" is sent before the kill
// then the unexpected EOF is sent after the kill
if runtime.GOOS == "windows" {
assert.Contains(t, w.Body.String(), "wsarecv: An existing connection was forcibly closed by the remote host")
} else {
assert.Contains(t, w.Body.String(), "unexpected EOF")
}
close(waitChan)
}()
<-time.After(time.Millisecond)
process.StopImmediately()
assert.Equal(t, process.CurrentState(), StateStopped)
// the request should have been interrupted by SIGKILL
<-waitChan
}
func TestProcess_StopCmd(t *testing.T) {
config := getTestSimpleResponderConfig("test_stop_cmd")
if runtime.GOOS == "windows" {
config.CmdStop = "taskkill /f /t /pid ${PID}"
} else {
config.CmdStop = "kill -TERM ${PID}"
}
process := NewProcess("testStopCmd", 2, config, debugLogger, debugLogger)
defer process.Stop()
err := process.start()
assert.Nil(t, err)
assert.Equal(t, process.CurrentState(), StateReady)
process.StopImmediately()
assert.Equal(t, process.CurrentState(), StateStopped)
}
+29 -2
View File
@@ -334,7 +334,33 @@ func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
// Iterate over sorted keys
for _, modelID := range modelIDs {
html.WriteString(fmt.Sprintf("<li><a href=\"/upstream/%s\">%s</a></li>", modelID, modelID))
// Get process state
processGroup := pm.findGroupByModelName(modelID)
var state string
if processGroup != nil {
process := processGroup.processes[modelID]
if process != nil {
var stateStr string
switch process.CurrentState() {
case StateReady:
stateStr = "Ready"
case StateStarting:
stateStr = "Starting"
case StateStopping:
stateStr = "Stopping"
case StateFailed:
stateStr = "Failed"
case StateShutdown:
stateStr = "Shutdown"
case StateStopped:
stateStr = "Stopped"
default:
stateStr = "Unknown"
}
state = stateStr
}
}
html.WriteString(fmt.Sprintf("<li><a href=\"/upstream/%s\">%s</a> - %s</li>", modelID, modelID, state))
}
html.WriteString("</ul></body></html>")
c.Header("Content-Type", "text/html")
@@ -374,7 +400,8 @@ func (pm *ProxyManager) proxyOAIHandler(c *gin.Context) {
// dechunk it as we already have all the body bytes see issue #11
c.Request.Header.Del("transfer-encoding")
c.Request.Header.Add("content-length", strconv.Itoa(len(bodyBytes)))
c.Request.Header.Set("content-length", strconv.Itoa(len(bodyBytes)))
c.Request.ContentLength = int64(len(bodyBytes))
if err := processGroup.ProxyRequest(realModelName, c.Writer, c.Request); err != nil {
pm.sendErrorResponse(c, http.StatusInternalServerError, fmt.Sprintf("error proxying request: %s", err.Error()))
+8 -2
View File
@@ -14,6 +14,7 @@ import (
"time"
"github.com/stretchr/testify/assert"
"github.com/tidwall/gjson"
)
func TestProxyManager_SwapProcessCorrectly(t *testing.T) {
@@ -339,7 +340,7 @@ func TestProxyManager_RunningEndpoint(t *testing.T) {
"model1": getTestSimpleResponderConfig("model1"),
"model2": getTestSimpleResponderConfig("model2"),
},
LogLevel: "debug",
LogLevel: "warn",
})
// Define a helper struct to parse the JSON response.
@@ -448,7 +449,6 @@ func TestProxyManager_AudioTranscriptionHandler(t *testing.T) {
// Test useModelName in configuration sends overrides what is sent to upstream
func TestProxyManager_UseModelName(t *testing.T) {
upstreamModelName := "upstreamModel"
modelConfig := getTestSimpleResponderConfig(upstreamModelName)
modelConfig.UseModelName = upstreamModelName
@@ -473,6 +473,12 @@ func TestProxyManager_UseModelName(t *testing.T) {
proxy.ServeHTTP(w, req)
assert.Equal(t, http.StatusOK, w.Code)
assert.Contains(t, w.Body.String(), upstreamModelName)
// make sure the content length was set correctly
// simple-responder will return the content length it got in the response
body := w.Body.Bytes()
contentLength := int(gjson.GetBytes(body, "h_content_length").Int())
assert.Equal(t, len(fmt.Sprintf(`{"model":"%s"}`, upstreamModelName)), contentLength)
})
t.Run("useModelName over rides requested model: /v1/audio/transcriptions", func(t *testing.T) {
+189
View File
@@ -0,0 +1,189 @@
#!/bin/sh
# This script installs llama-swap on Linux.
# It detects the current operating system architecture and installs the appropriate version of llama-swap.
set -eu
red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
status() { echo ">>> $*" >&2; }
error() { echo "${red}ERROR:${plain} $*"; exit 1; }
warning() { echo "${red}WARNING:${plain} $*"; }
available() { command -v $1 >/dev/null; }
require() {
local MISSING=''
for TOOL in $*; do
if ! available $TOOL; then
MISSING="$MISSING $TOOL"
fi
done
echo $MISSING
}
SUDO=
if [ "$(id -u)" -ne 0 ]; then
if ! available sudo; then
error "This script requires superuser permissions. Please re-run as root."
fi
SUDO="sudo"
fi
NEEDS=$(require curl tee jq tar)
if [ -n "$NEEDS" ]; then
status "ERROR: The following tools are required but missing:"
for NEED in $NEEDS; do
echo " - $NEED"
done
exit 1
fi
[ "$(uname -s)" = "Linux" ] || error 'This script is intended to run on Linux only.'
ARCH=$(uname -m)
case "$ARCH" in
x86_64) ARCH="amd64" ;;
aarch64|arm64) ARCH="arm64" ;;
*) error "Unsupported architecture: $ARCH" ;;
esac
IS_WSL2=false
KERN=$(uname -r)
case "$KERN" in
*icrosoft*WSL2 | *icrosoft*wsl2) IS_WSL2=true;;
*icrosoft) error "Microsoft WSL1 is not currently supported. Please use WSL2 with 'wsl --set-version <distro> 2'" ;;
*) ;;
esac
download_binary() {
ASSET_NAME="linux_$ARCH"
# Fetch the latest release info and extract the matching asset URL
DL_URL=$(curl -s "https://api.github.com/repos/mostlygeek/llama-swap/releases/latest" | \
jq -r --arg name "$ASSET_NAME" \
'.assets[] | select(.name | contains($name)) | .browser_download_url')
# Check if a URL was successfully extracted
if [ -z "$DL_URL" ]; then
error "No matching asset found with name containing '$ASSET_NAME'."
fi
status "Downloading Linux $ARCH binary"
curl -s -L "$DL_URL" | $SUDO tar -xzf - -C /usr/local/bin llama-swap
}
download_binary
configure_systemd() {
if ! id llama-swap >/dev/null 2>&1; then
status "Creating llama-swap user..."
$SUDO useradd -r -s /bin/false -U -m -d /usr/share/llama-swap llama-swap
fi
if getent group render >/dev/null 2>&1; then
status "Adding llama-swap user to render group..."
$SUDO usermod -a -G render llama-swap
fi
if getent group video >/dev/null 2>&1; then
status "Adding llama-swap user to video group..."
$SUDO usermod -a -G video llama-swap
fi
if getent group docker >/dev/null 2>&1; then
status "Adding llama-swap user to docker group..."
$SUDO usermod -a -G docker llama-swap
fi
status "Adding current user to llama-swap group..."
$SUDO usermod -a -G llama-swap $(whoami)
if [ ! -f "/usr/share/llama-swap/config.yaml" ]; then
status "Creating default config.yaml..."
cat <<EOF | $SUDO -u llama-swap tee /usr/share/llama-swap/config.yaml >/dev/null
# default 15s likely to fail for default models due to downloading models
healthCheckTimeout: 60
models:
"qwen2.5":
cmd: |
docker run
--rm
-p \${PORT}:8080
--name qwen2.5
ghcr.io/ggml-org/llama.cpp:server
-hf bartowski/Qwen2.5-0.5B-Instruct-GGUF:Q4_K_M
cmdStop: docker stop qwen2.5
"smollm2":
cmd: |
docker run
--rm
-p \${PORT}:8080
--name smollm2
ghcr.io/ggml-org/llama.cpp:server
-hf bartowski/SmolLM2-135M-Instruct-GGUF:Q4_K_M
cmdStop: docker stop smollm2
EOF
fi
status "Creating llama-swap systemd service..."
cat <<EOF | $SUDO tee /etc/systemd/system/llama-swap.service >/dev/null
[Unit]
Description=llama-swap
After=network.target
[Service]
User=llama-swap
Group=llama-swap
# set this to match your environment
ExecStart=/usr/local/bin/llama-swap --config /usr/share/llama-swap/config.yaml --watch-config
Restart=on-failure
RestartSec=3
StartLimitBurst=3
StartLimitInterval=30
[Install]
WantedBy=multi-user.target
EOF
SYSTEMCTL_RUNNING="$(systemctl is-system-running || true)"
case $SYSTEMCTL_RUNNING in
running|degraded)
status "Enabling and starting llama-swap service..."
$SUDO systemctl daemon-reload
$SUDO systemctl enable llama-swap
start_service() { $SUDO systemctl restart llama-swap; }
trap start_service EXIT
;;
*)
warning "systemd is not running"
if [ "$IS_WSL2" = true ]; then
warning "see https://learn.microsoft.com/en-us/windows/wsl/systemd#how-to-enable-systemd to enable it"
fi
;;
esac
}
if available systemctl; then
configure_systemd
fi
install_success() {
status 'The llama-swap API is now available at 127.0.0.1:8080.'
status 'Customize the config file at /usr/share/llama-swap/config.yaml.'
status 'Install complete.'
}
# WSL2 only supports GPUs via nvidia passthrough
# so check for nvidia-smi to determine if GPU is available
if [ "$IS_WSL2" = true ]; then
if available nvidia-smi && [ -n "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\.[0-9]*")" ]; then
status "Nvidia GPU detected."
fi
exit 0
fi
install_success
+68
View File
@@ -0,0 +1,68 @@
#!/bin/sh
# This script uninstalls llama-swap on Linux.
# It removes the binary, systemd service, config.yaml (optional), and llama-swap user and group.
set -eu
red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
status() { echo ">>> $*" >&2; }
error() { echo "${red}ERROR:${plain} $*"; exit 1; }
warning() { echo "${red}WARNING:${plain} $*"; }
available() { command -v $1 >/dev/null; }
SUDO=
if [ "$(id -u)" -ne 0 ]; then
if ! available sudo; then
error "This script requires superuser permissions. Please re-run as root."
fi
SUDO="sudo"
fi
configure_systemd() {
status "Stopping llama-swap service..."
$SUDO systemctl stop llama-swap
status "Disabling llama-swap service..."
$SUDO systemctl disable llama-swap
}
if available systemctl; then
configure_systemd
fi
if available llama-swap; then
status "Removing llama-swap binary..."
$SUDO rm $(which llama-swap)
fi
if [ -f "/usr/share/llama-swap/config.yaml" ]; then
while true; do
printf "Delete config.yaml (/usr/share/llama-swap/config.yaml)? [y/N] " >&2
read answer
case "$answer" in
[Yy]* )
$SUDO rm -r /usr/share/llama-swap
break
;;
[Nn]* | "" )
break
;;
* )
echo "Invalid input. Please enter y or n."
;;
esac
done
fi
if id llama-swap >/dev/null 2>&1; then
status "Removing llama-swap user..."
$SUDO userdel llama-swap
fi
if getent group llama-swap >/dev/null 2>&1; then
status "Removing llama-swap group..."
$SUDO groupdel llama-swap
fi