Compare commits

..

10 Commits

Author SHA1 Message Date
Benson Wong 02ee29d881 increase default healthCheckTimeout to 120s 2025-05-26 09:57:53 -07:00
Benson Wong b2a891f8f4 Disable building of intel container until it's fixed upstream 2025-05-23 22:54:43 -07:00
Yuta Hayashibe 8d2b568897 Improve install script (#144)
* Use `python3` instead of `curl` and `jq`

* Use quote to word splitting

* Remove undefined `local` in POSIX sh

* Added `LLAMA_SWAP_DEFAULT_ADDRESS` to customize the server address

* Added `mktemp` to `NEEDS`
2025-05-23 09:39:55 -07:00
Yuta Hayashibe fb44cf4e08 Fix typos (#143) 2025-05-23 08:40:15 -07:00
Benson Wong 02aee4e86d remove noisy debug print message 2025-05-20 10:43:10 -07:00
Benson Wong f45896d395 add guard to avoid unnecessary logic in Process.Shutdown 2025-05-20 10:43:09 -07:00
choyuansu f7e46a359f Add link to unload endpoint in upstream list (#140)
* Add link to open /unload
2025-05-20 08:31:44 -07:00
choyuansu c260907415 Add linux install and uninstall shell scripts (#139)
Contribution for install, and uninstall llama-swap in linux.
2025-05-19 12:03:33 -07:00
Benson Wong b83a5fa291 make Failed stated recoverable (#137)
A process in the failed state can transition to stopped either by calling /unload or swapping to another model.
2025-05-16 19:54:44 -07:00
Benson Wong 6e2ff28d59 improve cmdStop docs [no ci] 2025-05-16 13:52:04 -07:00
8 changed files with 324 additions and 18 deletions
+2 -1
View File
@@ -15,7 +15,8 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
matrix: matrix:
platform: [intel, cuda, vulkan, cpu, musa] #platform: [intel, cuda, vulkan, cpu, musa]
platform: [cuda, vulkan, cpu, musa]
fail-fast: false fail-fast: false
steps: steps:
- name: Checkout code - name: Checkout code
+7 -4
View File
@@ -7,7 +7,7 @@
llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server. llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
Written in golang, it is very easy to install (single binary with no dependancies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images. Written in golang, it is very easy to install (single binary with no dependencies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images.
## Features: ## Features:
@@ -63,9 +63,10 @@ models:
<summary>But also very powerful ...</summary> <summary>But also very powerful ...</summary>
```yaml ```yaml
# Seconds to wait for llama.cpp to load and be ready to serve requests # Seconds to wait for upstream to load and be ready to serve requests
# Default (and minimum) is 15 seconds # minimum is 15 seconds
healthCheckTimeout: 60 # default is 120 seconds
healthCheckTimeout: 500
# Valid log levels: debug, info (default), warn, error # Valid log levels: debug, info (default), warn, error
logLevel: info logLevel: info
@@ -131,6 +132,8 @@ models:
# use a custom command to stop the model when swapping. By default # use a custom command to stop the model when swapping. By default
# this is SIGTERM on POSIX systems, and taskkill on Windows systems # this is SIGTERM on POSIX systems, and taskkill on Windows systems
# the ${PID} variable can be used in cmdStop, it will be automatically replaced
# with the PID of the running model
cmdStop: docker stop dockertest cmdStop: docker stop dockertest
# Groups provide advanced controls over model swapping behaviour. Using groups # Groups provide advanced controls over model swapping behaviour. Using groups
+2 -2
View File
@@ -223,13 +223,13 @@ runloop:
if countSigInt > 1 { if countSigInt > 1 {
break runloop break runloop
} else { } else {
log.Println("Recieved SIGINT, send another SIGINT to shutdown") log.Println("Received SIGINT, send another SIGINT to shutdown")
} }
case syscall.SIGTERM: case syscall.SIGTERM:
if *ignoreSigTerm { if *ignoreSigTerm {
log.Println("Ignoring SIGTERM") log.Println("Ignoring SIGTERM")
} else { } else {
log.Println("Recieved SIGTERM, shutting down") log.Println("Received SIGTERM, shutting down")
break runloop break runloop
} }
default: default:
+6 -1
View File
@@ -113,7 +113,12 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
return Config{}, err return Config{}, err
} }
if config.HealthCheckTimeout < 15 { if config.HealthCheckTimeout == 0 {
// this high default timeout helps avoid failing health checks
// for configurations that wait for docker or have slower startup
config.HealthCheckTimeout = 120
} else if config.HealthCheckTimeout < 15 {
// set a minimum of 15 seconds
config.HealthCheckTimeout = 15 config.HealthCheckTimeout = 15
} }
+23 -9
View File
@@ -81,9 +81,8 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
concurrentLimit := 10 concurrentLimit := 10
if config.ConcurrencyLimit > 0 { if config.ConcurrencyLimit > 0 {
concurrentLimit = config.ConcurrencyLimit concurrentLimit = config.ConcurrencyLimit
} else {
proxyLogger.Debugf("Concurrency limit for model %s not set, defaulting to 10", ID)
} }
return &Process{ return &Process{
ID: ID, ID: ID,
config: config, config: config,
@@ -101,7 +100,7 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
concurrencyLimitSemaphore: make(chan struct{}, concurrentLimit), concurrencyLimitSemaphore: make(chan struct{}, concurrentLimit),
// stop timeout // stop timeout
gracefulStopTimeout: 5 * time.Second, gracefulStopTimeout: 10 * time.Second,
upstreamWasStoppedWithKill: false, upstreamWasStoppedWithKill: false,
} }
} }
@@ -149,7 +148,9 @@ func isValidTransition(from, to ProcessState) bool {
return to == StateStopping return to == StateStopping
case StateStopping: case StateStopping:
return to == StateStopped || to == StateShutdown return to == StateStopped || to == StateShutdown
case StateFailed, StateShutdown: case StateFailed:
return to == StateStopping
case StateShutdown:
return false // No transitions allowed from these states return false // No transitions allowed from these states
} }
return false return false
@@ -359,12 +360,19 @@ func (p *Process) StopImmediately() {
return return
} }
p.proxyLogger.Debugf("<%s> Stopping process", p.ID) p.proxyLogger.Debugf("<%s> Stopping process, current state: %s", p.ID, p.CurrentState())
currentState := p.CurrentState()
// calling Stop() when state is invalid is a no-op if currentState == StateFailed {
if curState, err := p.swapState(StateReady, StateStopping); err != nil { if curState, err := p.swapState(StateFailed, StateStopping); err != nil {
p.proxyLogger.Infof("<%s> Stop() Ready -> StateStopping err: %v, current state: %v", p.ID, err, curState) p.proxyLogger.Infof("<%s> Stop() Failed -> StateStopping err: %v, current state: %v", p.ID, err, curState)
return return
}
} else {
if curState, err := p.swapState(StateReady, StateStopping); err != nil {
p.proxyLogger.Infof("<%s> Stop() Ready -> StateStopping err: %v, current state: %v", p.ID, err, curState)
return
}
} }
// stop the process with a graceful exit timeout // stop the process with a graceful exit timeout
@@ -380,8 +388,14 @@ func (p *Process) StopImmediately() {
// is in the state of starting, it will cancel it and shut it down. Once a process is in // is in the state of starting, it will cancel it and shut it down. Once a process is in
// the StateShutdown state, it can not be started again. // the StateShutdown state, it can not be started again.
func (p *Process) Shutdown() { func (p *Process) Shutdown() {
if !isValidTransition(p.CurrentState(), StateStopping) {
return
}
p.shutdownCancel() p.shutdownCancel()
p.stopCommand(p.gracefulStopTimeout) p.stopCommand(p.gracefulStopTimeout)
// just force it to this state since there is no recovery from shutdown
p.state = StateShutdown p.state = StateShutdown
} }
+3 -1
View File
@@ -319,7 +319,7 @@ func (pm *ProxyManager) proxyToUpstream(c *gin.Context) {
func (pm *ProxyManager) upstreamIndex(c *gin.Context) { func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
var html strings.Builder var html strings.Builder
html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><ul>") html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><a href=\"/unload\">Unload all models</a><ul>")
// Extract keys and sort them // Extract keys and sort them
var modelIDs []string var modelIDs []string
@@ -352,6 +352,8 @@ func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
stateStr = "Failed" stateStr = "Failed"
case StateShutdown: case StateShutdown:
stateStr = "Shutdown" stateStr = "Shutdown"
case StateStopped:
stateStr = "Stopped"
default: default:
stateStr = "Unknown" stateStr = "Unknown"
} }
+213
View File
@@ -0,0 +1,213 @@
#!/bin/sh
# This script installs llama-swap on Linux.
# It detects the current operating system architecture and installs the appropriate version of llama-swap.
set -eu
LLAMA_SWAP_DEFAULT_ADDRESS=${LLAMA_SWAP_DEFAULT_ADDRESS:-"127.0.0.1:8080"}
red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
status() { echo ">>> $*" >&2; }
error() { echo "${red}ERROR:${plain} $*"; exit 1; }
warning() { echo "${red}WARNING:${plain} $*"; }
available() { command -v "$1" >/dev/null; }
require() {
_MISSING=''
for TOOL in "$@"; do
if ! available "$TOOL"; then
_MISSING="$_MISSING $TOOL"
fi
done
echo "$_MISSING"
}
SUDO=
if [ "$(id -u)" -ne 0 ]; then
if ! available sudo; then
error "This script requires superuser permissions. Please re-run as root."
fi
SUDO="sudo"
fi
NEEDS=$(require tee tar python3 mktemp)
if [ -n "$NEEDS" ]; then
status "ERROR: The following tools are required but missing:"
for NEED in $NEEDS; do
echo " - $NEED"
done
exit 1
fi
[ "$(uname -s)" = "Linux" ] || error 'This script is intended to run on Linux only.'
ARCH=$(uname -m)
case "$ARCH" in
x86_64) ARCH="amd64" ;;
aarch64|arm64) ARCH="arm64" ;;
*) error "Unsupported architecture: $ARCH" ;;
esac
IS_WSL2=false
KERN=$(uname -r)
case "$KERN" in
*icrosoft*WSL2 | *icrosoft*wsl2) IS_WSL2=true;;
*icrosoft) error "Microsoft WSL1 is not currently supported. Please use WSL2 with 'wsl --set-version <distro> 2'" ;;
*) ;;
esac
download_binary() {
ASSET_NAME="linux_$ARCH"
TMPDIR=$(mktemp -d)
trap 'rm -rf "${TMPDIR}"' EXIT INT TERM HUP
PYTHON_SCRIPT=$(cat <<EOF
import os
import json
import sys
import urllib.request
ASSET_NAME = "${ASSET_NAME}"
with urllib.request.urlopen("https://api.github.com/repos/mostlygeek/llama-swap/releases/latest") as resp:
data = json.load(resp)
for asset in data.get("assets", []):
if ASSET_NAME in asset.get("name", ""):
url = asset["browser_download_url"]
break
else:
print("ERROR: Matching asset not found.", file=sys.stderr)
exit(1)
print("Downloading:", url, file=sys.stderr)
output_path = os.path.join("${TMPDIR}", "llama-swap.tar.gz")
urllib.request.urlretrieve(url, output_path)
print(output_path)
EOF
)
TARFILE=$(python3 -c "$PYTHON_SCRIPT")
if [ ! -f "$TARFILE" ]; then
error "Failed to download binary."
fi
status "Extracting to /usr/local/bin"
$SUDO tar -xzf "$TARFILE" -C /usr/local/bin llama-swap
}
download_binary
configure_systemd() {
if ! id llama-swap >/dev/null 2>&1; then
status "Creating llama-swap user..."
$SUDO useradd -r -s /bin/false -U -m -d /usr/share/llama-swap llama-swap
fi
if getent group render >/dev/null 2>&1; then
status "Adding llama-swap user to render group..."
$SUDO usermod -a -G render llama-swap
fi
if getent group video >/dev/null 2>&1; then
status "Adding llama-swap user to video group..."
$SUDO usermod -a -G video llama-swap
fi
if getent group docker >/dev/null 2>&1; then
status "Adding llama-swap user to docker group..."
$SUDO usermod -a -G docker llama-swap
fi
status "Adding current user to llama-swap group..."
$SUDO usermod -a -G llama-swap "$(whoami)"
if [ ! -f "/usr/share/llama-swap/config.yaml" ]; then
status "Creating default config.yaml..."
cat <<EOF | $SUDO -u llama-swap tee /usr/share/llama-swap/config.yaml >/dev/null
# default 15s likely to fail for default models due to downloading models
healthCheckTimeout: 60
models:
"qwen2.5":
cmd: |
docker run
--rm
-p \${PORT}:8080
--name qwen2.5
ghcr.io/ggml-org/llama.cpp:server
-hf bartowski/Qwen2.5-0.5B-Instruct-GGUF:Q4_K_M
cmdStop: docker stop qwen2.5
"smollm2":
cmd: |
docker run
--rm
-p \${PORT}:8080
--name smollm2
ghcr.io/ggml-org/llama.cpp:server
-hf bartowski/SmolLM2-135M-Instruct-GGUF:Q4_K_M
cmdStop: docker stop smollm2
EOF
fi
status "Creating llama-swap systemd service..."
cat <<EOF | $SUDO tee /etc/systemd/system/llama-swap.service >/dev/null
[Unit]
Description=llama-swap
After=network.target
[Service]
User=llama-swap
Group=llama-swap
# set this to match your environment
ExecStart=/usr/local/bin/llama-swap --config /usr/share/llama-swap/config.yaml --watch-config -listen ${LLAMA_SWAP_DEFAULT_ADDRESS}
Restart=on-failure
RestartSec=3
StartLimitBurst=3
StartLimitInterval=30
[Install]
WantedBy=multi-user.target
EOF
SYSTEMCTL_RUNNING="$(systemctl is-system-running || true)"
case $SYSTEMCTL_RUNNING in
running|degraded)
status "Enabling and starting llama-swap service..."
$SUDO systemctl daemon-reload
$SUDO systemctl enable llama-swap
start_service() { $SUDO systemctl restart llama-swap; }
trap start_service EXIT
;;
*)
warning "systemd is not running"
if [ "$IS_WSL2" = true ]; then
warning "see https://learn.microsoft.com/en-us/windows/wsl/systemd#how-to-enable-systemd to enable it"
fi
;;
esac
}
if available systemctl; then
configure_systemd
fi
install_success() {
status "The llama-swap API is now available at http://${LLAMA_SWAP_DEFAULT_ADDRESS}"
status 'Customize the config file at /usr/share/llama-swap/config.yaml.'
status 'Install complete.'
}
# WSL2 only supports GPUs via nvidia passthrough
# so check for nvidia-smi to determine if GPU is available
if [ "$IS_WSL2" = true ]; then
if available nvidia-smi && [ -n "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\.[0-9]*")" ]; then
status "Nvidia GPU detected."
fi
exit 0
fi
install_success
+68
View File
@@ -0,0 +1,68 @@
#!/bin/sh
# This script uninstalls llama-swap on Linux.
# It removes the binary, systemd service, config.yaml (optional), and llama-swap user and group.
set -eu
red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
status() { echo ">>> $*" >&2; }
error() { echo "${red}ERROR:${plain} $*"; exit 1; }
warning() { echo "${red}WARNING:${plain} $*"; }
available() { command -v $1 >/dev/null; }
SUDO=
if [ "$(id -u)" -ne 0 ]; then
if ! available sudo; then
error "This script requires superuser permissions. Please re-run as root."
fi
SUDO="sudo"
fi
configure_systemd() {
status "Stopping llama-swap service..."
$SUDO systemctl stop llama-swap
status "Disabling llama-swap service..."
$SUDO systemctl disable llama-swap
}
if available systemctl; then
configure_systemd
fi
if available llama-swap; then
status "Removing llama-swap binary..."
$SUDO rm $(which llama-swap)
fi
if [ -f "/usr/share/llama-swap/config.yaml" ]; then
while true; do
printf "Delete config.yaml (/usr/share/llama-swap/config.yaml)? [y/N] " >&2
read answer
case "$answer" in
[Yy]* )
$SUDO rm -r /usr/share/llama-swap
break
;;
[Nn]* | "" )
break
;;
* )
echo "Invalid input. Please enter y or n."
;;
esac
done
fi
if id llama-swap >/dev/null 2>&1; then
status "Removing llama-swap user..."
$SUDO userdel llama-swap
fi
if getent group llama-swap >/dev/null 2>&1; then
status "Removing llama-swap group..."
$SUDO groupdel llama-swap
fi