Compare commits

...

7 Commits

Author SHA1 Message Date
Benson Wong 02ee29d881 increase default healthCheckTimeout to 120s 2025-05-26 09:57:53 -07:00
Benson Wong b2a891f8f4 Disable building of intel container until it's fixed upstream 2025-05-23 22:54:43 -07:00
Yuta Hayashibe 8d2b568897 Improve install script (#144)
* Use `python3` instead of `curl` and `jq`

* Use quote to word splitting

* Remove undefined `local` in POSIX sh

* Added `LLAMA_SWAP_DEFAULT_ADDRESS` to customize the server address

* Added `mktemp` to `NEEDS`
2025-05-23 09:39:55 -07:00
Yuta Hayashibe fb44cf4e08 Fix typos (#143) 2025-05-23 08:40:15 -07:00
Benson Wong 02aee4e86d remove noisy debug print message 2025-05-20 10:43:10 -07:00
Benson Wong f45896d395 add guard to avoid unnecessary logic in Process.Shutdown 2025-05-20 10:43:09 -07:00
choyuansu f7e46a359f Add link to unload endpoint in upstream list (#140)
* Add link to open /unload
2025-05-20 08:31:44 -07:00
7 changed files with 67 additions and 31 deletions
+2 -1
View File
@@ -15,7 +15,8 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
platform: [intel, cuda, vulkan, cpu, musa]
#platform: [intel, cuda, vulkan, cpu, musa]
platform: [cuda, vulkan, cpu, musa]
fail-fast: false
steps:
- name: Checkout code
+5 -4
View File
@@ -7,7 +7,7 @@
llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
Written in golang, it is very easy to install (single binary with no dependancies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images.
Written in golang, it is very easy to install (single binary with no dependencies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images.
## Features:
@@ -63,9 +63,10 @@ models:
<summary>But also very powerful ...</summary>
```yaml
# Seconds to wait for llama.cpp to load and be ready to serve requests
# Default (and minimum) is 15 seconds
healthCheckTimeout: 60
# Seconds to wait for upstream to load and be ready to serve requests
# minimum is 15 seconds
# default is 120 seconds
healthCheckTimeout: 500
# Valid log levels: debug, info (default), warn, error
logLevel: info
+2 -2
View File
@@ -223,13 +223,13 @@ runloop:
if countSigInt > 1 {
break runloop
} else {
log.Println("Recieved SIGINT, send another SIGINT to shutdown")
log.Println("Received SIGINT, send another SIGINT to shutdown")
}
case syscall.SIGTERM:
if *ignoreSigTerm {
log.Println("Ignoring SIGTERM")
} else {
log.Println("Recieved SIGTERM, shutting down")
log.Println("Received SIGTERM, shutting down")
break runloop
}
default:
+6 -1
View File
@@ -113,7 +113,12 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
return Config{}, err
}
if config.HealthCheckTimeout < 15 {
if config.HealthCheckTimeout == 0 {
// this high default timeout helps avoid failing health checks
// for configurations that wait for docker or have slower startup
config.HealthCheckTimeout = 120
} else if config.HealthCheckTimeout < 15 {
// set a minimum of 15 seconds
config.HealthCheckTimeout = 15
}
+8 -3
View File
@@ -81,9 +81,8 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
concurrentLimit := 10
if config.ConcurrencyLimit > 0 {
concurrentLimit = config.ConcurrencyLimit
} else {
proxyLogger.Debugf("Concurrency limit for model %s not set, defaulting to 10", ID)
}
return &Process{
ID: ID,
config: config,
@@ -101,7 +100,7 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
concurrencyLimitSemaphore: make(chan struct{}, concurrentLimit),
// stop timeout
gracefulStopTimeout: 5 * time.Second,
gracefulStopTimeout: 10 * time.Second,
upstreamWasStoppedWithKill: false,
}
}
@@ -389,8 +388,14 @@ func (p *Process) StopImmediately() {
// is in the state of starting, it will cancel it and shut it down. Once a process is in
// the StateShutdown state, it can not be started again.
func (p *Process) Shutdown() {
if !isValidTransition(p.CurrentState(), StateStopping) {
return
}
p.shutdownCancel()
p.stopCommand(p.gracefulStopTimeout)
// just force it to this state since there is no recovery from shutdown
p.state = StateShutdown
}
+1 -1
View File
@@ -319,7 +319,7 @@ func (pm *ProxyManager) proxyToUpstream(c *gin.Context) {
func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
var html strings.Builder
html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><ul>")
html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><a href=\"/unload\">Unload all models</a><ul>")
// Extract keys and sort them
var modelIDs []string
+43 -19
View File
@@ -4,6 +4,8 @@
set -eu
LLAMA_SWAP_DEFAULT_ADDRESS=${LLAMA_SWAP_DEFAULT_ADDRESS:-"127.0.0.1:8080"}
red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
@@ -11,16 +13,16 @@ status() { echo ">>> $*" >&2; }
error() { echo "${red}ERROR:${plain} $*"; exit 1; }
warning() { echo "${red}WARNING:${plain} $*"; }
available() { command -v $1 >/dev/null; }
available() { command -v "$1" >/dev/null; }
require() {
local MISSING=''
for TOOL in $*; do
if ! available $TOOL; then
MISSING="$MISSING $TOOL"
_MISSING=''
for TOOL in "$@"; do
if ! available "$TOOL"; then
_MISSING="$_MISSING $TOOL"
fi
done
echo $MISSING
echo "$_MISSING"
}
SUDO=
@@ -32,7 +34,7 @@ if [ "$(id -u)" -ne 0 ]; then
SUDO="sudo"
fi
NEEDS=$(require curl tee jq tar)
NEEDS=$(require tee tar python3 mktemp)
if [ -n "$NEEDS" ]; then
status "ERROR: The following tools are required but missing:"
for NEED in $NEEDS; do
@@ -62,18 +64,40 @@ esac
download_binary() {
ASSET_NAME="linux_$ARCH"
# Fetch the latest release info and extract the matching asset URL
DL_URL=$(curl -s "https://api.github.com/repos/mostlygeek/llama-swap/releases/latest" | \
jq -r --arg name "$ASSET_NAME" \
'.assets[] | select(.name | contains($name)) | .browser_download_url')
TMPDIR=$(mktemp -d)
trap 'rm -rf "${TMPDIR}"' EXIT INT TERM HUP
PYTHON_SCRIPT=$(cat <<EOF
import os
import json
import sys
import urllib.request
# Check if a URL was successfully extracted
if [ -z "$DL_URL" ]; then
error "No matching asset found with name containing '$ASSET_NAME'."
ASSET_NAME = "${ASSET_NAME}"
with urllib.request.urlopen("https://api.github.com/repos/mostlygeek/llama-swap/releases/latest") as resp:
data = json.load(resp)
for asset in data.get("assets", []):
if ASSET_NAME in asset.get("name", ""):
url = asset["browser_download_url"]
break
else:
print("ERROR: Matching asset not found.", file=sys.stderr)
exit(1)
print("Downloading:", url, file=sys.stderr)
output_path = os.path.join("${TMPDIR}", "llama-swap.tar.gz")
urllib.request.urlretrieve(url, output_path)
print(output_path)
EOF
)
TARFILE=$(python3 -c "$PYTHON_SCRIPT")
if [ ! -f "$TARFILE" ]; then
error "Failed to download binary."
fi
status "Downloading Linux $ARCH binary"
curl -s -L "$DL_URL" | $SUDO tar -xzf - -C /usr/local/bin llama-swap
status "Extracting to /usr/local/bin"
$SUDO tar -xzf "$TARFILE" -C /usr/local/bin llama-swap
}
download_binary
@@ -96,7 +120,7 @@ configure_systemd() {
fi
status "Adding current user to llama-swap group..."
$SUDO usermod -a -G llama-swap $(whoami)
$SUDO usermod -a -G llama-swap "$(whoami)"
if [ ! -f "/usr/share/llama-swap/config.yaml" ]; then
status "Creating default config.yaml..."
@@ -138,7 +162,7 @@ User=llama-swap
Group=llama-swap
# set this to match your environment
ExecStart=/usr/local/bin/llama-swap --config /usr/share/llama-swap/config.yaml --watch-config
ExecStart=/usr/local/bin/llama-swap --config /usr/share/llama-swap/config.yaml --watch-config -listen ${LLAMA_SWAP_DEFAULT_ADDRESS}
Restart=on-failure
RestartSec=3
@@ -172,7 +196,7 @@ if available systemctl; then
fi
install_success() {
status 'The llama-swap API is now available at 127.0.0.1:8080.'
status "The llama-swap API is now available at http://${LLAMA_SWAP_DEFAULT_ADDRESS}"
status 'Customize the config file at /usr/share/llama-swap/config.yaml.'
status 'Install complete.'
}