Compare commits

..

2 Commits

Author SHA1 Message Date
Benson Wong e7af671d8e remove noisy debug print message 2025-05-19 15:36:15 -07:00
Benson Wong 8e62098eef add guard to avoid unnecessary logic in Process.Shutdown 2025-05-19 15:34:30 -07:00
7 changed files with 29 additions and 60 deletions
+1 -2
View File
@@ -15,8 +15,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
matrix: matrix:
#platform: [intel, cuda, vulkan, cpu, musa] platform: [intel, cuda, vulkan, cpu, musa]
platform: [cuda, vulkan, cpu, musa]
fail-fast: false fail-fast: false
steps: steps:
- name: Checkout code - name: Checkout code
+4 -5
View File
@@ -7,7 +7,7 @@
llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server. llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
Written in golang, it is very easy to install (single binary with no dependencies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images. Written in golang, it is very easy to install (single binary with no dependancies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images.
## Features: ## Features:
@@ -63,10 +63,9 @@ models:
<summary>But also very powerful ...</summary> <summary>But also very powerful ...</summary>
```yaml ```yaml
# Seconds to wait for upstream to load and be ready to serve requests # Seconds to wait for llama.cpp to load and be ready to serve requests
# minimum is 15 seconds # Default (and minimum) is 15 seconds
# default is 120 seconds healthCheckTimeout: 60
healthCheckTimeout: 500
# Valid log levels: debug, info (default), warn, error # Valid log levels: debug, info (default), warn, error
logLevel: info logLevel: info
+2 -2
View File
@@ -223,13 +223,13 @@ runloop:
if countSigInt > 1 { if countSigInt > 1 {
break runloop break runloop
} else { } else {
log.Println("Received SIGINT, send another SIGINT to shutdown") log.Println("Recieved SIGINT, send another SIGINT to shutdown")
} }
case syscall.SIGTERM: case syscall.SIGTERM:
if *ignoreSigTerm { if *ignoreSigTerm {
log.Println("Ignoring SIGTERM") log.Println("Ignoring SIGTERM")
} else { } else {
log.Println("Received SIGTERM, shutting down") log.Println("Recieved SIGTERM, shutting down")
break runloop break runloop
} }
default: default:
+1 -6
View File
@@ -113,12 +113,7 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
return Config{}, err return Config{}, err
} }
if config.HealthCheckTimeout == 0 { if config.HealthCheckTimeout < 15 {
// this high default timeout helps avoid failing health checks
// for configurations that wait for docker or have slower startup
config.HealthCheckTimeout = 120
} else if config.HealthCheckTimeout < 15 {
// set a minimum of 15 seconds
config.HealthCheckTimeout = 15 config.HealthCheckTimeout = 15
} }
+1 -1
View File
@@ -100,7 +100,7 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
concurrencyLimitSemaphore: make(chan struct{}, concurrentLimit), concurrencyLimitSemaphore: make(chan struct{}, concurrentLimit),
// stop timeout // stop timeout
gracefulStopTimeout: 10 * time.Second, gracefulStopTimeout: 5 * time.Second,
upstreamWasStoppedWithKill: false, upstreamWasStoppedWithKill: false,
} }
} }
+1 -1
View File
@@ -319,7 +319,7 @@ func (pm *ProxyManager) proxyToUpstream(c *gin.Context) {
func (pm *ProxyManager) upstreamIndex(c *gin.Context) { func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
var html strings.Builder var html strings.Builder
html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><a href=\"/unload\">Unload all models</a><ul>") html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><ul>")
// Extract keys and sort them // Extract keys and sort them
var modelIDs []string var modelIDs []string
+19 -43
View File
@@ -4,8 +4,6 @@
set -eu set -eu
LLAMA_SWAP_DEFAULT_ADDRESS=${LLAMA_SWAP_DEFAULT_ADDRESS:-"127.0.0.1:8080"}
red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)" red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
plain="$( (/usr/bin/tput sgr0 || :) 2>&-)" plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
@@ -13,16 +11,16 @@ status() { echo ">>> $*" >&2; }
error() { echo "${red}ERROR:${plain} $*"; exit 1; } error() { echo "${red}ERROR:${plain} $*"; exit 1; }
warning() { echo "${red}WARNING:${plain} $*"; } warning() { echo "${red}WARNING:${plain} $*"; }
available() { command -v "$1" >/dev/null; } available() { command -v $1 >/dev/null; }
require() { require() {
_MISSING='' local MISSING=''
for TOOL in "$@"; do for TOOL in $*; do
if ! available "$TOOL"; then if ! available $TOOL; then
_MISSING="$_MISSING $TOOL" MISSING="$MISSING $TOOL"
fi fi
done done
echo "$_MISSING" echo $MISSING
} }
SUDO= SUDO=
@@ -34,7 +32,7 @@ if [ "$(id -u)" -ne 0 ]; then
SUDO="sudo" SUDO="sudo"
fi fi
NEEDS=$(require tee tar python3 mktemp) NEEDS=$(require curl tee jq tar)
if [ -n "$NEEDS" ]; then if [ -n "$NEEDS" ]; then
status "ERROR: The following tools are required but missing:" status "ERROR: The following tools are required but missing:"
for NEED in $NEEDS; do for NEED in $NEEDS; do
@@ -64,40 +62,18 @@ esac
download_binary() { download_binary() {
ASSET_NAME="linux_$ARCH" ASSET_NAME="linux_$ARCH"
TMPDIR=$(mktemp -d) # Fetch the latest release info and extract the matching asset URL
trap 'rm -rf "${TMPDIR}"' EXIT INT TERM HUP DL_URL=$(curl -s "https://api.github.com/repos/mostlygeek/llama-swap/releases/latest" | \
PYTHON_SCRIPT=$(cat <<EOF jq -r --arg name "$ASSET_NAME" \
import os '.assets[] | select(.name | contains($name)) | .browser_download_url')
import json
import sys
import urllib.request
ASSET_NAME = "${ASSET_NAME}" # Check if a URL was successfully extracted
if [ -z "$DL_URL" ]; then
with urllib.request.urlopen("https://api.github.com/repos/mostlygeek/llama-swap/releases/latest") as resp: error "No matching asset found with name containing '$ASSET_NAME'."
data = json.load(resp)
for asset in data.get("assets", []):
if ASSET_NAME in asset.get("name", ""):
url = asset["browser_download_url"]
break
else:
print("ERROR: Matching asset not found.", file=sys.stderr)
exit(1)
print("Downloading:", url, file=sys.stderr)
output_path = os.path.join("${TMPDIR}", "llama-swap.tar.gz")
urllib.request.urlretrieve(url, output_path)
print(output_path)
EOF
)
TARFILE=$(python3 -c "$PYTHON_SCRIPT")
if [ ! -f "$TARFILE" ]; then
error "Failed to download binary."
fi fi
status "Extracting to /usr/local/bin" status "Downloading Linux $ARCH binary"
$SUDO tar -xzf "$TARFILE" -C /usr/local/bin llama-swap curl -s -L "$DL_URL" | $SUDO tar -xzf - -C /usr/local/bin llama-swap
} }
download_binary download_binary
@@ -120,7 +96,7 @@ configure_systemd() {
fi fi
status "Adding current user to llama-swap group..." status "Adding current user to llama-swap group..."
$SUDO usermod -a -G llama-swap "$(whoami)" $SUDO usermod -a -G llama-swap $(whoami)
if [ ! -f "/usr/share/llama-swap/config.yaml" ]; then if [ ! -f "/usr/share/llama-swap/config.yaml" ]; then
status "Creating default config.yaml..." status "Creating default config.yaml..."
@@ -162,7 +138,7 @@ User=llama-swap
Group=llama-swap Group=llama-swap
# set this to match your environment # set this to match your environment
ExecStart=/usr/local/bin/llama-swap --config /usr/share/llama-swap/config.yaml --watch-config -listen ${LLAMA_SWAP_DEFAULT_ADDRESS} ExecStart=/usr/local/bin/llama-swap --config /usr/share/llama-swap/config.yaml --watch-config
Restart=on-failure Restart=on-failure
RestartSec=3 RestartSec=3
@@ -196,7 +172,7 @@ if available systemctl; then
fi fi
install_success() { install_success() {
status "The llama-swap API is now available at http://${LLAMA_SWAP_DEFAULT_ADDRESS}" status 'The llama-swap API is now available at 127.0.0.1:8080.'
status 'Customize the config file at /usr/share/llama-swap/config.yaml.' status 'Customize the config file at /usr/share/llama-swap/config.yaml.'
status 'Install complete.' status 'Install complete.'
} }