Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 02ee29d881 | |||
| b2a891f8f4 | |||
| 8d2b568897 | |||
| fb44cf4e08 | |||
| 02aee4e86d | |||
| f45896d395 | |||
| f7e46a359f | |||
| c260907415 |
@@ -15,7 +15,8 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
platform: [intel, cuda, vulkan, cpu, musa]
|
||||
#platform: [intel, cuda, vulkan, cpu, musa]
|
||||
platform: [cuda, vulkan, cpu, musa]
|
||||
fail-fast: false
|
||||
steps:
|
||||
- name: Checkout code
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
|
||||
llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
|
||||
|
||||
Written in golang, it is very easy to install (single binary with no dependancies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images.
|
||||
Written in golang, it is very easy to install (single binary with no dependencies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images.
|
||||
|
||||
## Features:
|
||||
|
||||
@@ -63,9 +63,10 @@ models:
|
||||
<summary>But also very powerful ...</summary>
|
||||
|
||||
```yaml
|
||||
# Seconds to wait for llama.cpp to load and be ready to serve requests
|
||||
# Default (and minimum) is 15 seconds
|
||||
healthCheckTimeout: 60
|
||||
# Seconds to wait for upstream to load and be ready to serve requests
|
||||
# minimum is 15 seconds
|
||||
# default is 120 seconds
|
||||
healthCheckTimeout: 500
|
||||
|
||||
# Valid log levels: debug, info (default), warn, error
|
||||
logLevel: info
|
||||
|
||||
@@ -223,13 +223,13 @@ runloop:
|
||||
if countSigInt > 1 {
|
||||
break runloop
|
||||
} else {
|
||||
log.Println("Recieved SIGINT, send another SIGINT to shutdown")
|
||||
log.Println("Received SIGINT, send another SIGINT to shutdown")
|
||||
}
|
||||
case syscall.SIGTERM:
|
||||
if *ignoreSigTerm {
|
||||
log.Println("Ignoring SIGTERM")
|
||||
} else {
|
||||
log.Println("Recieved SIGTERM, shutting down")
|
||||
log.Println("Received SIGTERM, shutting down")
|
||||
break runloop
|
||||
}
|
||||
default:
|
||||
|
||||
+6
-1
@@ -113,7 +113,12 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
|
||||
return Config{}, err
|
||||
}
|
||||
|
||||
if config.HealthCheckTimeout < 15 {
|
||||
if config.HealthCheckTimeout == 0 {
|
||||
// this high default timeout helps avoid failing health checks
|
||||
// for configurations that wait for docker or have slower startup
|
||||
config.HealthCheckTimeout = 120
|
||||
} else if config.HealthCheckTimeout < 15 {
|
||||
// set a minimum of 15 seconds
|
||||
config.HealthCheckTimeout = 15
|
||||
}
|
||||
|
||||
|
||||
+8
-3
@@ -81,9 +81,8 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
|
||||
concurrentLimit := 10
|
||||
if config.ConcurrencyLimit > 0 {
|
||||
concurrentLimit = config.ConcurrencyLimit
|
||||
} else {
|
||||
proxyLogger.Debugf("Concurrency limit for model %s not set, defaulting to 10", ID)
|
||||
}
|
||||
|
||||
return &Process{
|
||||
ID: ID,
|
||||
config: config,
|
||||
@@ -101,7 +100,7 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
|
||||
concurrencyLimitSemaphore: make(chan struct{}, concurrentLimit),
|
||||
|
||||
// stop timeout
|
||||
gracefulStopTimeout: 5 * time.Second,
|
||||
gracefulStopTimeout: 10 * time.Second,
|
||||
upstreamWasStoppedWithKill: false,
|
||||
}
|
||||
}
|
||||
@@ -389,8 +388,14 @@ func (p *Process) StopImmediately() {
|
||||
// is in the state of starting, it will cancel it and shut it down. Once a process is in
|
||||
// the StateShutdown state, it can not be started again.
|
||||
func (p *Process) Shutdown() {
|
||||
if !isValidTransition(p.CurrentState(), StateStopping) {
|
||||
return
|
||||
}
|
||||
|
||||
p.shutdownCancel()
|
||||
p.stopCommand(p.gracefulStopTimeout)
|
||||
|
||||
// just force it to this state since there is no recovery from shutdown
|
||||
p.state = StateShutdown
|
||||
}
|
||||
|
||||
|
||||
@@ -319,7 +319,7 @@ func (pm *ProxyManager) proxyToUpstream(c *gin.Context) {
|
||||
func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
|
||||
var html strings.Builder
|
||||
|
||||
html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><ul>")
|
||||
html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><a href=\"/unload\">Unload all models</a><ul>")
|
||||
|
||||
// Extract keys and sort them
|
||||
var modelIDs []string
|
||||
|
||||
@@ -0,0 +1,213 @@
|
||||
#!/bin/sh
|
||||
# This script installs llama-swap on Linux.
|
||||
# It detects the current operating system architecture and installs the appropriate version of llama-swap.
|
||||
|
||||
set -eu
|
||||
|
||||
LLAMA_SWAP_DEFAULT_ADDRESS=${LLAMA_SWAP_DEFAULT_ADDRESS:-"127.0.0.1:8080"}
|
||||
|
||||
red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
|
||||
plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
|
||||
|
||||
status() { echo ">>> $*" >&2; }
|
||||
error() { echo "${red}ERROR:${plain} $*"; exit 1; }
|
||||
warning() { echo "${red}WARNING:${plain} $*"; }
|
||||
|
||||
available() { command -v "$1" >/dev/null; }
|
||||
require() {
|
||||
_MISSING=''
|
||||
for TOOL in "$@"; do
|
||||
if ! available "$TOOL"; then
|
||||
_MISSING="$_MISSING $TOOL"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "$_MISSING"
|
||||
}
|
||||
|
||||
SUDO=
|
||||
if [ "$(id -u)" -ne 0 ]; then
|
||||
if ! available sudo; then
|
||||
error "This script requires superuser permissions. Please re-run as root."
|
||||
fi
|
||||
|
||||
SUDO="sudo"
|
||||
fi
|
||||
|
||||
NEEDS=$(require tee tar python3 mktemp)
|
||||
if [ -n "$NEEDS" ]; then
|
||||
status "ERROR: The following tools are required but missing:"
|
||||
for NEED in $NEEDS; do
|
||||
echo " - $NEED"
|
||||
done
|
||||
exit 1
|
||||
fi
|
||||
|
||||
[ "$(uname -s)" = "Linux" ] || error 'This script is intended to run on Linux only.'
|
||||
|
||||
ARCH=$(uname -m)
|
||||
case "$ARCH" in
|
||||
x86_64) ARCH="amd64" ;;
|
||||
aarch64|arm64) ARCH="arm64" ;;
|
||||
*) error "Unsupported architecture: $ARCH" ;;
|
||||
esac
|
||||
|
||||
IS_WSL2=false
|
||||
|
||||
KERN=$(uname -r)
|
||||
case "$KERN" in
|
||||
*icrosoft*WSL2 | *icrosoft*wsl2) IS_WSL2=true;;
|
||||
*icrosoft) error "Microsoft WSL1 is not currently supported. Please use WSL2 with 'wsl --set-version <distro> 2'" ;;
|
||||
*) ;;
|
||||
esac
|
||||
|
||||
download_binary() {
|
||||
ASSET_NAME="linux_$ARCH"
|
||||
|
||||
TMPDIR=$(mktemp -d)
|
||||
trap 'rm -rf "${TMPDIR}"' EXIT INT TERM HUP
|
||||
PYTHON_SCRIPT=$(cat <<EOF
|
||||
import os
|
||||
import json
|
||||
import sys
|
||||
import urllib.request
|
||||
|
||||
ASSET_NAME = "${ASSET_NAME}"
|
||||
|
||||
with urllib.request.urlopen("https://api.github.com/repos/mostlygeek/llama-swap/releases/latest") as resp:
|
||||
data = json.load(resp)
|
||||
for asset in data.get("assets", []):
|
||||
if ASSET_NAME in asset.get("name", ""):
|
||||
url = asset["browser_download_url"]
|
||||
break
|
||||
else:
|
||||
print("ERROR: Matching asset not found.", file=sys.stderr)
|
||||
exit(1)
|
||||
|
||||
print("Downloading:", url, file=sys.stderr)
|
||||
output_path = os.path.join("${TMPDIR}", "llama-swap.tar.gz")
|
||||
urllib.request.urlretrieve(url, output_path)
|
||||
print(output_path)
|
||||
EOF
|
||||
)
|
||||
|
||||
TARFILE=$(python3 -c "$PYTHON_SCRIPT")
|
||||
if [ ! -f "$TARFILE" ]; then
|
||||
error "Failed to download binary."
|
||||
fi
|
||||
|
||||
status "Extracting to /usr/local/bin"
|
||||
$SUDO tar -xzf "$TARFILE" -C /usr/local/bin llama-swap
|
||||
}
|
||||
download_binary
|
||||
|
||||
configure_systemd() {
|
||||
if ! id llama-swap >/dev/null 2>&1; then
|
||||
status "Creating llama-swap user..."
|
||||
$SUDO useradd -r -s /bin/false -U -m -d /usr/share/llama-swap llama-swap
|
||||
fi
|
||||
if getent group render >/dev/null 2>&1; then
|
||||
status "Adding llama-swap user to render group..."
|
||||
$SUDO usermod -a -G render llama-swap
|
||||
fi
|
||||
if getent group video >/dev/null 2>&1; then
|
||||
status "Adding llama-swap user to video group..."
|
||||
$SUDO usermod -a -G video llama-swap
|
||||
fi
|
||||
if getent group docker >/dev/null 2>&1; then
|
||||
status "Adding llama-swap user to docker group..."
|
||||
$SUDO usermod -a -G docker llama-swap
|
||||
fi
|
||||
|
||||
status "Adding current user to llama-swap group..."
|
||||
$SUDO usermod -a -G llama-swap "$(whoami)"
|
||||
|
||||
if [ ! -f "/usr/share/llama-swap/config.yaml" ]; then
|
||||
status "Creating default config.yaml..."
|
||||
cat <<EOF | $SUDO -u llama-swap tee /usr/share/llama-swap/config.yaml >/dev/null
|
||||
# default 15s likely to fail for default models due to downloading models
|
||||
healthCheckTimeout: 60
|
||||
|
||||
models:
|
||||
"qwen2.5":
|
||||
cmd: |
|
||||
docker run
|
||||
--rm
|
||||
-p \${PORT}:8080
|
||||
--name qwen2.5
|
||||
ghcr.io/ggml-org/llama.cpp:server
|
||||
-hf bartowski/Qwen2.5-0.5B-Instruct-GGUF:Q4_K_M
|
||||
cmdStop: docker stop qwen2.5
|
||||
|
||||
"smollm2":
|
||||
cmd: |
|
||||
docker run
|
||||
--rm
|
||||
-p \${PORT}:8080
|
||||
--name smollm2
|
||||
ghcr.io/ggml-org/llama.cpp:server
|
||||
-hf bartowski/SmolLM2-135M-Instruct-GGUF:Q4_K_M
|
||||
cmdStop: docker stop smollm2
|
||||
EOF
|
||||
fi
|
||||
|
||||
status "Creating llama-swap systemd service..."
|
||||
cat <<EOF | $SUDO tee /etc/systemd/system/llama-swap.service >/dev/null
|
||||
[Unit]
|
||||
Description=llama-swap
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
User=llama-swap
|
||||
Group=llama-swap
|
||||
|
||||
# set this to match your environment
|
||||
ExecStart=/usr/local/bin/llama-swap --config /usr/share/llama-swap/config.yaml --watch-config -listen ${LLAMA_SWAP_DEFAULT_ADDRESS}
|
||||
|
||||
Restart=on-failure
|
||||
RestartSec=3
|
||||
StartLimitBurst=3
|
||||
StartLimitInterval=30
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
SYSTEMCTL_RUNNING="$(systemctl is-system-running || true)"
|
||||
case $SYSTEMCTL_RUNNING in
|
||||
running|degraded)
|
||||
status "Enabling and starting llama-swap service..."
|
||||
$SUDO systemctl daemon-reload
|
||||
$SUDO systemctl enable llama-swap
|
||||
|
||||
start_service() { $SUDO systemctl restart llama-swap; }
|
||||
trap start_service EXIT
|
||||
;;
|
||||
*)
|
||||
warning "systemd is not running"
|
||||
if [ "$IS_WSL2" = true ]; then
|
||||
warning "see https://learn.microsoft.com/en-us/windows/wsl/systemd#how-to-enable-systemd to enable it"
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
if available systemctl; then
|
||||
configure_systemd
|
||||
fi
|
||||
|
||||
install_success() {
|
||||
status "The llama-swap API is now available at http://${LLAMA_SWAP_DEFAULT_ADDRESS}"
|
||||
status 'Customize the config file at /usr/share/llama-swap/config.yaml.'
|
||||
status 'Install complete.'
|
||||
}
|
||||
|
||||
# WSL2 only supports GPUs via nvidia passthrough
|
||||
# so check for nvidia-smi to determine if GPU is available
|
||||
if [ "$IS_WSL2" = true ]; then
|
||||
if available nvidia-smi && [ -n "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\.[0-9]*")" ]; then
|
||||
status "Nvidia GPU detected."
|
||||
fi
|
||||
exit 0
|
||||
fi
|
||||
|
||||
install_success
|
||||
@@ -0,0 +1,68 @@
|
||||
#!/bin/sh
|
||||
# This script uninstalls llama-swap on Linux.
|
||||
# It removes the binary, systemd service, config.yaml (optional), and llama-swap user and group.
|
||||
|
||||
set -eu
|
||||
|
||||
red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
|
||||
plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
|
||||
|
||||
status() { echo ">>> $*" >&2; }
|
||||
error() { echo "${red}ERROR:${plain} $*"; exit 1; }
|
||||
warning() { echo "${red}WARNING:${plain} $*"; }
|
||||
|
||||
available() { command -v $1 >/dev/null; }
|
||||
|
||||
SUDO=
|
||||
if [ "$(id -u)" -ne 0 ]; then
|
||||
if ! available sudo; then
|
||||
error "This script requires superuser permissions. Please re-run as root."
|
||||
fi
|
||||
|
||||
SUDO="sudo"
|
||||
fi
|
||||
|
||||
configure_systemd() {
|
||||
status "Stopping llama-swap service..."
|
||||
$SUDO systemctl stop llama-swap
|
||||
|
||||
status "Disabling llama-swap service..."
|
||||
$SUDO systemctl disable llama-swap
|
||||
}
|
||||
if available systemctl; then
|
||||
configure_systemd
|
||||
fi
|
||||
|
||||
if available llama-swap; then
|
||||
status "Removing llama-swap binary..."
|
||||
$SUDO rm $(which llama-swap)
|
||||
fi
|
||||
|
||||
if [ -f "/usr/share/llama-swap/config.yaml" ]; then
|
||||
while true; do
|
||||
printf "Delete config.yaml (/usr/share/llama-swap/config.yaml)? [y/N] " >&2
|
||||
read answer
|
||||
case "$answer" in
|
||||
[Yy]* )
|
||||
$SUDO rm -r /usr/share/llama-swap
|
||||
break
|
||||
;;
|
||||
[Nn]* | "" )
|
||||
break
|
||||
;;
|
||||
* )
|
||||
echo "Invalid input. Please enter y or n."
|
||||
;;
|
||||
esac
|
||||
done
|
||||
fi
|
||||
|
||||
if id llama-swap >/dev/null 2>&1; then
|
||||
status "Removing llama-swap user..."
|
||||
$SUDO userdel llama-swap
|
||||
fi
|
||||
|
||||
if getent group llama-swap >/dev/null 2>&1; then
|
||||
status "Removing llama-swap group..."
|
||||
$SUDO groupdel llama-swap
|
||||
fi
|
||||
Reference in New Issue
Block a user