Compare commits

..

4 Commits

Author SHA1 Message Date
Benson Wong 02ee29d881 increase default healthCheckTimeout to 120s 2025-05-26 09:57:53 -07:00
Benson Wong b2a891f8f4 Disable building of intel container until it's fixed upstream 2025-05-23 22:54:43 -07:00
Yuta Hayashibe 8d2b568897 Improve install script (#144)
* Use `python3` instead of `curl` and `jq`

* Use quote to word splitting

* Remove undefined `local` in POSIX sh

* Added `LLAMA_SWAP_DEFAULT_ADDRESS` to customize the server address

* Added `mktemp` to `NEEDS`
2025-05-23 09:39:55 -07:00
Yuta Hayashibe fb44cf4e08 Fix typos (#143) 2025-05-23 08:40:15 -07:00
6 changed files with 59 additions and 28 deletions
+2 -1
View File
@@ -15,7 +15,8 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
matrix: matrix:
platform: [intel, cuda, vulkan, cpu, musa] #platform: [intel, cuda, vulkan, cpu, musa]
platform: [cuda, vulkan, cpu, musa]
fail-fast: false fail-fast: false
steps: steps:
- name: Checkout code - name: Checkout code
+5 -4
View File
@@ -7,7 +7,7 @@
llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server. llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
Written in golang, it is very easy to install (single binary with no dependancies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images. Written in golang, it is very easy to install (single binary with no dependencies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images.
## Features: ## Features:
@@ -63,9 +63,10 @@ models:
<summary>But also very powerful ...</summary> <summary>But also very powerful ...</summary>
```yaml ```yaml
# Seconds to wait for llama.cpp to load and be ready to serve requests # Seconds to wait for upstream to load and be ready to serve requests
# Default (and minimum) is 15 seconds # minimum is 15 seconds
healthCheckTimeout: 60 # default is 120 seconds
healthCheckTimeout: 500
# Valid log levels: debug, info (default), warn, error # Valid log levels: debug, info (default), warn, error
logLevel: info logLevel: info
+2 -2
View File
@@ -223,13 +223,13 @@ runloop:
if countSigInt > 1 { if countSigInt > 1 {
break runloop break runloop
} else { } else {
log.Println("Recieved SIGINT, send another SIGINT to shutdown") log.Println("Received SIGINT, send another SIGINT to shutdown")
} }
case syscall.SIGTERM: case syscall.SIGTERM:
if *ignoreSigTerm { if *ignoreSigTerm {
log.Println("Ignoring SIGTERM") log.Println("Ignoring SIGTERM")
} else { } else {
log.Println("Recieved SIGTERM, shutting down") log.Println("Received SIGTERM, shutting down")
break runloop break runloop
} }
default: default:
+6 -1
View File
@@ -113,7 +113,12 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
return Config{}, err return Config{}, err
} }
if config.HealthCheckTimeout < 15 { if config.HealthCheckTimeout == 0 {
// this high default timeout helps avoid failing health checks
// for configurations that wait for docker or have slower startup
config.HealthCheckTimeout = 120
} else if config.HealthCheckTimeout < 15 {
// set a minimum of 15 seconds
config.HealthCheckTimeout = 15 config.HealthCheckTimeout = 15
} }
+1 -1
View File
@@ -100,7 +100,7 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
concurrencyLimitSemaphore: make(chan struct{}, concurrentLimit), concurrencyLimitSemaphore: make(chan struct{}, concurrentLimit),
// stop timeout // stop timeout
gracefulStopTimeout: 5 * time.Second, gracefulStopTimeout: 10 * time.Second,
upstreamWasStoppedWithKill: false, upstreamWasStoppedWithKill: false,
} }
} }
+43 -19
View File
@@ -4,6 +4,8 @@
set -eu set -eu
LLAMA_SWAP_DEFAULT_ADDRESS=${LLAMA_SWAP_DEFAULT_ADDRESS:-"127.0.0.1:8080"}
red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)" red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
plain="$( (/usr/bin/tput sgr0 || :) 2>&-)" plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
@@ -11,16 +13,16 @@ status() { echo ">>> $*" >&2; }
error() { echo "${red}ERROR:${plain} $*"; exit 1; } error() { echo "${red}ERROR:${plain} $*"; exit 1; }
warning() { echo "${red}WARNING:${plain} $*"; } warning() { echo "${red}WARNING:${plain} $*"; }
available() { command -v $1 >/dev/null; } available() { command -v "$1" >/dev/null; }
require() { require() {
local MISSING='' _MISSING=''
for TOOL in $*; do for TOOL in "$@"; do
if ! available $TOOL; then if ! available "$TOOL"; then
MISSING="$MISSING $TOOL" _MISSING="$_MISSING $TOOL"
fi fi
done done
echo $MISSING echo "$_MISSING"
} }
SUDO= SUDO=
@@ -32,7 +34,7 @@ if [ "$(id -u)" -ne 0 ]; then
SUDO="sudo" SUDO="sudo"
fi fi
NEEDS=$(require curl tee jq tar) NEEDS=$(require tee tar python3 mktemp)
if [ -n "$NEEDS" ]; then if [ -n "$NEEDS" ]; then
status "ERROR: The following tools are required but missing:" status "ERROR: The following tools are required but missing:"
for NEED in $NEEDS; do for NEED in $NEEDS; do
@@ -62,18 +64,40 @@ esac
download_binary() { download_binary() {
ASSET_NAME="linux_$ARCH" ASSET_NAME="linux_$ARCH"
# Fetch the latest release info and extract the matching asset URL TMPDIR=$(mktemp -d)
DL_URL=$(curl -s "https://api.github.com/repos/mostlygeek/llama-swap/releases/latest" | \ trap 'rm -rf "${TMPDIR}"' EXIT INT TERM HUP
jq -r --arg name "$ASSET_NAME" \ PYTHON_SCRIPT=$(cat <<EOF
'.assets[] | select(.name | contains($name)) | .browser_download_url') import os
import json
import sys
import urllib.request
# Check if a URL was successfully extracted ASSET_NAME = "${ASSET_NAME}"
if [ -z "$DL_URL" ]; then
error "No matching asset found with name containing '$ASSET_NAME'." with urllib.request.urlopen("https://api.github.com/repos/mostlygeek/llama-swap/releases/latest") as resp:
data = json.load(resp)
for asset in data.get("assets", []):
if ASSET_NAME in asset.get("name", ""):
url = asset["browser_download_url"]
break
else:
print("ERROR: Matching asset not found.", file=sys.stderr)
exit(1)
print("Downloading:", url, file=sys.stderr)
output_path = os.path.join("${TMPDIR}", "llama-swap.tar.gz")
urllib.request.urlretrieve(url, output_path)
print(output_path)
EOF
)
TARFILE=$(python3 -c "$PYTHON_SCRIPT")
if [ ! -f "$TARFILE" ]; then
error "Failed to download binary."
fi fi
status "Downloading Linux $ARCH binary" status "Extracting to /usr/local/bin"
curl -s -L "$DL_URL" | $SUDO tar -xzf - -C /usr/local/bin llama-swap $SUDO tar -xzf "$TARFILE" -C /usr/local/bin llama-swap
} }
download_binary download_binary
@@ -96,7 +120,7 @@ configure_systemd() {
fi fi
status "Adding current user to llama-swap group..." status "Adding current user to llama-swap group..."
$SUDO usermod -a -G llama-swap $(whoami) $SUDO usermod -a -G llama-swap "$(whoami)"
if [ ! -f "/usr/share/llama-swap/config.yaml" ]; then if [ ! -f "/usr/share/llama-swap/config.yaml" ]; then
status "Creating default config.yaml..." status "Creating default config.yaml..."
@@ -138,7 +162,7 @@ User=llama-swap
Group=llama-swap Group=llama-swap
# set this to match your environment # set this to match your environment
ExecStart=/usr/local/bin/llama-swap --config /usr/share/llama-swap/config.yaml --watch-config ExecStart=/usr/local/bin/llama-swap --config /usr/share/llama-swap/config.yaml --watch-config -listen ${LLAMA_SWAP_DEFAULT_ADDRESS}
Restart=on-failure Restart=on-failure
RestartSec=3 RestartSec=3
@@ -172,7 +196,7 @@ if available systemctl; then
fi fi
install_success() { install_success() {
status 'The llama-swap API is now available at 127.0.0.1:8080.' status "The llama-swap API is now available at http://${LLAMA_SWAP_DEFAULT_ADDRESS}"
status 'Customize the config file at /usr/share/llama-swap/config.yaml.' status 'Customize the config file at /usr/share/llama-swap/config.yaml.'
status 'Install complete.' status 'Install complete.'
} }