increase default healthCheckTimeout to 120s

Disable building of intel container until it's fixed upstream
Improve install script (#144 )
2025-05-26 09:57:53 -07:00 · 2025-05-23 22:54:43 -07:00 · 2025-05-23 09:39:55 -07:00 · 2025-05-23 08:40:15 -07:00 · 2025-05-20 10:43:10 -07:00 · 2025-05-20 10:43:09 -07:00
8 changed files with 324 additions and 18 deletions
@@ -15,7 +15,8 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        platform: [intel, cuda, vulkan, cpu, musa]
+        #platform: [intel, cuda, vulkan, cpu, musa]
        platform: [cuda, vulkan, cpu, musa]
      fail-fast: false
    steps:
      - name: Checkout code
@@ -7,7 +7,7 @@
 llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
-Written in golang, it is very easy to install (single binary with no dependancies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images.
+Written in golang, it is very easy to install (single binary with no dependencies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images.
 ## Features:
@@ -63,9 +63,10 @@ models:
 <summary>But also very powerful ...</summary>
 ```yaml
-# Seconds to wait for llama.cpp to load and be ready to serve requests
+# Seconds to wait for upstream to load and be ready to serve requests
-# Default (and minimum) is 15 seconds
+# minimum is 15 seconds
-healthCheckTimeout: 60
+# default is 120 seconds
 healthCheckTimeout: 500
 # Valid log levels: debug, info (default), warn, error
 logLevel: info
@@ -131,6 +132,8 @@ models:
    # use a custom command to stop the model when swapping. By default
    # this is SIGTERM on POSIX systems, and taskkill on Windows systems
    # the ${PID} variable can be used in cmdStop, it will be automatically replaced
    # with the PID of the running model
    cmdStop: docker stop dockertest
 # Groups provide advanced controls over model swapping behaviour. Using groups
@@ -223,13 +223,13 @@ runloop:
 			if countSigInt > 1 {
 				break runloop
 			} else {
-				log.Println("Recieved SIGINT, send another SIGINT to shutdown")
+				log.Println("Received SIGINT, send another SIGINT to shutdown")
 			}
 		case syscall.SIGTERM:
 			if *ignoreSigTerm {
 				log.Println("Ignoring SIGTERM")
 			} else {
-				log.Println("Recieved SIGTERM, shutting down")
+				log.Println("Received SIGTERM, shutting down")
 				break runloop
 			}
 		default:
@@ -113,7 +113,12 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 		return Config{}, err
 	}
-	if config.HealthCheckTimeout < 15 {
+	if config.HealthCheckTimeout == 0 {
 		// this high default timeout helps avoid failing health checks
 		// for configurations that wait for docker or have slower startup
 		config.HealthCheckTimeout = 120
 	} else if config.HealthCheckTimeout < 15 {
 		// set a minimum of 15 seconds
 		config.HealthCheckTimeout = 15
 	}
@@ -81,9 +81,8 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
 	concurrentLimit := 10
 	if config.ConcurrencyLimit > 0 {
 		concurrentLimit = config.ConcurrencyLimit
 	} else {
 		proxyLogger.Debugf("Concurrency limit for model %s not set, defaulting to 10", ID)
 	}
 	return &Process{
 		ID:                      ID,
 		config:                  config,
@@ -101,7 +100,7 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
 		concurrencyLimitSemaphore: make(chan struct{}, concurrentLimit),
 		// stop timeout
-		gracefulStopTimeout:        5 * time.Second,
+		gracefulStopTimeout:        10 * time.Second,
 		upstreamWasStoppedWithKill: false,
 	}
 }
@@ -149,7 +148,9 @@ func isValidTransition(from, to ProcessState) bool {
 		return to == StateStopping
 	case StateStopping:
 		return to == StateStopped || to == StateShutdown
-	case StateFailed, StateShutdown:
+	case StateFailed:
 		return to == StateStopping
 	case StateShutdown:
 		return false // No transitions allowed from these states
 	}
 	return false
@@ -359,12 +360,19 @@ func (p *Process) StopImmediately() {
 		return
 	}
-	p.proxyLogger.Debugf("<%s> Stopping process", p.ID)
+	p.proxyLogger.Debugf("<%s> Stopping process, current state: %s", p.ID, p.CurrentState())
 	currentState := p.CurrentState()
-	// calling Stop() when state is invalid is a no-op
+	if currentState == StateFailed {
-	if curState, err := p.swapState(StateReady, StateStopping); err != nil {
+		if curState, err := p.swapState(StateFailed, StateStopping); err != nil {
-		p.proxyLogger.Infof("<%s> Stop() Ready -> StateStopping err: %v, current state: %v", p.ID, err, curState)
+			p.proxyLogger.Infof("<%s> Stop() Failed -> StateStopping err: %v, current state: %v", p.ID, err, curState)
-		return
+			return
 		}
 	} else {
 		if curState, err := p.swapState(StateReady, StateStopping); err != nil {
 			p.proxyLogger.Infof("<%s> Stop() Ready -> StateStopping err: %v, current state: %v", p.ID, err, curState)
 			return
 		}
 	}
 	// stop the process with a graceful exit timeout
@@ -380,8 +388,14 @@ func (p *Process) StopImmediately() {
 // is in the state of starting, it will cancel it and shut it down. Once a process is in
 // the StateShutdown state, it can not be started again.
 func (p *Process) Shutdown() {
 	if !isValidTransition(p.CurrentState(), StateStopping) {
 		return
 	}
 	p.shutdownCancel()
 	p.stopCommand(p.gracefulStopTimeout)
 	// just force it to this state since there is no recovery from shutdown
 	p.state = StateShutdown
 }
@@ -319,7 +319,7 @@ func (pm *ProxyManager) proxyToUpstream(c *gin.Context) {
 func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
 	var html strings.Builder
-	html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><ul>")
+	html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><a href=\"/unload\">Unload all models</a><ul>")
 	// Extract keys and sort them
 	var modelIDs []string
@@ -352,6 +352,8 @@ func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
 					stateStr = "Failed"
 				case StateShutdown:
 					stateStr = "Shutdown"
 				case StateStopped:
 					stateStr = "Stopped"
 				default:
 					stateStr = "Unknown"
 				}
@@ -0,0 +1,213 @@
 #!/bin/sh
 # This script installs llama-swap on Linux.
 # It detects the current operating system architecture and installs the appropriate version of llama-swap.
 set -eu
 LLAMA_SWAP_DEFAULT_ADDRESS=${LLAMA_SWAP_DEFAULT_ADDRESS:-"127.0.0.1:8080"}
 red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
 plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
 status() { echo ">>> $*" >&2; }
 error() { echo "${red}ERROR:${plain} $*"; exit 1; }
 warning() { echo "${red}WARNING:${plain} $*"; }
 available() { command -v "$1" >/dev/null; }
 require() {
    _MISSING=''
    for TOOL in "$@"; do
        if ! available "$TOOL"; then
            _MISSING="$_MISSING $TOOL"
        fi
    done
    echo "$_MISSING"
 }
 SUDO=
 if [ "$(id -u)" -ne 0 ]; then
    if ! available sudo; then
        error "This script requires superuser permissions. Please re-run as root."
    fi
    SUDO="sudo"
 fi
 NEEDS=$(require tee tar python3 mktemp)
 if [ -n "$NEEDS" ]; then
    status "ERROR: The following tools are required but missing:"
    for NEED in $NEEDS; do
        echo "  - $NEED"
    done
    exit 1
 fi
 [ "$(uname -s)" = "Linux" ] || error 'This script is intended to run on Linux only.'
 ARCH=$(uname -m)
 case "$ARCH" in
    x86_64) ARCH="amd64" ;;
    aarch64|arm64) ARCH="arm64" ;;
    *) error "Unsupported architecture: $ARCH" ;;
 esac
 IS_WSL2=false
 KERN=$(uname -r)
 case "$KERN" in
    *icrosoft*WSL2 | *icrosoft*wsl2) IS_WSL2=true;;
    *icrosoft) error "Microsoft WSL1 is not currently supported. Please use WSL2 with 'wsl --set-version <distro> 2'" ;;
    *) ;;
 esac
 download_binary() {
    ASSET_NAME="linux_$ARCH"
    TMPDIR=$(mktemp -d)
    trap 'rm -rf "${TMPDIR}"' EXIT INT TERM HUP
    PYTHON_SCRIPT=$(cat <<EOF
 import os
 import json
 import sys
 import urllib.request
 ASSET_NAME = "${ASSET_NAME}"
 with urllib.request.urlopen("https://api.github.com/repos/mostlygeek/llama-swap/releases/latest") as resp:
    data = json.load(resp)
    for asset in data.get("assets", []):
        if ASSET_NAME in asset.get("name", ""):
            url = asset["browser_download_url"]
            break
    else:
        print("ERROR: Matching asset not found.", file=sys.stderr)
        exit(1)
 print("Downloading:", url, file=sys.stderr)
 output_path = os.path.join("${TMPDIR}", "llama-swap.tar.gz")
 urllib.request.urlretrieve(url, output_path)
 print(output_path)
 EOF
 )
    TARFILE=$(python3 -c "$PYTHON_SCRIPT")
    if [ ! -f "$TARFILE" ]; then
        error "Failed to download binary."
    fi
    status "Extracting to /usr/local/bin"
    $SUDO tar -xzf "$TARFILE" -C /usr/local/bin llama-swap
 }
 download_binary
 configure_systemd() {
    if ! id llama-swap >/dev/null 2>&1; then
        status "Creating llama-swap user..."
        $SUDO useradd -r -s /bin/false -U -m -d /usr/share/llama-swap llama-swap
    fi
    if getent group render >/dev/null 2>&1; then
        status "Adding llama-swap user to render group..."
        $SUDO usermod -a -G render llama-swap
    fi
    if getent group video >/dev/null 2>&1; then
        status "Adding llama-swap user to video group..."
        $SUDO usermod -a -G video llama-swap
    fi
    if getent group docker >/dev/null 2>&1; then
        status "Adding llama-swap user to docker group..."
        $SUDO usermod -a -G docker llama-swap
    fi
    status "Adding current user to llama-swap group..."
    $SUDO usermod -a -G llama-swap "$(whoami)"
    if [ ! -f "/usr/share/llama-swap/config.yaml" ]; then
        status "Creating default config.yaml..."
        cat <<EOF | $SUDO -u llama-swap tee /usr/share/llama-swap/config.yaml >/dev/null
 # default 15s likely to fail for default models due to downloading models
 healthCheckTimeout: 60
 models:
  "qwen2.5":
    cmd: |
      docker run
        --rm
        -p \${PORT}:8080
        --name qwen2.5
      ghcr.io/ggml-org/llama.cpp:server
        -hf bartowski/Qwen2.5-0.5B-Instruct-GGUF:Q4_K_M
    cmdStop: docker stop qwen2.5
  "smollm2":
    cmd: |
      docker run
        --rm
        -p \${PORT}:8080
        --name smollm2
      ghcr.io/ggml-org/llama.cpp:server
        -hf bartowski/SmolLM2-135M-Instruct-GGUF:Q4_K_M
    cmdStop: docker stop smollm2
 EOF
    fi
    status "Creating llama-swap systemd service..."
    cat <<EOF | $SUDO tee /etc/systemd/system/llama-swap.service >/dev/null
 [Unit]
 Description=llama-swap
 After=network.target
 [Service]
 User=llama-swap
 Group=llama-swap
 # set this to match your environment
 ExecStart=/usr/local/bin/llama-swap --config /usr/share/llama-swap/config.yaml --watch-config -listen ${LLAMA_SWAP_DEFAULT_ADDRESS}
 Restart=on-failure
 RestartSec=3
 StartLimitBurst=3
 StartLimitInterval=30
 [Install]
 WantedBy=multi-user.target
 EOF
    SYSTEMCTL_RUNNING="$(systemctl is-system-running || true)"
    case $SYSTEMCTL_RUNNING in
        running|degraded)
            status "Enabling and starting llama-swap service..."
            $SUDO systemctl daemon-reload
            $SUDO systemctl enable llama-swap
            start_service() { $SUDO systemctl restart llama-swap; }
            trap start_service EXIT
            ;;
        *)
            warning "systemd is not running"
            if [ "$IS_WSL2" = true ]; then
                warning "see https://learn.microsoft.com/en-us/windows/wsl/systemd#how-to-enable-systemd to enable it"
            fi
            ;;
    esac
 }
 if available systemctl; then
    configure_systemd
 fi
 install_success() {
    status "The llama-swap API is now available at http://${LLAMA_SWAP_DEFAULT_ADDRESS}"
    status 'Customize the config file at /usr/share/llama-swap/config.yaml.'
    status 'Install complete.'
 }
 # WSL2 only supports GPUs via nvidia passthrough
 # so check for nvidia-smi to determine if GPU is available
 if [ "$IS_WSL2" = true ]; then
    if available nvidia-smi && [ -n "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\.[0-9]*")" ]; then
        status "Nvidia GPU detected."
    fi
    exit 0
 fi
 install_success
@@ -0,0 +1,68 @@
 #!/bin/sh
 # This script uninstalls llama-swap on Linux.
 # It removes the binary, systemd service, config.yaml (optional), and llama-swap user and group.
 set -eu
 red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
 plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
 status() { echo ">>> $*" >&2; }
 error() { echo "${red}ERROR:${plain} $*"; exit 1; }
 warning() { echo "${red}WARNING:${plain} $*"; }
 available() { command -v $1 >/dev/null; }
 SUDO=
 if [ "$(id -u)" -ne 0 ]; then
    if ! available sudo; then
        error "This script requires superuser permissions. Please re-run as root."
    fi
    SUDO="sudo"
 fi
 configure_systemd() {
    status "Stopping llama-swap service..."
    $SUDO systemctl stop llama-swap
    status "Disabling llama-swap service..."
    $SUDO systemctl disable llama-swap
 }
 if available systemctl; then
    configure_systemd
 fi
 if available llama-swap; then
    status "Removing llama-swap binary..."
    $SUDO rm $(which llama-swap)
 fi
 if [ -f "/usr/share/llama-swap/config.yaml" ]; then
    while true; do
        printf "Delete config.yaml (/usr/share/llama-swap/config.yaml)? [y/N] " >&2
        read answer
        case "$answer" in
            [Yy]* ) 
                $SUDO rm -r /usr/share/llama-swap
                break
                ;;
            [Nn]* | "" ) 
                break
                ;;
            * ) 
                echo "Invalid input. Please enter y or n."
                ;;
        esac
    done
 fi
 if id llama-swap >/dev/null 2>&1; then
    status "Removing llama-swap user..."
    $SUDO userdel llama-swap
 fi
 if getent group llama-swap >/dev/null 2>&1; then
    status "Removing llama-swap group..."
    $SUDO groupdel llama-swap
 fi
Author	SHA1	Message	Date
Benson Wong	02ee29d881	increase default healthCheckTimeout to 120s	2025-05-26 09:57:53 -07:00
Benson Wong	b2a891f8f4	Disable building of intel container until it's fixed upstream	2025-05-23 22:54:43 -07:00
Yuta Hayashibe	8d2b568897	Improve install script (#144 ) * Use `python3` instead of `curl` and `jq` * Use quote to word splitting * Remove undefined `local` in POSIX sh * Added `LLAMA_SWAP_DEFAULT_ADDRESS` to customize the server address * Added `mktemp` to `NEEDS`	2025-05-23 09:39:55 -07:00
Yuta Hayashibe	fb44cf4e08	Fix typos (#143 )	2025-05-23 08:40:15 -07:00
Benson Wong	02aee4e86d	remove noisy debug print message	2025-05-20 10:43:10 -07:00
Benson Wong	f45896d395	add guard to avoid unnecessary logic in Process.Shutdown	2025-05-20 10:43:09 -07:00
choyuansu	f7e46a359f	Add link to unload endpoint in upstream list (#140 ) * Add link to open /unload	2025-05-20 08:31:44 -07:00
choyuansu	c260907415	Add linux install and uninstall shell scripts (#139 ) Contribution for install, and uninstall llama-swap in linux.	2025-05-19 12:03:33 -07:00
Benson Wong	b83a5fa291	make Failed stated recoverable (#137 ) A process in the failed state can transition to stopped either by calling /unload or swapping to another model.	2025-05-16 19:54:44 -07:00
Benson Wong	6e2ff28d59	improve cmdStop docs [no ci]	2025-05-16 13:52:04 -07:00