Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 02ee29d881 | |||
| b2a891f8f4 | |||
| 8d2b568897 | |||
| fb44cf4e08 | |||
| 02aee4e86d | |||
| f45896d395 | |||
| f7e46a359f |
@@ -15,7 +15,8 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
platform: [intel, cuda, vulkan, cpu, musa]
|
||||
#platform: [intel, cuda, vulkan, cpu, musa]
|
||||
platform: [cuda, vulkan, cpu, musa]
|
||||
fail-fast: false
|
||||
steps:
|
||||
- name: Checkout code
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
|
||||
llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
|
||||
|
||||
Written in golang, it is very easy to install (single binary with no dependancies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images.
|
||||
Written in golang, it is very easy to install (single binary with no dependencies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images.
|
||||
|
||||
## Features:
|
||||
|
||||
@@ -63,9 +63,10 @@ models:
|
||||
<summary>But also very powerful ...</summary>
|
||||
|
||||
```yaml
|
||||
# Seconds to wait for llama.cpp to load and be ready to serve requests
|
||||
# Default (and minimum) is 15 seconds
|
||||
healthCheckTimeout: 60
|
||||
# Seconds to wait for upstream to load and be ready to serve requests
|
||||
# minimum is 15 seconds
|
||||
# default is 120 seconds
|
||||
healthCheckTimeout: 500
|
||||
|
||||
# Valid log levels: debug, info (default), warn, error
|
||||
logLevel: info
|
||||
|
||||
@@ -223,13 +223,13 @@ runloop:
|
||||
if countSigInt > 1 {
|
||||
break runloop
|
||||
} else {
|
||||
log.Println("Recieved SIGINT, send another SIGINT to shutdown")
|
||||
log.Println("Received SIGINT, send another SIGINT to shutdown")
|
||||
}
|
||||
case syscall.SIGTERM:
|
||||
if *ignoreSigTerm {
|
||||
log.Println("Ignoring SIGTERM")
|
||||
} else {
|
||||
log.Println("Recieved SIGTERM, shutting down")
|
||||
log.Println("Received SIGTERM, shutting down")
|
||||
break runloop
|
||||
}
|
||||
default:
|
||||
|
||||
+6
-1
@@ -113,7 +113,12 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
|
||||
return Config{}, err
|
||||
}
|
||||
|
||||
if config.HealthCheckTimeout < 15 {
|
||||
if config.HealthCheckTimeout == 0 {
|
||||
// this high default timeout helps avoid failing health checks
|
||||
// for configurations that wait for docker or have slower startup
|
||||
config.HealthCheckTimeout = 120
|
||||
} else if config.HealthCheckTimeout < 15 {
|
||||
// set a minimum of 15 seconds
|
||||
config.HealthCheckTimeout = 15
|
||||
}
|
||||
|
||||
|
||||
+8
-3
@@ -81,9 +81,8 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
|
||||
concurrentLimit := 10
|
||||
if config.ConcurrencyLimit > 0 {
|
||||
concurrentLimit = config.ConcurrencyLimit
|
||||
} else {
|
||||
proxyLogger.Debugf("Concurrency limit for model %s not set, defaulting to 10", ID)
|
||||
}
|
||||
|
||||
return &Process{
|
||||
ID: ID,
|
||||
config: config,
|
||||
@@ -101,7 +100,7 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
|
||||
concurrencyLimitSemaphore: make(chan struct{}, concurrentLimit),
|
||||
|
||||
// stop timeout
|
||||
gracefulStopTimeout: 5 * time.Second,
|
||||
gracefulStopTimeout: 10 * time.Second,
|
||||
upstreamWasStoppedWithKill: false,
|
||||
}
|
||||
}
|
||||
@@ -389,8 +388,14 @@ func (p *Process) StopImmediately() {
|
||||
// is in the state of starting, it will cancel it and shut it down. Once a process is in
|
||||
// the StateShutdown state, it can not be started again.
|
||||
func (p *Process) Shutdown() {
|
||||
if !isValidTransition(p.CurrentState(), StateStopping) {
|
||||
return
|
||||
}
|
||||
|
||||
p.shutdownCancel()
|
||||
p.stopCommand(p.gracefulStopTimeout)
|
||||
|
||||
// just force it to this state since there is no recovery from shutdown
|
||||
p.state = StateShutdown
|
||||
}
|
||||
|
||||
|
||||
@@ -319,7 +319,7 @@ func (pm *ProxyManager) proxyToUpstream(c *gin.Context) {
|
||||
func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
|
||||
var html strings.Builder
|
||||
|
||||
html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><ul>")
|
||||
html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><a href=\"/unload\">Unload all models</a><ul>")
|
||||
|
||||
// Extract keys and sort them
|
||||
var modelIDs []string
|
||||
|
||||
+43
-19
@@ -4,6 +4,8 @@
|
||||
|
||||
set -eu
|
||||
|
||||
LLAMA_SWAP_DEFAULT_ADDRESS=${LLAMA_SWAP_DEFAULT_ADDRESS:-"127.0.0.1:8080"}
|
||||
|
||||
red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
|
||||
plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
|
||||
|
||||
@@ -11,16 +13,16 @@ status() { echo ">>> $*" >&2; }
|
||||
error() { echo "${red}ERROR:${plain} $*"; exit 1; }
|
||||
warning() { echo "${red}WARNING:${plain} $*"; }
|
||||
|
||||
available() { command -v $1 >/dev/null; }
|
||||
available() { command -v "$1" >/dev/null; }
|
||||
require() {
|
||||
local MISSING=''
|
||||
for TOOL in $*; do
|
||||
if ! available $TOOL; then
|
||||
MISSING="$MISSING $TOOL"
|
||||
_MISSING=''
|
||||
for TOOL in "$@"; do
|
||||
if ! available "$TOOL"; then
|
||||
_MISSING="$_MISSING $TOOL"
|
||||
fi
|
||||
done
|
||||
|
||||
echo $MISSING
|
||||
echo "$_MISSING"
|
||||
}
|
||||
|
||||
SUDO=
|
||||
@@ -32,7 +34,7 @@ if [ "$(id -u)" -ne 0 ]; then
|
||||
SUDO="sudo"
|
||||
fi
|
||||
|
||||
NEEDS=$(require curl tee jq tar)
|
||||
NEEDS=$(require tee tar python3 mktemp)
|
||||
if [ -n "$NEEDS" ]; then
|
||||
status "ERROR: The following tools are required but missing:"
|
||||
for NEED in $NEEDS; do
|
||||
@@ -62,18 +64,40 @@ esac
|
||||
download_binary() {
|
||||
ASSET_NAME="linux_$ARCH"
|
||||
|
||||
# Fetch the latest release info and extract the matching asset URL
|
||||
DL_URL=$(curl -s "https://api.github.com/repos/mostlygeek/llama-swap/releases/latest" | \
|
||||
jq -r --arg name "$ASSET_NAME" \
|
||||
'.assets[] | select(.name | contains($name)) | .browser_download_url')
|
||||
TMPDIR=$(mktemp -d)
|
||||
trap 'rm -rf "${TMPDIR}"' EXIT INT TERM HUP
|
||||
PYTHON_SCRIPT=$(cat <<EOF
|
||||
import os
|
||||
import json
|
||||
import sys
|
||||
import urllib.request
|
||||
|
||||
# Check if a URL was successfully extracted
|
||||
if [ -z "$DL_URL" ]; then
|
||||
error "No matching asset found with name containing '$ASSET_NAME'."
|
||||
ASSET_NAME = "${ASSET_NAME}"
|
||||
|
||||
with urllib.request.urlopen("https://api.github.com/repos/mostlygeek/llama-swap/releases/latest") as resp:
|
||||
data = json.load(resp)
|
||||
for asset in data.get("assets", []):
|
||||
if ASSET_NAME in asset.get("name", ""):
|
||||
url = asset["browser_download_url"]
|
||||
break
|
||||
else:
|
||||
print("ERROR: Matching asset not found.", file=sys.stderr)
|
||||
exit(1)
|
||||
|
||||
print("Downloading:", url, file=sys.stderr)
|
||||
output_path = os.path.join("${TMPDIR}", "llama-swap.tar.gz")
|
||||
urllib.request.urlretrieve(url, output_path)
|
||||
print(output_path)
|
||||
EOF
|
||||
)
|
||||
|
||||
TARFILE=$(python3 -c "$PYTHON_SCRIPT")
|
||||
if [ ! -f "$TARFILE" ]; then
|
||||
error "Failed to download binary."
|
||||
fi
|
||||
|
||||
status "Downloading Linux $ARCH binary"
|
||||
curl -s -L "$DL_URL" | $SUDO tar -xzf - -C /usr/local/bin llama-swap
|
||||
status "Extracting to /usr/local/bin"
|
||||
$SUDO tar -xzf "$TARFILE" -C /usr/local/bin llama-swap
|
||||
}
|
||||
download_binary
|
||||
|
||||
@@ -96,7 +120,7 @@ configure_systemd() {
|
||||
fi
|
||||
|
||||
status "Adding current user to llama-swap group..."
|
||||
$SUDO usermod -a -G llama-swap $(whoami)
|
||||
$SUDO usermod -a -G llama-swap "$(whoami)"
|
||||
|
||||
if [ ! -f "/usr/share/llama-swap/config.yaml" ]; then
|
||||
status "Creating default config.yaml..."
|
||||
@@ -138,7 +162,7 @@ User=llama-swap
|
||||
Group=llama-swap
|
||||
|
||||
# set this to match your environment
|
||||
ExecStart=/usr/local/bin/llama-swap --config /usr/share/llama-swap/config.yaml --watch-config
|
||||
ExecStart=/usr/local/bin/llama-swap --config /usr/share/llama-swap/config.yaml --watch-config -listen ${LLAMA_SWAP_DEFAULT_ADDRESS}
|
||||
|
||||
Restart=on-failure
|
||||
RestartSec=3
|
||||
@@ -172,7 +196,7 @@ if available systemctl; then
|
||||
fi
|
||||
|
||||
install_success() {
|
||||
status 'The llama-swap API is now available at 127.0.0.1:8080.'
|
||||
status "The llama-swap API is now available at http://${LLAMA_SWAP_DEFAULT_ADDRESS}"
|
||||
status 'Customize the config file at /usr/share/llama-swap/config.yaml.'
|
||||
status 'Install complete.'
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user