Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 02ee29d881 | |||
| b2a891f8f4 | |||
| 8d2b568897 | |||
| fb44cf4e08 |
@@ -15,7 +15,8 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
platform: [intel, cuda, vulkan, cpu, musa]
|
#platform: [intel, cuda, vulkan, cpu, musa]
|
||||||
|
platform: [cuda, vulkan, cpu, musa]
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
|
|||||||
@@ -7,7 +7,7 @@
|
|||||||
|
|
||||||
llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
|
llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
|
||||||
|
|
||||||
Written in golang, it is very easy to install (single binary with no dependancies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images.
|
Written in golang, it is very easy to install (single binary with no dependencies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images.
|
||||||
|
|
||||||
## Features:
|
## Features:
|
||||||
|
|
||||||
@@ -63,9 +63,10 @@ models:
|
|||||||
<summary>But also very powerful ...</summary>
|
<summary>But also very powerful ...</summary>
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
# Seconds to wait for llama.cpp to load and be ready to serve requests
|
# Seconds to wait for upstream to load and be ready to serve requests
|
||||||
# Default (and minimum) is 15 seconds
|
# minimum is 15 seconds
|
||||||
healthCheckTimeout: 60
|
# default is 120 seconds
|
||||||
|
healthCheckTimeout: 500
|
||||||
|
|
||||||
# Valid log levels: debug, info (default), warn, error
|
# Valid log levels: debug, info (default), warn, error
|
||||||
logLevel: info
|
logLevel: info
|
||||||
|
|||||||
@@ -223,13 +223,13 @@ runloop:
|
|||||||
if countSigInt > 1 {
|
if countSigInt > 1 {
|
||||||
break runloop
|
break runloop
|
||||||
} else {
|
} else {
|
||||||
log.Println("Recieved SIGINT, send another SIGINT to shutdown")
|
log.Println("Received SIGINT, send another SIGINT to shutdown")
|
||||||
}
|
}
|
||||||
case syscall.SIGTERM:
|
case syscall.SIGTERM:
|
||||||
if *ignoreSigTerm {
|
if *ignoreSigTerm {
|
||||||
log.Println("Ignoring SIGTERM")
|
log.Println("Ignoring SIGTERM")
|
||||||
} else {
|
} else {
|
||||||
log.Println("Recieved SIGTERM, shutting down")
|
log.Println("Received SIGTERM, shutting down")
|
||||||
break runloop
|
break runloop
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
|
|||||||
+6
-1
@@ -113,7 +113,12 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
|
|||||||
return Config{}, err
|
return Config{}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
if config.HealthCheckTimeout < 15 {
|
if config.HealthCheckTimeout == 0 {
|
||||||
|
// this high default timeout helps avoid failing health checks
|
||||||
|
// for configurations that wait for docker or have slower startup
|
||||||
|
config.HealthCheckTimeout = 120
|
||||||
|
} else if config.HealthCheckTimeout < 15 {
|
||||||
|
// set a minimum of 15 seconds
|
||||||
config.HealthCheckTimeout = 15
|
config.HealthCheckTimeout = 15
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
+1
-1
@@ -100,7 +100,7 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
|
|||||||
concurrencyLimitSemaphore: make(chan struct{}, concurrentLimit),
|
concurrencyLimitSemaphore: make(chan struct{}, concurrentLimit),
|
||||||
|
|
||||||
// stop timeout
|
// stop timeout
|
||||||
gracefulStopTimeout: 5 * time.Second,
|
gracefulStopTimeout: 10 * time.Second,
|
||||||
upstreamWasStoppedWithKill: false,
|
upstreamWasStoppedWithKill: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
+43
-19
@@ -4,6 +4,8 @@
|
|||||||
|
|
||||||
set -eu
|
set -eu
|
||||||
|
|
||||||
|
LLAMA_SWAP_DEFAULT_ADDRESS=${LLAMA_SWAP_DEFAULT_ADDRESS:-"127.0.0.1:8080"}
|
||||||
|
|
||||||
red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
|
red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
|
||||||
plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
|
plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
|
||||||
|
|
||||||
@@ -11,16 +13,16 @@ status() { echo ">>> $*" >&2; }
|
|||||||
error() { echo "${red}ERROR:${plain} $*"; exit 1; }
|
error() { echo "${red}ERROR:${plain} $*"; exit 1; }
|
||||||
warning() { echo "${red}WARNING:${plain} $*"; }
|
warning() { echo "${red}WARNING:${plain} $*"; }
|
||||||
|
|
||||||
available() { command -v $1 >/dev/null; }
|
available() { command -v "$1" >/dev/null; }
|
||||||
require() {
|
require() {
|
||||||
local MISSING=''
|
_MISSING=''
|
||||||
for TOOL in $*; do
|
for TOOL in "$@"; do
|
||||||
if ! available $TOOL; then
|
if ! available "$TOOL"; then
|
||||||
MISSING="$MISSING $TOOL"
|
_MISSING="$_MISSING $TOOL"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
echo $MISSING
|
echo "$_MISSING"
|
||||||
}
|
}
|
||||||
|
|
||||||
SUDO=
|
SUDO=
|
||||||
@@ -32,7 +34,7 @@ if [ "$(id -u)" -ne 0 ]; then
|
|||||||
SUDO="sudo"
|
SUDO="sudo"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
NEEDS=$(require curl tee jq tar)
|
NEEDS=$(require tee tar python3 mktemp)
|
||||||
if [ -n "$NEEDS" ]; then
|
if [ -n "$NEEDS" ]; then
|
||||||
status "ERROR: The following tools are required but missing:"
|
status "ERROR: The following tools are required but missing:"
|
||||||
for NEED in $NEEDS; do
|
for NEED in $NEEDS; do
|
||||||
@@ -62,18 +64,40 @@ esac
|
|||||||
download_binary() {
|
download_binary() {
|
||||||
ASSET_NAME="linux_$ARCH"
|
ASSET_NAME="linux_$ARCH"
|
||||||
|
|
||||||
# Fetch the latest release info and extract the matching asset URL
|
TMPDIR=$(mktemp -d)
|
||||||
DL_URL=$(curl -s "https://api.github.com/repos/mostlygeek/llama-swap/releases/latest" | \
|
trap 'rm -rf "${TMPDIR}"' EXIT INT TERM HUP
|
||||||
jq -r --arg name "$ASSET_NAME" \
|
PYTHON_SCRIPT=$(cat <<EOF
|
||||||
'.assets[] | select(.name | contains($name)) | .browser_download_url')
|
import os
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
# Check if a URL was successfully extracted
|
ASSET_NAME = "${ASSET_NAME}"
|
||||||
if [ -z "$DL_URL" ]; then
|
|
||||||
error "No matching asset found with name containing '$ASSET_NAME'."
|
with urllib.request.urlopen("https://api.github.com/repos/mostlygeek/llama-swap/releases/latest") as resp:
|
||||||
|
data = json.load(resp)
|
||||||
|
for asset in data.get("assets", []):
|
||||||
|
if ASSET_NAME in asset.get("name", ""):
|
||||||
|
url = asset["browser_download_url"]
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print("ERROR: Matching asset not found.", file=sys.stderr)
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
print("Downloading:", url, file=sys.stderr)
|
||||||
|
output_path = os.path.join("${TMPDIR}", "llama-swap.tar.gz")
|
||||||
|
urllib.request.urlretrieve(url, output_path)
|
||||||
|
print(output_path)
|
||||||
|
EOF
|
||||||
|
)
|
||||||
|
|
||||||
|
TARFILE=$(python3 -c "$PYTHON_SCRIPT")
|
||||||
|
if [ ! -f "$TARFILE" ]; then
|
||||||
|
error "Failed to download binary."
|
||||||
fi
|
fi
|
||||||
|
|
||||||
status "Downloading Linux $ARCH binary"
|
status "Extracting to /usr/local/bin"
|
||||||
curl -s -L "$DL_URL" | $SUDO tar -xzf - -C /usr/local/bin llama-swap
|
$SUDO tar -xzf "$TARFILE" -C /usr/local/bin llama-swap
|
||||||
}
|
}
|
||||||
download_binary
|
download_binary
|
||||||
|
|
||||||
@@ -96,7 +120,7 @@ configure_systemd() {
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
status "Adding current user to llama-swap group..."
|
status "Adding current user to llama-swap group..."
|
||||||
$SUDO usermod -a -G llama-swap $(whoami)
|
$SUDO usermod -a -G llama-swap "$(whoami)"
|
||||||
|
|
||||||
if [ ! -f "/usr/share/llama-swap/config.yaml" ]; then
|
if [ ! -f "/usr/share/llama-swap/config.yaml" ]; then
|
||||||
status "Creating default config.yaml..."
|
status "Creating default config.yaml..."
|
||||||
@@ -138,7 +162,7 @@ User=llama-swap
|
|||||||
Group=llama-swap
|
Group=llama-swap
|
||||||
|
|
||||||
# set this to match your environment
|
# set this to match your environment
|
||||||
ExecStart=/usr/local/bin/llama-swap --config /usr/share/llama-swap/config.yaml --watch-config
|
ExecStart=/usr/local/bin/llama-swap --config /usr/share/llama-swap/config.yaml --watch-config -listen ${LLAMA_SWAP_DEFAULT_ADDRESS}
|
||||||
|
|
||||||
Restart=on-failure
|
Restart=on-failure
|
||||||
RestartSec=3
|
RestartSec=3
|
||||||
@@ -172,7 +196,7 @@ if available systemctl; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
install_success() {
|
install_success() {
|
||||||
status 'The llama-swap API is now available at 127.0.0.1:8080.'
|
status "The llama-swap API is now available at http://${LLAMA_SWAP_DEFAULT_ADDRESS}"
|
||||||
status 'Customize the config file at /usr/share/llama-swap/config.yaml.'
|
status 'Customize the config file at /usr/share/llama-swap/config.yaml.'
|
||||||
status 'Install complete.'
|
status 'Install complete.'
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user