Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| e7af671d8e | |||
| 8e62098eef |
@@ -15,8 +15,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
#platform: [intel, cuda, vulkan, cpu, musa]
|
platform: [intel, cuda, vulkan, cpu, musa]
|
||||||
platform: [cuda, vulkan, cpu, musa]
|
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
|
|||||||
@@ -7,7 +7,7 @@
|
|||||||
|
|
||||||
llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
|
llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
|
||||||
|
|
||||||
Written in golang, it is very easy to install (single binary with no dependencies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images.
|
Written in golang, it is very easy to install (single binary with no dependancies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images.
|
||||||
|
|
||||||
## Features:
|
## Features:
|
||||||
|
|
||||||
@@ -45,31 +45,157 @@ llama-swap's configuration is purposefully simple.
|
|||||||
```yaml
|
```yaml
|
||||||
models:
|
models:
|
||||||
"qwen2.5":
|
"qwen2.5":
|
||||||
|
proxy: "http://127.0.0.1:9999"
|
||||||
cmd: |
|
cmd: |
|
||||||
/app/llama-server
|
/app/llama-server
|
||||||
-hf bartowski/Qwen2.5-0.5B-Instruct-GGUF:Q4_K_M
|
-hf bartowski/Qwen2.5-0.5B-Instruct-GGUF:Q4_K_M
|
||||||
--port ${PORT}
|
--port 9999
|
||||||
|
|
||||||
"smollm2":
|
"smollm2":
|
||||||
|
proxy: "http://127.0.0.1:9999"
|
||||||
cmd: |
|
cmd: |
|
||||||
/app/llama-server
|
/app/llama-server
|
||||||
-hf bartowski/SmolLM2-135M-Instruct-GGUF:Q4_K_M
|
-hf bartowski/SmolLM2-135M-Instruct-GGUF:Q4_K_M
|
||||||
--port ${PORT}
|
--port 9999
|
||||||
```
|
```
|
||||||
|
|
||||||
But also very powerful:
|
<details>
|
||||||
|
<summary>But also very powerful ...</summary>
|
||||||
|
|
||||||
- ⚡ `groups` to run multiple models at once
|
```yaml
|
||||||
- ⚡ `macros` for reusable snippets
|
# Seconds to wait for llama.cpp to load and be ready to serve requests
|
||||||
- ⚡ `ttl` to automatically unload models
|
# Default (and minimum) is 15 seconds
|
||||||
- ⚡ `aliases` to use familiar model names (e.g., "gpt-4o-mini")
|
healthCheckTimeout: 60
|
||||||
- ⚡ `env` variables to pass custom environment to inference servers
|
|
||||||
- ⚡ `useModelName` to override model names sent to upstream servers
|
|
||||||
- ⚡ `healthCheckTimeout` to control model startup wait times
|
|
||||||
- ⚡ `${PORT}` automatic port variables for dynamic port assignment
|
|
||||||
- ⚡ Docker/podman compatible
|
|
||||||
|
|
||||||
Check the [wiki](https://github.com/mostlygeek/llama-swap/wiki/Configuration) full documentation.
|
# Valid log levels: debug, info (default), warn, error
|
||||||
|
logLevel: info
|
||||||
|
|
||||||
|
# Automatic Port Values
|
||||||
|
# use ${PORT} in model.cmd and model.proxy to use an automatic port number
|
||||||
|
# when you use ${PORT} you can omit a custom model.proxy value, as it will
|
||||||
|
# default to http://localhost:${PORT}
|
||||||
|
|
||||||
|
# override the default port (5800) for automatic port values
|
||||||
|
startPort: 10001
|
||||||
|
|
||||||
|
# define valid model values and the upstream server start
|
||||||
|
models:
|
||||||
|
"llama":
|
||||||
|
# multiline for readability
|
||||||
|
cmd: |
|
||||||
|
llama-server --port 8999
|
||||||
|
--model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
|
||||||
|
|
||||||
|
# environment variables to pass to the command
|
||||||
|
env:
|
||||||
|
- "CUDA_VISIBLE_DEVICES=0"
|
||||||
|
|
||||||
|
# where to reach the server started by cmd, make sure the ports match
|
||||||
|
# can be omitted if you use an automatic ${PORT} in cmd
|
||||||
|
proxy: http://127.0.0.1:8999
|
||||||
|
|
||||||
|
# aliases names to use this model for
|
||||||
|
aliases:
|
||||||
|
- "gpt-4o-mini"
|
||||||
|
- "gpt-3.5-turbo"
|
||||||
|
|
||||||
|
# check this path for an HTTP 200 OK before serving requests
|
||||||
|
# default: /health to match llama.cpp
|
||||||
|
# use "none" to skip endpoint checking, but may cause HTTP errors
|
||||||
|
# until the model is ready
|
||||||
|
checkEndpoint: /custom-endpoint
|
||||||
|
|
||||||
|
# automatically unload the model after this many seconds
|
||||||
|
# ttl values must be a value greater than 0
|
||||||
|
# default: 0 = never unload model
|
||||||
|
ttl: 60
|
||||||
|
|
||||||
|
# `useModelName` overrides the model name in the request
|
||||||
|
# and sends a specific name to the upstream server
|
||||||
|
useModelName: "qwen:qwq"
|
||||||
|
|
||||||
|
# unlisted models do not show up in /v1/models or /upstream lists
|
||||||
|
# but they can still be requested as normal
|
||||||
|
"qwen-unlisted":
|
||||||
|
unlisted: true
|
||||||
|
cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
|
||||||
|
|
||||||
|
# Docker Support (v26.1.4+ required!)
|
||||||
|
"docker-llama":
|
||||||
|
proxy: "http://127.0.0.1:${PORT}"
|
||||||
|
cmd: |
|
||||||
|
docker run --name dockertest
|
||||||
|
--init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
|
||||||
|
ghcr.io/ggerganov/llama.cpp:server
|
||||||
|
--model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
|
||||||
|
|
||||||
|
# use a custom command to stop the model when swapping. By default
|
||||||
|
# this is SIGTERM on POSIX systems, and taskkill on Windows systems
|
||||||
|
# the ${PID} variable can be used in cmdStop, it will be automatically replaced
|
||||||
|
# with the PID of the running model
|
||||||
|
cmdStop: docker stop dockertest
|
||||||
|
|
||||||
|
# Groups provide advanced controls over model swapping behaviour. Using groups
|
||||||
|
# some models can be kept loaded indefinitely, while others are swapped out.
|
||||||
|
#
|
||||||
|
# Tips:
|
||||||
|
#
|
||||||
|
# - models must be defined above in the Models section
|
||||||
|
# - a model can only be a member of one group
|
||||||
|
# - group behaviour is controlled via the `swap`, `exclusive` and `persistent` fields
|
||||||
|
# - see issue #109 for details
|
||||||
|
#
|
||||||
|
# NOTE: the example below uses model names that are not defined above for demonstration purposes
|
||||||
|
groups:
|
||||||
|
# group1 is the default behaviour of llama-swap where only one model is allowed
|
||||||
|
# to run a time across the whole llama-swap instance
|
||||||
|
"group1":
|
||||||
|
# swap controls the model swapping behaviour in within the group
|
||||||
|
# - true : only one model is allowed to run at a time
|
||||||
|
# - false: all models can run together, no swapping
|
||||||
|
swap: true
|
||||||
|
|
||||||
|
# exclusive controls how the group affects other groups
|
||||||
|
# - true: causes all other groups to unload their models when this group runs a model
|
||||||
|
# - false: does not affect other groups
|
||||||
|
exclusive: true
|
||||||
|
|
||||||
|
# members references the models defined above
|
||||||
|
members:
|
||||||
|
- "llama"
|
||||||
|
- "qwen-unlisted"
|
||||||
|
|
||||||
|
# models in this group are never unloaded
|
||||||
|
"group2":
|
||||||
|
swap: false
|
||||||
|
exclusive: false
|
||||||
|
members:
|
||||||
|
- "docker-llama"
|
||||||
|
# (not defined above, here for example)
|
||||||
|
- "modelA"
|
||||||
|
- "modelB"
|
||||||
|
|
||||||
|
"forever":
|
||||||
|
# setting persistent to true causes the group to never be affected by the swapping behaviour of
|
||||||
|
# other groups. It is a shortcut to keeping some models always loaded.
|
||||||
|
persistent: true
|
||||||
|
|
||||||
|
# set swap/exclusive to false to prevent swapping inside the group and effect on other groups
|
||||||
|
swap: false
|
||||||
|
exclusive: false
|
||||||
|
members:
|
||||||
|
- "forever-modelA"
|
||||||
|
- "forever-modelB"
|
||||||
|
- "forever-modelc"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Use Case Examples
|
||||||
|
|
||||||
|
- [config.example.yaml](config.example.yaml) includes example for supporting `v1/embeddings` and `v1/rerank` endpoints
|
||||||
|
- [Speculative Decoding](examples/speculative-decoding/README.md) - using a small draft model can increase inference speeds from 20% to 40%. This example includes a configurations Qwen2.5-Coder-32B (2.5x increase) and Llama-3.1-70B (1.4x increase) in the best cases.
|
||||||
|
- [Optimizing Code Generation](examples/benchmark-snakegame/README.md) - find the optimal settings for your machine. This example demonstrates defining multiple configurations and testing which one is fastest.
|
||||||
|
- [Restart on Config Change](examples/restart-on-config-change/README.md) - automatically restart llama-swap when trying out different configurations.
|
||||||
|
</details>
|
||||||
|
|
||||||
## Docker Install ([download images](https://github.com/mostlygeek/llama-swap/pkgs/container/llama-swap))
|
## Docker Install ([download images](https://github.com/mostlygeek/llama-swap/pkgs/container/llama-swap))
|
||||||
|
|
||||||
@@ -172,6 +298,32 @@ Any OpenAI compatible server would work. llama-swap was originally designed for
|
|||||||
|
|
||||||
For Python based inference servers like vllm or tabbyAPI it is recommended to run them via podman or docker. This provides clean environment isolation as well as responding correctly to `SIGTERM` signals to shutdown.
|
For Python based inference servers like vllm or tabbyAPI it is recommended to run them via podman or docker. This provides clean environment isolation as well as responding correctly to `SIGTERM` signals to shutdown.
|
||||||
|
|
||||||
|
## Systemd Unit Files
|
||||||
|
|
||||||
|
Use this unit file to start llama-swap on boot. This is only tested on Ubuntu.
|
||||||
|
|
||||||
|
`/etc/systemd/system/llama-swap.service`
|
||||||
|
|
||||||
|
```
|
||||||
|
[Unit]
|
||||||
|
Description=llama-swap
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
User=nobody
|
||||||
|
|
||||||
|
# set this to match your environment
|
||||||
|
ExecStart=/path/to/llama-swap --config /path/to/llama-swap.config.yml
|
||||||
|
|
||||||
|
Restart=on-failure
|
||||||
|
RestartSec=3
|
||||||
|
StartLimitBurst=3
|
||||||
|
StartLimitInterval=30
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
```
|
||||||
|
|
||||||
## Star History
|
## Star History
|
||||||
|
|
||||||
[](https://www.star-history.com/#mostlygeek/llama-swap&Date)
|
[](https://www.star-history.com/#mostlygeek/llama-swap&Date)
|
||||||
|
|||||||
@@ -223,13 +223,13 @@ runloop:
|
|||||||
if countSigInt > 1 {
|
if countSigInt > 1 {
|
||||||
break runloop
|
break runloop
|
||||||
} else {
|
} else {
|
||||||
log.Println("Received SIGINT, send another SIGINT to shutdown")
|
log.Println("Recieved SIGINT, send another SIGINT to shutdown")
|
||||||
}
|
}
|
||||||
case syscall.SIGTERM:
|
case syscall.SIGTERM:
|
||||||
if *ignoreSigTerm {
|
if *ignoreSigTerm {
|
||||||
log.Println("Ignoring SIGTERM")
|
log.Println("Ignoring SIGTERM")
|
||||||
} else {
|
} else {
|
||||||
log.Println("Received SIGTERM, shutting down")
|
log.Println("Recieved SIGTERM, shutting down")
|
||||||
break runloop
|
break runloop
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
|
|||||||
+8
-73
@@ -4,7 +4,6 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"regexp"
|
|
||||||
"runtime"
|
"runtime"
|
||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
@@ -68,9 +67,6 @@ type Config struct {
|
|||||||
Profiles map[string][]string `yaml:"profiles"`
|
Profiles map[string][]string `yaml:"profiles"`
|
||||||
Groups map[string]GroupConfig `yaml:"groups"` /* key is group ID */
|
Groups map[string]GroupConfig `yaml:"groups"` /* key is group ID */
|
||||||
|
|
||||||
// for key/value replacements in model's cmd, cmdStop, proxy, checkEndPoint
|
|
||||||
Macros map[string]string `yaml:"macros"`
|
|
||||||
|
|
||||||
// map aliases to actual model IDs
|
// map aliases to actual model IDs
|
||||||
aliases map[string]string
|
aliases map[string]string
|
||||||
|
|
||||||
@@ -117,12 +113,7 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
|
|||||||
return Config{}, err
|
return Config{}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
if config.HealthCheckTimeout == 0 {
|
if config.HealthCheckTimeout < 15 {
|
||||||
// this high default timeout helps avoid failing health checks
|
|
||||||
// for configurations that wait for docker or have slower startup
|
|
||||||
config.HealthCheckTimeout = 120
|
|
||||||
} else if config.HealthCheckTimeout < 15 {
|
|
||||||
// set a minimum of 15 seconds
|
|
||||||
config.HealthCheckTimeout = 15
|
config.HealthCheckTimeout = 15
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -145,30 +136,6 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* check macro constraint rules:
|
|
||||||
|
|
||||||
- name must fit the regex ^[a-zA-Z0-9_-]+$
|
|
||||||
- names must be less than 64 characters (no reason, just cause)
|
|
||||||
- name can not be any reserved macros: PORT
|
|
||||||
- macro values must be less than 1024 characters
|
|
||||||
*/
|
|
||||||
macroNameRegex := regexp.MustCompile(`^[a-zA-Z0-9_-]+$`)
|
|
||||||
for macroName, macroValue := range config.Macros {
|
|
||||||
if len(macroName) >= 64 {
|
|
||||||
return Config{}, fmt.Errorf("macro name '%s' exceeds maximum length of 63 characters", macroName)
|
|
||||||
}
|
|
||||||
if !macroNameRegex.MatchString(macroName) {
|
|
||||||
return Config{}, fmt.Errorf("macro name '%s' contains invalid characters, must match pattern ^[a-zA-Z0-9_-]+$", macroName)
|
|
||||||
}
|
|
||||||
if len(macroValue) >= 1024 {
|
|
||||||
return Config{}, fmt.Errorf("macro value for '%s' exceeds maximum length of 1024 characters", macroName)
|
|
||||||
}
|
|
||||||
switch macroName {
|
|
||||||
case "PORT":
|
|
||||||
return Config{}, fmt.Errorf("macro name '%s' is reserved and cannot be used", macroName)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get and sort all model IDs first, makes testing more consistent
|
// Get and sort all model IDs first, makes testing more consistent
|
||||||
modelIds := make([]string, 0, len(config.Models))
|
modelIds := make([]string, 0, len(config.Models))
|
||||||
for modelId := range config.Models {
|
for modelId := range config.Models {
|
||||||
@@ -179,51 +146,19 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
|
|||||||
nextPort := config.StartPort
|
nextPort := config.StartPort
|
||||||
for _, modelId := range modelIds {
|
for _, modelId := range modelIds {
|
||||||
modelConfig := config.Models[modelId]
|
modelConfig := config.Models[modelId]
|
||||||
|
// iterate over the models and replace any ${PORT} with the next available port
|
||||||
// go through model config fields: cmd, cmdStop, proxy, checkEndPoint and replace macros with macro values
|
if strings.Contains(modelConfig.Cmd, "${PORT}") {
|
||||||
for macroName, macroValue := range config.Macros {
|
modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, "${PORT}", strconv.Itoa(nextPort))
|
||||||
macroSlug := fmt.Sprintf("${%s}", macroName)
|
|
||||||
modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, macroSlug, macroValue)
|
|
||||||
modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, macroSlug, macroValue)
|
|
||||||
modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, macroSlug, macroValue)
|
|
||||||
modelConfig.CheckEndpoint = strings.ReplaceAll(modelConfig.CheckEndpoint, macroSlug, macroValue)
|
|
||||||
}
|
|
||||||
|
|
||||||
// only iterate over models that use ${PORT} to keep port numbers from increasing unnecessarily
|
|
||||||
if strings.Contains(modelConfig.Cmd, "${PORT}") || strings.Contains(modelConfig.Proxy, "${PORT}") || strings.Contains(modelConfig.CmdStop, "${PORT}") {
|
|
||||||
if modelConfig.Proxy == "" {
|
if modelConfig.Proxy == "" {
|
||||||
modelConfig.Proxy = "http://localhost:${PORT}"
|
modelConfig.Proxy = fmt.Sprintf("http://localhost:%d", nextPort)
|
||||||
|
} else {
|
||||||
|
modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, "${PORT}", strconv.Itoa(nextPort))
|
||||||
}
|
}
|
||||||
|
|
||||||
nextPortStr := strconv.Itoa(nextPort)
|
|
||||||
modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, "${PORT}", nextPortStr)
|
|
||||||
modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, "${PORT}", nextPortStr)
|
|
||||||
modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, "${PORT}", nextPortStr)
|
|
||||||
nextPort++
|
nextPort++
|
||||||
|
config.Models[modelId] = modelConfig
|
||||||
} else if modelConfig.Proxy == "" {
|
} else if modelConfig.Proxy == "" {
|
||||||
return Config{}, fmt.Errorf("model %s requires a proxy value when not using automatic ${PORT}", modelId)
|
return Config{}, fmt.Errorf("model %s requires a proxy value when not using automatic ${PORT}", modelId)
|
||||||
}
|
}
|
||||||
|
|
||||||
// make sure there are no unknown macros that have not been replaced
|
|
||||||
macroPattern := regexp.MustCompile(`\$\{([a-zA-Z0-9_-]+)\}`)
|
|
||||||
fieldMap := map[string]string{
|
|
||||||
"cmd": modelConfig.Cmd,
|
|
||||||
"cmdStop": modelConfig.CmdStop,
|
|
||||||
"proxy": modelConfig.Proxy,
|
|
||||||
"checkEndpoint": modelConfig.CheckEndpoint,
|
|
||||||
}
|
|
||||||
|
|
||||||
for fieldName, fieldValue := range fieldMap {
|
|
||||||
matches := macroPattern.FindAllStringSubmatch(fieldValue, -1)
|
|
||||||
for _, match := range matches {
|
|
||||||
macroName := match[1]
|
|
||||||
if _, exists := config.Macros[macroName]; !exists {
|
|
||||||
return Config{}, fmt.Errorf("unknown macro '${%s}' found in %s.%s", macroName, modelId, fieldName)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
config.Models[modelId] = modelConfig
|
|
||||||
}
|
}
|
||||||
|
|
||||||
config = AddDefaultGroupToConfig(config)
|
config = AddDefaultGroupToConfig(config)
|
||||||
|
|||||||
+2
-110
@@ -19,8 +19,6 @@ func TestConfig_Load(t *testing.T) {
|
|||||||
|
|
||||||
tempFile := filepath.Join(tempDir, "config.yaml")
|
tempFile := filepath.Join(tempDir, "config.yaml")
|
||||||
content := `
|
content := `
|
||||||
macros:
|
|
||||||
svr-path: "path/to/server"
|
|
||||||
models:
|
models:
|
||||||
model1:
|
model1:
|
||||||
cmd: path/to/cmd --arg1 one
|
cmd: path/to/cmd --arg1 one
|
||||||
@@ -33,7 +31,7 @@ models:
|
|||||||
- "VAR2=value2"
|
- "VAR2=value2"
|
||||||
checkEndpoint: "/health"
|
checkEndpoint: "/health"
|
||||||
model2:
|
model2:
|
||||||
cmd: ${svr-path} --arg1 one
|
cmd: path/to/cmd --arg1 one
|
||||||
proxy: "http://localhost:8081"
|
proxy: "http://localhost:8081"
|
||||||
aliases:
|
aliases:
|
||||||
- "m2"
|
- "m2"
|
||||||
@@ -78,9 +76,6 @@ groups:
|
|||||||
|
|
||||||
expected := Config{
|
expected := Config{
|
||||||
StartPort: 5800,
|
StartPort: 5800,
|
||||||
Macros: map[string]string{
|
|
||||||
"svr-path": "path/to/server",
|
|
||||||
},
|
|
||||||
Models: map[string]ModelConfig{
|
Models: map[string]ModelConfig{
|
||||||
"model1": {
|
"model1": {
|
||||||
Cmd: "path/to/cmd --arg1 one",
|
Cmd: "path/to/cmd --arg1 one",
|
||||||
@@ -90,7 +85,7 @@ groups:
|
|||||||
CheckEndpoint: "/health",
|
CheckEndpoint: "/health",
|
||||||
},
|
},
|
||||||
"model2": {
|
"model2": {
|
||||||
Cmd: "path/to/server --arg1 one",
|
Cmd: "path/to/cmd --arg1 one",
|
||||||
Proxy: "http://localhost:8081",
|
Proxy: "http://localhost:8081",
|
||||||
Aliases: []string{"m2"},
|
Aliases: []string{"m2"},
|
||||||
Env: nil,
|
Env: nil,
|
||||||
@@ -336,106 +331,3 @@ models:
|
|||||||
assert.Equal(t, "model model1 requires a proxy value when not using automatic ${PORT}", err.Error())
|
assert.Equal(t, "model model1 requires a proxy value when not using automatic ${PORT}", err.Error())
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestConfig_MacroReplacement(t *testing.T) {
|
|
||||||
content := `
|
|
||||||
startPort: 9990
|
|
||||||
macros:
|
|
||||||
svr-path: "path/to/server"
|
|
||||||
argOne: "--arg1"
|
|
||||||
argTwo: "--arg2"
|
|
||||||
autoPort: "--port ${PORT}"
|
|
||||||
|
|
||||||
models:
|
|
||||||
model1:
|
|
||||||
cmd: |
|
|
||||||
${svr-path} ${argTwo}
|
|
||||||
# the automatic ${PORT} is replaced
|
|
||||||
${autoPort}
|
|
||||||
${argOne}
|
|
||||||
--arg3 three
|
|
||||||
cmdStop: |
|
|
||||||
/path/to/stop.sh --port ${PORT} ${argTwo}
|
|
||||||
`
|
|
||||||
|
|
||||||
config, err := LoadConfigFromReader(strings.NewReader(content))
|
|
||||||
assert.NoError(t, err)
|
|
||||||
sanitizedCmd, err := SanitizeCommand(config.Models["model1"].Cmd)
|
|
||||||
assert.NoError(t, err)
|
|
||||||
assert.Equal(t, "path/to/server --arg2 --port 9990 --arg1 --arg3 three", strings.Join(sanitizedCmd, " "))
|
|
||||||
|
|
||||||
sanitizedCmdStop, err := SanitizeCommand(config.Models["model1"].CmdStop)
|
|
||||||
assert.NoError(t, err)
|
|
||||||
assert.Equal(t, "/path/to/stop.sh --port 9990 --arg2", strings.Join(sanitizedCmdStop, " "))
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestConfig_MacroErrorOnUnknownMacros(t *testing.T) {
|
|
||||||
tests := []struct {
|
|
||||||
name string
|
|
||||||
field string
|
|
||||||
content string
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
name: "unknown macro in cmd",
|
|
||||||
field: "cmd",
|
|
||||||
content: `
|
|
||||||
startPort: 9990
|
|
||||||
macros:
|
|
||||||
svr-path: "path/to/server"
|
|
||||||
models:
|
|
||||||
model1:
|
|
||||||
cmd: |
|
|
||||||
${svr-path} --port ${PORT}
|
|
||||||
${unknownMacro}
|
|
||||||
`,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "unknown macro in cmdStop",
|
|
||||||
field: "cmdStop",
|
|
||||||
content: `
|
|
||||||
startPort: 9990
|
|
||||||
macros:
|
|
||||||
svr-path: "path/to/server"
|
|
||||||
models:
|
|
||||||
model1:
|
|
||||||
cmd: "${svr-path} --port ${PORT}"
|
|
||||||
cmdStop: "kill ${unknownMacro}"
|
|
||||||
`,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "unknown macro in proxy",
|
|
||||||
field: "proxy",
|
|
||||||
content: `
|
|
||||||
startPort: 9990
|
|
||||||
macros:
|
|
||||||
svr-path: "path/to/server"
|
|
||||||
models:
|
|
||||||
model1:
|
|
||||||
cmd: "${svr-path} --port ${PORT}"
|
|
||||||
proxy: "http://localhost:${unknownMacro}"
|
|
||||||
`,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "unknown macro in checkEndpoint",
|
|
||||||
field: "checkEndpoint",
|
|
||||||
content: `
|
|
||||||
startPort: 9990
|
|
||||||
macros:
|
|
||||||
svr-path: "path/to/server"
|
|
||||||
models:
|
|
||||||
model1:
|
|
||||||
cmd: "${svr-path} --port ${PORT}"
|
|
||||||
checkEndpoint: "http://localhost:${unknownMacro}/health"
|
|
||||||
`,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
|
||||||
_, err := LoadConfigFromReader(strings.NewReader(tt.content))
|
|
||||||
assert.Error(t, err)
|
|
||||||
assert.Contains(t, err.Error(), "unknown macro '${unknownMacro}' found in model1."+tt.field)
|
|
||||||
//t.Log(err)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
+1
-1
@@ -100,7 +100,7 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
|
|||||||
concurrencyLimitSemaphore: make(chan struct{}, concurrentLimit),
|
concurrencyLimitSemaphore: make(chan struct{}, concurrentLimit),
|
||||||
|
|
||||||
// stop timeout
|
// stop timeout
|
||||||
gracefulStopTimeout: 10 * time.Second,
|
gracefulStopTimeout: 5 * time.Second,
|
||||||
upstreamWasStoppedWithKill: false,
|
upstreamWasStoppedWithKill: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -319,7 +319,7 @@ func (pm *ProxyManager) proxyToUpstream(c *gin.Context) {
|
|||||||
func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
|
func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
|
||||||
var html strings.Builder
|
var html strings.Builder
|
||||||
|
|
||||||
html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><a href=\"/unload\">Unload all models</a><ul>")
|
html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><ul>")
|
||||||
|
|
||||||
// Extract keys and sort them
|
// Extract keys and sort them
|
||||||
var modelIDs []string
|
var modelIDs []string
|
||||||
|
|||||||
+19
-43
@@ -4,8 +4,6 @@
|
|||||||
|
|
||||||
set -eu
|
set -eu
|
||||||
|
|
||||||
LLAMA_SWAP_DEFAULT_ADDRESS=${LLAMA_SWAP_DEFAULT_ADDRESS:-"127.0.0.1:8080"}
|
|
||||||
|
|
||||||
red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
|
red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
|
||||||
plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
|
plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
|
||||||
|
|
||||||
@@ -13,16 +11,16 @@ status() { echo ">>> $*" >&2; }
|
|||||||
error() { echo "${red}ERROR:${plain} $*"; exit 1; }
|
error() { echo "${red}ERROR:${plain} $*"; exit 1; }
|
||||||
warning() { echo "${red}WARNING:${plain} $*"; }
|
warning() { echo "${red}WARNING:${plain} $*"; }
|
||||||
|
|
||||||
available() { command -v "$1" >/dev/null; }
|
available() { command -v $1 >/dev/null; }
|
||||||
require() {
|
require() {
|
||||||
_MISSING=''
|
local MISSING=''
|
||||||
for TOOL in "$@"; do
|
for TOOL in $*; do
|
||||||
if ! available "$TOOL"; then
|
if ! available $TOOL; then
|
||||||
_MISSING="$_MISSING $TOOL"
|
MISSING="$MISSING $TOOL"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
echo "$_MISSING"
|
echo $MISSING
|
||||||
}
|
}
|
||||||
|
|
||||||
SUDO=
|
SUDO=
|
||||||
@@ -34,7 +32,7 @@ if [ "$(id -u)" -ne 0 ]; then
|
|||||||
SUDO="sudo"
|
SUDO="sudo"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
NEEDS=$(require tee tar python3 mktemp)
|
NEEDS=$(require curl tee jq tar)
|
||||||
if [ -n "$NEEDS" ]; then
|
if [ -n "$NEEDS" ]; then
|
||||||
status "ERROR: The following tools are required but missing:"
|
status "ERROR: The following tools are required but missing:"
|
||||||
for NEED in $NEEDS; do
|
for NEED in $NEEDS; do
|
||||||
@@ -64,40 +62,18 @@ esac
|
|||||||
download_binary() {
|
download_binary() {
|
||||||
ASSET_NAME="linux_$ARCH"
|
ASSET_NAME="linux_$ARCH"
|
||||||
|
|
||||||
TMPDIR=$(mktemp -d)
|
# Fetch the latest release info and extract the matching asset URL
|
||||||
trap 'rm -rf "${TMPDIR}"' EXIT INT TERM HUP
|
DL_URL=$(curl -s "https://api.github.com/repos/mostlygeek/llama-swap/releases/latest" | \
|
||||||
PYTHON_SCRIPT=$(cat <<EOF
|
jq -r --arg name "$ASSET_NAME" \
|
||||||
import os
|
'.assets[] | select(.name | contains($name)) | .browser_download_url')
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
import urllib.request
|
|
||||||
|
|
||||||
ASSET_NAME = "${ASSET_NAME}"
|
# Check if a URL was successfully extracted
|
||||||
|
if [ -z "$DL_URL" ]; then
|
||||||
with urllib.request.urlopen("https://api.github.com/repos/mostlygeek/llama-swap/releases/latest") as resp:
|
error "No matching asset found with name containing '$ASSET_NAME'."
|
||||||
data = json.load(resp)
|
|
||||||
for asset in data.get("assets", []):
|
|
||||||
if ASSET_NAME in asset.get("name", ""):
|
|
||||||
url = asset["browser_download_url"]
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
print("ERROR: Matching asset not found.", file=sys.stderr)
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
print("Downloading:", url, file=sys.stderr)
|
|
||||||
output_path = os.path.join("${TMPDIR}", "llama-swap.tar.gz")
|
|
||||||
urllib.request.urlretrieve(url, output_path)
|
|
||||||
print(output_path)
|
|
||||||
EOF
|
|
||||||
)
|
|
||||||
|
|
||||||
TARFILE=$(python3 -c "$PYTHON_SCRIPT")
|
|
||||||
if [ ! -f "$TARFILE" ]; then
|
|
||||||
error "Failed to download binary."
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
status "Extracting to /usr/local/bin"
|
status "Downloading Linux $ARCH binary"
|
||||||
$SUDO tar -xzf "$TARFILE" -C /usr/local/bin llama-swap
|
curl -s -L "$DL_URL" | $SUDO tar -xzf - -C /usr/local/bin llama-swap
|
||||||
}
|
}
|
||||||
download_binary
|
download_binary
|
||||||
|
|
||||||
@@ -120,7 +96,7 @@ configure_systemd() {
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
status "Adding current user to llama-swap group..."
|
status "Adding current user to llama-swap group..."
|
||||||
$SUDO usermod -a -G llama-swap "$(whoami)"
|
$SUDO usermod -a -G llama-swap $(whoami)
|
||||||
|
|
||||||
if [ ! -f "/usr/share/llama-swap/config.yaml" ]; then
|
if [ ! -f "/usr/share/llama-swap/config.yaml" ]; then
|
||||||
status "Creating default config.yaml..."
|
status "Creating default config.yaml..."
|
||||||
@@ -162,7 +138,7 @@ User=llama-swap
|
|||||||
Group=llama-swap
|
Group=llama-swap
|
||||||
|
|
||||||
# set this to match your environment
|
# set this to match your environment
|
||||||
ExecStart=/usr/local/bin/llama-swap --config /usr/share/llama-swap/config.yaml --watch-config -listen ${LLAMA_SWAP_DEFAULT_ADDRESS}
|
ExecStart=/usr/local/bin/llama-swap --config /usr/share/llama-swap/config.yaml --watch-config
|
||||||
|
|
||||||
Restart=on-failure
|
Restart=on-failure
|
||||||
RestartSec=3
|
RestartSec=3
|
||||||
@@ -196,7 +172,7 @@ if available systemctl; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
install_success() {
|
install_success() {
|
||||||
status "The llama-swap API is now available at http://${LLAMA_SWAP_DEFAULT_ADDRESS}"
|
status 'The llama-swap API is now available at 127.0.0.1:8080.'
|
||||||
status 'Customize the config file at /usr/share/llama-swap/config.yaml.'
|
status 'Customize the config file at /usr/share/llama-swap/config.yaml.'
|
||||||
status 'Install complete.'
|
status 'Install complete.'
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user