Compare commits

..

6 Commits

Author SHA1 Message Date
Benson Wong 6cf0962807 Add custom check endpoint
Replace previously hardcoded value for /health to check when the server became ready to serve traffic. With this the server can support any server that provides an an OpenAI compatible inference endpoint.
2024-10-11 22:02:14 -07:00
Benson Wong 8eb5b7b6c4 Add custom check endpoint
Replace previously hardcoded value for `/health` to check when the
server became ready to serve traffic. With this the server can support
any server that provides an an OpenAI compatible inference endpoint.
2024-10-11 21:59:21 -07:00
Benson Wong 5a57688aa8 add .vscode to .gitignore 2024-10-05 19:37:00 -07:00
Benson Wong b79b7ef3d9 add goreleaser config to limit GOOS and GOARCH builds 2024-10-04 21:46:55 -07:00
Benson Wong 476086c066 Add Cmd.Wait() to prevent creation of zombie child processes see: #1 2024-10-04 21:38:29 -07:00
Benson Wong 4fae7cf946 update docs 2024-10-04 21:11:08 -07:00
9 changed files with 68 additions and 16 deletions
+3 -1
View File
@@ -1,3 +1,5 @@
.aider*
.env
build/
build/
dist/
.vscode
+11
View File
@@ -0,0 +1,11 @@
version: 2
builds:
- env:
- CGO_ENABLED=0
goos:
- linux
- darwin
goarch:
- amd64
- arm64
+16 -3
View File
@@ -30,6 +30,13 @@ models:
- "gpt-4o-mini"
- "gpt-3.5-turbo"
# wait for this path to return an HTTP 200 before serving requests
# defaults to /health to match llama.cpp
#
# use "none" to skip endpoint checking. This may cause requests to fail
# until the server is ready
checkEndpoint: "/custom-endpoint"
"qwen":
# environment variables to pass to the command
env:
@@ -38,7 +45,7 @@ models:
proxy: "http://127.0.0.1:8999"
```
## Deployment
## Installation
1. Create a configuration file, see [config.example.yaml](config.example.yaml)
1. Download a [release](https://github.com/mostlygeek/llama-swap/releases) appropriate for your OS and architecture.
@@ -47,7 +54,7 @@ models:
## Systemd Unit Files
Use this unit file to start llama-swap on boot
Use this unit file to start llama-swap on boot. This is only tested on Ubuntu.
`/etc/systemd/system/llama-swap.service`
```
@@ -68,4 +75,10 @@ StartLimitInterval=30
[Install]
WantedBy=multi-user.target
```
```
## Building from Source
1. Install golang for your system
1. run `make clean all`
1. binaries will be built into `build/` directory
+8
View File
@@ -10,6 +10,10 @@ models:
# list of model name aliases this llama.cpp instance can serve
aliases:
- "gpt-4o-mini"
# check this path for a HTTP 200 response for the server to be ready
checkEndpoint: "/health"
"qwen":
cmd: "models/llama-server-osx --port 8999 -m models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf"
proxy: "http://127.0.0.1:8999"
@@ -24,6 +28,10 @@ models:
cmd: "build/simple-responder --port 8999"
proxy: "http://127.0.0.1:8999"
# use "none" to skip check. Caution this may cause some requests to fail
# until the upstream server is ready for traffic
checkEndpoint: "none"
# don't use this, just for testing if things are broken
"broken":
cmd: "models/llama-server-osx --port 8999 -m models/doesnotexist.gguf"
+1 -1
View File
@@ -2,4 +2,4 @@ module github.com/mostlygeek/llama-swap
go 1.23.0
require gopkg.in/yaml.v3 v3.0.1 // indirect
require gopkg.in/yaml.v3 v3.0.1
+1
View File
@@ -1,3 +1,4 @@
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+1 -1
View File
@@ -25,7 +25,7 @@ func main() {
proxyManager := proxy.New(config)
http.HandleFunc("/", proxyManager.HandleFunc)
fmt.Println("llamagate listening on " + *listenStr)
fmt.Println("llama-swap listening on " + *listenStr)
if err := http.ListenAndServe(*listenStr, nil); err != nil {
fmt.Printf("Error starting server: %v\n", err)
os.Exit(1)
+5 -4
View File
@@ -7,10 +7,11 @@ import (
)
type ModelConfig struct {
Cmd string `yaml:"cmd"`
Proxy string `yaml:"proxy"`
Aliases []string `yaml:"aliases"`
Env []string `yaml:"env"`
Cmd string `yaml:"cmd"`
Proxy string `yaml:"proxy"`
Aliases []string `yaml:"aliases"`
Env []string `yaml:"env"`
CheckEndpoint string `yaml:"checkEndpoint"`
}
type Config struct {
+22 -6
View File
@@ -7,6 +7,7 @@ import (
"fmt"
"io"
"net/http"
"net/url"
"os"
"os/exec"
"strings"
@@ -57,6 +58,9 @@ func (pm *ProxyManager) swapModel(requestedModel string) error {
// kill the current running one to swap it
if pm.currentCmd != nil {
pm.currentCmd.Process.Signal(syscall.SIGTERM)
// wait for it to end
pm.currentCmd.Process.Wait()
}
pm.currentConfig = modelConfig
@@ -86,11 +90,23 @@ func (pm *ProxyManager) checkHealthEndpoint() error {
return fmt.Errorf("no upstream available to check /health")
}
checkEndpoint := strings.TrimSpace(pm.currentConfig.CheckEndpoint)
if checkEndpoint == "none" {
return nil
}
// keep default behaviour
if checkEndpoint == "" {
checkEndpoint = "/health"
}
proxyTo := pm.currentConfig.Proxy
maxDuration := time.Second * time.Duration(pm.config.HealthCheckTimeout)
healthURL := proxyTo + "/health"
healthURL, err := url.JoinPath(proxyTo, checkEndpoint)
if err != nil {
return fmt.Errorf("failed to create health url with with %s and path %s", proxyTo, checkEndpoint)
}
client := &http.Client{}
startTime := time.Now()
@@ -109,12 +125,12 @@ func (pm *ProxyManager) checkHealthEndpoint() error {
// if TCP dial can't connect any HTTP response after 5 seconds
// exit quickly.
if time.Since(startTime) > 5*time.Second {
return fmt.Errorf("/healthy endpoint took more than 5 seconds to respond")
return fmt.Errorf("health check endpoint took more than 5 seconds to respond")
}
}
if time.Since(startTime) >= maxDuration {
return fmt.Errorf("failed to check /healthy from: %s", healthURL)
return fmt.Errorf("failed to check health from: %s", healthURL)
}
time.Sleep(time.Second)
continue
@@ -124,7 +140,7 @@ func (pm *ProxyManager) checkHealthEndpoint() error {
return nil
}
if time.Since(startTime) >= maxDuration {
return fmt.Errorf("failed to check /healthy from: %s", healthURL)
return fmt.Errorf("failed to check health from: %s", healthURL)
}
time.Sleep(time.Second)
}