Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 6cf0962807 | |||
| 8eb5b7b6c4 | |||
| 5a57688aa8 | |||
| b79b7ef3d9 | |||
| 476086c066 | |||
| 4fae7cf946 |
@@ -1,3 +1,5 @@
|
|||||||
.aider*
|
.aider*
|
||||||
.env
|
.env
|
||||||
build/
|
build/
|
||||||
|
dist/
|
||||||
|
.vscode
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
version: 2
|
||||||
|
|
||||||
|
builds:
|
||||||
|
- env:
|
||||||
|
- CGO_ENABLED=0
|
||||||
|
goos:
|
||||||
|
- linux
|
||||||
|
- darwin
|
||||||
|
goarch:
|
||||||
|
- amd64
|
||||||
|
- arm64
|
||||||
@@ -30,6 +30,13 @@ models:
|
|||||||
- "gpt-4o-mini"
|
- "gpt-4o-mini"
|
||||||
- "gpt-3.5-turbo"
|
- "gpt-3.5-turbo"
|
||||||
|
|
||||||
|
# wait for this path to return an HTTP 200 before serving requests
|
||||||
|
# defaults to /health to match llama.cpp
|
||||||
|
#
|
||||||
|
# use "none" to skip endpoint checking. This may cause requests to fail
|
||||||
|
# until the server is ready
|
||||||
|
checkEndpoint: "/custom-endpoint"
|
||||||
|
|
||||||
"qwen":
|
"qwen":
|
||||||
# environment variables to pass to the command
|
# environment variables to pass to the command
|
||||||
env:
|
env:
|
||||||
@@ -38,7 +45,7 @@ models:
|
|||||||
proxy: "http://127.0.0.1:8999"
|
proxy: "http://127.0.0.1:8999"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Deployment
|
## Installation
|
||||||
|
|
||||||
1. Create a configuration file, see [config.example.yaml](config.example.yaml)
|
1. Create a configuration file, see [config.example.yaml](config.example.yaml)
|
||||||
1. Download a [release](https://github.com/mostlygeek/llama-swap/releases) appropriate for your OS and architecture.
|
1. Download a [release](https://github.com/mostlygeek/llama-swap/releases) appropriate for your OS and architecture.
|
||||||
@@ -47,7 +54,7 @@ models:
|
|||||||
|
|
||||||
## Systemd Unit Files
|
## Systemd Unit Files
|
||||||
|
|
||||||
Use this unit file to start llama-swap on boot
|
Use this unit file to start llama-swap on boot. This is only tested on Ubuntu.
|
||||||
|
|
||||||
`/etc/systemd/system/llama-swap.service`
|
`/etc/systemd/system/llama-swap.service`
|
||||||
```
|
```
|
||||||
@@ -69,3 +76,9 @@ StartLimitInterval=30
|
|||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Building from Source
|
||||||
|
|
||||||
|
1. Install golang for your system
|
||||||
|
1. run `make clean all`
|
||||||
|
1. binaries will be built into `build/` directory
|
||||||
|
|||||||
@@ -10,6 +10,10 @@ models:
|
|||||||
# list of model name aliases this llama.cpp instance can serve
|
# list of model name aliases this llama.cpp instance can serve
|
||||||
aliases:
|
aliases:
|
||||||
- "gpt-4o-mini"
|
- "gpt-4o-mini"
|
||||||
|
|
||||||
|
# check this path for a HTTP 200 response for the server to be ready
|
||||||
|
checkEndpoint: "/health"
|
||||||
|
|
||||||
"qwen":
|
"qwen":
|
||||||
cmd: "models/llama-server-osx --port 8999 -m models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf"
|
cmd: "models/llama-server-osx --port 8999 -m models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf"
|
||||||
proxy: "http://127.0.0.1:8999"
|
proxy: "http://127.0.0.1:8999"
|
||||||
@@ -24,6 +28,10 @@ models:
|
|||||||
cmd: "build/simple-responder --port 8999"
|
cmd: "build/simple-responder --port 8999"
|
||||||
proxy: "http://127.0.0.1:8999"
|
proxy: "http://127.0.0.1:8999"
|
||||||
|
|
||||||
|
# use "none" to skip check. Caution this may cause some requests to fail
|
||||||
|
# until the upstream server is ready for traffic
|
||||||
|
checkEndpoint: "none"
|
||||||
|
|
||||||
# don't use this, just for testing if things are broken
|
# don't use this, just for testing if things are broken
|
||||||
"broken":
|
"broken":
|
||||||
cmd: "models/llama-server-osx --port 8999 -m models/doesnotexist.gguf"
|
cmd: "models/llama-server-osx --port 8999 -m models/doesnotexist.gguf"
|
||||||
|
|||||||
@@ -2,4 +2,4 @@ module github.com/mostlygeek/llama-swap
|
|||||||
|
|
||||||
go 1.23.0
|
go 1.23.0
|
||||||
|
|
||||||
require gopkg.in/yaml.v3 v3.0.1 // indirect
|
require gopkg.in/yaml.v3 v3.0.1
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
|
|||||||
+1
-1
@@ -25,7 +25,7 @@ func main() {
|
|||||||
proxyManager := proxy.New(config)
|
proxyManager := proxy.New(config)
|
||||||
http.HandleFunc("/", proxyManager.HandleFunc)
|
http.HandleFunc("/", proxyManager.HandleFunc)
|
||||||
|
|
||||||
fmt.Println("llamagate listening on " + *listenStr)
|
fmt.Println("llama-swap listening on " + *listenStr)
|
||||||
if err := http.ListenAndServe(*listenStr, nil); err != nil {
|
if err := http.ListenAndServe(*listenStr, nil); err != nil {
|
||||||
fmt.Printf("Error starting server: %v\n", err)
|
fmt.Printf("Error starting server: %v\n", err)
|
||||||
os.Exit(1)
|
os.Exit(1)
|
||||||
|
|||||||
+5
-4
@@ -7,10 +7,11 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type ModelConfig struct {
|
type ModelConfig struct {
|
||||||
Cmd string `yaml:"cmd"`
|
Cmd string `yaml:"cmd"`
|
||||||
Proxy string `yaml:"proxy"`
|
Proxy string `yaml:"proxy"`
|
||||||
Aliases []string `yaml:"aliases"`
|
Aliases []string `yaml:"aliases"`
|
||||||
Env []string `yaml:"env"`
|
Env []string `yaml:"env"`
|
||||||
|
CheckEndpoint string `yaml:"checkEndpoint"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type Config struct {
|
type Config struct {
|
||||||
|
|||||||
+22
-6
@@ -7,6 +7,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -57,6 +58,9 @@ func (pm *ProxyManager) swapModel(requestedModel string) error {
|
|||||||
// kill the current running one to swap it
|
// kill the current running one to swap it
|
||||||
if pm.currentCmd != nil {
|
if pm.currentCmd != nil {
|
||||||
pm.currentCmd.Process.Signal(syscall.SIGTERM)
|
pm.currentCmd.Process.Signal(syscall.SIGTERM)
|
||||||
|
|
||||||
|
// wait for it to end
|
||||||
|
pm.currentCmd.Process.Wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
pm.currentConfig = modelConfig
|
pm.currentConfig = modelConfig
|
||||||
@@ -86,11 +90,23 @@ func (pm *ProxyManager) checkHealthEndpoint() error {
|
|||||||
return fmt.Errorf("no upstream available to check /health")
|
return fmt.Errorf("no upstream available to check /health")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
checkEndpoint := strings.TrimSpace(pm.currentConfig.CheckEndpoint)
|
||||||
|
|
||||||
|
if checkEndpoint == "none" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// keep default behaviour
|
||||||
|
if checkEndpoint == "" {
|
||||||
|
checkEndpoint = "/health"
|
||||||
|
}
|
||||||
|
|
||||||
proxyTo := pm.currentConfig.Proxy
|
proxyTo := pm.currentConfig.Proxy
|
||||||
|
|
||||||
maxDuration := time.Second * time.Duration(pm.config.HealthCheckTimeout)
|
maxDuration := time.Second * time.Duration(pm.config.HealthCheckTimeout)
|
||||||
|
healthURL, err := url.JoinPath(proxyTo, checkEndpoint)
|
||||||
healthURL := proxyTo + "/health"
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to create health url with with %s and path %s", proxyTo, checkEndpoint)
|
||||||
|
}
|
||||||
client := &http.Client{}
|
client := &http.Client{}
|
||||||
startTime := time.Now()
|
startTime := time.Now()
|
||||||
|
|
||||||
@@ -109,12 +125,12 @@ func (pm *ProxyManager) checkHealthEndpoint() error {
|
|||||||
// if TCP dial can't connect any HTTP response after 5 seconds
|
// if TCP dial can't connect any HTTP response after 5 seconds
|
||||||
// exit quickly.
|
// exit quickly.
|
||||||
if time.Since(startTime) > 5*time.Second {
|
if time.Since(startTime) > 5*time.Second {
|
||||||
return fmt.Errorf("/healthy endpoint took more than 5 seconds to respond")
|
return fmt.Errorf("health check endpoint took more than 5 seconds to respond")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if time.Since(startTime) >= maxDuration {
|
if time.Since(startTime) >= maxDuration {
|
||||||
return fmt.Errorf("failed to check /healthy from: %s", healthURL)
|
return fmt.Errorf("failed to check health from: %s", healthURL)
|
||||||
}
|
}
|
||||||
time.Sleep(time.Second)
|
time.Sleep(time.Second)
|
||||||
continue
|
continue
|
||||||
@@ -124,7 +140,7 @@ func (pm *ProxyManager) checkHealthEndpoint() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
if time.Since(startTime) >= maxDuration {
|
if time.Since(startTime) >= maxDuration {
|
||||||
return fmt.Errorf("failed to check /healthy from: %s", healthURL)
|
return fmt.Errorf("failed to check health from: %s", healthURL)
|
||||||
}
|
}
|
||||||
time.Sleep(time.Second)
|
time.Sleep(time.Second)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user