Add custom check endpoint

Replace previously hardcoded value for /health to check when the server became ready to serve traffic. With this the server can support any server that provides an an OpenAI compatible inference endpoint.
2024-10-11 22:02:14 -07:00 · 2024-10-11 21:59:21 -07:00 · 2024-10-05 19:37:00 -07:00 · 2024-10-04 21:46:55 -07:00 · 2024-10-04 21:38:29 -07:00 · 2024-10-04 21:11:08 -07:00
9 changed files with 68 additions and 16 deletions
@@ -1,3 +1,5 @@
 .aider*
 .env
-build/
+build/
+dist/
+.vscode
@@ -0,0 +1,11 @@
+version: 2
+
+builds:
+  - env:
+      - CGO_ENABLED=0
+    goos:
+      - linux
+      - darwin
+    goarch:
+      - amd64
+      - arm64
@@ -30,6 +30,13 @@ models:
    - "gpt-4o-mini"
    - "gpt-3.5-turbo"

+    # wait for this path to return an HTTP 200 before serving requests
+    # defaults to /health to match llama.cpp
+    #
+    # use "none" to skip endpoint checking. This may cause requests to fail
+    # until the server is ready
+    checkEndpoint: "/custom-endpoint"
+
  "qwen":
    # environment variables to pass to the command
    env:
@@ -38,7 +45,7 @@ models:
    proxy: "http://127.0.0.1:8999"
 ```

-## Deployment
+## Installation

 1. Create a configuration file, see [config.example.yaml](config.example.yaml)
 1. Download a [release](https://github.com/mostlygeek/llama-swap/releases) appropriate for your OS and architecture.
@@ -47,7 +54,7 @@ models:

 ## Systemd Unit Files

-Use this unit file to start llama-swap on boot
+Use this unit file to start llama-swap on boot. This is only tested on Ubuntu.

 `/etc/systemd/system/llama-swap.service`
 ```
@@ -68,4 +75,10 @@ StartLimitInterval=30

 [Install]
 WantedBy=multi-user.target
-```
+```
+
+## Building from Source
+
+1. Install golang for your system
+1. run `make clean all`
+1. binaries will be built into `build/` directory
@@ -10,6 +10,10 @@ models:
    # list of model name aliases this llama.cpp instance can serve
    aliases:
    - "gpt-4o-mini"
+
+    # check this path for a HTTP 200 response for the server to be ready
+    checkEndpoint: "/health"
+
  "qwen":
    cmd: "models/llama-server-osx --port 8999 -m models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf"
    proxy: "http://127.0.0.1:8999"
@@ -24,6 +28,10 @@ models:
    cmd: "build/simple-responder --port 8999"
    proxy: "http://127.0.0.1:8999"

+    # use "none" to skip check. Caution this may cause some requests to fail
+    # until the upstream server is ready for traffic
+    checkEndpoint: "none"
+
  # don't use this, just for testing if things are broken
  "broken":
    cmd: "models/llama-server-osx --port 8999 -m models/doesnotexist.gguf"
@@ -2,4 +2,4 @@ module github.com/mostlygeek/llama-swap

 go 1.23.0

-require gopkg.in/yaml.v3 v3.0.1 // indirect
+require gopkg.in/yaml.v3 v3.0.1
@@ -1,3 +1,4 @@
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
@@ -25,7 +25,7 @@ func main() {
 	proxyManager := proxy.New(config)
 	http.HandleFunc("/", proxyManager.HandleFunc)

-	fmt.Println("llamagate listening on " + *listenStr)
+	fmt.Println("llama-swap listening on " + *listenStr)
 	if err := http.ListenAndServe(*listenStr, nil); err != nil {
 		fmt.Printf("Error starting server: %v\n", err)
 		os.Exit(1)
@@ -7,10 +7,11 @@ import (
 )

 type ModelConfig struct {
-	Cmd     string   `yaml:"cmd"`
-	Proxy   string   `yaml:"proxy"`
-	Aliases []string `yaml:"aliases"`
-	Env     []string `yaml:"env"`
+	Cmd           string   `yaml:"cmd"`
+	Proxy         string   `yaml:"proxy"`
+	Aliases       []string `yaml:"aliases"`
+	Env           []string `yaml:"env"`
+	CheckEndpoint string   `yaml:"checkEndpoint"`
 }

 type Config struct {
@@ -7,6 +7,7 @@ import (
 	"fmt"
 	"io"
 	"net/http"
+	"net/url"
 	"os"
 	"os/exec"
 	"strings"
@@ -57,6 +58,9 @@ func (pm *ProxyManager) swapModel(requestedModel string) error {
 	// kill the current running one to swap it
 	if pm.currentCmd != nil {
 		pm.currentCmd.Process.Signal(syscall.SIGTERM)
+
+		// wait for it to end
+		pm.currentCmd.Process.Wait()
 	}

 	pm.currentConfig = modelConfig
@@ -86,11 +90,23 @@ func (pm *ProxyManager) checkHealthEndpoint() error {
 		return fmt.Errorf("no upstream available to check /health")
 	}

+	checkEndpoint := strings.TrimSpace(pm.currentConfig.CheckEndpoint)
+
+	if checkEndpoint == "none" {
+		return nil
+	}
+
+	// keep default behaviour
+	if checkEndpoint == "" {
+		checkEndpoint = "/health"
+	}
+
 	proxyTo := pm.currentConfig.Proxy
-
 	maxDuration := time.Second * time.Duration(pm.config.HealthCheckTimeout)
-
-	healthURL := proxyTo + "/health"
+	healthURL, err := url.JoinPath(proxyTo, checkEndpoint)
+	if err != nil {
+		return fmt.Errorf("failed to create health url with with %s and path %s", proxyTo, checkEndpoint)
+	}
 	client := &http.Client{}
 	startTime := time.Now()

@@ -109,12 +125,12 @@ func (pm *ProxyManager) checkHealthEndpoint() error {
 				// if TCP dial can't connect any HTTP response after 5 seconds
 				// exit quickly.
 				if time.Since(startTime) > 5*time.Second {
-					return fmt.Errorf("/healthy endpoint took more than 5 seconds to respond")
+					return fmt.Errorf("health check endpoint took more than 5 seconds to respond")
 				}
 			}

 			if time.Since(startTime) >= maxDuration {
-				return fmt.Errorf("failed to check /healthy from: %s", healthURL)
+				return fmt.Errorf("failed to check health from: %s", healthURL)
 			}
 			time.Sleep(time.Second)
 			continue
@@ -124,7 +140,7 @@ func (pm *ProxyManager) checkHealthEndpoint() error {
 			return nil
 		}
 		if time.Since(startTime) >= maxDuration {
-			return fmt.Errorf("failed to check /healthy from: %s", healthURL)
+			return fmt.Errorf("failed to check health from: %s", healthURL)
 		}
 		time.Sleep(time.Second)
 	}
Author	SHA1	Message	Date
Benson Wong	6cf0962807	Add custom check endpoint Replace previously hardcoded value for /health to check when the server became ready to serve traffic. With this the server can support any server that provides an an OpenAI compatible inference endpoint.	2024-10-11 22:02:14 -07:00
Benson Wong	8eb5b7b6c4	Add custom check endpoint Replace previously hardcoded value for `/health` to check when the server became ready to serve traffic. With this the server can support any server that provides an an OpenAI compatible inference endpoint.	2024-10-11 21:59:21 -07:00
Benson Wong	5a57688aa8	add .vscode to .gitignore	2024-10-05 19:37:00 -07:00
Benson Wong	b79b7ef3d9	add goreleaser config to limit GOOS and GOARCH builds	2024-10-04 21:46:55 -07:00
Benson Wong	476086c066	Add Cmd.Wait() to prevent creation of zombie child processes see: #1	2024-10-04 21:38:29 -07:00
Benson Wong	4fae7cf946	update docs	2024-10-04 21:11:08 -07:00