small tweak to example config

Add ${MODEL_ID} macro (#226 )
The automatic ${MODEL_ID} macro includes the name of the model and can be used in Cmd and CmdStop.
2025-09-01 21:26:58 -07:00 · 2025-09-01 21:21:37 -07:00 · 2025-08-28 23:44:37 -07:00 · 2025-08-28 22:47:28 -07:00 · 2025-08-28 22:03:14 -07:00 · 2025-08-28 21:41:02 -07:00
10 changed files with 153 additions and 26 deletions
@@ -7,7 +7,7 @@
 llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
-Written in golang, it is very easy to install (single binary with no dependencies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images.
+Written in golang, it is very easy to install (single binary with no dependencies) and configure (single yaml file). To get started, download a pre-built binary, a provided docker images or Homebrew.
 ## Features:
@@ -23,6 +23,7 @@ Written in golang, it is very easy to install (single binary with no dependencie
 - ✅ llama-server (llama.cpp) supported endpoints:
  - `v1/rerank`, `v1/reranking`, `/rerank`
  - `/infill` - for code infilling
  - `/completion` - for completion endpoint
 - ✅ llama-swap custom API endpoints
  - `/ui` - web UI
  - `/log` - remote log monitoring
@@ -206,4 +207,7 @@ For Python based inference servers like vllm or tabbyAPI it is recommended to ru
 ## Star History
 > [!NOTE]
 > ⭐️ Star this project to help others discover it! 
 [![Star History Chart](https://api.star-history.com/svg?repos=mostlygeek/llama-swap&type=Date)](https://www.star-history.com/#mostlygeek/llama-swap&Date)
@@ -49,8 +49,8 @@ macros:
 # - required
 # - each key is the model's ID, used in API requests
 # - model settings have default values that are used if they are not defined here
-# - below are examples of the various settings a model can have:
+# - the model's ID is available in the ${MODEL_ID} macro, also available in macros defined above
-# - available model settings: env, cmd, cmdStop, proxy, aliases, checkEndpoint, ttl, unlisted
+# - below are examples of the all the settings a model can have
 models:
  # keys are the model names used in API requests
@@ -148,12 +148,12 @@ models:
    cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
  # Docker example:
-  # container run times like Docker and Podman can be used reliably with a
+  # container runtimes like Docker and Podman can be used reliably with
-  # a combination of cmd and cmdStop.
+  # a combination of cmd, cmdStop, and ${MODEL_ID}
  "docker-llama":
    proxy: "http://127.0.0.1:${PORT}"
    cmd: |
-      docker run --name dockertest
+      docker run --name ${MODEL_ID}
      --init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
      ghcr.io/ggml-org/llama.cpp:server
      --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
@@ -167,7 +167,7 @@ models:
    # - on POSIX systems: a SIGTERM signal is sent
    # - on Windows, calls taskkill to stop the process
    # - processes have 5 seconds to shutdown until forceful termination is attempted
-    cmdStop: docker stop dockertest
+    cmdStop: docker stop ${MODEL_ID}
 # groups: a dictionary of group settings
 # - optional, default: empty dictionary
@@ -153,6 +153,19 @@ func main() {
 	})
 	// llama-server compatibility: /completion
 	r.POST("/completion", func(c *gin.Context) {
 		c.Header("Content-Type", "application/json")
 		c.JSON(http.StatusOK, gin.H{
 			"responseMessage": *responseMessage,
 			"usage": gin.H{
 				"completion_tokens": 10,
 				"prompt_tokens":     25,
 				"total_tokens":      35,
 			},
 		})
 	})
 	// issue #41
 	r.POST("/v1/audio/transcriptions", func(c *gin.Context) {
 		// Parse the multipart form
@@ -237,7 +237,7 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 	- name must fit the regex ^[a-zA-Z0-9_-]+$
 	- names must be less than 64 characters (no reason, just cause)
-	- name can not be any reserved macros: PORT
+	- name can not be any reserved macros: PORT, MODEL_ID
 	- macro values must be less than 1024 characters
 	*/
 	macroNameRegex := regexp.MustCompile(`^[a-zA-Z0-9_-]+$`)
@@ -253,6 +253,7 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 		}
 		switch macroName {
 		case "PORT":
 		case "MODEL_ID":
 			return Config{}, fmt.Errorf("macro name '%s' is reserved and cannot be used", macroName)
 		}
 	}
@@ -296,6 +297,11 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 			nextPort++
 		}
 		if strings.Contains(modelConfig.Cmd, "${MODEL_ID}") || strings.Contains(modelConfig.CmdStop, "${MODEL_ID}") {
 			modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, "${MODEL_ID}", modelId)
 			modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, "${MODEL_ID}", modelId)
 		}
 		// make sure there are no unknown macros that have not been replaced
 		macroPattern := regexp.MustCompile(`\$\{([a-zA-Z0-9_-]+)\}`)
 		fieldMap := map[string]string{
@@ -440,3 +440,44 @@ models:
 	expectedCmd := "/user/llama.cpp/build/bin/llama-server --port 9990 --model /path/to/model.gguf -ngl 99"
 	assert.Equal(t, expectedCmd, cmdStr, "Final command does not match expected structure")
 }
 func TestConfig_MacroModelId(t *testing.T) {
 	content := `
 startPort: 9000
 macros:
  "docker-llama": docker run --name ${MODEL_ID} -p ${PORT}:8080 docker_img
  "docker-stop": docker stop ${MODEL_ID}
 models:
  model1:
    cmd: /path/to/server -p ${PORT} -hf ${MODEL_ID}
  model2:
    cmd: ${docker-llama}
    cmdStop: ${docker-stop}
  author/model:F16:
    cmd: /path/to/server -p ${PORT} -hf ${MODEL_ID}
    cmdStop: stop
 `
 	config, err := LoadConfigFromReader(strings.NewReader(content))
 	assert.NoError(t, err)
 	sanitizedCmd, err := SanitizeCommand(config.Models["model1"].Cmd)
 	assert.NoError(t, err)
 	assert.Equal(t, "/path/to/server -p 9001 -hf model1", strings.Join(sanitizedCmd, " "))
 	assert.Equal(t, "docker stop ${MODEL_ID}", config.Macros["docker-stop"])
 	sanitizedCmd2, err := SanitizeCommand(config.Models["model2"].Cmd)
 	assert.NoError(t, err)
 	assert.Equal(t, "docker run --name model2 -p 9002:8080 docker_img", strings.Join(sanitizedCmd2, " "))
 	sanitizedCmdStop, err := SanitizeCommand(config.Models["model2"].CmdStop)
 	assert.NoError(t, err)
 	assert.Equal(t, "docker stop model2", strings.Join(sanitizedCmdStop, " "))
 	sanitizedCmd3, err := SanitizeCommand(config.Models["author/model:F16"].Cmd)
 	assert.NoError(t, err)
 	assert.Equal(t, "/path/to/server -p 9000 -hf author/model:F16", strings.Join(sanitizedCmd3, " "))
 }
@@ -5,6 +5,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
 	"net"
 	"net/http"
 	"net/url"
 	"os/exec"
@@ -363,8 +364,18 @@ func (p *Process) stopCommand() {
 }
 func (p *Process) checkHealthEndpoint(healthURL string) error {
 	client := &http.Client{
-		Timeout: 500 * time.Millisecond,
+		// wait a short time for a tcp connection to be established
 		Transport: &http.Transport{
 			DialContext: (&net.Dialer{
 				Timeout: 500 * time.Millisecond,
 			}).DialContext,
 		},
 		// give a long time to respond to the health check endpoint
 		// after the connection is established. See issue: 276
 		Timeout: 5000 * time.Millisecond,
 	}
 	req, err := http.NewRequest("GET", healthURL, nil)
@@ -60,10 +60,20 @@ func (pg *ProcessGroup) ProxyRequest(modelID string, writer http.ResponseWriter,
 	if pg.swap {
 		pg.Lock()
 		if pg.lastUsedProcess != modelID {
 			// is there something already running?
 			if pg.lastUsedProcess != "" {
 				pg.processes[pg.lastUsedProcess].Stop()
 			}
 			// wait for the request to the new model to be fully handled
 			// and prevent race conditions see issue #277
 			pg.processes[modelID].ProxyRequest(writer, request)
 			pg.lastUsedProcess = modelID
 			// short circuit and exit
 			pg.Unlock()
 			return nil
 		}
 		pg.Unlock()
 	}
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"net/http"
 	"net/http/httptest"
 	"sync"
 	"testing"
 	"github.com/stretchr/testify/assert"
@@ -44,32 +45,49 @@ func TestProcessGroup_HasMember(t *testing.T) {
 	assert.False(t, pg.HasMember("model3"))
 }
-func TestProcessGroup_ProxyRequestSwapIsTrue(t *testing.T) {
+// TestProcessGroup_ProxyRequestSwapIsTrueParallel tests that when swap is true
 // and multiple requests are made in parallel, only one process is running at a time.
 func TestProcessGroup_ProxyRequestSwapIsTrueParallel(t *testing.T) {
 	var processGroupTestConfig = AddDefaultGroupToConfig(Config{
 		HealthCheckTimeout: 15,
 		Models: map[string]ModelConfig{
 			// use the same listening so if a model is already running, it will fail
 			// this is a way to test that swap isolation is working
 			// properly when there are parallel requests made at the
 			// same time.
 			"model1": getTestSimpleResponderConfigPort("model1", 9832),
 			"model2": getTestSimpleResponderConfigPort("model2", 9832),
 			"model3": getTestSimpleResponderConfigPort("model3", 9832),
 			"model4": getTestSimpleResponderConfigPort("model4", 9832),
 			"model5": getTestSimpleResponderConfigPort("model5", 9832),
 		},
 		Groups: map[string]GroupConfig{
 			"G1": {
 				Swap:    true,
 				Members: []string{"model1", "model2", "model3", "model4", "model5"},
 			},
 		},
 	})
 	pg := NewProcessGroup("G1", processGroupTestConfig, testLogger, testLogger)
 	defer pg.StopProcesses(StopWaitForInflightRequest)
-	tests := []string{"model1", "model2"}
+	tests := []string{"model1", "model2", "model3", "model4", "model5"}
 	var wg sync.WaitGroup
 	wg.Add(len(tests))
 	for _, modelName := range tests {
-		t.Run(modelName, func(t *testing.T) {
+		go func(modelName string) {
-			reqBody := `{"x", "y"}`
+			defer wg.Done()
-			req := httptest.NewRequest("POST", "/v1/chat/completions", bytes.NewBufferString(reqBody))
+			req := httptest.NewRequest("POST", "/v1/chat/completions", nil)
 			w := httptest.NewRecorder()
 			assert.NoError(t, pg.ProxyRequest(modelName, w, req))
 			assert.Equal(t, http.StatusOK, w.Code)
 			assert.Contains(t, w.Body.String(), modelName)
-
+		}(modelName)
 			// make sure only one process is in the running state
 			count := 0
 			for _, process := range pg.processes {
 				if process.CurrentState() == StateReady {
 					count++
 				}
 			}
 			assert.Equal(t, 1, count)
 		})
 	}
 	wg.Wait()
 }
 func TestProcessGroup_ProxyRequestSwapIsFalse(t *testing.T) {
@@ -203,6 +203,9 @@ func (pm *ProxyManager) setupGinEngine() {
 	// llama-server's /infill endpoint for code infilling
 	pm.ginEngine.POST("/infill", mm, pm.proxyOAIHandler)
 	// llama-server's /completion endpoint
 	pm.ginEngine.POST("/completion", mm, pm.proxyOAIHandler)
 	// Support audio/speech endpoint
 	pm.ginEngine.POST("/v1/audio/speech", pm.proxyOAIHandler)
 	pm.ginEngine.POST("/v1/audio/transcriptions", pm.proxyOAIPostFormHandler)
@@ -42,7 +42,6 @@ func TestProxyManager_SwapProcessCorrectly(t *testing.T) {
 		assert.Contains(t, w.Body.String(), modelName)
 	}
 }
 func TestProxyManager_SwapMultiProcess(t *testing.T) {
 	config := AddDefaultGroupToConfig(Config{
 		HealthCheckTimeout: 15,
@@ -834,6 +833,28 @@ func TestProxyManager_HealthEndpoint(t *testing.T) {
 	assert.Equal(t, "OK", rec.Body.String())
 }
 // Ensure the custom llama-server /completion endpoint proxies correctly
 func TestProxyManager_CompletionEndpoint(t *testing.T) {
 	config := AddDefaultGroupToConfig(Config{
 		HealthCheckTimeout: 15,
 		Models: map[string]ModelConfig{
 			"model1": getTestSimpleResponderConfig("model1"),
 		},
 		LogLevel: "error",
 	})
 	proxy := New(config)
 	defer proxy.StopProcesses(StopWaitForInflightRequest)
 	reqBody := `{"model":"model1"}`
 	req := httptest.NewRequest("POST", "/completion", bytes.NewBufferString(reqBody))
 	w := httptest.NewRecorder()
 	proxy.ServeHTTP(w, req)
 	assert.Equal(t, http.StatusOK, w.Code)
 	assert.Contains(t, w.Body.String(), "model1")
 }
 func TestProxyManager_StartupHooks(t *testing.T) {
 	// using real YAML as the configuration has gotten more complex
Author	SHA1	Message	Date
Benson Wong	a533aec736	small tweak to example config	2025-09-01 21:26:58 -07:00
Brett Profitt	97b17fc47d	Add ${MODEL_ID} macro (#226 ) The automatic ${MODEL_ID} macro includes the name of the model and can be used in Cmd and CmdStop.	2025-09-01 21:21:37 -07:00
Benson Wong	2457840698	Update README.md [skip ci]	2025-08-28 23:44:37 -07:00
Benson Wong	7f55494151	Update README.md [skip ci]	2025-08-28 22:47:28 -07:00
Benson Wong	831a90d3b0	Add different timeout scenarios to Process.checkHealthEndpoint #276 (#278 ) - add a TCP connection timeout of 500ms - increase HTTP client timeout to 5000ms In this new behaviour the upstream has 500ms to accept a tcp connection and 5000ms to respond to the HTTP request.	2025-08-28 22:03:14 -07:00
Yandrik	977f1856bb	add /completion endpoint (#275 ) * feat: add /completion endpoint * chore: reformat using gofmt	2025-08-28 21:41:02 -07:00
Benson Wong	52b329f7bc	Fix #277 race condition in ProcessGroup.ProxyRequest when swap=true	2025-08-28 21:38:40 -07:00