Support llama.cpp's cache_n in timings info (#287 )

Capture prompt cache metrics and surface them on Activities page in UI
Remove cmdStart from README [skip ci]
2025-09-06 13:58:02 -07:00 · 2025-09-04 11:57:28 -07:00 · 2025-09-01 21:26:58 -07:00 · 2025-09-01 21:21:37 -07:00 · 2025-08-28 23:44:37 -07:00 · 2025-08-28 22:47:28 -07:00
14 changed files with 226 additions and 38 deletions
@@ -7,7 +7,7 @@

 llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.

-Written in golang, it is very easy to install (single binary with no dependencies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images.
+Written in golang, it is very easy to install (single binary with no dependencies) and configure (single yaml file). To get started, download a pre-built binary, a provided docker images or Homebrew.

 ## Features:

@@ -23,6 +23,7 @@ Written in golang, it is very easy to install (single binary with no dependencie
 - ✅ llama-server (llama.cpp) supported endpoints:
  - `v1/rerank`, `v1/reranking`, `/rerank`
  - `/infill` - for code infilling
+  - `/completion` - for completion endpoint
 - ✅ llama-swap custom API endpoints
  - `/ui` - web UI
  - `/log` - remote log monitoring
@@ -33,7 +34,7 @@ Written in golang, it is very easy to install (single binary with no dependencie
 - ✅ Run multiple models at once with `Groups` ([#107](https://github.com/mostlygeek/llama-swap/issues/107))
 - ✅ Automatic unloading of models after timeout by setting a `ttl`
 - ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabbyAPI, etc)
- ✅ Reliable Docker and Podman support with `cmdStart` and `cmdStop`
+- ✅ Reliable Docker and Podman support using `cmd` and `cmdStop` together
 - ✅ Full control over server settings per model
 - ✅ Preload models on startup with `hooks` ([#235](https://github.com/mostlygeek/llama-swap/pull/235))

@@ -206,4 +207,7 @@ For Python based inference servers like vllm or tabbyAPI it is recommended to ru

 ## Star History

+> [!NOTE]
+> ⭐️ Star this project to help others discover it! 
+
 [![Star History Chart](https://api.star-history.com/svg?repos=mostlygeek/llama-swap&type=Date)](https://www.star-history.com/#mostlygeek/llama-swap&Date)
@@ -49,8 +49,8 @@ macros:
 # - required
 # - each key is the model's ID, used in API requests
 # - model settings have default values that are used if they are not defined here
-# - below are examples of the various settings a model can have:
-# - available model settings: env, cmd, cmdStop, proxy, aliases, checkEndpoint, ttl, unlisted
+# - the model's ID is available in the ${MODEL_ID} macro, also available in macros defined above
+# - below are examples of the all the settings a model can have
 models:

  # keys are the model names used in API requests
@@ -148,12 +148,12 @@ models:
    cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0

  # Docker example:
-  # container run times like Docker and Podman can be used reliably with a
-  # a combination of cmd and cmdStop.
+  # container runtimes like Docker and Podman can be used reliably with
+  # a combination of cmd, cmdStop, and ${MODEL_ID}
  "docker-llama":
    proxy: "http://127.0.0.1:${PORT}"
    cmd: |
-      docker run --name dockertest
+      docker run --name ${MODEL_ID}
      --init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
      ghcr.io/ggml-org/llama.cpp:server
      --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
@@ -167,7 +167,7 @@ models:
    # - on POSIX systems: a SIGTERM signal is sent
    # - on Windows, calls taskkill to stop the process
    # - processes have 5 seconds to shutdown until forceful termination is attempted
-    cmdStop: docker stop dockertest
+    cmdStop: docker stop ${MODEL_ID}

 # groups: a dictionary of group settings
 # - optional, default: empty dictionary
@@ -153,6 +153,19 @@ func main() {

 	})

+	// llama-server compatibility: /completion
+	r.POST("/completion", func(c *gin.Context) {
+		c.Header("Content-Type", "application/json")
+		c.JSON(http.StatusOK, gin.H{
+			"responseMessage": *responseMessage,
+			"usage": gin.H{
+				"completion_tokens": 10,
+				"prompt_tokens":     25,
+				"total_tokens":      35,
+			},
+		})
+	})
+
 	// issue #41
 	r.POST("/v1/audio/transcriptions", func(c *gin.Context) {
 		// Parse the multipart form
@@ -237,7 +237,7 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {

 	- name must fit the regex ^[a-zA-Z0-9_-]+$
 	- names must be less than 64 characters (no reason, just cause)
-	- name can not be any reserved macros: PORT
+	- name can not be any reserved macros: PORT, MODEL_ID
 	- macro values must be less than 1024 characters
 	*/
 	macroNameRegex := regexp.MustCompile(`^[a-zA-Z0-9_-]+$`)
@@ -253,6 +253,7 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 		}
 		switch macroName {
 		case "PORT":
+		case "MODEL_ID":
 			return Config{}, fmt.Errorf("macro name '%s' is reserved and cannot be used", macroName)
 		}
 	}
@@ -296,6 +297,11 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 			nextPort++
 		}

+		if strings.Contains(modelConfig.Cmd, "${MODEL_ID}") || strings.Contains(modelConfig.CmdStop, "${MODEL_ID}") {
+			modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, "${MODEL_ID}", modelId)
+			modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, "${MODEL_ID}", modelId)
+		}
+
 		// make sure there are no unknown macros that have not been replaced
 		macroPattern := regexp.MustCompile(`\$\{([a-zA-Z0-9_-]+)\}`)
 		fieldMap := map[string]string{
@@ -440,3 +440,44 @@ models:
 	expectedCmd := "/user/llama.cpp/build/bin/llama-server --port 9990 --model /path/to/model.gguf -ngl 99"
 	assert.Equal(t, expectedCmd, cmdStr, "Final command does not match expected structure")
 }
+
+func TestConfig_MacroModelId(t *testing.T) {
+	content := `
+startPort: 9000
+macros:
+  "docker-llama": docker run --name ${MODEL_ID} -p ${PORT}:8080 docker_img
+  "docker-stop": docker stop ${MODEL_ID}
+
+models:
+  model1:
+    cmd: /path/to/server -p ${PORT} -hf ${MODEL_ID}
+
+  model2:
+    cmd: ${docker-llama}
+    cmdStop: ${docker-stop}
+
+  author/model:F16:
+    cmd: /path/to/server -p ${PORT} -hf ${MODEL_ID}
+    cmdStop: stop
+`
+
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+	sanitizedCmd, err := SanitizeCommand(config.Models["model1"].Cmd)
+	assert.NoError(t, err)
+	assert.Equal(t, "/path/to/server -p 9001 -hf model1", strings.Join(sanitizedCmd, " "))
+
+	assert.Equal(t, "docker stop ${MODEL_ID}", config.Macros["docker-stop"])
+
+	sanitizedCmd2, err := SanitizeCommand(config.Models["model2"].Cmd)
+	assert.NoError(t, err)
+	assert.Equal(t, "docker run --name model2 -p 9002:8080 docker_img", strings.Join(sanitizedCmd2, " "))
+
+	sanitizedCmdStop, err := SanitizeCommand(config.Models["model2"].CmdStop)
+	assert.NoError(t, err)
+	assert.Equal(t, "docker stop model2", strings.Join(sanitizedCmdStop, " "))
+
+	sanitizedCmd3, err := SanitizeCommand(config.Models["author/model:F16"].Cmd)
+	assert.NoError(t, err)
+	assert.Equal(t, "/path/to/server -p 9000 -hf author/model:F16", strings.Join(sanitizedCmd3, " "))
+}
@@ -61,7 +61,6 @@ func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
 		} else {
 			writer.metricsRecorder.processNonStreamingResponse(writer.body)
 		}
-
 	}
 }

@@ -73,6 +72,7 @@ func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool {
 	}

 	// default values
+	cachedTokens := -1 // unknown or missing data
 	outputTokens := 0
 	inputTokens := 0

@@ -93,11 +93,16 @@ func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool {
 		promptPerSecond = jsonData.Get("timings.prompt_per_second").Float()
 		tokensPerSecond = jsonData.Get("timings.predicted_per_second").Float()
 		durationMs = int(jsonData.Get("timings.prompt_ms").Float() + jsonData.Get("timings.predicted_ms").Float())
+
+		if cachedValue := jsonData.Get("timings.cache_n"); cachedValue.Exists() {
+			cachedTokens = int(cachedValue.Int())
+		}
 	}

 	rec.metricsMonitor.addMetrics(TokenMetrics{
 		Timestamp:       time.Now(),
 		Model:           rec.realModelName,
+		CachedTokens:    cachedTokens,
 		InputTokens:     inputTokens,
 		OutputTokens:    outputTokens,
 		PromptPerSecond: promptPerSecond,
@@ -13,6 +13,7 @@ type TokenMetrics struct {
 	ID              int       `json:"id"`
 	Timestamp       time.Time `json:"timestamp"`
 	Model           string    `json:"model"`
+	CachedTokens    int       `json:"cache_tokens"`
 	InputTokens     int       `json:"input_tokens"`
 	OutputTokens    int       `json:"output_tokens"`
 	PromptPerSecond float64   `json:"prompt_per_second"`
@@ -61,7 +62,6 @@ func (mp *MetricsMonitor) addMetrics(metric TokenMetrics) {
 	if len(mp.metrics) > mp.maxMetrics {
 		mp.metrics = mp.metrics[len(mp.metrics)-mp.maxMetrics:]
 	}
-
 	event.Emit(TokenMetricsEvent{Metrics: metric})
 }

@@ -5,6 +5,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"net"
 	"net/http"
 	"net/url"
 	"os/exec"
@@ -363,8 +364,18 @@ func (p *Process) stopCommand() {
 }

 func (p *Process) checkHealthEndpoint(healthURL string) error {
+
 	client := &http.Client{
-		Timeout: 500 * time.Millisecond,
+		// wait a short time for a tcp connection to be established
+		Transport: &http.Transport{
+			DialContext: (&net.Dialer{
+				Timeout: 500 * time.Millisecond,
+			}).DialContext,
+		},
+
+		// give a long time to respond to the health check endpoint
+		// after the connection is established. See issue: 276
+		Timeout: 5000 * time.Millisecond,
 	}

 	req, err := http.NewRequest("GET", healthURL, nil)
@@ -60,10 +60,20 @@ func (pg *ProcessGroup) ProxyRequest(modelID string, writer http.ResponseWriter,
 	if pg.swap {
 		pg.Lock()
 		if pg.lastUsedProcess != modelID {
+
+			// is there something already running?
 			if pg.lastUsedProcess != "" {
 				pg.processes[pg.lastUsedProcess].Stop()
 			}
+
+			// wait for the request to the new model to be fully handled
+			// and prevent race conditions see issue #277
+			pg.processes[modelID].ProxyRequest(writer, request)
 			pg.lastUsedProcess = modelID
+
+			// short circuit and exit
+			pg.Unlock()
+			return nil
 		}
 		pg.Unlock()
 	}
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"net/http"
 	"net/http/httptest"
+	"sync"
 	"testing"

 	"github.com/stretchr/testify/assert"
@@ -44,32 +45,49 @@ func TestProcessGroup_HasMember(t *testing.T) {
 	assert.False(t, pg.HasMember("model3"))
 }

-func TestProcessGroup_ProxyRequestSwapIsTrue(t *testing.T) {
+// TestProcessGroup_ProxyRequestSwapIsTrueParallel tests that when swap is true
+// and multiple requests are made in parallel, only one process is running at a time.
+func TestProcessGroup_ProxyRequestSwapIsTrueParallel(t *testing.T) {
+	var processGroupTestConfig = AddDefaultGroupToConfig(Config{
+		HealthCheckTimeout: 15,
+		Models: map[string]ModelConfig{
+			// use the same listening so if a model is already running, it will fail
+			// this is a way to test that swap isolation is working
+			// properly when there are parallel requests made at the
+			// same time.
+			"model1": getTestSimpleResponderConfigPort("model1", 9832),
+			"model2": getTestSimpleResponderConfigPort("model2", 9832),
+			"model3": getTestSimpleResponderConfigPort("model3", 9832),
+			"model4": getTestSimpleResponderConfigPort("model4", 9832),
+			"model5": getTestSimpleResponderConfigPort("model5", 9832),
+		},
+		Groups: map[string]GroupConfig{
+			"G1": {
+				Swap:    true,
+				Members: []string{"model1", "model2", "model3", "model4", "model5"},
+			},
+		},
+	})
+
 	pg := NewProcessGroup("G1", processGroupTestConfig, testLogger, testLogger)
 	defer pg.StopProcesses(StopWaitForInflightRequest)

-	tests := []string{"model1", "model2"}
+	tests := []string{"model1", "model2", "model3", "model4", "model5"}

+	var wg sync.WaitGroup
+
+	wg.Add(len(tests))
 	for _, modelName := range tests {
-		t.Run(modelName, func(t *testing.T) {
-			reqBody := `{"x", "y"}`
-			req := httptest.NewRequest("POST", "/v1/chat/completions", bytes.NewBufferString(reqBody))
+		go func(modelName string) {
+			defer wg.Done()
+			req := httptest.NewRequest("POST", "/v1/chat/completions", nil)
 			w := httptest.NewRecorder()
-
 			assert.NoError(t, pg.ProxyRequest(modelName, w, req))
 			assert.Equal(t, http.StatusOK, w.Code)
 			assert.Contains(t, w.Body.String(), modelName)
-
-			// make sure only one process is in the running state
-			count := 0
-			for _, process := range pg.processes {
-				if process.CurrentState() == StateReady {
-					count++
-				}
-			}
-			assert.Equal(t, 1, count)
-		})
+		}(modelName)
 	}
+	wg.Wait()
 }

 func TestProcessGroup_ProxyRequestSwapIsFalse(t *testing.T) {
@@ -203,6 +203,9 @@ func (pm *ProxyManager) setupGinEngine() {
 	// llama-server's /infill endpoint for code infilling
 	pm.ginEngine.POST("/infill", mm, pm.proxyOAIHandler)

+	// llama-server's /completion endpoint
+	pm.ginEngine.POST("/completion", mm, pm.proxyOAIHandler)
+
 	// Support audio/speech endpoint
 	pm.ginEngine.POST("/v1/audio/speech", pm.proxyOAIHandler)
 	pm.ginEngine.POST("/v1/audio/transcriptions", pm.proxyOAIPostFormHandler)
@@ -42,7 +42,6 @@ func TestProxyManager_SwapProcessCorrectly(t *testing.T) {
 		assert.Contains(t, w.Body.String(), modelName)
 	}
 }
-
 func TestProxyManager_SwapMultiProcess(t *testing.T) {
 	config := AddDefaultGroupToConfig(Config{
 		HealthCheckTimeout: 15,
@@ -834,6 +833,28 @@ func TestProxyManager_HealthEndpoint(t *testing.T) {
 	assert.Equal(t, "OK", rec.Body.String())
 }

+// Ensure the custom llama-server /completion endpoint proxies correctly
+func TestProxyManager_CompletionEndpoint(t *testing.T) {
+	config := AddDefaultGroupToConfig(Config{
+		HealthCheckTimeout: 15,
+		Models: map[string]ModelConfig{
+			"model1": getTestSimpleResponderConfig("model1"),
+		},
+		LogLevel: "error",
+	})
+
+	proxy := New(config)
+	defer proxy.StopProcesses(StopWaitForInflightRequest)
+
+	reqBody := `{"model":"model1"}`
+	req := httptest.NewRequest("POST", "/completion", bytes.NewBufferString(reqBody))
+	w := httptest.NewRecorder()
+
+	proxy.ServeHTTP(w, req)
+	assert.Equal(t, http.StatusOK, w.Code)
+	assert.Contains(t, w.Body.String(), "model1")
+}
+
 func TestProxyManager_StartupHooks(t *testing.T) {

 	// using real YAML as the configuration has gotten more complex
@@ -28,6 +28,7 @@ interface Metrics {
  id: number;
  timestamp: string;
  model: string;
+  cache_tokens: number;
  input_tokens: number;
  output_tokens: number;
  prompt_per_second: number;
@@ -1,10 +1,6 @@
 import { useMemo } from "react";
 import { useAPI } from "../contexts/APIProvider";

-const formatTimestamp = (timestamp: string): string => {
-  return new Date(timestamp).toLocaleString();
-};
-
 const formatSpeed = (speed: number): string => {
  return speed < 0 ? "unknown" : speed.toFixed(2) + " t/s";
 };
@@ -13,6 +9,33 @@ const formatDuration = (ms: number): string => {
  return (ms / 1000).toFixed(2) + "s";
 };

+const formatRelativeTime = (timestamp: string): string => {
+  const now = new Date();
+  const date = new Date(timestamp);
+  const diffInSeconds = Math.floor((now.getTime() - date.getTime()) / 1000);
+
+  // Handle future dates by returning "just now"
+  if (diffInSeconds < 5) {
+    return "now";
+  }
+
+  if (diffInSeconds < 60) {
+    return `${diffInSeconds}s ago`;
+  }
+
+  const diffInMinutes = Math.floor(diffInSeconds / 60);
+  if (diffInMinutes < 60) {
+    return `${diffInMinutes}m ago`;
+  }
+
+  const diffInHours = Math.floor(diffInMinutes / 60);
+  if (diffInHours < 24) {
+    return `${diffInHours}h ago`;
+  }
+
+  return "a while ago";
+};
+
 const ActivityPage = () => {
  const { metrics } = useAPI();
  const sortedMetrics = useMemo(() => {
@@ -32,11 +55,16 @@ const ActivityPage = () => {
          <table className="min-w-full divide-y">
            <thead>
              <tr>
-                <th className="px-4 py-3 text-left text-xs font-medium uppercase tracking-wider">Id</th>
-                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Timestamp</th>
+                <th className="px-4 py-3 text-left text-xs font-medium uppercase tracking-wider">ID</th>
+                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Time</th>
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Model</th>
-                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Input Tokens</th>
-                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Output Tokens</th>
+                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">
+                  Cached <Tooltip content="prompt tokens from cache" />
+                </th>
+                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">
+                  Prompt <Tooltip content="new prompt tokens processed" />
+                </th>
+                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Generated</th>
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Prompt Processing</th>
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Generation Speed</th>
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Duration</th>
@@ -46,8 +74,11 @@ const ActivityPage = () => {
              {sortedMetrics.map((metric) => (
                <tr key={`metric_${metric.id}`}>
                  <td className="px-4 py-4 whitespace-nowrap text-sm">{metric.id + 1 /* un-zero index */}</td>
-                  <td className="px-6 py-4 whitespace-nowrap text-sm">{formatTimestamp(metric.timestamp)}</td>
+                  <td className="px-6 py-4 whitespace-nowrap text-sm">{formatRelativeTime(metric.timestamp)}</td>
                  <td className="px-6 py-4 whitespace-nowrap text-sm">{metric.model}</td>
+                  <td className="px-6 py-4 whitespace-nowrap text-sm">
+                    {metric.cache_tokens > 0 ? metric.cache_tokens.toLocaleString() : "-"}
+                  </td>
                  <td className="px-6 py-4 whitespace-nowrap text-sm">{metric.input_tokens.toLocaleString()}</td>
                  <td className="px-6 py-4 whitespace-nowrap text-sm">{metric.output_tokens.toLocaleString()}</td>
                  <td className="px-6 py-4 whitespace-nowrap text-sm">{formatSpeed(metric.prompt_per_second)}</td>
@@ -63,4 +94,28 @@ const ActivityPage = () => {
  );
 };

+interface TooltipProps {
+  content: string;
+}
+
+const Tooltip: React.FC<TooltipProps> = ({ content }) => {
+  return (
+    <div className="relative group inline-block">
+      ⓘ
+      <div
+        className="absolute top-full left-1/2 transform -translate-x-1/2 mt-2
+                     px-3 py-2 bg-gray-900 text-white text-sm rounded-md
+                     opacity-0 group-hover:opacity-100 transition-opacity
+                     duration-200 pointer-events-none whitespace-nowrap z-50 normal-case"
+      >
+        {content}
+        <div
+          className="absolute bottom-full left-1/2 transform -translate-x-1/2
+                       border-4 border-transparent border-b-gray-900"
+        ></div>
+      </div>
+    </div>
+  );
+};
+
 export default ActivityPage;
Author	SHA1	Message	Date
Benson Wong	f58c8c8ec5	Support llama.cpp's cache_n in timings info (#287 ) Capture prompt cache metrics and surface them on Activities page in UI	2025-09-06 13:58:02 -07:00
Benson Wong	954e2dee73	Remove `cmdStart` from README [skip ci] cmdStart was in the README but it doesn't exist. Fixed the typo. Oops.	2025-09-04 11:57:28 -07:00
Benson Wong	a533aec736	small tweak to example config	2025-09-01 21:26:58 -07:00
Brett Profitt	97b17fc47d	Add ${MODEL_ID} macro (#226 ) The automatic ${MODEL_ID} macro includes the name of the model and can be used in Cmd and CmdStop.	2025-09-01 21:21:37 -07:00
Benson Wong	2457840698	Update README.md [skip ci]	2025-08-28 23:44:37 -07:00
Benson Wong	7f55494151	Update README.md [skip ci]	2025-08-28 22:47:28 -07:00
Benson Wong	831a90d3b0	Add different timeout scenarios to Process.checkHealthEndpoint #276 (#278 ) - add a TCP connection timeout of 500ms - increase HTTP client timeout to 5000ms In this new behaviour the upstream has 500ms to accept a tcp connection and 5000ms to respond to the HTTP request.	2025-08-28 22:03:14 -07:00
Yandrik	977f1856bb	add /completion endpoint (#275 ) * feat: add /completion endpoint * chore: reformat using gofmt	2025-08-28 21:41:02 -07:00
Benson Wong	52b329f7bc	Fix #277 race condition in ProcessGroup.ProxyRequest when swap=true	2025-08-28 21:38:40 -07:00