Support llama.cpp's cache_n in timings info (#287 )

Capture prompt cache metrics and surface them on Activities page in UI
Remove cmdStart from README [skip ci]
2025-09-06 13:58:02 -07:00 · 2025-09-04 11:57:28 -07:00 · 2025-09-01 21:26:58 -07:00 · 2025-09-01 21:21:37 -07:00 · 2025-08-28 23:44:37 -07:00 · 2025-08-28 22:47:28 -07:00
8 changed files with 131 additions and 20 deletions
@@ -7,7 +7,7 @@
 llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
-Written in golang, it is very easy to install (single binary with no dependencies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images.
+Written in golang, it is very easy to install (single binary with no dependencies) and configure (single yaml file). To get started, download a pre-built binary, a provided docker images or Homebrew.
 ## Features:
@@ -34,7 +34,7 @@ Written in golang, it is very easy to install (single binary with no dependencie
 - ✅ Run multiple models at once with `Groups` ([#107](https://github.com/mostlygeek/llama-swap/issues/107))
 - ✅ Automatic unloading of models after timeout by setting a `ttl`
 - ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabbyAPI, etc)
- ✅ Reliable Docker and Podman support with `cmdStart` and `cmdStop`
+- ✅ Reliable Docker and Podman support using `cmd` and `cmdStop` together
 - ✅ Full control over server settings per model
 - ✅ Preload models on startup with `hooks` ([#235](https://github.com/mostlygeek/llama-swap/pull/235))
@@ -207,4 +207,7 @@ For Python based inference servers like vllm or tabbyAPI it is recommended to ru
 ## Star History
 > [!NOTE]
 > ⭐️ Star this project to help others discover it! 
 [![Star History Chart](https://api.star-history.com/svg?repos=mostlygeek/llama-swap&type=Date)](https://www.star-history.com/#mostlygeek/llama-swap&Date)
@@ -49,8 +49,8 @@ macros:
 # - required
 # - each key is the model's ID, used in API requests
 # - model settings have default values that are used if they are not defined here
-# - below are examples of the various settings a model can have:
+# - the model's ID is available in the ${MODEL_ID} macro, also available in macros defined above
-# - available model settings: env, cmd, cmdStop, proxy, aliases, checkEndpoint, ttl, unlisted
+# - below are examples of the all the settings a model can have
 models:
  # keys are the model names used in API requests
@@ -148,12 +148,12 @@ models:
    cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
  # Docker example:
-  # container run times like Docker and Podman can be used reliably with a
+  # container runtimes like Docker and Podman can be used reliably with
-  # a combination of cmd and cmdStop.
+  # a combination of cmd, cmdStop, and ${MODEL_ID}
  "docker-llama":
    proxy: "http://127.0.0.1:${PORT}"
    cmd: |
-      docker run --name dockertest
+      docker run --name ${MODEL_ID}
      --init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
      ghcr.io/ggml-org/llama.cpp:server
      --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
@@ -167,7 +167,7 @@ models:
    # - on POSIX systems: a SIGTERM signal is sent
    # - on Windows, calls taskkill to stop the process
    # - processes have 5 seconds to shutdown until forceful termination is attempted
-    cmdStop: docker stop dockertest
+    cmdStop: docker stop ${MODEL_ID}
 # groups: a dictionary of group settings
 # - optional, default: empty dictionary
@@ -237,7 +237,7 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 	- name must fit the regex ^[a-zA-Z0-9_-]+$
 	- names must be less than 64 characters (no reason, just cause)
-	- name can not be any reserved macros: PORT
+	- name can not be any reserved macros: PORT, MODEL_ID
 	- macro values must be less than 1024 characters
 	*/
 	macroNameRegex := regexp.MustCompile(`^[a-zA-Z0-9_-]+$`)
@@ -253,6 +253,7 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 		}
 		switch macroName {
 		case "PORT":
 		case "MODEL_ID":
 			return Config{}, fmt.Errorf("macro name '%s' is reserved and cannot be used", macroName)
 		}
 	}
@@ -296,6 +297,11 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 			nextPort++
 		}
 		if strings.Contains(modelConfig.Cmd, "${MODEL_ID}") || strings.Contains(modelConfig.CmdStop, "${MODEL_ID}") {
 			modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, "${MODEL_ID}", modelId)
 			modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, "${MODEL_ID}", modelId)
 		}
 		// make sure there are no unknown macros that have not been replaced
 		macroPattern := regexp.MustCompile(`\$\{([a-zA-Z0-9_-]+)\}`)
 		fieldMap := map[string]string{
@@ -440,3 +440,44 @@ models:
 	expectedCmd := "/user/llama.cpp/build/bin/llama-server --port 9990 --model /path/to/model.gguf -ngl 99"
 	assert.Equal(t, expectedCmd, cmdStr, "Final command does not match expected structure")
 }
 func TestConfig_MacroModelId(t *testing.T) {
 	content := `
 startPort: 9000
 macros:
  "docker-llama": docker run --name ${MODEL_ID} -p ${PORT}:8080 docker_img
  "docker-stop": docker stop ${MODEL_ID}
 models:
  model1:
    cmd: /path/to/server -p ${PORT} -hf ${MODEL_ID}
  model2:
    cmd: ${docker-llama}
    cmdStop: ${docker-stop}
  author/model:F16:
    cmd: /path/to/server -p ${PORT} -hf ${MODEL_ID}
    cmdStop: stop
 `
 	config, err := LoadConfigFromReader(strings.NewReader(content))
 	assert.NoError(t, err)
 	sanitizedCmd, err := SanitizeCommand(config.Models["model1"].Cmd)
 	assert.NoError(t, err)
 	assert.Equal(t, "/path/to/server -p 9001 -hf model1", strings.Join(sanitizedCmd, " "))
 	assert.Equal(t, "docker stop ${MODEL_ID}", config.Macros["docker-stop"])
 	sanitizedCmd2, err := SanitizeCommand(config.Models["model2"].Cmd)
 	assert.NoError(t, err)
 	assert.Equal(t, "docker run --name model2 -p 9002:8080 docker_img", strings.Join(sanitizedCmd2, " "))
 	sanitizedCmdStop, err := SanitizeCommand(config.Models["model2"].CmdStop)
 	assert.NoError(t, err)
 	assert.Equal(t, "docker stop model2", strings.Join(sanitizedCmdStop, " "))
 	sanitizedCmd3, err := SanitizeCommand(config.Models["author/model:F16"].Cmd)
 	assert.NoError(t, err)
 	assert.Equal(t, "/path/to/server -p 9000 -hf author/model:F16", strings.Join(sanitizedCmd3, " "))
 }
@@ -61,7 +61,6 @@ func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
 		} else {
 			writer.metricsRecorder.processNonStreamingResponse(writer.body)
 		}
 	}
 }
@@ -73,6 +72,7 @@ func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool {
 	}
 	// default values
 	cachedTokens := -1 // unknown or missing data
 	outputTokens := 0
 	inputTokens := 0
@@ -93,11 +93,16 @@ func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool {
 		promptPerSecond = jsonData.Get("timings.prompt_per_second").Float()
 		tokensPerSecond = jsonData.Get("timings.predicted_per_second").Float()
 		durationMs = int(jsonData.Get("timings.prompt_ms").Float() + jsonData.Get("timings.predicted_ms").Float())
 		if cachedValue := jsonData.Get("timings.cache_n"); cachedValue.Exists() {
 			cachedTokens = int(cachedValue.Int())
 		}
 	}
 	rec.metricsMonitor.addMetrics(TokenMetrics{
 		Timestamp:       time.Now(),
 		Model:           rec.realModelName,
 		CachedTokens:    cachedTokens,
 		InputTokens:     inputTokens,
 		OutputTokens:    outputTokens,
 		PromptPerSecond: promptPerSecond,
@@ -13,6 +13,7 @@ type TokenMetrics struct {
 	ID              int       `json:"id"`
 	Timestamp       time.Time `json:"timestamp"`
 	Model           string    `json:"model"`
 	CachedTokens    int       `json:"cache_tokens"`
 	InputTokens     int       `json:"input_tokens"`
 	OutputTokens    int       `json:"output_tokens"`
 	PromptPerSecond float64   `json:"prompt_per_second"`
@@ -61,7 +62,6 @@ func (mp *MetricsMonitor) addMetrics(metric TokenMetrics) {
 	if len(mp.metrics) > mp.maxMetrics {
 		mp.metrics = mp.metrics[len(mp.metrics)-mp.maxMetrics:]
 	}
 	event.Emit(TokenMetricsEvent{Metrics: metric})
 }
@@ -28,6 +28,7 @@ interface Metrics {
  id: number;
  timestamp: string;
  model: string;
  cache_tokens: number;
  input_tokens: number;
  output_tokens: number;
  prompt_per_second: number;
@@ -1,10 +1,6 @@
 import { useMemo } from "react";
 import { useAPI } from "../contexts/APIProvider";
 const formatTimestamp = (timestamp: string): string => {
  return new Date(timestamp).toLocaleString();
 };
 const formatSpeed = (speed: number): string => {
  return speed < 0 ? "unknown" : speed.toFixed(2) + " t/s";
 };
@@ -13,6 +9,33 @@ const formatDuration = (ms: number): string => {
  return (ms / 1000).toFixed(2) + "s";
 };
 const formatRelativeTime = (timestamp: string): string => {
  const now = new Date();
  const date = new Date(timestamp);
  const diffInSeconds = Math.floor((now.getTime() - date.getTime()) / 1000);
  // Handle future dates by returning "just now"
  if (diffInSeconds < 5) {
    return "now";
  }
  if (diffInSeconds < 60) {
    return `${diffInSeconds}s ago`;
  }
  const diffInMinutes = Math.floor(diffInSeconds / 60);
  if (diffInMinutes < 60) {
    return `${diffInMinutes}m ago`;
  }
  const diffInHours = Math.floor(diffInMinutes / 60);
  if (diffInHours < 24) {
    return `${diffInHours}h ago`;
  }
  return "a while ago";
 };
 const ActivityPage = () => {
  const { metrics } = useAPI();
  const sortedMetrics = useMemo(() => {
@@ -32,11 +55,16 @@ const ActivityPage = () => {
          <table className="min-w-full divide-y">
            <thead>
              <tr>
-                <th className="px-4 py-3 text-left text-xs font-medium uppercase tracking-wider">Id</th>
+                <th className="px-4 py-3 text-left text-xs font-medium uppercase tracking-wider">ID</th>
-                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Timestamp</th>
+                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Time</th>
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Model</th>
-                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Input Tokens</th>
+                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">
-                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Output Tokens</th>
+                  Cached <Tooltip content="prompt tokens from cache" />
                </th>
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">
                  Prompt <Tooltip content="new prompt tokens processed" />
                </th>
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Generated</th>
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Prompt Processing</th>
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Generation Speed</th>
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Duration</th>
@@ -46,8 +74,11 @@ const ActivityPage = () => {
              {sortedMetrics.map((metric) => (
                <tr key={`metric_${metric.id}`}>
                  <td className="px-4 py-4 whitespace-nowrap text-sm">{metric.id + 1 /* un-zero index */}</td>
-                  <td className="px-6 py-4 whitespace-nowrap text-sm">{formatTimestamp(metric.timestamp)}</td>
+                  <td className="px-6 py-4 whitespace-nowrap text-sm">{formatRelativeTime(metric.timestamp)}</td>
                  <td className="px-6 py-4 whitespace-nowrap text-sm">{metric.model}</td>
                  <td className="px-6 py-4 whitespace-nowrap text-sm">
                    {metric.cache_tokens > 0 ? metric.cache_tokens.toLocaleString() : "-"}
                  </td>
                  <td className="px-6 py-4 whitespace-nowrap text-sm">{metric.input_tokens.toLocaleString()}</td>
                  <td className="px-6 py-4 whitespace-nowrap text-sm">{metric.output_tokens.toLocaleString()}</td>
                  <td className="px-6 py-4 whitespace-nowrap text-sm">{formatSpeed(metric.prompt_per_second)}</td>
@@ -63,4 +94,28 @@ const ActivityPage = () => {
  );
 };
 interface TooltipProps {
  content: string;
 }
 const Tooltip: React.FC<TooltipProps> = ({ content }) => {
  return (
    <div className="relative group inline-block">
      ⓘ
      <div
        className="absolute top-full left-1/2 transform -translate-x-1/2 mt-2
                     px-3 py-2 bg-gray-900 text-white text-sm rounded-md
                     opacity-0 group-hover:opacity-100 transition-opacity
                     duration-200 pointer-events-none whitespace-nowrap z-50 normal-case"
      >
        {content}
        <div
          className="absolute bottom-full left-1/2 transform -translate-x-1/2
                       border-4 border-transparent border-b-gray-900"
        ></div>
      </div>
    </div>
  );
 };
 export default ActivityPage;
Author	SHA1	Message	Date
Benson Wong	f58c8c8ec5	Support llama.cpp's cache_n in timings info (#287 ) Capture prompt cache metrics and surface them on Activities page in UI	2025-09-06 13:58:02 -07:00
Benson Wong	954e2dee73	Remove `cmdStart` from README [skip ci] cmdStart was in the README but it doesn't exist. Fixed the typo. Oops.	2025-09-04 11:57:28 -07:00
Benson Wong	a533aec736	small tweak to example config	2025-09-01 21:26:58 -07:00
Brett Profitt	97b17fc47d	Add ${MODEL_ID} macro (#226 ) The automatic ${MODEL_ID} macro includes the name of the model and can be used in Cmd and CmdStop.	2025-09-01 21:21:37 -07:00
Benson Wong	2457840698	Update README.md [skip ci]	2025-08-28 23:44:37 -07:00
Benson Wong	7f55494151	Update README.md [skip ci]	2025-08-28 22:47:28 -07:00