Fix token metrics parsing (#199 )

Fix #198 - use llama-server's `timings` info if available in response body - send "-1" for token/sec when not able to accurately calculate performance - optimize streaming body search for metrics information
add contributors to README [skip ci]
2025-07-22 23:10:14 -07:00 · 2025-07-21 23:16:48 -07:00 · 2025-07-21 23:08:19 -07:00
5 changed files with 89 additions and 47 deletions
@@ -70,9 +70,10 @@ See the [configuration documentation](https://github.com/mostlygeek/llama-swap/w

 ## Web UI

-llama-swap ships with a web based interface to make it easier to monitor logs and check the status of models. 
+llama-swap ships with a real time web interface to monitor logs and status of models:
+
+<img width="1786" height="1334" alt="image" src="https://github.com/user-attachments/assets/d6258cb9-1dad-40db-828f-2be860aec8fe" />

-<img width="1758" alt="image" src="https://github.com/user-attachments/assets/31ae5bcd-5efd-46b0-b64b-6db9e60196d3" />

 ## Docker Install ([download images](https://github.com/mostlygeek/llama-swap/pkgs/container/llama-swap))

@@ -173,6 +174,13 @@ Any OpenAI compatible server would work. llama-swap was originally designed for

 For Python based inference servers like vllm or tabbyAPI it is recommended to run them via podman or docker. This provides clean environment isolation as well as responding correctly to `SIGTERM` signals to shutdown.

+## Contributors 
+<a href="https://github.com/mostlygeek/llama-swap/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=mostlygeek/llama-swap" />
+</a>
+
+Made with [contrib.rocks](https://contrib.rocks).
+
 ## Star History

 [![Star History Chart](https://api.star-history.com/svg?repos=mostlygeek/llama-swap&type=Date)](https://www.star-history.com/#mostlygeek/llama-swap&Date)
@@ -78,6 +78,14 @@ func main() {
 					"prompt_tokens":     25,
 					"total_tokens":      35,
 				},
+				// add timings to simulate llama.cpp
+				"timings": gin.H{
+					"prompt_n":             25,
+					"prompt_ms":            13,
+					"predicted_n":          10,
+					"predicted_ms":         17,
+					"predicted_per_second": 10,
+				},
 			}
 			c.SSEvent("message", finalData)
 			c.Writer.Flush()
@@ -102,6 +110,13 @@ func main() {
 					"prompt_tokens":     25,
 					"total_tokens":      35,
 				},
+				"timings": gin.H{
+					"prompt_n":             25,
+					"prompt_ms":            13,
+					"predicted_n":          10,
+					"predicted_ms":         17,
+					"predicted_per_second": 10,
+				},
 			})
 		}
 	})
@@ -67,51 +67,66 @@ func (rec *MetricsRecorder) processBody(body []byte) {
 	}
 }

-func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) {
-	if !jsonData.Get("usage").Exists() {
-		return
+func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool {
+	usage := jsonData.Get("usage")
+	if !usage.Exists() {
+		return false
 	}

+	// default values
 	outputTokens := int(jsonData.Get("usage.completion_tokens").Int())
 	inputTokens := int(jsonData.Get("usage.prompt_tokens").Int())
+	tokensPerSecond := -1.0
+	durationMs := int(time.Since(rec.startTime).Milliseconds())

-	if outputTokens > 0 {
-		duration := time.Since(rec.startTime)
-		tokensPerSecond := float64(inputTokens+outputTokens) / duration.Seconds()
-
-		metrics := TokenMetrics{
-			Timestamp:       time.Now(),
-			Model:           rec.realModelName,
-			InputTokens:     inputTokens,
-			OutputTokens:    outputTokens,
-			TokensPerSecond: tokensPerSecond,
-			DurationMs:      int(duration.Milliseconds()),
-		}
-		rec.metricsMonitor.addMetrics(metrics)
+	// use llama-server's timing data for tok/sec and duration as it is more accurate
+	if timings := jsonData.Get("timings"); timings.Exists() {
+		tokensPerSecond = jsonData.Get("timings.predicted_per_second").Float()
+		durationMs = int(jsonData.Get("timings.prompt_ms").Float() + jsonData.Get("timings.predicted_ms").Float())
 	}
+
+	rec.metricsMonitor.addMetrics(TokenMetrics{
+		Timestamp:       time.Now(),
+		Model:           rec.realModelName,
+		InputTokens:     inputTokens,
+		OutputTokens:    outputTokens,
+		TokensPerSecond: tokensPerSecond,
+		DurationMs:      durationMs,
+	})
+
+	return true
 }

 func (rec *MetricsRecorder) processStreamingResponse(body []byte) {
+	// Iterate **backwards** through the lines looking for the data payload with
+	// usage data
 	lines := bytes.Split(body, []byte("\n"))
-	for _, line := range lines {
-		line = bytes.TrimSpace(line)
+
+	for i := len(lines) - 1; i >= 0; i-- {
+		line := bytes.TrimSpace(lines[i])
 		if len(line) == 0 {
 			continue
 		}

-		// Check for SSE data prefix
-		if data, found := bytes.CutPrefix(line, []byte("data:")); found {
-			data = bytes.TrimSpace(data)
-			if len(data) == 0 {
-				continue
-			}
-			if bytes.Equal(data, []byte("[DONE]")) {
-				break
-			}
+		// SSE payload always follows "data:"
+		prefix := []byte("data:")
+		if !bytes.HasPrefix(line, prefix) {
+			continue
+		}
+		data := bytes.TrimSpace(line[len(prefix):])

-			// Parse JSON to look for usage data
-			if gjson.ValidBytes(data) {
-				rec.parseAndRecordMetrics(gjson.ParseBytes(data))
+		if len(data) == 0 {
+			continue
+		}
+
+		if bytes.Equal(data, []byte("[DONE]")) {
+			// [DONE] line itself contains nothing of interest.
+			continue
+		}
+
+		if gjson.ValidBytes(data) {
+			if rec.parseAndRecordMetrics(gjson.ParseBytes(data)) {
+				return // short circuit if a metric was recorded
 			}
 		}
 	}
@@ -708,7 +708,9 @@ func TestProxyManager_MiddlewareWritesMetrics_NonStreaming(t *testing.T) {

 	// Check that metrics were recorded
 	metrics := proxy.metricsMonitor.GetMetrics()
-	assert.NotEmpty(t, metrics, "metrics should be recorded for non-streaming request")
+	if !assert.NotEmpty(t, metrics, "metrics should be recorded for non-streaming request") {
+		return
+	}

 	// Verify the last metric has the correct model
 	lastMetric := metrics[len(metrics)-1]
@@ -741,7 +743,9 @@ func TestProxyManager_MiddlewareWritesMetrics_Streaming(t *testing.T) {

 	// Check that metrics were recorded
 	metrics := proxy.metricsMonitor.GetMetrics()
-	assert.NotEmpty(t, metrics, "metrics should be recorded for streaming request")
+	if !assert.NotEmpty(t, metrics, "metrics should be recorded for streaming request") {
+		return
+	}

 	// Verify the last metric has the correct model
 	lastMetric := metrics[len(metrics)-1]
@@ -1,6 +1,18 @@
 import { useState, useEffect } from "react";
 import { useAPI } from "../contexts/APIProvider";

+const formatTimestamp = (timestamp: string): string => {
+  return new Date(timestamp).toLocaleString();
+};
+
+const formatSpeed = (speed: number): string => {
+  return speed < 0 ? "unknown" : speed.toFixed(2) + " t/s";
+};
+
+const formatDuration = (ms: number): string => {
+  return (ms / 1000).toFixed(2) + "s";
+};
+
 const ActivityPage = () => {
  const { metrics } = useAPI();
  const [error, setError] = useState<string | null>(null);
@@ -11,18 +23,6 @@ const ActivityPage = () => {
    }
  }, [metrics]);

-  const formatTimestamp = (timestamp: string) => {
-    return new Date(timestamp).toLocaleString();
-  };
-
-  const formatSpeed = (speed: number) => {
-    return speed.toFixed(2) + " t/s";
-  };
-
-  const formatDuration = (ms: number) => {
-    return (ms / 1000).toFixed(2) + "s";
-  };
-
  if (error) {
    return (
      <div className="p-6">
@@ -51,7 +51,7 @@ const ActivityPage = () => {
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Model</th>
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Input Tokens</th>
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Output Tokens</th>
-                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Processing Speed</th>
+                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Generation Speed</th>
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Duration</th>
              </tr>
            </thead>
Author	SHA1	Message	Date
Benson Wong	01d4838fb3	Fix token metrics parsing (#199 ) Fix #198 - use llama-server's `timings` info if available in response body - send "-1" for token/sec when not able to accurately calculate performance - optimize streaming body search for metrics information	2025-07-22 23:10:14 -07:00
Benson Wong	accd65294b	add contributors to README [skip ci]	2025-07-21 23:16:48 -07:00
Benson Wong	7472a25864	Update README.md [skip ci] update screenshot for web UI	2025-07-21 23:08:19 -07:00