Update github actions for notifying homebrew build (#212 )

Combine homebrew-llama-swap event with the release action
add /health (#211 )
2025-07-30 11:29:03 -07:00 · 2025-07-30 10:37:10 -07:00 · 2025-07-30 10:13:49 -07:00 · 2025-07-30 10:12:21 -07:00 · 2025-07-27 10:36:06 -07:00 · 2025-07-24 08:32:47 -07:00
8 changed files with 182 additions and 61 deletions
@@ -7,6 +7,10 @@ on:

  # Allows manual triggering of the workflow
  workflow_dispatch:
+    inputs:
+      tag:
+        description: 'Tag version to release (e.g. v144)'
+        required: true

 permissions:
  contents: write
@@ -20,15 +24,15 @@ jobs:
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          ref: ${{ github.event.inputs.tag || github.ref }}
      -
        name: Set up Go
        uses: actions/setup-go@v5
-
      -
        name: Set up Node.js
        uses: actions/setup-node@v4
        with:
-          node-version: '23'  # or your preferred version
+          node-version: '23'
      -
        name: Install dependencies and build UI
        run: |
@@ -46,4 +50,30 @@ jobs:
          version: '~> v2'
          args: release --clean
        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+  trigger-tap-update:
+    runs-on: ubuntu-latest
+    needs: goreleaser
+    steps:
+      - name: "Resolve tag to dispatch"
+        id: tag
+        run: |
+          if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
+            echo "tag=${{ github.event.inputs.tag }}" >> "$GITHUB_OUTPUT"
+          else
+            echo "tag=${{ github.ref_name }}" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: "Trigger tap repository update"
+        uses: peter-evans/repository-dispatch@v2
+        with:
+          token: ${{ secrets.TAP_REPO_PAT }}
+          repository: mostlygeek/homebrew-llama-swap
+          event-type: new-release
+          client-payload: |
+            {
+              "release": {
+                "tag_name": "${{ steps.tag.outputs.tag }}"
+              }
+            }
@@ -18,7 +18,7 @@ Written in golang, it is very easy to install (single binary with no dependencie
  - `v1/completions`
  - `v1/chat/completions`
  - `v1/embeddings`
-  - `v1/rerank`
+  - `v1/rerank`, `v1/reranking`, `rerank`
  - `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36))
  - `v1/audio/transcriptions` ([docs](https://github.com/mostlygeek/llama-swap/issues/41#issuecomment-2722637867))
 - ✅ llama-swap custom API endpoints
@@ -70,9 +70,10 @@ See the [configuration documentation](https://github.com/mostlygeek/llama-swap/w

 ## Web UI

-llama-swap ships with a web based interface to make it easier to monitor logs and check the status of models. 
+llama-swap ships with a real time web interface to monitor logs and status of models:
+
+<img width="1786" height="1334" alt="image" src="https://github.com/user-attachments/assets/d6258cb9-1dad-40db-828f-2be860aec8fe" />

-<img width="1758" alt="image" src="https://github.com/user-attachments/assets/31ae5bcd-5efd-46b0-b64b-6db9e60196d3" />

 ## Docker Install ([download images](https://github.com/mostlygeek/llama-swap/pkgs/container/llama-swap))

@@ -121,6 +122,20 @@ $ docker run -it --rm --runtime nvidia -p 9292:8080 \

 </details>

+## Homebrew Install (macOS/Linux)
+
+For macOS & Linux users, `llama-swap` can be installed via [Homebrew](https://brew.sh):
+
+```shell
+# Set up tap and install formula 
+brew tap mostlygeek/llama-swap
+brew install llama-swap
+# Run llama-swap
+llama-swap --config path/to/config.yaml --listen localhost:8080
+```
+
+This will install the `llama-swap` binary and make it available in your path. See the [configuration documentation](https://github.com/mostlygeek/llama-swap/wiki/Configuration)
+
 ## Bare metal Install ([download](https://github.com/mostlygeek/llama-swap/releases))

 Pre-built binaries are available for Linux, Mac, Windows and FreeBSD. These are automatically published and are likely a few hours ahead of the docker releases. The baremetal install works with any OpenAI compatible server, not just llama-server.
@@ -173,6 +188,13 @@ Any OpenAI compatible server would work. llama-swap was originally designed for

 For Python based inference servers like vllm or tabbyAPI it is recommended to run them via podman or docker. This provides clean environment isolation as well as responding correctly to `SIGTERM` signals to shutdown.

+## Contributors 
+<a href="https://github.com/mostlygeek/llama-swap/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=mostlygeek/llama-swap" />
+</a>
+
+Made with [contrib.rocks](https://contrib.rocks).
+
 ## Star History

 [![Star History Chart](https://api.star-history.com/svg?repos=mostlygeek/llama-swap&type=Date)](https://www.star-history.com/#mostlygeek/llama-swap&Date)
@@ -78,6 +78,14 @@ func main() {
 					"prompt_tokens":     25,
 					"total_tokens":      35,
 				},
+				// add timings to simulate llama.cpp
+				"timings": gin.H{
+					"prompt_n":             25,
+					"prompt_ms":            13,
+					"predicted_n":          10,
+					"predicted_ms":         17,
+					"predicted_per_second": 10,
+				},
 			}
 			c.SSEvent("message", finalData)
 			c.Writer.Flush()
@@ -102,6 +110,13 @@ func main() {
 					"prompt_tokens":     25,
 					"total_tokens":      35,
 				},
+				"timings": gin.H{
+					"prompt_n":             25,
+					"prompt_ms":            13,
+					"predicted_n":          10,
+					"predicted_ms":         17,
+					"predicted_per_second": 10,
+				},
 			})
 		}
 	})
@@ -17,6 +17,7 @@ func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
 		bodyBytes, err := io.ReadAll(c.Request.Body)
 		if err != nil {
 			pm.sendErrorResponse(c, http.StatusBadRequest, "could not ready request body")
+			c.Abort()
 			return
 		}
 		c.Request.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
@@ -24,15 +25,16 @@ func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
 		requestedModel := gjson.GetBytes(bodyBytes, "model").String()
 		if requestedModel == "" {
 			pm.sendErrorResponse(c, http.StatusBadRequest, "missing or invalid 'model' key")
+			c.Abort()
 			return
 		}

 		realModelName, found := pm.config.RealModelName(requestedModel)
 		if !found {
 			pm.sendErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("could not find real modelID for %s", requestedModel))
+			c.Abort()
 			return
 		}
-		c.Set("ls-real-model-name", realModelName)

 		writer := &MetricsResponseWriter{
 			ResponseWriter: c.Writer,
@@ -67,51 +69,66 @@ func (rec *MetricsRecorder) processBody(body []byte) {
 	}
 }

-func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) {
-	if !jsonData.Get("usage").Exists() {
-		return
+func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool {
+	usage := jsonData.Get("usage")
+	if !usage.Exists() {
+		return false
 	}

+	// default values
 	outputTokens := int(jsonData.Get("usage.completion_tokens").Int())
 	inputTokens := int(jsonData.Get("usage.prompt_tokens").Int())
+	tokensPerSecond := -1.0
+	durationMs := int(time.Since(rec.startTime).Milliseconds())

-	if outputTokens > 0 {
-		duration := time.Since(rec.startTime)
-		tokensPerSecond := float64(inputTokens+outputTokens) / duration.Seconds()
-
-		metrics := TokenMetrics{
-			Timestamp:       time.Now(),
-			Model:           rec.realModelName,
-			InputTokens:     inputTokens,
-			OutputTokens:    outputTokens,
-			TokensPerSecond: tokensPerSecond,
-			DurationMs:      int(duration.Milliseconds()),
-		}
-		rec.metricsMonitor.addMetrics(metrics)
+	// use llama-server's timing data for tok/sec and duration as it is more accurate
+	if timings := jsonData.Get("timings"); timings.Exists() {
+		tokensPerSecond = jsonData.Get("timings.predicted_per_second").Float()
+		durationMs = int(jsonData.Get("timings.prompt_ms").Float() + jsonData.Get("timings.predicted_ms").Float())
 	}
+
+	rec.metricsMonitor.addMetrics(TokenMetrics{
+		Timestamp:       time.Now(),
+		Model:           rec.realModelName,
+		InputTokens:     inputTokens,
+		OutputTokens:    outputTokens,
+		TokensPerSecond: tokensPerSecond,
+		DurationMs:      durationMs,
+	})
+
+	return true
 }

 func (rec *MetricsRecorder) processStreamingResponse(body []byte) {
+	// Iterate **backwards** through the lines looking for the data payload with
+	// usage data
 	lines := bytes.Split(body, []byte("\n"))
-	for _, line := range lines {
-		line = bytes.TrimSpace(line)
+
+	for i := len(lines) - 1; i >= 0; i-- {
+		line := bytes.TrimSpace(lines[i])
 		if len(line) == 0 {
 			continue
 		}

-		// Check for SSE data prefix
-		if data, found := bytes.CutPrefix(line, []byte("data:")); found {
-			data = bytes.TrimSpace(data)
-			if len(data) == 0 {
-				continue
-			}
-			if bytes.Equal(data, []byte("[DONE]")) {
-				break
-			}
+		// SSE payload always follows "data:"
+		prefix := []byte("data:")
+		if !bytes.HasPrefix(line, prefix) {
+			continue
+		}
+		data := bytes.TrimSpace(line[len(prefix):])

-			// Parse JSON to look for usage data
-			if gjson.ValidBytes(data) {
-				rec.parseAndRecordMetrics(gjson.ParseBytes(data))
+		if len(data) == 0 {
+			continue
+		}
+
+		if bytes.Equal(data, []byte("[DONE]")) {
+			// [DONE] line itself contains nothing of interest.
+			continue
+		}
+
+		if gjson.ValidBytes(data) {
+			if rec.parseAndRecordMetrics(gjson.ParseBytes(data)) {
+				return // short circuit if a metric was recorded
 			}
 		}
 	}
@@ -14,6 +14,7 @@ import (
 	"time"

 	"github.com/gin-gonic/gin"
+	"github.com/tidwall/gjson"
 	"github.com/tidwall/sjson"
 )

@@ -160,8 +161,10 @@ func (pm *ProxyManager) setupGinEngine() {
 	pm.ginEngine.POST("/v1/completions", mm, pm.proxyOAIHandler)

 	// Support embeddings
-	pm.ginEngine.POST("/v1/embeddings", pm.proxyOAIHandler)
-	pm.ginEngine.POST("/v1/rerank", pm.proxyOAIHandler)
+	pm.ginEngine.POST("/v1/embeddings", mm, pm.proxyOAIHandler)
+	pm.ginEngine.POST("/v1/rerank", mm, pm.proxyOAIHandler)
+	pm.ginEngine.POST("/v1/reranking", mm, pm.proxyOAIHandler)
+	pm.ginEngine.POST("/rerank", mm, pm.proxyOAIHandler)

 	// Support audio/speech endpoint
 	pm.ginEngine.POST("/v1/audio/speech", pm.proxyOAIHandler)
@@ -188,6 +191,9 @@ func (pm *ProxyManager) setupGinEngine() {

 	pm.ginEngine.GET("/unload", pm.unloadAllModelsHandler)
 	pm.ginEngine.GET("/running", pm.listRunningProcessesHandler)
+	pm.ginEngine.GET("/health", func(c *gin.Context) {
+		c.String(http.StatusOK, "OK")
+	})

 	pm.ginEngine.GET("/favicon.ico", func(c *gin.Context) {
 		if data, err := reactStaticFS.ReadFile("ui_dist/favicon.ico"); err == nil {
@@ -365,9 +371,15 @@ func (pm *ProxyManager) proxyOAIHandler(c *gin.Context) {
 		return
 	}

-	realModelName := c.GetString("ls-real-model-name") // Should be set in MetricsMiddleware
-	if realModelName == "" {
-		pm.sendErrorResponse(c, http.StatusInternalServerError, "ls-real-model-name not set")
+	requestedModel := gjson.GetBytes(bodyBytes, "model").String()
+	if requestedModel == "" {
+		pm.sendErrorResponse(c, http.StatusBadRequest, "missing or invalid 'model' key")
+		return
+	}
+
+	realModelName, found := pm.config.RealModelName(requestedModel)
+	if !found {
+		pm.sendErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("could not find real modelID for %s", requestedModel))
 		return
 	}

@@ -708,7 +708,9 @@ func TestProxyManager_MiddlewareWritesMetrics_NonStreaming(t *testing.T) {

 	// Check that metrics were recorded
 	metrics := proxy.metricsMonitor.GetMetrics()
-	assert.NotEmpty(t, metrics, "metrics should be recorded for non-streaming request")
+	if !assert.NotEmpty(t, metrics, "metrics should be recorded for non-streaming request") {
+		return
+	}

 	// Verify the last metric has the correct model
 	lastMetric := metrics[len(metrics)-1]
@@ -741,7 +743,9 @@ func TestProxyManager_MiddlewareWritesMetrics_Streaming(t *testing.T) {

 	// Check that metrics were recorded
 	metrics := proxy.metricsMonitor.GetMetrics()
-	assert.NotEmpty(t, metrics, "metrics should be recorded for streaming request")
+	if !assert.NotEmpty(t, metrics, "metrics should be recorded for streaming request") {
+		return
+	}

 	// Verify the last metric has the correct model
 	lastMetric := metrics[len(metrics)-1]
@@ -751,3 +755,21 @@ func TestProxyManager_MiddlewareWritesMetrics_Streaming(t *testing.T) {
 	assert.Greater(t, lastMetric.TokensPerSecond, 0.0, "tokens per second should be greater than 0")
 	assert.Greater(t, lastMetric.DurationMs, 0, "duration should be greater than 0")
 }
+
+func TestProxyManager_HealthEndpoint(t *testing.T) {
+	config := AddDefaultGroupToConfig(Config{
+		HealthCheckTimeout: 15,
+		Models: map[string]ModelConfig{
+			"model1": getTestSimpleResponderConfig("model1"),
+		},
+		LogLevel: "error",
+	})
+
+	proxy := New(config)
+	defer proxy.StopProcesses(StopWaitForInflightRequest)
+	req := httptest.NewRequest("GET", "/health", nil)
+	rec := httptest.NewRecorder()
+	proxy.ServeHTTP(rec, req)
+	assert.Equal(t, http.StatusOK, rec.Code)
+	assert.Equal(t, "OK", rec.Body.String())
+}
@@ -1,6 +1,18 @@
 import { useState, useEffect } from "react";
 import { useAPI } from "../contexts/APIProvider";

+const formatTimestamp = (timestamp: string): string => {
+  return new Date(timestamp).toLocaleString();
+};
+
+const formatSpeed = (speed: number): string => {
+  return speed < 0 ? "unknown" : speed.toFixed(2) + " t/s";
+};
+
+const formatDuration = (ms: number): string => {
+  return (ms / 1000).toFixed(2) + "s";
+};
+
 const ActivityPage = () => {
  const { metrics } = useAPI();
  const [error, setError] = useState<string | null>(null);
@@ -11,18 +23,6 @@ const ActivityPage = () => {
    }
  }, [metrics]);

-  const formatTimestamp = (timestamp: string) => {
-    return new Date(timestamp).toLocaleString();
-  };
-
-  const formatSpeed = (speed: number) => {
-    return speed.toFixed(2) + " t/s";
-  };
-
-  const formatDuration = (ms: number) => {
-    return (ms / 1000).toFixed(2) + "s";
-  };
-
  if (error) {
    return (
      <div className="p-6">
@@ -51,7 +51,7 @@ const ActivityPage = () => {
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Model</th>
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Input Tokens</th>
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Output Tokens</th>
-                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Processing Speed</th>
+                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Generation Speed</th>
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Duration</th>
              </tr>
            </thead>
@@ -27,10 +27,13 @@ export default function ModelsPage() {
  }, []);

  const [totalRequests, totalTokens, avgTokensPerSecond] = useMemo(() => {
-    const totalTokens = metrics.reduce((sum, m) => sum + m.input_tokens + m.output_tokens, 0);
-    const totalSeconds = metrics.reduce((sum, m) => sum + m.duration_ms / 1000, 0);
-    const avgTokensPerSecond = totalSeconds > 0 ? totalTokens / totalSeconds : 0;
-    return [metrics.length, totalTokens, avgTokensPerSecond.toFixed(2)];
+    const totalRequests = metrics.length;
+    if (totalRequests === 0) {
+      return [0, 0, 0];
+    }
+    const totalTokens = metrics.reduce((sum, m) => sum + m.output_tokens, 0);
+    const avgTokensPerSecond = (metrics.reduce((sum, m) => sum + m.tokens_per_second, 0) / totalRequests).toFixed(2);
+    return [totalRequests, totalTokens, avgTokensPerSecond];
  }, [metrics]);

  return (
Author	SHA1	Message	Date
Benson Wong	5672cb03fd	Update github actions for notifying homebrew build (#212 ) Combine homebrew-llama-swap event with the release action	2025-07-30 11:29:03 -07:00
Benson Wong	0f583163f7	add /health (#211 )	2025-07-30 10:37:10 -07:00
Benson Wong	7905fa9ea3	Update trigger-homebrew-update.yml [skip ci]	2025-07-30 10:13:49 -07:00
Ian Sebastian Mathew	bbaf172956	add trigger to rebuild homebrew formula (#210 )	2025-07-30 10:12:21 -07:00
Benson Wong	fd50932dbc	Decouple MetricsMiddleware from downstream handlers (#206 ) * Decouple MetricsMiddleware from downstream handlers Remove ls-real-model-name optimization. Within proxyOAIHandler the request body's bytes are required for various rewriting features anyways. This negated any benefits from trying not to parse it twice.	2025-07-27 10:36:06 -07:00
Gaël James	8c693e7fcf	Add endpoint aliases for reranking models (#201 ) * Add endpoint aliases for reranking models * Add MetricsMiddleware to the previous reranking endpoint * Fix the embeddings endpoint not having model set	2025-07-24 08:32:47 -07:00
Benson Wong	8f2af26a41	fix stats on model page	2025-07-23 13:57:33 -07:00
Benson Wong	01d4838fb3	Fix token metrics parsing (#199 ) Fix #198 - use llama-server's `timings` info if available in response body - send "-1" for token/sec when not able to accurately calculate performance - optimize streaming body search for metrics information	2025-07-22 23:10:14 -07:00
Benson Wong	accd65294b	add contributors to README [skip ci]	2025-07-21 23:16:48 -07:00
Benson Wong	7472a25864	Update README.md [skip ci] update screenshot for web UI	2025-07-21 23:08:19 -07:00