add /health (#211 )

Update trigger-homebrew-update.yml [skip ci]
add trigger to rebuild homebrew formula (#210 )
2025-07-30 10:37:10 -07:00 · 2025-07-30 10:13:49 -07:00 · 2025-07-30 10:12:21 -07:00 · 2025-07-27 10:36:06 -07:00 · 2025-07-24 08:32:47 -07:00 · 2025-07-23 13:57:33 -07:00
6 changed files with 84 additions and 11 deletions
@@ -0,0 +1,24 @@
 name: Trigger Homebrew Tap Update
 on:
  release:
    types: [published]
  # Allows manual triggering of the workflow
  workflow_dispatch:
 jobs:
  trigger-tap-update:
    runs-on: ubuntu-latest
    steps:
      - name: "Trigger tap repository update"
        uses: peter-evans/repository-dispatch@v2
        with:
          token: ${{ secrets.TAP_REPO_PAT }}
          repository: mostlygeek/homebrew-llama-swap
          event-type: new-release
          client-payload: |-
            {
              "release": {
                  "tag_name": "${{ github.event.release.tag_name }}"
              }
            }
@@ -18,7 +18,7 @@ Written in golang, it is very easy to install (single binary with no dependencie
  - `v1/completions`
  - `v1/chat/completions`
  - `v1/embeddings`
-  - `v1/rerank`
+  - `v1/rerank`, `v1/reranking`, `rerank`
  - `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36))
  - `v1/audio/transcriptions` ([docs](https://github.com/mostlygeek/llama-swap/issues/41#issuecomment-2722637867))
 - ✅ llama-swap custom API endpoints
@@ -122,6 +122,20 @@ $ docker run -it --rm --runtime nvidia -p 9292:8080 \
 </details>
 ## Homebrew Install (macOS/Linux)
 For macOS & Linux users, `llama-swap` can be installed via [Homebrew](https://brew.sh):
 ```shell
 # Set up tap and install formula 
 brew tap mostlygeek/llama-swap
 brew install llama-swap
 # Run llama-swap
 llama-swap --config path/to/config.yaml --listen localhost:8080
 ```
 This will install the `llama-swap` binary and make it available in your path. See the [configuration documentation](https://github.com/mostlygeek/llama-swap/wiki/Configuration)
 ## Bare metal Install ([download](https://github.com/mostlygeek/llama-swap/releases))
 Pre-built binaries are available for Linux, Mac, Windows and FreeBSD. These are automatically published and are likely a few hours ahead of the docker releases. The baremetal install works with any OpenAI compatible server, not just llama-server.
@@ -17,6 +17,7 @@ func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
 		bodyBytes, err := io.ReadAll(c.Request.Body)
 		if err != nil {
 			pm.sendErrorResponse(c, http.StatusBadRequest, "could not ready request body")
 			c.Abort()
 			return
 		}
 		c.Request.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
@@ -24,15 +25,16 @@ func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
 		requestedModel := gjson.GetBytes(bodyBytes, "model").String()
 		if requestedModel == "" {
 			pm.sendErrorResponse(c, http.StatusBadRequest, "missing or invalid 'model' key")
 			c.Abort()
 			return
 		}
 		realModelName, found := pm.config.RealModelName(requestedModel)
 		if !found {
 			pm.sendErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("could not find real modelID for %s", requestedModel))
 			c.Abort()
 			return
 		}
 		c.Set("ls-real-model-name", realModelName)
 		writer := &MetricsResponseWriter{
 			ResponseWriter: c.Writer,
@@ -14,6 +14,7 @@ import (
 	"time"
 	"github.com/gin-gonic/gin"
 	"github.com/tidwall/gjson"
 	"github.com/tidwall/sjson"
 )
@@ -160,8 +161,10 @@ func (pm *ProxyManager) setupGinEngine() {
 	pm.ginEngine.POST("/v1/completions", mm, pm.proxyOAIHandler)
 	// Support embeddings
-	pm.ginEngine.POST("/v1/embeddings", pm.proxyOAIHandler)
+	pm.ginEngine.POST("/v1/embeddings", mm, pm.proxyOAIHandler)
-	pm.ginEngine.POST("/v1/rerank", pm.proxyOAIHandler)
+	pm.ginEngine.POST("/v1/rerank", mm, pm.proxyOAIHandler)
 	pm.ginEngine.POST("/v1/reranking", mm, pm.proxyOAIHandler)
 	pm.ginEngine.POST("/rerank", mm, pm.proxyOAIHandler)
 	// Support audio/speech endpoint
 	pm.ginEngine.POST("/v1/audio/speech", pm.proxyOAIHandler)
@@ -188,6 +191,9 @@ func (pm *ProxyManager) setupGinEngine() {
 	pm.ginEngine.GET("/unload", pm.unloadAllModelsHandler)
 	pm.ginEngine.GET("/running", pm.listRunningProcessesHandler)
 	pm.ginEngine.GET("/health", func(c *gin.Context) {
 		c.String(http.StatusOK, "OK")
 	})
 	pm.ginEngine.GET("/favicon.ico", func(c *gin.Context) {
 		if data, err := reactStaticFS.ReadFile("ui_dist/favicon.ico"); err == nil {
@@ -365,9 +371,15 @@ func (pm *ProxyManager) proxyOAIHandler(c *gin.Context) {
 		return
 	}
-	realModelName := c.GetString("ls-real-model-name") // Should be set in MetricsMiddleware
+	requestedModel := gjson.GetBytes(bodyBytes, "model").String()
-	if realModelName == "" {
+	if requestedModel == "" {
-		pm.sendErrorResponse(c, http.StatusInternalServerError, "ls-real-model-name not set")
+		pm.sendErrorResponse(c, http.StatusBadRequest, "missing or invalid 'model' key")
 		return
 	}
 	realModelName, found := pm.config.RealModelName(requestedModel)
 	if !found {
 		pm.sendErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("could not find real modelID for %s", requestedModel))
 		return
 	}
@@ -755,3 +755,21 @@ func TestProxyManager_MiddlewareWritesMetrics_Streaming(t *testing.T) {
 	assert.Greater(t, lastMetric.TokensPerSecond, 0.0, "tokens per second should be greater than 0")
 	assert.Greater(t, lastMetric.DurationMs, 0, "duration should be greater than 0")
 }
 func TestProxyManager_HealthEndpoint(t *testing.T) {
 	config := AddDefaultGroupToConfig(Config{
 		HealthCheckTimeout: 15,
 		Models: map[string]ModelConfig{
 			"model1": getTestSimpleResponderConfig("model1"),
 		},
 		LogLevel: "error",
 	})
 	proxy := New(config)
 	defer proxy.StopProcesses(StopWaitForInflightRequest)
 	req := httptest.NewRequest("GET", "/health", nil)
 	rec := httptest.NewRecorder()
 	proxy.ServeHTTP(rec, req)
 	assert.Equal(t, http.StatusOK, rec.Code)
 	assert.Equal(t, "OK", rec.Body.String())
 }
@@ -27,10 +27,13 @@ export default function ModelsPage() {
  }, []);
  const [totalRequests, totalTokens, avgTokensPerSecond] = useMemo(() => {
-    const totalTokens = metrics.reduce((sum, m) => sum + m.input_tokens + m.output_tokens, 0);
+    const totalRequests = metrics.length;
-    const totalSeconds = metrics.reduce((sum, m) => sum + m.duration_ms / 1000, 0);
+    if (totalRequests === 0) {
-    const avgTokensPerSecond = totalSeconds > 0 ? totalTokens / totalSeconds : 0;
+      return [0, 0, 0];
-    return [metrics.length, totalTokens, avgTokensPerSecond.toFixed(2)];
+    }
    const totalTokens = metrics.reduce((sum, m) => sum + m.output_tokens, 0);
    const avgTokensPerSecond = (metrics.reduce((sum, m) => sum + m.tokens_per_second, 0) / totalRequests).toFixed(2);
    return [totalRequests, totalTokens, avgTokensPerSecond];
  }, [metrics]);
  return (
Author	SHA1	Message	Date
Benson Wong	0f583163f7	add /health (#211 )	2025-07-30 10:37:10 -07:00
Benson Wong	7905fa9ea3	Update trigger-homebrew-update.yml [skip ci]	2025-07-30 10:13:49 -07:00
Ian Sebastian Mathew	bbaf172956	add trigger to rebuild homebrew formula (#210 )	2025-07-30 10:12:21 -07:00
Benson Wong	fd50932dbc	Decouple MetricsMiddleware from downstream handlers (#206 ) * Decouple MetricsMiddleware from downstream handlers Remove ls-real-model-name optimization. Within proxyOAIHandler the request body's bytes are required for various rewriting features anyways. This negated any benefits from trying not to parse it twice.	2025-07-27 10:36:06 -07:00
Gaël James	8c693e7fcf	Add endpoint aliases for reranking models (#201 ) * Add endpoint aliases for reranking models * Add MetricsMiddleware to the previous reranking endpoint * Fix the embeddings endpoint not having model set	2025-07-24 08:32:47 -07:00
Benson Wong	8f2af26a41	fix stats on model page	2025-07-23 13:57:33 -07:00