Decouple MetricsMiddleware from downstream handlers (#206 )

* Decouple MetricsMiddleware from downstream handlers Remove ls-real-model-name optimization. Within proxyOAIHandler the request body's bytes are required for various rewriting features anyways. This negated any benefits from trying not to parse it twice.
Add endpoint aliases for reranking models (#201 )
2025-07-27 10:36:06 -07:00 · 2025-07-24 08:32:47 -07:00 · 2025-07-23 13:57:33 -07:00
4 changed files with 25 additions and 11 deletions
@@ -18,7 +18,7 @@ Written in golang, it is very easy to install (single binary with no dependencie
  - `v1/completions`
  - `v1/chat/completions`
  - `v1/embeddings`
-  - `v1/rerank`
+  - `v1/rerank`, `v1/reranking`, `rerank`
  - `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36))
  - `v1/audio/transcriptions` ([docs](https://github.com/mostlygeek/llama-swap/issues/41#issuecomment-2722637867))
 - ✅ llama-swap custom API endpoints
@@ -17,6 +17,7 @@ func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
 		bodyBytes, err := io.ReadAll(c.Request.Body)
 		if err != nil {
 			pm.sendErrorResponse(c, http.StatusBadRequest, "could not ready request body")
+			c.Abort()
 			return
 		}
 		c.Request.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
@@ -24,15 +25,16 @@ func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
 		requestedModel := gjson.GetBytes(bodyBytes, "model").String()
 		if requestedModel == "" {
 			pm.sendErrorResponse(c, http.StatusBadRequest, "missing or invalid 'model' key")
+			c.Abort()
 			return
 		}

 		realModelName, found := pm.config.RealModelName(requestedModel)
 		if !found {
 			pm.sendErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("could not find real modelID for %s", requestedModel))
+			c.Abort()
 			return
 		}
-		c.Set("ls-real-model-name", realModelName)

 		writer := &MetricsResponseWriter{
 			ResponseWriter: c.Writer,
@@ -14,6 +14,7 @@ import (
 	"time"

 	"github.com/gin-gonic/gin"
+	"github.com/tidwall/gjson"
 	"github.com/tidwall/sjson"
 )

@@ -160,8 +161,10 @@ func (pm *ProxyManager) setupGinEngine() {
 	pm.ginEngine.POST("/v1/completions", mm, pm.proxyOAIHandler)

 	// Support embeddings
-	pm.ginEngine.POST("/v1/embeddings", pm.proxyOAIHandler)
-	pm.ginEngine.POST("/v1/rerank", pm.proxyOAIHandler)
+	pm.ginEngine.POST("/v1/embeddings", mm, pm.proxyOAIHandler)
+	pm.ginEngine.POST("/v1/rerank", mm, pm.proxyOAIHandler)
+	pm.ginEngine.POST("/v1/reranking", mm, pm.proxyOAIHandler)
+	pm.ginEngine.POST("/rerank", mm, pm.proxyOAIHandler)

 	// Support audio/speech endpoint
 	pm.ginEngine.POST("/v1/audio/speech", pm.proxyOAIHandler)
@@ -365,9 +368,15 @@ func (pm *ProxyManager) proxyOAIHandler(c *gin.Context) {
 		return
 	}

-	realModelName := c.GetString("ls-real-model-name") // Should be set in MetricsMiddleware
-	if realModelName == "" {
-		pm.sendErrorResponse(c, http.StatusInternalServerError, "ls-real-model-name not set")
+	requestedModel := gjson.GetBytes(bodyBytes, "model").String()
+	if requestedModel == "" {
+		pm.sendErrorResponse(c, http.StatusBadRequest, "missing or invalid 'model' key")
+		return
+	}
+
+	realModelName, found := pm.config.RealModelName(requestedModel)
+	if !found {
+		pm.sendErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("could not find real modelID for %s", requestedModel))
 		return
 	}

@@ -27,10 +27,13 @@ export default function ModelsPage() {
  }, []);

  const [totalRequests, totalTokens, avgTokensPerSecond] = useMemo(() => {
-    const totalTokens = metrics.reduce((sum, m) => sum + m.input_tokens + m.output_tokens, 0);
-    const totalSeconds = metrics.reduce((sum, m) => sum + m.duration_ms / 1000, 0);
-    const avgTokensPerSecond = totalSeconds > 0 ? totalTokens / totalSeconds : 0;
-    return [metrics.length, totalTokens, avgTokensPerSecond.toFixed(2)];
+    const totalRequests = metrics.length;
+    if (totalRequests === 0) {
+      return [0, 0, 0];
+    }
+    const totalTokens = metrics.reduce((sum, m) => sum + m.output_tokens, 0);
+    const avgTokensPerSecond = (metrics.reduce((sum, m) => sum + m.tokens_per_second, 0) / totalRequests).toFixed(2);
+    return [totalRequests, totalTokens, avgTokensPerSecond];
  }, [metrics]);

  return (