Compare commits

..

3 Commits

Author SHA1 Message Date
Benson Wong fd50932dbc Decouple MetricsMiddleware from downstream handlers (#206)
* Decouple MetricsMiddleware from downstream handlers

Remove ls-real-model-name optimization. Within proxyOAIHandler the
request body's bytes are required for various rewriting features
anyways. This negated any benefits from trying not to parse it twice.
2025-07-27 10:36:06 -07:00
Gaël James 8c693e7fcf Add endpoint aliases for reranking models (#201)
* Add endpoint aliases for reranking models
* Add MetricsMiddleware to the previous reranking endpoint
* Fix the embeddings endpoint not having model set
2025-07-24 08:32:47 -07:00
Benson Wong 8f2af26a41 fix stats on model page 2025-07-23 13:57:33 -07:00
4 changed files with 25 additions and 11 deletions
+1 -1
View File
@@ -18,7 +18,7 @@ Written in golang, it is very easy to install (single binary with no dependencie
- `v1/completions` - `v1/completions`
- `v1/chat/completions` - `v1/chat/completions`
- `v1/embeddings` - `v1/embeddings`
- `v1/rerank` - `v1/rerank`, `v1/reranking`, `rerank`
- `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36)) - `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36))
- `v1/audio/transcriptions` ([docs](https://github.com/mostlygeek/llama-swap/issues/41#issuecomment-2722637867)) - `v1/audio/transcriptions` ([docs](https://github.com/mostlygeek/llama-swap/issues/41#issuecomment-2722637867))
- ✅ llama-swap custom API endpoints - ✅ llama-swap custom API endpoints
+3 -1
View File
@@ -17,6 +17,7 @@ func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
bodyBytes, err := io.ReadAll(c.Request.Body) bodyBytes, err := io.ReadAll(c.Request.Body)
if err != nil { if err != nil {
pm.sendErrorResponse(c, http.StatusBadRequest, "could not ready request body") pm.sendErrorResponse(c, http.StatusBadRequest, "could not ready request body")
c.Abort()
return return
} }
c.Request.Body = io.NopCloser(bytes.NewBuffer(bodyBytes)) c.Request.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
@@ -24,15 +25,16 @@ func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
requestedModel := gjson.GetBytes(bodyBytes, "model").String() requestedModel := gjson.GetBytes(bodyBytes, "model").String()
if requestedModel == "" { if requestedModel == "" {
pm.sendErrorResponse(c, http.StatusBadRequest, "missing or invalid 'model' key") pm.sendErrorResponse(c, http.StatusBadRequest, "missing or invalid 'model' key")
c.Abort()
return return
} }
realModelName, found := pm.config.RealModelName(requestedModel) realModelName, found := pm.config.RealModelName(requestedModel)
if !found { if !found {
pm.sendErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("could not find real modelID for %s", requestedModel)) pm.sendErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("could not find real modelID for %s", requestedModel))
c.Abort()
return return
} }
c.Set("ls-real-model-name", realModelName)
writer := &MetricsResponseWriter{ writer := &MetricsResponseWriter{
ResponseWriter: c.Writer, ResponseWriter: c.Writer,
+14 -5
View File
@@ -14,6 +14,7 @@ import (
"time" "time"
"github.com/gin-gonic/gin" "github.com/gin-gonic/gin"
"github.com/tidwall/gjson"
"github.com/tidwall/sjson" "github.com/tidwall/sjson"
) )
@@ -160,8 +161,10 @@ func (pm *ProxyManager) setupGinEngine() {
pm.ginEngine.POST("/v1/completions", mm, pm.proxyOAIHandler) pm.ginEngine.POST("/v1/completions", mm, pm.proxyOAIHandler)
// Support embeddings // Support embeddings
pm.ginEngine.POST("/v1/embeddings", pm.proxyOAIHandler) pm.ginEngine.POST("/v1/embeddings", mm, pm.proxyOAIHandler)
pm.ginEngine.POST("/v1/rerank", pm.proxyOAIHandler) pm.ginEngine.POST("/v1/rerank", mm, pm.proxyOAIHandler)
pm.ginEngine.POST("/v1/reranking", mm, pm.proxyOAIHandler)
pm.ginEngine.POST("/rerank", mm, pm.proxyOAIHandler)
// Support audio/speech endpoint // Support audio/speech endpoint
pm.ginEngine.POST("/v1/audio/speech", pm.proxyOAIHandler) pm.ginEngine.POST("/v1/audio/speech", pm.proxyOAIHandler)
@@ -365,9 +368,15 @@ func (pm *ProxyManager) proxyOAIHandler(c *gin.Context) {
return return
} }
realModelName := c.GetString("ls-real-model-name") // Should be set in MetricsMiddleware requestedModel := gjson.GetBytes(bodyBytes, "model").String()
if realModelName == "" { if requestedModel == "" {
pm.sendErrorResponse(c, http.StatusInternalServerError, "ls-real-model-name not set") pm.sendErrorResponse(c, http.StatusBadRequest, "missing or invalid 'model' key")
return
}
realModelName, found := pm.config.RealModelName(requestedModel)
if !found {
pm.sendErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("could not find real modelID for %s", requestedModel))
return return
} }
+7 -4
View File
@@ -27,10 +27,13 @@ export default function ModelsPage() {
}, []); }, []);
const [totalRequests, totalTokens, avgTokensPerSecond] = useMemo(() => { const [totalRequests, totalTokens, avgTokensPerSecond] = useMemo(() => {
const totalTokens = metrics.reduce((sum, m) => sum + m.input_tokens + m.output_tokens, 0); const totalRequests = metrics.length;
const totalSeconds = metrics.reduce((sum, m) => sum + m.duration_ms / 1000, 0); if (totalRequests === 0) {
const avgTokensPerSecond = totalSeconds > 0 ? totalTokens / totalSeconds : 0; return [0, 0, 0];
return [metrics.length, totalTokens, avgTokensPerSecond.toFixed(2)]; }
const totalTokens = metrics.reduce((sum, m) => sum + m.output_tokens, 0);
const avgTokensPerSecond = (metrics.reduce((sum, m) => sum + m.tokens_per_second, 0) / totalRequests).toFixed(2);
return [totalRequests, totalTokens, avgTokensPerSecond];
}, [metrics]); }, [metrics]);
return ( return (