Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| fd50932dbc | |||
| 8c693e7fcf | |||
| 8f2af26a41 |
@@ -18,7 +18,7 @@ Written in golang, it is very easy to install (single binary with no dependencie
|
|||||||
- `v1/completions`
|
- `v1/completions`
|
||||||
- `v1/chat/completions`
|
- `v1/chat/completions`
|
||||||
- `v1/embeddings`
|
- `v1/embeddings`
|
||||||
- `v1/rerank`
|
- `v1/rerank`, `v1/reranking`, `rerank`
|
||||||
- `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36))
|
- `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36))
|
||||||
- `v1/audio/transcriptions` ([docs](https://github.com/mostlygeek/llama-swap/issues/41#issuecomment-2722637867))
|
- `v1/audio/transcriptions` ([docs](https://github.com/mostlygeek/llama-swap/issues/41#issuecomment-2722637867))
|
||||||
- ✅ llama-swap custom API endpoints
|
- ✅ llama-swap custom API endpoints
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
|
|||||||
bodyBytes, err := io.ReadAll(c.Request.Body)
|
bodyBytes, err := io.ReadAll(c.Request.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
pm.sendErrorResponse(c, http.StatusBadRequest, "could not ready request body")
|
pm.sendErrorResponse(c, http.StatusBadRequest, "could not ready request body")
|
||||||
|
c.Abort()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
c.Request.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
|
c.Request.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
|
||||||
@@ -24,15 +25,16 @@ func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
|
|||||||
requestedModel := gjson.GetBytes(bodyBytes, "model").String()
|
requestedModel := gjson.GetBytes(bodyBytes, "model").String()
|
||||||
if requestedModel == "" {
|
if requestedModel == "" {
|
||||||
pm.sendErrorResponse(c, http.StatusBadRequest, "missing or invalid 'model' key")
|
pm.sendErrorResponse(c, http.StatusBadRequest, "missing or invalid 'model' key")
|
||||||
|
c.Abort()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
realModelName, found := pm.config.RealModelName(requestedModel)
|
realModelName, found := pm.config.RealModelName(requestedModel)
|
||||||
if !found {
|
if !found {
|
||||||
pm.sendErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("could not find real modelID for %s", requestedModel))
|
pm.sendErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("could not find real modelID for %s", requestedModel))
|
||||||
|
c.Abort()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
c.Set("ls-real-model-name", realModelName)
|
|
||||||
|
|
||||||
writer := &MetricsResponseWriter{
|
writer := &MetricsResponseWriter{
|
||||||
ResponseWriter: c.Writer,
|
ResponseWriter: c.Writer,
|
||||||
|
|||||||
+14
-5
@@ -14,6 +14,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
|
"github.com/tidwall/gjson"
|
||||||
"github.com/tidwall/sjson"
|
"github.com/tidwall/sjson"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -160,8 +161,10 @@ func (pm *ProxyManager) setupGinEngine() {
|
|||||||
pm.ginEngine.POST("/v1/completions", mm, pm.proxyOAIHandler)
|
pm.ginEngine.POST("/v1/completions", mm, pm.proxyOAIHandler)
|
||||||
|
|
||||||
// Support embeddings
|
// Support embeddings
|
||||||
pm.ginEngine.POST("/v1/embeddings", pm.proxyOAIHandler)
|
pm.ginEngine.POST("/v1/embeddings", mm, pm.proxyOAIHandler)
|
||||||
pm.ginEngine.POST("/v1/rerank", pm.proxyOAIHandler)
|
pm.ginEngine.POST("/v1/rerank", mm, pm.proxyOAIHandler)
|
||||||
|
pm.ginEngine.POST("/v1/reranking", mm, pm.proxyOAIHandler)
|
||||||
|
pm.ginEngine.POST("/rerank", mm, pm.proxyOAIHandler)
|
||||||
|
|
||||||
// Support audio/speech endpoint
|
// Support audio/speech endpoint
|
||||||
pm.ginEngine.POST("/v1/audio/speech", pm.proxyOAIHandler)
|
pm.ginEngine.POST("/v1/audio/speech", pm.proxyOAIHandler)
|
||||||
@@ -365,9 +368,15 @@ func (pm *ProxyManager) proxyOAIHandler(c *gin.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
realModelName := c.GetString("ls-real-model-name") // Should be set in MetricsMiddleware
|
requestedModel := gjson.GetBytes(bodyBytes, "model").String()
|
||||||
if realModelName == "" {
|
if requestedModel == "" {
|
||||||
pm.sendErrorResponse(c, http.StatusInternalServerError, "ls-real-model-name not set")
|
pm.sendErrorResponse(c, http.StatusBadRequest, "missing or invalid 'model' key")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
realModelName, found := pm.config.RealModelName(requestedModel)
|
||||||
|
if !found {
|
||||||
|
pm.sendErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("could not find real modelID for %s", requestedModel))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -27,10 +27,13 @@ export default function ModelsPage() {
|
|||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
const [totalRequests, totalTokens, avgTokensPerSecond] = useMemo(() => {
|
const [totalRequests, totalTokens, avgTokensPerSecond] = useMemo(() => {
|
||||||
const totalTokens = metrics.reduce((sum, m) => sum + m.input_tokens + m.output_tokens, 0);
|
const totalRequests = metrics.length;
|
||||||
const totalSeconds = metrics.reduce((sum, m) => sum + m.duration_ms / 1000, 0);
|
if (totalRequests === 0) {
|
||||||
const avgTokensPerSecond = totalSeconds > 0 ? totalTokens / totalSeconds : 0;
|
return [0, 0, 0];
|
||||||
return [metrics.length, totalTokens, avgTokensPerSecond.toFixed(2)];
|
}
|
||||||
|
const totalTokens = metrics.reduce((sum, m) => sum + m.output_tokens, 0);
|
||||||
|
const avgTokensPerSecond = (metrics.reduce((sum, m) => sum + m.tokens_per_second, 0) / totalRequests).toFixed(2);
|
||||||
|
return [totalRequests, totalTokens, avgTokensPerSecond];
|
||||||
}, [metrics]);
|
}, [metrics]);
|
||||||
|
|
||||||
return (
|
return (
|
||||||
|
|||||||
Reference in New Issue
Block a user