From 1688bdd1e984cfa532a22106c7ce69ecea3f51d5 Mon Sep 17 00:00:00 2001 From: Brian Mendonca <166055087+author-brian-mendonca@users.noreply.github.com> Date: Mon, 16 Feb 2026 09:41:15 -0800 Subject: [PATCH] proxy, ui: add pending requests count to the main dashboard (#516) add a real time counter of pending (inflight) requests to the UI. --- proxy/events.go | 9 +++ proxy/proxymanager.go | 84 +++++++++++++++++----- proxy/proxymanager_api.go | 21 ++++++ ui-svelte/src/components/StatsPanel.svelte | 28 ++++++-- ui-svelte/src/lib/types.ts | 6 +- ui-svelte/src/stores/api.ts | 10 ++- 6 files changed, 133 insertions(+), 25 deletions(-) diff --git a/proxy/events.go b/proxy/events.go index 11403fc8..e35ee627 100644 --- a/proxy/events.go +++ b/proxy/events.go @@ -8,6 +8,7 @@ const ConfigFileChangedEventID = 0x03 const LogDataEventID = 0x04 const TokenMetricsEventID = 0x05 const ModelPreloadedEventID = 0x06 +const InFlightRequestsEventID = 0x07 type ProcessStateChangeEvent struct { ProcessName string @@ -58,3 +59,11 @@ type ModelPreloadedEvent struct { func (e ModelPreloadedEvent) Type() uint32 { return ModelPreloadedEventID } + +type InFlightRequestsEvent struct { + Total int +} + +func (e InFlightRequestsEvent) Type() uint32 { + return InFlightRequestsEventID +} diff --git a/proxy/proxymanager.go b/proxy/proxymanager.go index 20b14602..358d23df 100644 --- a/proxy/proxymanager.go +++ b/proxy/proxymanager.go @@ -28,6 +28,40 @@ const ( type proxyCtxKey string +type InflightCounter struct { + mu sync.Mutex + total int +} + +func newInflightCounter() *InflightCounter { + return &InflightCounter{} +} + +func (ic *InflightCounter) Current() int { + ic.mu.Lock() + total := ic.total + ic.mu.Unlock() + return total +} + +func (ic *InflightCounter) Increment() int { + ic.mu.Lock() + ic.total++ + total := ic.total + ic.mu.Unlock() + return total +} + +func (ic *InflightCounter) Decrement() int { + ic.mu.Lock() + if ic.total > 0 { + ic.total-- + } + total := ic.total + ic.mu.Unlock() + return total +} + type ProxyManager struct { sync.Mutex @@ -43,6 +77,8 @@ type ProxyManager struct { processGroups map[string]*ProcessGroup + inFlightCounter *InflightCounter + // shutdown signaling shutdownCtx context.Context shutdownCancel context.CancelFunc @@ -155,6 +191,8 @@ func New(proxyConfig config.Config) *ProxyManager { processGroups: make(map[string]*ProcessGroup), + inFlightCounter: newInflightCounter(), + shutdownCtx: shutdownCtx, shutdownCancel: shutdownCancel, @@ -276,37 +314,37 @@ func (pm *ProxyManager) setupGinEngine() { // Set up routes using the Gin engine // Protected routes use pm.apiKeyAuth() middleware - pm.ginEngine.POST("/v1/chat/completions", pm.apiKeyAuth(), pm.proxyInferenceHandler) - pm.ginEngine.POST("/v1/responses", pm.apiKeyAuth(), pm.proxyInferenceHandler) + pm.ginEngine.POST("/v1/chat/completions", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler) + pm.ginEngine.POST("/v1/responses", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler) // Support legacy /v1/completions api, see issue #12 - pm.ginEngine.POST("/v1/completions", pm.apiKeyAuth(), pm.proxyInferenceHandler) + pm.ginEngine.POST("/v1/completions", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler) // Support anthropic /v1/messages (added https://github.com/ggml-org/llama.cpp/pull/17570) - pm.ginEngine.POST("/v1/messages", pm.apiKeyAuth(), pm.proxyInferenceHandler) + pm.ginEngine.POST("/v1/messages", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler) // Support anthropic count_tokens API (Also added in the above PR) - pm.ginEngine.POST("/v1/messages/count_tokens", pm.apiKeyAuth(), pm.proxyInferenceHandler) + pm.ginEngine.POST("/v1/messages/count_tokens", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler) // Support embeddings and reranking - pm.ginEngine.POST("/v1/embeddings", pm.apiKeyAuth(), pm.proxyInferenceHandler) + pm.ginEngine.POST("/v1/embeddings", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler) // llama-server's /reranking endpoint + aliases - pm.ginEngine.POST("/reranking", pm.apiKeyAuth(), pm.proxyInferenceHandler) - pm.ginEngine.POST("/rerank", pm.apiKeyAuth(), pm.proxyInferenceHandler) - pm.ginEngine.POST("/v1/rerank", pm.apiKeyAuth(), pm.proxyInferenceHandler) - pm.ginEngine.POST("/v1/reranking", pm.apiKeyAuth(), pm.proxyInferenceHandler) + pm.ginEngine.POST("/reranking", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler) + pm.ginEngine.POST("/rerank", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler) + pm.ginEngine.POST("/v1/rerank", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler) + pm.ginEngine.POST("/v1/reranking", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler) // llama-server's /infill endpoint for code infilling - pm.ginEngine.POST("/infill", pm.apiKeyAuth(), pm.proxyInferenceHandler) + pm.ginEngine.POST("/infill", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler) // llama-server's /completion endpoint - pm.ginEngine.POST("/completion", pm.apiKeyAuth(), pm.proxyInferenceHandler) + pm.ginEngine.POST("/completion", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler) // Support audio/speech endpoint - pm.ginEngine.POST("/v1/audio/speech", pm.apiKeyAuth(), pm.proxyInferenceHandler) - pm.ginEngine.POST("/v1/audio/voices", pm.apiKeyAuth(), pm.proxyInferenceHandler) - pm.ginEngine.GET("/v1/audio/voices", pm.apiKeyAuth(), pm.proxyGETModelHandler) - pm.ginEngine.POST("/v1/audio/transcriptions", pm.apiKeyAuth(), pm.proxyOAIPostFormHandler) - pm.ginEngine.POST("/v1/images/generations", pm.apiKeyAuth(), pm.proxyInferenceHandler) - pm.ginEngine.POST("/v1/images/edits", pm.apiKeyAuth(), pm.proxyOAIPostFormHandler) + pm.ginEngine.POST("/v1/audio/speech", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler) + pm.ginEngine.POST("/v1/audio/voices", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler) + pm.ginEngine.GET("/v1/audio/voices", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyGETModelHandler) + pm.ginEngine.POST("/v1/audio/transcriptions", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyOAIPostFormHandler) + pm.ginEngine.POST("/v1/images/generations", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler) + pm.ginEngine.POST("/v1/images/edits", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyOAIPostFormHandler) pm.ginEngine.GET("/v1/models", pm.apiKeyAuth(), pm.listModelsHandler) @@ -325,7 +363,7 @@ func (pm *ProxyManager) setupGinEngine() { pm.ginEngine.GET("/upstream", func(c *gin.Context) { c.Redirect(http.StatusFound, "/ui/models") }) - pm.ginEngine.Any("/upstream/*upstreamPath", pm.apiKeyAuth(), pm.proxyToUpstream) + pm.ginEngine.Any("/upstream/*upstreamPath", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyToUpstream) pm.ginEngine.GET("/unload", pm.apiKeyAuth(), pm.unloadAllModelsHandler) pm.ginEngine.GET("/running", pm.apiKeyAuth(), pm.listRunningProcessesHandler) pm.ginEngine.GET("/health", func(c *gin.Context) { @@ -389,6 +427,14 @@ func (pm *ProxyManager) setupGinEngine() { gin.DisableConsoleColor() } +func (pm *ProxyManager) trackInflight() gin.HandlerFunc { + return func(c *gin.Context) { + event.Emit(InFlightRequestsEvent{Total: pm.inFlightCounter.Increment()}) + defer event.Emit(InFlightRequestsEvent{Total: pm.inFlightCounter.Decrement()}) + c.Next() + } +} + // ServeHTTP implements http.Handler interface func (pm *ProxyManager) ServeHTTP(w http.ResponseWriter, r *http.Request) { pm.ginEngine.ServeHTTP(w, r) diff --git a/proxy/proxymanager_api.go b/proxy/proxymanager_api.go index 4354ea4a..0e660d03 100644 --- a/proxy/proxymanager_api.go +++ b/proxy/proxymanager_api.go @@ -107,6 +107,7 @@ const ( msgTypeModelStatus messageType = "modelStatus" msgTypeLogData messageType = "logData" msgTypeMetrics messageType = "metrics" + msgTypeInFlight messageType = "inflight" ) type messageEnvelope struct { @@ -166,6 +167,18 @@ func (pm *ProxyManager) apiSendEvents(c *gin.Context) { } } + sendInFlight := func(total int) { + jsonData, err := json.Marshal(gin.H{"total": total}) + if err == nil { + select { + case sendBuffer <- messageEnvelope{Type: msgTypeInFlight, Data: string(jsonData)}: + case <-ctx.Done(): + return + default: + } + } + } + /** * Send updated models list */ @@ -193,11 +206,19 @@ func (pm *ProxyManager) apiSendEvents(c *gin.Context) { sendMetrics([]TokenMetrics{e.Metrics}) })() + /** + * Send in-flight request stats related to token stats "Waiting: N" count. + */ + defer event.On(func(e InFlightRequestsEvent) { + sendInFlight(e.Total) + })() + // send initial batch of data sendLogData("proxy", pm.proxyLogger.GetHistory()) sendLogData("upstream", pm.upstreamLogger.GetHistory()) sendModels() sendMetrics(pm.metricsMonitor.getMetrics()) + sendInFlight(pm.inFlightCounter.Current()) for { select { diff --git a/ui-svelte/src/components/StatsPanel.svelte b/ui-svelte/src/components/StatsPanel.svelte index 2ef869a5..33075e13 100644 --- a/ui-svelte/src/components/StatsPanel.svelte +++ b/ui-svelte/src/components/StatsPanel.svelte @@ -1,5 +1,5 @@