proxy, ui: add pending requests count to the main dashboard (#516)

add a real time counter of pending (inflight) requests to the UI.
This commit is contained in:
Brian Mendonca
2026-02-16 09:41:15 -08:00
committed by GitHub
parent d33d51fa75
commit 1688bdd1e9
6 changed files with 133 additions and 25 deletions
+9
View File
@@ -8,6 +8,7 @@ const ConfigFileChangedEventID = 0x03
const LogDataEventID = 0x04 const LogDataEventID = 0x04
const TokenMetricsEventID = 0x05 const TokenMetricsEventID = 0x05
const ModelPreloadedEventID = 0x06 const ModelPreloadedEventID = 0x06
const InFlightRequestsEventID = 0x07
type ProcessStateChangeEvent struct { type ProcessStateChangeEvent struct {
ProcessName string ProcessName string
@@ -58,3 +59,11 @@ type ModelPreloadedEvent struct {
func (e ModelPreloadedEvent) Type() uint32 { func (e ModelPreloadedEvent) Type() uint32 {
return ModelPreloadedEventID return ModelPreloadedEventID
} }
type InFlightRequestsEvent struct {
Total int
}
func (e InFlightRequestsEvent) Type() uint32 {
return InFlightRequestsEventID
}
+65 -19
View File
@@ -28,6 +28,40 @@ const (
type proxyCtxKey string type proxyCtxKey string
type InflightCounter struct {
mu sync.Mutex
total int
}
func newInflightCounter() *InflightCounter {
return &InflightCounter{}
}
func (ic *InflightCounter) Current() int {
ic.mu.Lock()
total := ic.total
ic.mu.Unlock()
return total
}
func (ic *InflightCounter) Increment() int {
ic.mu.Lock()
ic.total++
total := ic.total
ic.mu.Unlock()
return total
}
func (ic *InflightCounter) Decrement() int {
ic.mu.Lock()
if ic.total > 0 {
ic.total--
}
total := ic.total
ic.mu.Unlock()
return total
}
type ProxyManager struct { type ProxyManager struct {
sync.Mutex sync.Mutex
@@ -43,6 +77,8 @@ type ProxyManager struct {
processGroups map[string]*ProcessGroup processGroups map[string]*ProcessGroup
inFlightCounter *InflightCounter
// shutdown signaling // shutdown signaling
shutdownCtx context.Context shutdownCtx context.Context
shutdownCancel context.CancelFunc shutdownCancel context.CancelFunc
@@ -155,6 +191,8 @@ func New(proxyConfig config.Config) *ProxyManager {
processGroups: make(map[string]*ProcessGroup), processGroups: make(map[string]*ProcessGroup),
inFlightCounter: newInflightCounter(),
shutdownCtx: shutdownCtx, shutdownCtx: shutdownCtx,
shutdownCancel: shutdownCancel, shutdownCancel: shutdownCancel,
@@ -276,37 +314,37 @@ func (pm *ProxyManager) setupGinEngine() {
// Set up routes using the Gin engine // Set up routes using the Gin engine
// Protected routes use pm.apiKeyAuth() middleware // Protected routes use pm.apiKeyAuth() middleware
pm.ginEngine.POST("/v1/chat/completions", pm.apiKeyAuth(), pm.proxyInferenceHandler) pm.ginEngine.POST("/v1/chat/completions", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
pm.ginEngine.POST("/v1/responses", pm.apiKeyAuth(), pm.proxyInferenceHandler) pm.ginEngine.POST("/v1/responses", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
// Support legacy /v1/completions api, see issue #12 // Support legacy /v1/completions api, see issue #12
pm.ginEngine.POST("/v1/completions", pm.apiKeyAuth(), pm.proxyInferenceHandler) pm.ginEngine.POST("/v1/completions", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
// Support anthropic /v1/messages (added https://github.com/ggml-org/llama.cpp/pull/17570) // Support anthropic /v1/messages (added https://github.com/ggml-org/llama.cpp/pull/17570)
pm.ginEngine.POST("/v1/messages", pm.apiKeyAuth(), pm.proxyInferenceHandler) pm.ginEngine.POST("/v1/messages", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
// Support anthropic count_tokens API (Also added in the above PR) // Support anthropic count_tokens API (Also added in the above PR)
pm.ginEngine.POST("/v1/messages/count_tokens", pm.apiKeyAuth(), pm.proxyInferenceHandler) pm.ginEngine.POST("/v1/messages/count_tokens", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
// Support embeddings and reranking // Support embeddings and reranking
pm.ginEngine.POST("/v1/embeddings", pm.apiKeyAuth(), pm.proxyInferenceHandler) pm.ginEngine.POST("/v1/embeddings", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
// llama-server's /reranking endpoint + aliases // llama-server's /reranking endpoint + aliases
pm.ginEngine.POST("/reranking", pm.apiKeyAuth(), pm.proxyInferenceHandler) pm.ginEngine.POST("/reranking", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
pm.ginEngine.POST("/rerank", pm.apiKeyAuth(), pm.proxyInferenceHandler) pm.ginEngine.POST("/rerank", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
pm.ginEngine.POST("/v1/rerank", pm.apiKeyAuth(), pm.proxyInferenceHandler) pm.ginEngine.POST("/v1/rerank", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
pm.ginEngine.POST("/v1/reranking", pm.apiKeyAuth(), pm.proxyInferenceHandler) pm.ginEngine.POST("/v1/reranking", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
// llama-server's /infill endpoint for code infilling // llama-server's /infill endpoint for code infilling
pm.ginEngine.POST("/infill", pm.apiKeyAuth(), pm.proxyInferenceHandler) pm.ginEngine.POST("/infill", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
// llama-server's /completion endpoint // llama-server's /completion endpoint
pm.ginEngine.POST("/completion", pm.apiKeyAuth(), pm.proxyInferenceHandler) pm.ginEngine.POST("/completion", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
// Support audio/speech endpoint // Support audio/speech endpoint
pm.ginEngine.POST("/v1/audio/speech", pm.apiKeyAuth(), pm.proxyInferenceHandler) pm.ginEngine.POST("/v1/audio/speech", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
pm.ginEngine.POST("/v1/audio/voices", pm.apiKeyAuth(), pm.proxyInferenceHandler) pm.ginEngine.POST("/v1/audio/voices", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
pm.ginEngine.GET("/v1/audio/voices", pm.apiKeyAuth(), pm.proxyGETModelHandler) pm.ginEngine.GET("/v1/audio/voices", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyGETModelHandler)
pm.ginEngine.POST("/v1/audio/transcriptions", pm.apiKeyAuth(), pm.proxyOAIPostFormHandler) pm.ginEngine.POST("/v1/audio/transcriptions", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyOAIPostFormHandler)
pm.ginEngine.POST("/v1/images/generations", pm.apiKeyAuth(), pm.proxyInferenceHandler) pm.ginEngine.POST("/v1/images/generations", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
pm.ginEngine.POST("/v1/images/edits", pm.apiKeyAuth(), pm.proxyOAIPostFormHandler) pm.ginEngine.POST("/v1/images/edits", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyOAIPostFormHandler)
pm.ginEngine.GET("/v1/models", pm.apiKeyAuth(), pm.listModelsHandler) pm.ginEngine.GET("/v1/models", pm.apiKeyAuth(), pm.listModelsHandler)
@@ -325,7 +363,7 @@ func (pm *ProxyManager) setupGinEngine() {
pm.ginEngine.GET("/upstream", func(c *gin.Context) { pm.ginEngine.GET("/upstream", func(c *gin.Context) {
c.Redirect(http.StatusFound, "/ui/models") c.Redirect(http.StatusFound, "/ui/models")
}) })
pm.ginEngine.Any("/upstream/*upstreamPath", pm.apiKeyAuth(), pm.proxyToUpstream) pm.ginEngine.Any("/upstream/*upstreamPath", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyToUpstream)
pm.ginEngine.GET("/unload", pm.apiKeyAuth(), pm.unloadAllModelsHandler) pm.ginEngine.GET("/unload", pm.apiKeyAuth(), pm.unloadAllModelsHandler)
pm.ginEngine.GET("/running", pm.apiKeyAuth(), pm.listRunningProcessesHandler) pm.ginEngine.GET("/running", pm.apiKeyAuth(), pm.listRunningProcessesHandler)
pm.ginEngine.GET("/health", func(c *gin.Context) { pm.ginEngine.GET("/health", func(c *gin.Context) {
@@ -389,6 +427,14 @@ func (pm *ProxyManager) setupGinEngine() {
gin.DisableConsoleColor() gin.DisableConsoleColor()
} }
func (pm *ProxyManager) trackInflight() gin.HandlerFunc {
return func(c *gin.Context) {
event.Emit(InFlightRequestsEvent{Total: pm.inFlightCounter.Increment()})
defer event.Emit(InFlightRequestsEvent{Total: pm.inFlightCounter.Decrement()})
c.Next()
}
}
// ServeHTTP implements http.Handler interface // ServeHTTP implements http.Handler interface
func (pm *ProxyManager) ServeHTTP(w http.ResponseWriter, r *http.Request) { func (pm *ProxyManager) ServeHTTP(w http.ResponseWriter, r *http.Request) {
pm.ginEngine.ServeHTTP(w, r) pm.ginEngine.ServeHTTP(w, r)
+21
View File
@@ -107,6 +107,7 @@ const (
msgTypeModelStatus messageType = "modelStatus" msgTypeModelStatus messageType = "modelStatus"
msgTypeLogData messageType = "logData" msgTypeLogData messageType = "logData"
msgTypeMetrics messageType = "metrics" msgTypeMetrics messageType = "metrics"
msgTypeInFlight messageType = "inflight"
) )
type messageEnvelope struct { type messageEnvelope struct {
@@ -166,6 +167,18 @@ func (pm *ProxyManager) apiSendEvents(c *gin.Context) {
} }
} }
sendInFlight := func(total int) {
jsonData, err := json.Marshal(gin.H{"total": total})
if err == nil {
select {
case sendBuffer <- messageEnvelope{Type: msgTypeInFlight, Data: string(jsonData)}:
case <-ctx.Done():
return
default:
}
}
}
/** /**
* Send updated models list * Send updated models list
*/ */
@@ -193,11 +206,19 @@ func (pm *ProxyManager) apiSendEvents(c *gin.Context) {
sendMetrics([]TokenMetrics{e.Metrics}) sendMetrics([]TokenMetrics{e.Metrics})
})() })()
/**
* Send in-flight request stats related to token stats "Waiting: N" count.
*/
defer event.On(func(e InFlightRequestsEvent) {
sendInFlight(e.Total)
})()
// send initial batch of data // send initial batch of data
sendLogData("proxy", pm.proxyLogger.GetHistory()) sendLogData("proxy", pm.proxyLogger.GetHistory())
sendLogData("upstream", pm.upstreamLogger.GetHistory()) sendLogData("upstream", pm.upstreamLogger.GetHistory())
sendModels() sendModels()
sendMetrics(pm.metricsMonitor.getMetrics()) sendMetrics(pm.metricsMonitor.getMetrics())
sendInFlight(pm.inFlightCounter.Current())
for { for {
select { select {
+24 -4
View File
@@ -1,5 +1,5 @@
<script lang="ts"> <script lang="ts">
import { metrics } from "../stores/api"; import { inFlightRequests, metrics } from "../stores/api";
import TokenHistogram from "./TokenHistogram.svelte"; import TokenHistogram from "./TokenHistogram.svelte";
interface HistogramData { interface HistogramData {
@@ -15,7 +15,14 @@
let stats = $derived.by(() => { let stats = $derived.by(() => {
const totalRequests = $metrics.length; const totalRequests = $metrics.length;
if (totalRequests === 0) { if (totalRequests === 0) {
return { totalRequests: 0, totalInputTokens: 0, totalOutputTokens: 0, tokenStats: { p99: "0", p95: "0", p50: "0" }, histogramData: null }; return {
totalRequests: 0,
totalInputTokens: 0,
totalOutputTokens: 0,
inFlightRequests: $inFlightRequests,
tokenStats: { p99: "0", p95: "0", p50: "0" },
histogramData: null,
};
} }
const totalInputTokens = $metrics.reduce((sum, m) => sum + m.input_tokens, 0); const totalInputTokens = $metrics.reduce((sum, m) => sum + m.input_tokens, 0);
@@ -24,7 +31,14 @@
// Calculate token statistics using output_tokens and duration_ms // Calculate token statistics using output_tokens and duration_ms
const validMetrics = $metrics.filter((m) => m.duration_ms > 0 && m.output_tokens > 0); const validMetrics = $metrics.filter((m) => m.duration_ms > 0 && m.output_tokens > 0);
if (validMetrics.length === 0) { if (validMetrics.length === 0) {
return { totalRequests, totalInputTokens, totalOutputTokens, tokenStats: { p99: "0", p95: "0", p50: "0" }, histogramData: null }; return {
totalRequests,
totalInputTokens,
totalOutputTokens,
inFlightRequests: $inFlightRequests,
tokenStats: { p99: "0", p95: "0", p50: "0" },
histogramData: null,
};
} }
// Calculate tokens/second for each valid metric // Calculate tokens/second for each valid metric
@@ -63,6 +77,7 @@
totalRequests, totalRequests,
totalInputTokens, totalInputTokens,
totalOutputTokens, totalOutputTokens,
inFlightRequests: $inFlightRequests,
tokenStats: { tokenStats: {
p99: p99.toFixed(2), p99: p99.toFixed(2),
p95: p95.toFixed(2), p95: p95.toFixed(2),
@@ -95,7 +110,12 @@
<tbody class="bg-surface divide-y divide-card-border-inner"> <tbody class="bg-surface divide-y divide-card-border-inner">
<tr class="hover:bg-secondary"> <tr class="hover:bg-secondary">
<td class="px-4 py-4 text-sm font-semibold text-gray-900 dark:text-white">{stats.totalRequests}</td> <td class="px-4 py-4 text-sm font-semibold text-gray-900 dark:text-white">
<div class="flex flex-col gap-1">
<span class="text-xs font-medium text-gray-500 dark:text-gray-400">Completed: {nf.format(stats.totalRequests)}</span>
<span class="text-xs font-medium text-gray-500 dark:text-gray-400">Waiting: {nf.format(stats.inFlightRequests)}</span>
</div>
</td>
<td class="px-4 py-4 text-sm text-gray-700 dark:text-gray-300 border-l border-gray-200 dark:border-white/10"> <td class="px-4 py-4 text-sm text-gray-700 dark:text-gray-300 border-l border-gray-200 dark:border-white/10">
<div class="flex items-center gap-2"> <div class="flex items-center gap-2">
+5 -1
View File
@@ -38,8 +38,12 @@ export interface LogData {
data: string; data: string;
} }
export interface InFlightStats {
total: number;
}
export interface APIEventEnvelope { export interface APIEventEnvelope {
type: "modelStatus" | "logData" | "metrics"; type: "modelStatus" | "logData" | "metrics" | "inflight";
data: string; data: string;
} }
+9 -1
View File
@@ -1,5 +1,5 @@
import { writable } from "svelte/store"; import { writable } from "svelte/store";
import type { Model, Metrics, VersionInfo, LogData, APIEventEnvelope, ReqRespCapture } from "../lib/types"; import type { Model, Metrics, VersionInfo, LogData, APIEventEnvelope, ReqRespCapture, InFlightStats } from "../lib/types";
import { connectionState } from "./theme"; import { connectionState } from "./theme";
const LOG_LENGTH_LIMIT = 1024 * 100; /* 100KB of log data */ const LOG_LENGTH_LIMIT = 1024 * 100; /* 100KB of log data */
@@ -9,6 +9,7 @@ export const models = writable<Model[]>([]);
export const proxyLogs = writable<string>(""); export const proxyLogs = writable<string>("");
export const upstreamLogs = writable<string>(""); export const upstreamLogs = writable<string>("");
export const metrics = writable<Metrics[]>([]); export const metrics = writable<Metrics[]>([]);
export const inFlightRequests = writable<number>(0);
export const versionInfo = writable<VersionInfo>({ export const versionInfo = writable<VersionInfo>({
build_date: "unknown", build_date: "unknown",
commit: "unknown", commit: "unknown",
@@ -29,6 +30,7 @@ export function enableAPIEvents(enabled: boolean): void {
apiEventSource?.close(); apiEventSource?.close();
apiEventSource = null; apiEventSource = null;
metrics.set([]); metrics.set([]);
inFlightRequests.set(0);
return; return;
} }
@@ -46,6 +48,7 @@ export function enableAPIEvents(enabled: boolean): void {
proxyLogs.set(""); proxyLogs.set("");
upstreamLogs.set(""); upstreamLogs.set("");
metrics.set([]); metrics.set([]);
inFlightRequests.set(0);
models.set([]); models.set([]);
retryCount = 0; retryCount = 0;
connectionState.set("connected"); connectionState.set("connected");
@@ -83,6 +86,11 @@ export function enableAPIEvents(enabled: boolean): void {
metrics.update((prevMetrics) => [...newMetrics, ...prevMetrics]); metrics.update((prevMetrics) => [...newMetrics, ...prevMetrics]);
break; break;
} }
case "inflight": {
const stats = JSON.parse(message.data) as InFlightStats;
inFlightRequests.set(stats.total ?? 0);
break;
}
} }
} catch (err) { } catch (err) {
console.error(e.data, err); console.error(e.data, err);