proxy,ui-svelte: improve support for v1/messages and v1/responses (#758)

This improves the support for activity logging from the v1/responses and
v1/messages endpoints.

- add chat endpoint selection to Playground > Chat > Settings
- improve metrics extraction for streaming v1/messages and v1/responses
endpoints (tested with llama-server)

Fixes #742
This commit is contained in:
Benson Wong
2026-05-14 21:53:57 -07:00
committed by GitHub
parent aac7b8745a
commit fe71e8a6ea
4 changed files with 537 additions and 119 deletions
+124 -75
View File
@@ -424,110 +424,159 @@ func (mp *metricsMonitor) wrapHandler(
return nil
}
// usagePaths lists the JSON paths where a per-event usage object can live.
// v1/chat/completions puts it at top-level "usage"; v1/responses nests under
// "response.usage"; v1/messages emits it at "message.usage" on message_start
// and at "usage" on message_delta.
var usagePaths = []string{"usage", "response.usage", "message.usage"}
// extractUsageTokens reads input/output/cached token counts from a usage
// gjson.Result, handling the field-name differences across endpoints.
// cached returns -1 when the field is absent. ok is true when at least one
// field was present.
func extractUsageTokens(usage gjson.Result) (input, output, cached int64, ok bool) {
cached = -1
if !usage.Exists() {
return
}
if v := usage.Get("prompt_tokens"); v.Exists() {
// v1/chat/completions
input = v.Int()
ok = true
} else if v := usage.Get("input_tokens"); v.Exists() {
// v1/messages, v1/responses
input = v.Int()
ok = true
}
if v := usage.Get("completion_tokens"); v.Exists() {
// v1/chat/completions
output = v.Int()
ok = true
} else if v := usage.Get("output_tokens"); v.Exists() {
// v1/messages, v1/responses
output = v.Int()
ok = true
}
if v := usage.Get("cache_read_input_tokens"); v.Exists() {
// v1/messages (Anthropic)
cached = v.Int()
ok = true
} else if v := usage.Get("input_tokens_details.cached_tokens"); v.Exists() {
// v1/responses (OpenAI Responses API)
cached = v.Int()
ok = true
} else if v := usage.Get("prompt_tokens_details.cached_tokens"); v.Exists() {
// v1/chat/completions (OpenAI cache hits)
cached = v.Int()
ok = true
}
return
}
func processStreamingResponse(modelID string, start time.Time, body []byte) (ActivityLogEntry, error) {
// Iterate **backwards** through the body looking for the data payload with
// usage data. This avoids allocating a slice of all lines via bytes.Split.
// Walk SSE "data:" lines forward, merging usage info from every event.
// Different endpoints split usage across events:
// - v1/chat/completions: usage on the final chunk before [DONE]
// - v1/responses: usage on response.completed (response.usage)
// - v1/messages: input + cache on message_start (message.usage),
// output_tokens on message_delta (usage)
// We take the latest informative value per field so all three are covered.
// Start from the end of the body and scan backwards for newlines
pos := len(body)
for pos > 0 {
// Find the previous newline (or start of body)
lineStart := bytes.LastIndexByte(body[:pos], '\n')
if lineStart == -1 {
lineStart = 0
var (
inputTokens, outputTokens int64
cachedTokens int64 = -1
hasAny bool
timings gjson.Result
)
prefix := []byte("data:")
for offset := 0; offset < len(body); {
nl := bytes.IndexByte(body[offset:], '\n')
var line []byte
if nl == -1 {
line = body[offset:]
offset = len(body)
} else {
lineStart++ // Move past the newline
line = body[offset : offset+nl]
offset += nl + 1
}
line := bytes.TrimSpace(body[lineStart:pos])
pos = lineStart - 1 // Move position before the newline for next iteration
if len(line) == 0 {
continue
}
// SSE payload always follows "data:"
prefix := []byte("data:")
if !bytes.HasPrefix(line, prefix) {
line = bytes.TrimSpace(line)
if len(line) == 0 || !bytes.HasPrefix(line, prefix) {
continue
}
data := bytes.TrimSpace(line[len(prefix):])
if len(data) == 0 {
if len(data) == 0 || bytes.Equal(data, []byte("[DONE]")) {
continue
}
if bytes.Equal(data, []byte("[DONE]")) {
// [DONE] line itself contains nothing of interest.
if !gjson.ValidBytes(data) {
continue
}
parsed := gjson.ParseBytes(data)
if gjson.ValidBytes(data) {
parsed := gjson.ParseBytes(data)
usage := parsed.Get("usage")
timings := parsed.Get("timings")
// v1/responses format nests usage under response.usage
if !usage.Exists() {
usage = parsed.Get("response.usage")
for _, path := range usagePaths {
u := parsed.Get(path)
if !u.Exists() {
continue
}
if usage.Exists() || timings.Exists() {
return parseMetrics(modelID, start, usage, timings)
i, o, c, ok := extractUsageTokens(u)
if !ok {
continue
}
hasAny = true
// Take the latest non-zero value so message_start's input_tokens
// is preserved when message_delta's usage omits it, and vice versa
// for output_tokens.
if i > 0 {
inputTokens = i
}
if o > 0 {
outputTokens = o
}
if c >= 0 {
cachedTokens = c
}
}
if t := parsed.Get("timings"); t.Exists() {
timings = t
hasAny = true
}
}
return ActivityLogEntry{}, fmt.Errorf("no valid JSON data found in stream")
if !hasAny {
return ActivityLogEntry{}, fmt.Errorf("no valid JSON data found in stream")
}
return buildMetrics(modelID, start, inputTokens, outputTokens, cachedTokens, timings), nil
}
func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result) (ActivityLogEntry, error) {
input, output, cached, _ := extractUsageTokens(usage)
return buildMetrics(modelID, start, input, output, cached, timings), nil
}
// buildMetrics composes an ActivityLogEntry from accumulated token counts and
// optional llama-server timings (which override input/output and provide rates).
func buildMetrics(modelID string, start time.Time, inputTokens, outputTokens, cachedTokens int64, timings gjson.Result) ActivityLogEntry {
wallDurationMs := int(time.Since(start).Milliseconds())
// default values
cachedTokens := -1 // unknown or missing data
outputTokens := 0
inputTokens := 0
// timings data
durationMs := wallDurationMs
tokensPerSecond := -1.0
promptPerSecond := -1.0
durationMs := wallDurationMs
if usage.Exists() {
if pt := usage.Get("prompt_tokens"); pt.Exists() {
// v1/chat/completions
inputTokens = int(pt.Int())
} else if it := usage.Get("input_tokens"); it.Exists() {
// v1/messages
inputTokens = int(it.Int())
}
if ct := usage.Get("completion_tokens"); ct.Exists() {
// v1/chat/completions
outputTokens = int(ct.Int())
} else if ot := usage.Get("output_tokens"); ot.Exists() {
outputTokens = int(ot.Int())
}
if ct := usage.Get("cache_read_input_tokens"); ct.Exists() {
cachedTokens = int(ct.Int())
}
}
// use llama-server's timing data for tok/sec and duration as it is more accurate
if timings.Exists() {
inputTokens = int(timings.Get("prompt_n").Int())
outputTokens = int(timings.Get("predicted_n").Int())
inputTokens = timings.Get("prompt_n").Int()
outputTokens = timings.Get("predicted_n").Int()
promptPerSecond = timings.Get("prompt_per_second").Float()
tokensPerSecond = timings.Get("predicted_per_second").Float()
timingsDurationMs := int(timings.Get("prompt_ms").Float() + timings.Get("predicted_ms").Float())
if timingsDurationMs > durationMs {
durationMs = timingsDurationMs
}
if cachedValue := timings.Get("cache_n"); cachedValue.Exists() {
cachedTokens = int(cachedValue.Int())
cachedTokens = cachedValue.Int()
}
}
@@ -535,14 +584,14 @@ func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result)
Timestamp: time.Now(),
Model: modelID,
Tokens: TokenMetrics{
CachedTokens: cachedTokens,
InputTokens: inputTokens,
OutputTokens: outputTokens,
CachedTokens: int(cachedTokens),
InputTokens: int(inputTokens),
OutputTokens: int(outputTokens),
PromptPerSecond: promptPerSecond,
TokensPerSecond: tokensPerSecond,
},
DurationMs: durationMs,
}, nil
}
}
// decompressBody decompresses the body based on Content-Encoding header