proxy: preserve wall-clock duration in metrics (#629)
Keep request duration from being underreported when upstream timings only cover part of the full request lifecycle. - compare wall-clock and upstream timing durations - keep token and throughput values from timings - add regression coverage for underreported timings fixes #602
This commit is contained in:
@@ -365,6 +365,8 @@ func processStreamingResponse(modelID string, start time.Time, body []byte) (Tok
|
|||||||
}
|
}
|
||||||
|
|
||||||
func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result) (TokenMetrics, error) {
|
func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result) (TokenMetrics, error) {
|
||||||
|
wallDurationMs := int(time.Since(start).Milliseconds())
|
||||||
|
|
||||||
// default values
|
// default values
|
||||||
cachedTokens := -1 // unknown or missing data
|
cachedTokens := -1 // unknown or missing data
|
||||||
outputTokens := 0
|
outputTokens := 0
|
||||||
@@ -373,7 +375,7 @@ func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result)
|
|||||||
// timings data
|
// timings data
|
||||||
tokensPerSecond := -1.0
|
tokensPerSecond := -1.0
|
||||||
promptPerSecond := -1.0
|
promptPerSecond := -1.0
|
||||||
durationMs := int(time.Since(start).Milliseconds())
|
durationMs := wallDurationMs
|
||||||
|
|
||||||
if usage.Exists() {
|
if usage.Exists() {
|
||||||
if pt := usage.Get("prompt_tokens"); pt.Exists() {
|
if pt := usage.Get("prompt_tokens"); pt.Exists() {
|
||||||
@@ -402,7 +404,10 @@ func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result)
|
|||||||
outputTokens = int(timings.Get("predicted_n").Int())
|
outputTokens = int(timings.Get("predicted_n").Int())
|
||||||
promptPerSecond = timings.Get("prompt_per_second").Float()
|
promptPerSecond = timings.Get("prompt_per_second").Float()
|
||||||
tokensPerSecond = timings.Get("predicted_per_second").Float()
|
tokensPerSecond = timings.Get("predicted_per_second").Float()
|
||||||
durationMs = int(timings.Get("prompt_ms").Float() + timings.Get("predicted_ms").Float())
|
timingsDurationMs := int(timings.Get("prompt_ms").Float() + timings.Get("predicted_ms").Float())
|
||||||
|
if timingsDurationMs > durationMs {
|
||||||
|
durationMs = timingsDurationMs
|
||||||
|
}
|
||||||
|
|
||||||
if cachedValue := timings.Get("cache_n"); cachedValue.Exists() {
|
if cachedValue := timings.Get("cache_n"); cachedValue.Exists() {
|
||||||
cachedTokens = int(cachedValue.Int())
|
cachedTokens = int(cachedValue.Int())
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ import (
|
|||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
"github.com/mostlygeek/llama-swap/event"
|
"github.com/mostlygeek/llama-swap/event"
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/tidwall/gjson"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestMetricsMonitor_AddMetrics(t *testing.T) {
|
func TestMetricsMonitor_AddMetrics(t *testing.T) {
|
||||||
@@ -570,6 +571,27 @@ func TestMetricsMonitor_Concurrent(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestMetricsMonitor_ParseMetrics(t *testing.T) {
|
func TestMetricsMonitor_ParseMetrics(t *testing.T) {
|
||||||
|
t.Run("keeps wall clock duration when timings underreport request time", func(t *testing.T) {
|
||||||
|
start := time.Now().Add(-5 * time.Second)
|
||||||
|
usage := gjson.Parse(`{"prompt_tokens": 5, "completion_tokens": 1}`)
|
||||||
|
timings := gjson.Parse(`{
|
||||||
|
"prompt_n": 5,
|
||||||
|
"predicted_n": 1,
|
||||||
|
"prompt_per_second": 10.0,
|
||||||
|
"predicted_per_second": 2.0,
|
||||||
|
"prompt_ms": 5.0,
|
||||||
|
"predicted_ms": 15.0
|
||||||
|
}`)
|
||||||
|
|
||||||
|
metrics, err := parseMetrics("test-model", start, usage, timings)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Equal(t, 5, metrics.InputTokens)
|
||||||
|
assert.Equal(t, 1, metrics.OutputTokens)
|
||||||
|
assert.Equal(t, 10.0, metrics.PromptPerSecond)
|
||||||
|
assert.Equal(t, 2.0, metrics.TokensPerSecond)
|
||||||
|
assert.GreaterOrEqual(t, metrics.DurationMs, 5000)
|
||||||
|
})
|
||||||
|
|
||||||
t.Run("prefers timings over usage data", func(t *testing.T) {
|
t.Run("prefers timings over usage data", func(t *testing.T) {
|
||||||
mm := newMetricsMonitor(testLogger, 10, 0)
|
mm := newMetricsMonitor(testLogger, 10, 0)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user