From d9b44387f5482829489b5906525f6de1ea7a90d3 Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Fri, 26 Jun 2026 20:32:58 -0400 Subject: [PATCH] fix: address gadfly P1 review (3 low-risk findings) Triaged gadfly's P1 review (advisory). Fixed the three clearly-correct, low-risk items; the rest were pre-existing mort behavior or theoretical: - model/call.go: recordUsage dropped fully-cached responses (input==0 && output==0 early-return missed CacheRead/CacheWrite-only usage, which Anthropic/OpenAI prompt-caching bills). Guard now also checks cache tokens. - llmmeta/helper.go: recordLedger swallowed Storage.RecordMetaCall errors; now logs them (slog.Warn) so a non-logging Storage impl can't silently drop audit rows. - model/cloud_sync.go: the ollama.com limit-cache used unbounded io.ReadAll; wrapped both reads in io.LimitReader(1 MiB) so a misbehaving endpoint can't exhaust memory before the 15s timeout. Noted-not-fixed (follow-ups / pre-existing mort semantics): tier_not_allowed ledger label on resolution failure, unknown-model usage attribution, the cloud_sync https scheme allowlist, and several theoretical/cosmetic items. Co-Authored-By: Claude Opus 4.8 (1M context) --- llmmeta/helper.go | 5 ++++- model/call.go | 2 +- model/cloud_sync.go | 9 +++++++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/llmmeta/helper.go b/llmmeta/helper.go index 3fabdd0..f5773e2 100644 --- a/llmmeta/helper.go +++ b/llmmeta/helper.go @@ -34,6 +34,7 @@ import ( "context" "encoding/json" "fmt" + "log/slog" "strings" "time" @@ -574,7 +575,9 @@ func (h *Helper) recordLedger(ctx context.Context, call MetaCall) { if h.storage == nil { return } - _ = h.storage.RecordMetaCall(ctx, call) + if err := h.storage.RecordMetaCall(ctx, call); err != nil { + slog.Warn("llmmeta: failed to record ledger row", "err", err) + } } // tryParseJSON attempts to decode text as JSON. Returns the parsed diff --git a/model/call.go b/model/call.go index 30bb964..7e07c07 100644 --- a/model/call.go +++ b/model/call.go @@ -237,7 +237,7 @@ func recordUsage(ctx context.Context, resp *llm.Response) { return } u := resp.Usage - if u.InputTokens == 0 && u.OutputTokens == 0 { + if u.InputTokens == 0 && u.OutputTokens == 0 && u.CacheReadTokens == 0 && u.CacheWriteTokens == 0 { return } model := resolvedModelName(ctx, resp) diff --git a/model/cloud_sync.go b/model/cloud_sync.go index ae28512..c214ca1 100644 --- a/model/cloud_sync.go +++ b/model/cloud_sync.go @@ -314,7 +314,7 @@ func (c *CloudOllamaLimitCache) fetchTags(ctx context.Context) ([]string, error) return nil, err } defer resp.Body.Close() - body, err := io.ReadAll(resp.Body) + body, err := io.ReadAll(io.LimitReader(resp.Body, maxLimitCacheResponseBytes)) if err != nil { return nil, err } @@ -367,7 +367,7 @@ func (c *CloudOllamaLimitCache) fetchContextLength(ctx context.Context, modelNam return 0, err } defer resp.Body.Close() - respBody, err := io.ReadAll(resp.Body) + respBody, err := io.ReadAll(io.LimitReader(resp.Body, maxLimitCacheResponseBytes)) if err != nil { return 0, err } @@ -451,3 +451,8 @@ func truncate(b []byte, n int) string { } return string(b[:n]) + "...(truncated)" } + +// maxLimitCacheResponseBytes bounds the ollama.com limit-cache HTTP responses +// (/api/tags, /api/show) so a misbehaving endpoint can't stream an unbounded +// body before the 15s timeout fires. 1 MiB is far above any real response. +const maxLimitCacheResponseBytes = 1 << 20