From d9b44387f5482829489b5906525f6de1ea7a90d3 Mon Sep 17 00:00:00 2001
From: Steve Dudenhoeffer <steve@stevedudenhoeffer.com>
Date: Fri, 26 Jun 2026 20:32:58 -0400
Subject: [PATCH] fix: address gadfly P1 review (3 low-risk findings)

Triaged gadfly's P1 review (advisory). Fixed the three clearly-correct,
low-risk items; the rest were pre-existing mort behavior or theoretical:

- model/call.go: recordUsage dropped fully-cached responses (input==0 &&
  output==0 early-return missed CacheRead/CacheWrite-only usage, which
  Anthropic/OpenAI prompt-caching bills). Guard now also checks cache tokens.
- llmmeta/helper.go: recordLedger swallowed Storage.RecordMetaCall errors;
  now logs them (slog.Warn) so a non-logging Storage impl can't silently drop
  audit rows.
- model/cloud_sync.go: the ollama.com limit-cache used unbounded io.ReadAll;
  wrapped both reads in io.LimitReader(1 MiB) so a misbehaving endpoint can't
  exhaust memory before the 15s timeout.

Noted-not-fixed (follow-ups / pre-existing mort semantics): tier_not_allowed
ledger label on resolution failure, unknown-model usage attribution, the
cloud_sync https scheme allowlist, and several theoretical/cosmetic items.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 llmmeta/helper.go   | 5 ++++-
 model/call.go       | 2 +-
 model/cloud_sync.go | 9 +++++++--
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/llmmeta/helper.go b/llmmeta/helper.go
index 3fabdd0..f5773e2 100644
--- a/llmmeta/helper.go
+++ b/llmmeta/helper.go
@@ -34,6 +34,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"log/slog"
 	"strings"
 	"time"
 
@@ -574,7 +575,9 @@ func (h *Helper) recordLedger(ctx context.Context, call MetaCall) {
 	if h.storage == nil {
 		return
 	}
-	_ = h.storage.RecordMetaCall(ctx, call)
+	if err := h.storage.RecordMetaCall(ctx, call); err != nil {
+		slog.Warn("llmmeta: failed to record ledger row", "err", err)
+	}
 }
 
 // tryParseJSON attempts to decode text as JSON. Returns the parsed
diff --git a/model/call.go b/model/call.go
index 30bb964..7e07c07 100644
--- a/model/call.go
+++ b/model/call.go
@@ -237,7 +237,7 @@ func recordUsage(ctx context.Context, resp *llm.Response) {
 		return
 	}
 	u := resp.Usage
-	if u.InputTokens == 0 && u.OutputTokens == 0 {
+	if u.InputTokens == 0 && u.OutputTokens == 0 && u.CacheReadTokens == 0 && u.CacheWriteTokens == 0 {
 		return
 	}
 	model := resolvedModelName(ctx, resp)
diff --git a/model/cloud_sync.go b/model/cloud_sync.go
index ae28512..c214ca1 100644
--- a/model/cloud_sync.go
+++ b/model/cloud_sync.go
@@ -314,7 +314,7 @@ func (c *CloudOllamaLimitCache) fetchTags(ctx context.Context) ([]string, error)
 		return nil, err
 	}
 	defer resp.Body.Close()
-	body, err := io.ReadAll(resp.Body)
+	body, err := io.ReadAll(io.LimitReader(resp.Body, maxLimitCacheResponseBytes))
 	if err != nil {
 		return nil, err
 	}
@@ -367,7 +367,7 @@ func (c *CloudOllamaLimitCache) fetchContextLength(ctx context.Context, modelNam
 		return 0, err
 	}
 	defer resp.Body.Close()
-	respBody, err := io.ReadAll(resp.Body)
+	respBody, err := io.ReadAll(io.LimitReader(resp.Body, maxLimitCacheResponseBytes))
 	if err != nil {
 		return 0, err
 	}
@@ -451,3 +451,8 @@ func truncate(b []byte, n int) string {
 	}
 	return string(b[:n]) + "...(truncated)"
 }
+
+// maxLimitCacheResponseBytes bounds the ollama.com limit-cache HTTP responses
+// (/api/tags, /api/show) so a misbehaving endpoint can't stream an unbounded
+// body before the 15s timeout fires. 1 MiB is far above any real response.
+const maxLimitCacheResponseBytes = 1 << 20