fix: address verified gadfly P2 findings (9 real of 18)

Independently verified all 18 gadfly findings against the code (18-agent fan-out). Fixed the 9 real ones; the other 9 were false-positive / hallucinated / valid-tradeoff (no change). High: - F1 nil model: a Models resolver returning (ctx,nil,nil) flowed into the agent loop and nil-panicked. Now a clean error (Run never panics). +test. - F9 compactor data-leak: renderTranscript sent tool-call args verbatim to the summarizer (a possibly-different provider/tier); secret-bearing tool args (mcp_call/email_send/http_*/webhook_*) are now redacted, with a doc note that result bodies still flow (summary needs them). Medium/minor: - F2 compactor error path returned the folded slice, not the original msgs (contradicting the documented non-fatal contract) -> return msgs. - F3 RunStats.Status only ok/error; now timeout (DeadlineExceeded) / cancelled (Canceled) via statusFor. +test. - F4 step-zip emitted empty-name "ghost" steps when results>calls; now pairs min(calls,results) only. - F5 SetIteration was never called -> RunState.Iteration always 0; the step observer now updates it each loop. - F6 matchPending fallback was LIFO; now FIFO (matches the per-key queue). - F7 estimateTokens had no default arm (future Part kinds counted as 0); unknown parts now counted conservatively. - F8 cloud_sync silently truncated >1MiB responses -> opaque JSON error; now a clear "response exceeded N bytes" via readCapped. - F12 step observer captured the caller ctx; now the merged runCtx. - F13 compaction onFire was nil (doc claimed it logged); now wired to audit LogEvent("compaction_fired"). - F11 (no pre-dispatch hook in majordomo) documented honestly as a known limitation; F18 UsageSink doc clarified cache tokens are subsets of input. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 21:42:46 -04:00
parent dfbc5a42b9
commit 7b3da87c08
6 changed files with 193 additions and 42 deletions
@@ -2,6 +2,7 @@ package run

 import (
 	"context"
+	"errors"
 	"fmt"
 	"time"

@@ -130,11 +131,18 @@ func (e *Executor) Run(ctx context.Context, ra RunnableAgent, inv tool.Invocatio
 		res.Err = fmt.Errorf("resolve model %q: %w", tier, err)
 		return res
 	}
+	if model == nil {
+		// A resolver returning (ctx, nil, nil) would otherwise nil-panic inside
+		// the agent loop; surface it as a clean error (Run never panics out).
+		res.Err = fmt.Errorf("resolve model %q: resolver returned a nil model", tier)
+		return res
+	}
 	ctx = modelCtx

 	// Audit start (optional). The recorder satisfies RunTally; stamp it on the
 	// invocation so a self-status tool can read live progress.
 	var rec RunRecorder
+	var stateAcc *RunStateAccessor
 	if e.cfg.Ports.Audit != nil {
 		rec = e.cfg.Ports.Audit.StartRun(ctx, RunInfo{
 			RunID:       inv.RunID,
@@ -148,7 +156,8 @@ func (e *Executor) Run(ctx context.Context, ra RunnableAgent, inv tool.Invocatio
 		})
 	}
 	if rec != nil {
-		inv.RunState = NewRunStateAccessor(rec, maxIter, 0, started)
+		stateAcc = NewRunStateAccessor(rec, maxIter, 0, started)
+		inv.RunState = stateAcc
 	}

 	// Build the toolbox from the agent's low-level tools.
@@ -159,11 +168,27 @@ func (e *Executor) Run(ctx context.Context, ra RunnableAgent, inv tool.Invocatio
 		return res
 	}

-	// Step instrumentation: accumulate Result.Steps + fire inv.OnStep, and feed
-	// the audit recorder. majordomo's step observer hands us each completed
-	// iteration; we zip the model's tool calls with their executed results.
+	// Run context: bound by MaxRuntime, detached from the caller's deadline so a
+	// lane/queue wait doesn't eat the run budget (mort's V10 lesson). Caller
+	// cancellation still propagates via MergeCancellation. Created BEFORE the
+	// step observer so the observer forwards the merged run context (not a
+	// possibly-cancelled caller ctx) to OnStep consumers.
+	runCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), maxRuntime)
+	defer cancel()
+	runCtx, mergeCancel := MergeCancellation(runCtx, ctx)
+	defer mergeCancel()
+
+	// Step instrumentation: accumulate Result.Steps + fire inv.OnStep, feed the
+	// audit recorder, and keep the live iteration counter fresh. majordomo's
+	// step observer hands us each completed iteration; we zip the model's tool
+	// calls with their executed results PAIRWISE — a result without a matching
+	// call (or a call without a result) is skipped rather than recorded as an
+	// empty-name "ghost" step.
 	emitter := newStepEmitter(inv.OnStep)
 	stepObserver := func(s agent.Step) {
+		if stateAcc != nil {
+			stateAcc.SetIteration(s.Index)
+		}
 		if rec != nil {
 			rec.OnStep(s.Index, s.Response)
 		}
@@ -171,27 +196,20 @@ func (e *Executor) Run(ctx context.Context, ra RunnableAgent, inv tool.Invocatio
 		if s.Response != nil {
 			calls = s.Response.ToolCalls
 		}
-		for i, r := range s.Results {
-			var call llm.ToolCall
-			if i < len(calls) {
-				call = calls[i]
-			}
-			emitter.toolStart(ctx, call.Name, call.Arguments)
-			emitter.toolEnd(ctx, call, r.Content, r.IsError)
+		n := len(s.Results)
+		if len(calls) < n {
+			n = len(calls)
+		}
+		for i := 0; i < n; i++ {
+			call, r := calls[i], s.Results[i]
+			emitter.toolStart(runCtx, call.Name, call.Arguments)
+			emitter.toolEnd(runCtx, call, r.Content, r.IsError)
 			if rec != nil {
 				rec.OnTool(call, r.Content)
 			}
 		}
 	}

-	// Run context: bound by MaxRuntime, detached from the caller's deadline so a
-	// lane/queue wait doesn't eat the run budget (mort's V10 lesson). Caller
-	// cancellation still propagates via MergeCancellation.
-	runCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), maxRuntime)
-	defer cancel()
-	runCtx, mergeCancel := MergeCancellation(runCtx, ctx)
-	defer mergeCancel()
-
 	opts := []agent.Option{
 		agent.WithToolbox(toolbox),
 		agent.WithMaxSteps(maxIter),
@@ -200,17 +218,27 @@ func (e *Executor) Run(ctx context.Context, ra RunnableAgent, inv tool.Invocatio
 	}
 	if e.cfg.Compactor != nil && e.cfg.ContextTokens != nil {
 		if threshold := e.compactionThreshold(tier); threshold > 0 {
-			opts = append(opts, agent.WithCompactor(e.cfg.Compactor(threshold, nil)))
+			// Forward compaction events to the audit log (makes the
+			// CompactionEvent doc's "logged to the run trace" promise true).
+			var onFire func(compact.CompactionEvent)
+			if rec != nil {
+				onFire = func(ev compact.CompactionEvent) {
+					rec.LogEvent("compaction_fired", map[string]any{
+						"messages_before": ev.MessagesBefore,
+						"messages_after":  ev.MessagesAfter,
+						"tokens_before":   ev.TokensBefore,
+						"tokens_after":    ev.TokensAfter,
+					})
+				}
+			}
+			opts = append(opts, agent.WithCompactor(e.cfg.Compactor(threshold, onFire)))
 		}
 	}

 	ag := agent.New(model, e.systemPrompt(ra), opts...)
 	runRes, runErr := ag.Run(runCtx, input)

-	status := "ok"
-	if runErr != nil {
-		status = "error"
-	}
+	status := statusFor(runErr)
 	if runRes != nil {
 		res.Output = runRes.Output
 		res.Usage = runRes.Usage
@@ -225,6 +253,22 @@ func (e *Executor) Run(ctx context.Context, ra RunnableAgent, inv tool.Invocatio
 	return res
 }

+// statusFor maps a run error to a RunStats.Status, distinguishing a deadline
+// (timeout) and a cancellation (cancelled — caller cancel or shutdown) from a
+// generic error so audit consumers can tell them apart.
+func statusFor(runErr error) string {
+	switch {
+	case runErr == nil:
+		return "ok"
+	case errors.Is(runErr, context.DeadlineExceeded):
+		return "timeout"
+	case errors.Is(runErr, context.Canceled):
+		return "cancelled"
+	default:
+		return "error"
+	}
+}
+
 // finishAudit writes the terminal roll-up on a detached context so a cancelled
 // run still records (mort's CleanupContextTimeout lesson).
 func (e *Executor) finishAudit(ctx context.Context, rec RunRecorder, status string, res Result, started time.Time, runErr error) {