fix: address verified gadfly P4/#4 findings (audit/budget/persona)

Security (all 3 models — HIGH): audit OnTool persisted raw tool args + results verbatim for the very tools the OnStep narration-redaction flags as secret (mcp_call/email_send/http_*) — the args/results are what CARRY the secret, so they landed in skill_run_logs unredacted. Factored the predicate into isSecretTool() (single source of truth) and OnTool now emits args_redacted/result_redacted (+ lengths) for secret tools. Test asserts no secret reaches the log. (persona) webhook_ip_allowlist entries are now CIDR/IP-validated at load (malformed dropped + warned) instead of accepted raw. Contract correctness (glm-5.2 + deepseek) — audit Memory now honors its documented Storage contract: ListChildrenByParent/ListFinishedRunsBefore return oldest-first; WalkParentChain returns root-first and honors MaxParentChainDepth; ListRunsFiltered clamps limit (<=0 or >500 -> 50); ListFinishedRunsBefore with limit<=0 returns none; an explicit RunFilter.Status (incl. "dry_run") matches regardless of IncludeDryRun; LastRunBySkills counts only status=="ok" unless includeFailed. (PurgeOlderThan's FinishedAt key is the SAFE behavior — in-flight runs retained — so the doc was aligned to it, not the impl.) Error-handling: appendLog now uses a bounded context (auditAppendTimeout=3s) so a hung backend can't block the run goroutine on the hot path; Sink.StartRun logs its (still best-effort) failure instead of swallowing it; budget Memory.Get uses RLock (RWMutex); budget package doc fixed (was skillexec's); Check uses the budgetWindow constant, not a duplicated literal. Triaged false-positive: NewNoOpBudget returning BudgetTracker is assignable to run.Budget (identical method sets) — no change needed. Core go.sum still free of host/DB deps. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 23:44:34 -04:00
parent 2260480c81
commit d82cef46b4
8 changed files with 197 additions and 24 deletions
@@ -99,6 +99,19 @@ func (m *Memory) newestFirst(keep func(SkillRun) bool) []SkillRun {
 	return out
 }

+// oldestFirst returns the retained runs in insertion (oldest-first) order,
+// optionally filtered. Caller holds at least RLock.
+func (m *Memory) oldestFirst(keep func(SkillRun) bool) []SkillRun {
+	out := make([]SkillRun, 0, len(m.order))
+	for _, id := range m.order {
+		r := m.runs[id]
+		if keep == nil || keep(r) {
+			out = append(out, r)
+		}
+	}
+	return out
+}
+
 func page(rs []SkillRun, offset, limit int) []SkillRun {
 	if offset < 0 {
 		offset = 0
@@ -142,10 +155,12 @@ func (m *Memory) ListRunsByCaller(_ context.Context, callerID string, limit int)
 }

 func (m *Memory) matchesFilter(r SkillRun, f RunFilter) bool {
-	if !f.IncludeDryRun && r.Status == "dry_run" {
-		return false
-	}
-	if f.Status != "" && r.Status != f.Status {
+	if f.Status != "" {
+		if r.Status != f.Status {
+			return false
+		}
+		// An explicit Status (even "dry_run") matches regardless of IncludeDryRun.
+	} else if !f.IncludeDryRun && r.Status == "dry_run" {
 		return false
 	}
 	if f.SkillID != "" && r.SkillID != f.SkillID {
@@ -170,6 +185,9 @@ func (m *Memory) matchesFilter(r SkillRun, f RunFilter) bool {
 }

 func (m *Memory) ListRunsFiltered(_ context.Context, f RunFilter, offset, limit int) ([]SkillRun, error) {
+	if limit <= 0 || limit > 500 {
+		limit = 50 // bound admin scans, per the Storage contract
+	}
 	m.mu.RLock()
 	defer m.mu.RUnlock()
 	return page(m.newestFirst(func(r SkillRun) bool { return m.matchesFilter(r, f) }), offset, limit), nil
@@ -203,7 +221,7 @@ func (m *Memory) PurgeOlderThan(_ context.Context, t time.Time) (int64, error) {
 func (m *Memory) ListChildrenByParent(_ context.Context, parentRunID string) ([]SkillRun, error) {
 	m.mu.RLock()
 	defer m.mu.RUnlock()
-	return m.newestFirst(func(r SkillRun) bool { return r.ParentRunID == parentRunID }), nil
+	return m.oldestFirst(func(r SkillRun) bool { return r.ParentRunID == parentRunID }), nil
 }

 func (m *Memory) WalkParentChain(_ context.Context, runID string) ([]SkillRun, error) {
@@ -211,7 +229,7 @@ func (m *Memory) WalkParentChain(_ context.Context, runID string) ([]SkillRun, e
 	defer m.mu.RUnlock()
 	var chain []SkillRun
 	seen := map[string]bool{}
-	for id := runID; id != ""; {
+	for id := runID; id != "" && len(chain) < MaxParentChainDepth; {
 		r, ok := m.runs[id]
 		if !ok || seen[id] {
 			break
@@ -220,13 +238,20 @@ func (m *Memory) WalkParentChain(_ context.Context, runID string) ([]SkillRun, e
 		chain = append(chain, r)
 		id = r.ParentRunID
 	}
+	// Contract: root first, the queried run last. We walked child→root, so reverse.
+	for i, j := 0, len(chain)-1; i < j; i, j = i+1, j-1 {
+		chain[i], chain[j] = chain[j], chain[i]
+	}
 	return chain, nil
 }

 func (m *Memory) ListFinishedRunsBefore(_ context.Context, cutoff time.Time, limit int) ([]SkillRun, error) {
+	if limit <= 0 {
+		return nil, nil // contract: a real bound is required
+	}
 	m.mu.RLock()
 	defer m.mu.RUnlock()
-	return page(m.newestFirst(func(r SkillRun) bool {
+	return page(m.oldestFirst(func(r SkillRun) bool {
 		return r.FinishedAt != nil && r.FinishedAt.Before(cutoff)
 	}), 0, limit), nil
 }
@@ -244,8 +269,8 @@ func (m *Memory) LastRunBySkills(_ context.Context, skillIDs []string, includeFa
 		if !want[r.SkillID] {
 			continue
 		}
-		if !includeFailed && (r.Status == "error" || r.Status == "timeout") {
-			continue
+		if !includeFailed && r.Status != "ok" {
+			continue // contract: only status=="ok" counts unless includeFailed
 		}
 		if r.StartedAt.After(out[r.SkillID]) {
 			out[r.SkillID] = r.StartedAt
@@ -0,0 +1,58 @@
+package audit
+
+import (
+	"context"
+	"strings"
+	"testing"
+
+	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
+)
+
+// TestOnToolRedactsSecretTools: a secret-bearing tool's args/result must NOT be
+// persisted verbatim in the audit log.
+func TestOnToolRedactsSecretTools(t *testing.T) {
+	ctx := context.Background()
+	mem := NewMemory()
+	mem.StartRun(ctx, SkillRun{ID: "r1"})
+	w := NewWriter(mem, "r1")
+
+	secret := `{"url":"https://x","headers":{"Authorization":"Bearer SUPERSECRET"}}`
+	w.OnTool(llm.ToolCall{Name: "http_get", ID: "1", Arguments: []byte(secret)}, "TOPSECRETBODY")
+	// a non-secret tool is logged verbatim
+	w.OnTool(llm.ToolCall{Name: "think", ID: "2", Arguments: []byte(`{"thought":"hi"}`)}, "ok")
+
+	logs, _ := mem.ListLogsByRun(ctx, "r1")
+	var dump strings.Builder
+	for _, l := range logs {
+		for k, v := range l.Payload {
+			dump.WriteString(k)
+			dump.WriteString("=")
+			if s, ok := v.(string); ok {
+				dump.WriteString(s)
+			}
+			dump.WriteString(" ")
+		}
+	}
+	all := dump.String()
+	if strings.Contains(all, "SUPERSECRET") || strings.Contains(all, "TOPSECRETBODY") {
+		t.Fatalf("secret leaked into audit log: %s", all)
+	}
+	// the redaction marker is present, and the non-secret tool's args survive
+	foundRedacted, foundThink := false, false
+	for _, l := range logs {
+		if l.EventType == "tool_call" {
+			if r, _ := l.Payload["args_redacted"].(bool); r {
+				foundRedacted = true
+			}
+			if a, _ := l.Payload["args"].(string); strings.Contains(a, "thought") {
+				foundThink = true
+			}
+		}
+	}
+	if !foundRedacted {
+		t.Error("secret tool_call should carry args_redacted=true")
+	}
+	if !foundThink {
+		t.Error("non-secret tool args should be logged verbatim")
+	}
+}
@@ -2,6 +2,7 @@ package audit

 import (
 	"context"
+	"log/slog"
 	"time"

 	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
@@ -36,8 +37,9 @@ func (s *Sink) StartRun(ctx context.Context, info run.RunInfo) run.RunRecorder {
 	if started.IsZero() {
 		started = time.Now()
 	}
-	// Best-effort: a failed StartRun must not break the user-visible run.
-	_ = s.storage.StartRun(ctx, SkillRun{
+	// Best-effort: a failed StartRun must not break the user-visible run, but we
+	// surface it (a swallowed failure leaves orphan log events with no run row).
+	if err := s.storage.StartRun(ctx, SkillRun{
 		ID:          info.RunID,
 		SkillID:     info.SubjectID,
 		CallerID:    info.CallerID,
@@ -46,7 +48,10 @@ func (s *Sink) StartRun(ctx context.Context, info run.RunInfo) run.RunRecorder {
 		Inputs:      info.Inputs,
 		StartedAt:   started,
 		Status:      "running",
-	})
+	}); err != nil {
+		slog.Warn("audit: StartRun failed; the run row is missing so its log events will orphan",
+			"run_id", info.RunID, "error", err)
+	}
 	return &recorder{w: NewWriter(s.storage, info.RunID)}
 }

@@ -168,16 +168,26 @@ func (w *Writer) OnStep(iter int, resp *llm.Response) {
 // surrounding narration could leak a secret (MCP args, email body/
 // recipients, raw HTTP request). Mirrors the steps.go redaction list so
 // the audit trace never persists secret-adjacent assistant text.
+// isSecretTool reports whether a tool's arguments/results may carry secrets
+// (MCP args, email bodies/recipients, HTTP auth headers/bodies) and so must be
+// redacted from the persisted audit log. Single source of truth for both the
+// step-narration redaction and the OnTool arg/result redaction. NOTE: this is
+// a name-prefix allowlist — a NEW secret-bearing tool must be added here or its
+// args/results will be logged verbatim.
+func isSecretTool(name string) bool {
+	switch name {
+	case "mcp_call", "email_send":
+		return true
+	}
+	return strings.HasPrefix(name, "http_")
+}
+
 func stepHasSecretTool(resp *llm.Response) bool {
 	if resp == nil {
 		return false
 	}
 	for _, c := range resp.ToolCalls {
-		switch c.Name {
-		case "mcp_call", "email_send":
-			return true
-		}
-		if strings.HasPrefix(c.Name, "http_") {
+		if isSecretTool(c.Name) {
 			return true
 		}
 	}
@@ -211,6 +221,24 @@ func (w *Writer) OnTool(call llm.ToolCall, result string) {
 		return
 	}
 	w.calls.Add(1)
+	// Redact the args/result of secret-bearing tools — these fields actually
+	// CARRY the secret (MCP args, email body/recipients, HTTP auth/body), so
+	// logging them verbatim would defeat the OnStep narration redaction.
+	if isSecretTool(call.Name) {
+		w.appendLog("tool_call", map[string]any{
+			"name":          call.Name,
+			"id":            call.ID,
+			"args_redacted": true,
+			"args_len":      len(call.Arguments),
+		})
+		w.appendLog("tool_result", map[string]any{
+			"name":            call.Name,
+			"id":              call.ID,
+			"result_redacted": true,
+			"result_len":      len(result),
+		})
+		return
+	}
 	w.appendLog("tool_call", map[string]any{
 		"name": call.Name,
 		"args": string(call.Arguments),
@@ -296,6 +324,10 @@ func (w *Writer) Close(ctx context.Context, stats RunStats) {
 // hung connection that the run goroutine shouldn't keep waiting on.
 const auditFinishTimeout = 10 * time.Second

+// auditAppendTimeout bounds each per-event AppendLog on the hot path so a hung
+// storage backend can't block the run goroutine.
+const auditAppendTimeout = 3 * time.Second
+
 // ToolCallsCount returns how many tool invocations OnTool has seen so
 // far. Useful for budget enforcement.
 func (w *Writer) ToolCallsCount() int { return int(w.calls.Load()) }
@@ -309,7 +341,12 @@ func (w *Writer) appendLog(eventType string, payload map[string]any) {
 		Payload:   payload,
 		CreatedAt: time.Now(),
 	}
-	if err := w.storage.AppendLog(context.Background(), log); err != nil {
+	// Bound the write: a hung storage backend must not block the run goroutine
+	// on the hot path (every step/tool event flows through here). Detached from
+	// any caller deadline — the log write is independent of the run's context.
+	ctx, cancel := context.WithTimeout(context.Background(), auditAppendTimeout)
+	defer cancel()
+	if err := w.storage.AppendLog(ctx, log); err != nil {
 		slog.Warn("skillaudit: AppendLog failed", "run_id", w.runID, "seq", seq, "type", eventType, "error", err)
 	}
 }