executus/audit/storage.go

// Package skillaudit persists skill execution traces: per-run summary rows
// (skill_runs) and per-step event logs (skill_run_logs). The executor in
// pkg/logic/skillexec emits events through a Writer; the storage layer is
// kept separate so tests can mock it and so retention pruning has a clear
// home.
//
// Why: agentic runs can be long, multi-tool affairs. Without a structured
// audit trail, debugging "why did the LLM do that?" is impossible. The
// log table is keyed by (run_id, sequence) so insert order is preserved.
package audit

import (
	"context"
	"errors"
	"time"
)

// ErrNotFound is returned when a run lookup fails.
var ErrNotFound = errors.New("skill run not found")

// SkillRun is the per-invocation summary row. One per call to
// Executor.Run. Status transitions through running → ok / error /
// timeout / budget_exceeded / dry_run.
type SkillRun struct {
	ID             string
	SkillID        string
	CallerID       string
	ChannelID      string
	Inputs         map[string]any
	StartedAt      time.Time
	FinishedAt     *time.Time
	Status         string // running|ok|error|timeout|budget_exceeded|dry_run
	Output         string
	Error          string
	ToolCallsCount int
	RuntimeSeconds float64
	// ParentRunID is the run_id of the parent skill that invoked this
	// run via skill_invoke. Empty for top-level invocations. Indexed
	// in the gorm model so call-tree queries (ListChildrenByParent +
	// WalkParentChain) are cheap.
	ParentRunID string

	// Token roll-ups, summed across all model completions in this run
	// (one Usage per OnStep). All default to 0 when the provider did
	// not expose token usage.
	TotalInputTokens    int64
	TotalOutputTokens   int64
	TotalThinkingTokens int64
}

// RunStats captures the terminal state of a run for FinishRun. Bundling
// these into one struct (vs a long positional argument list) keeps
// callers readable; future fields slot in here without touching every
// call site.
//
// Why: FinishRun originally took six positional args; adding token
// columns would push it higher. A struct is the idiomatic Go way to
// avoid the positional-arg explosion.
type RunStats struct {
	Status         string  // ok|error|timeout|budget_exceeded|dry_run
	Output         string  // final agent output (empty on error)
	Error          string  // error message (empty on success)
	ToolCalls      int     // total OnTool count
	RuntimeSeconds float64 // wall-clock duration

	// Token roll-ups (all default to 0 when token usage was not
	// exposed by the provider).
	InputTokens    int64
	OutputTokens   int64
	ThinkingTokens int64
}

// SkillRunLog is one event recorded during a run. EventType ∈
// step|tool_call|tool_result|error. Payload is opaque JSON the writer
// emits.
type SkillRunLog struct {
	RunID     string
	Sequence  int
	EventType string
	Payload   map[string]any
	CreatedAt time.Time
}

// RunFilter is the predicate bundle for the cross-surface "recent runs"
// query (ListRunsFiltered / CountRunsFiltered). Every field is optional;
// the zero value matches the most recent runs across ALL audited surfaces
// (agents + skills). This powers the admin agent-trace debug view and the
// Claude debug API's /runs list.
//
// Why a struct (vs positional args): the debug list filters along several
// independent axes and more will be added; bundling avoids a positional
// explosion and keeps call sites readable.
type RunFilter struct {
	Status    string // exact status match; "" = all (dry_run excluded unless IncludeDryRun)
	SkillID   string // exact skill_id (holds the agent UUID for agent runs)
	CallerID  string // exact caller (Discord member id)
	ChannelID string // exact channel id

	// TopLevelOnly restricts to root runs (parent_run_id = ''), hiding
	// nested sub-agent / sub-skill runs from the firehose. The debug list
	// defaults this on; an "include nested" toggle clears it.
	TopLevelOnly bool

	// IncludeDryRun surfaces status="dry_run" sandbox rows, which are
	// excluded by default. Ignored when Status is set explicitly (an
	// explicit Status=="dry_run" still matches).
	IncludeDryRun bool

	// Since / Until bound started_at: started_at >= Since (zero = no lower
	// bound) and started_at < Until (zero = no upper bound).
	Since time.Time
	Until time.Time
}

// Storage is the persistence interface for skill runs and per-step logs.
//
// Why: tests substitute fake implementations; production wires
// NewGormStorage. Keep the interface narrow — the system only needs CRUD
// plus the retention prune helper.
type Storage interface {
	Initialize(ctx context.Context) error

	// StartRun inserts the run with status=running. The caller MUST
	// invoke FinishRun later (or the row stays in running indefinitely
	// — operationally that signals a crash mid-run, which is useful
	// signal).
	StartRun(ctx context.Context, run SkillRun) error

	// FinishRun updates the running row with terminal status, output
	// and stats. Idempotent on second call (last write wins).
	//
	// V5: takes a RunStats struct so token + cost columns can be
	// written alongside the legacy fields without changing the
	// signature for every future addition.
	FinishRun(ctx context.Context, runID string, stats RunStats) error

	// AppendLog adds one event to the run's log. Sequence numbers must
	// be unique per run; the writer is responsible for monotonic
	// ordering.
	AppendLog(ctx context.Context, log SkillRunLog) error

	// GetRun returns the run summary, or ErrNotFound.
	GetRun(ctx context.Context, runID string) (*SkillRun, error)

	// ListLogsByRun returns all logs for a run in sequence order.
	ListLogsByRun(ctx context.Context, runID string) ([]SkillRunLog, error)

	// ListRunsBySkill returns recent runs for a skill, newest first,
	// capped at limit. Excludes dry-run rows by default — use
	// ListRunsBySkillPaginated with includeDryRun=true to see them.
	ListRunsBySkill(ctx context.Context, skillID string, limit int) ([]SkillRun, error)

	// ListRunsBySkillPaginated returns recent runs for a skill, newest
	// first, with offset+limit. When includeDryRun is false, rows with
	// status="dry_run" are excluded (matches the wizard's sandbox
	// status; see skillaudit.Writer / wizardtools docs).
	//
	// Why a separate paginated method vs. expanding ListRunsBySkill:
	// callers that need the legacy "last N" view (Discord .skill runs,
	// chatbot tool result) want the simpler signature; the paginated
	// view is webui-specific.
	ListRunsBySkillPaginated(ctx context.Context, skillID string,
		offset, limit int, includeDryRun bool) ([]SkillRun, error)

	// CountRunsBySkill returns the total number of runs for a skill.
	// When includeDryRun is false, dry-run rows are excluded so the
	// count matches the default ListRunsBySkillPaginated result.
	CountRunsBySkill(ctx context.Context, skillID string, includeDryRun bool) (int64, error)

	// ListRunsByCaller returns recent runs by a caller, newest first,
	// capped at limit.
	ListRunsByCaller(ctx context.Context, callerID string, limit int) ([]SkillRun, error)

	// ListRunsFiltered returns runs matching f, newest first
	// (started_at DESC), with offset+limit. With an all-zero filter it
	// returns the most recent runs across EVERY audited surface (agents +
	// skills) — the cross-surface feed behind the admin agent-trace debug
	// view and the Claude debug API. dry_run rows are excluded unless
	// f.IncludeDryRun or f.Status=="dry_run". limit is clamped (<=0 or
	// >500 → 50) to bound admin scans.
	ListRunsFiltered(ctx context.Context, f RunFilter, offset, limit int) ([]SkillRun, error)

	// CountRunsFiltered returns the total rows matching f (ignoring
	// offset/limit), for pagination math.
	CountRunsFiltered(ctx context.Context, f RunFilter) (int64, error)

	// PurgeOlderThan deletes runs (and their logs) whose StartedAt is
	// strictly before t. Returns the number of runs deleted.
	PurgeOlderThan(ctx context.Context, t time.Time) (int64, error)

	// ListChildrenByParent returns all SkillRun rows where
	// parent_run_id == parentRunID, oldest first. Used for the
	// call-tree view (skill_invoke trace section) and as a building
	// block for WalkParentChain.
	//
	// Returns an empty slice when parentRunID has no children. An
	// empty parentRunID never matches anything (no row stores ""
	// as a parent — that's the top-level sentinel).
	ListChildrenByParent(ctx context.Context, parentRunID string) ([]SkillRun, error)

	// WalkParentChain walks from runID up via parent_run_id, returning
	// the chain of SkillRun summaries (oldest = root first, newest =
	// runID last). Used by the loop guard in skill_invoke.
	//
	// Cap walk depth at 32 to prevent pathological loops in the data
	// itself: if the parent_run_id chain has been corrupted (e.g. by
	// a bad migration) and forms a cycle, we want a bounded result
	// rather than an infinite loop.
	WalkParentChain(ctx context.Context, runID string) ([]SkillRun, error)

	// ListFinishedRunsBefore returns runs whose FinishedAt is strictly
	// before cutoff, oldest first, capped at limit. limit <= 0 yields
	// no rows (the caller is expected to specify a real bound).
	//
	// Why: skills.StorageSweeper drives the run-scope storage purge from
	// this query. The sweeper picks up only finished runs so an
	// in-flight run's run-scope KV/files cannot be deleted out from
	// under it.
	//
	// Test: storage_test.go covers the include/exclude boundaries
	// (running rows excluded; finished-after-cutoff excluded; finished-
	// before-cutoff included).
	ListFinishedRunsBefore(ctx context.Context, cutoff time.Time, limit int) ([]SkillRun, error)

	// LastRunBySkills returns the most recent StartedAt timestamp per
	// skill in the input ID list. Skills with no rows simply have no
	// entry in the result map (caller distinguishes "never run" from
	// "run but no timestamp" by map key presence).
	//
	// When includeFailed is true, all non-dry-run statuses count
	// (ok / error / timeout / budget_exceeded / preempted / lane_busy).
	// When false, only status="ok" rows count — useful for "last
	// successful run" semantics on dashboards where errored runs
	// shouldn't surface as recent activity.
	//
	// Empty skillIDs short-circuits to an empty map without touching
	// the DB.
	LastRunBySkills(ctx context.Context, skillIDs []string, includeFailed bool) (map[string]time.Time, error)
}

// MaxParentChainDepth is the safety cap for WalkParentChain. The loop
// guard in skill_invoke enforces a separate (smaller) MaxInvokeDepth
// at the tool layer; this cap exists only to bound the walk in the
// presence of corrupted data.
const MaxParentChainDepth = 32