feat(run): durable checkpoint + resume (wire Ports.Checkpointer)

The kernel defined run.Ports.Checkpointer + the checkpoint battery but never drove them (the documented "P2 follow-up"). This wires durable recovery into the run loop so a run interrupted by shutdown can resume on the next boot instead of being lost — the executus-side half of mort's durable-agent-recovery parity (mort #1355). Kernel (run/): - Ports.Checkpointer is now a CheckpointerFactory (Begin per run → a per-run Checkpointer, or nil for a non-durable run). The single per-instance Checkpointer couldn't distinguish runs; a factory mints one per run, matching mort's agentexec.CheckpointerFactory. - RunInfo gains GuildID + ModelTier (so the factory can build resume meta); RunCheckpointState gains CompletedPhases + ActivePhase (+ PhaseOutput). - run/checkpoint.go: ResumeState + WithResumeState / WithExistingCheckpointer context carriers, classifyCheckpointOutcome (success→Complete, shutdown→leave for boot recovery, else→Fail using run.ErrShutdown), and finalizeCheckpoint. - run/executor.go: resolve the per-run checkpointer (existing-from-ctx on a recovery re-run, else factory.Begin); single-loop wraps the step observer to accumulate the transcript + Save each step (host throttles), and a recovered run seeds the saved transcript via WithHistory and continues with no new input; finalize on exit. - run/phases.go: phase-boundary checkpointing — record completed phases after each phase; a resumed run skips already-completed phases (the interrupted phase re-runs from its start — boundary-granular, documented; only the single-loop path resumes mid-loop). Battery (checkpoint/): NewFactory wires the battery into the factory port (per-run handle, meta derived from RunInfo); RunCheckpoint + handle.Save carry the phase fields. Tests (run/checkpoint_test.go): the finalize decision matrix; single-loop Save+Complete; terminal-error Fail; resume seeds history; phase-boundary Saves completed phases; resume skips completed phases. Full ./... green. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-29 16:04:06 -04:00
parent c071ed4996
commit 899059a791
7 changed files with 477 additions and 27 deletions
@@ -64,6 +64,13 @@ type phaseDeps struct {
 	stepObserver func(agent.Step)
 	steer        func() []llm.Message
 	rec          RunRecorder
+	// checkpointer records phase-boundary progress (completed phases) for durable
+	// recovery; nil = non-durable. resume carries a recovered run's completed
+	// phases so they are skipped on re-run. Phase recovery is boundary-granular:
+	// the interrupted (active) phase re-runs from its start (its mid-phase
+	// transcript is NOT resumed — only the single-loop path resumes mid-loop).
+	checkpointer Checkpointer
+	resume       *ResumeState
 }

 // runPhases executes ra.Phases sequentially and returns a synthetic agent.Result
@@ -73,10 +80,22 @@ type phaseDeps struct {
 // deadline/critic-kill — returns the error.
 func (e *Executor) runPhases(runCtx context.Context, ra RunnableAgent, deps phaseDeps, query string, images []llm.ImagePart) (*agent.Result, error) {
 	outputs := make(map[string]string, len(ra.Phases))
+	var completed []PhaseOutput
 	var lastResult *agent.Result
 	var lastOutput string
 	var totalUsage llm.Usage

+	// Resume: pre-populate from the saved checkpoint so already-finished phases are
+	// skipped. The interrupted (active) phase is NOT pre-populated, so it re-runs
+	// from its start (boundary-granular recovery).
+	if deps.resume != nil {
+		for _, pc := range deps.resume.CompletedPhases {
+			outputs[pc.Name] = pc.Output
+			completed = append(completed, pc)
+			lastOutput = pc.Output
+		}
+	}
+
 	// finish stamps the aggregated usage + final output onto the synthetic result.
 	finish := func(err error) (*agent.Result, error) {
 		if lastResult == nil {
@@ -90,6 +109,12 @@ func (e *Executor) runPhases(runCtx context.Context, ra RunnableAgent, deps phas
 	}

 	for i, phase := range ra.Phases {
+		// Skip phases already completed on a resumed run (key presence, not output
+		// emptiness — a legitimately-empty phase output still counts as done).
+		if _, done := outputs[phase.Name]; done {
+			lastOutput = outputs[phase.Name]
+			continue
+		}
 		// A killed/timed-out/cancelled run must not start its next phase.
 		if err := runCtx.Err(); err != nil {
 			return finish(err)
@@ -151,6 +176,16 @@ func (e *Executor) runPhases(runCtx context.Context, ra RunnableAgent, deps phas

 		outputs[phase.Name] = output
 		lastOutput = output
+		// Checkpoint the phase boundary: this phase is done, so a resumed run skips
+		// it and continues from the next. (Copy the slice — the checkpointer may
+		// hold/serialize it asynchronously.)
+		completed = append(completed, PhaseOutput{Name: phase.Name, Output: output})
+		if deps.checkpointer != nil {
+			_ = deps.checkpointer.Save(runCtx, RunCheckpointState{
+				CompletedPhases: append([]PhaseOutput(nil), completed...),
+				ActivePhase:     "",
+			})
+		}
 	}

 	return finish(nil)
@@ -192,11 +227,13 @@ func (e *Executor) runOnePhase(runCtx context.Context, ra RunnableAgent, deps ph
 		maxIter = deps.baseMaxIter
 	}
 	// Per-phase opts: a fixed step ceiling for this phase (the critic's dynamic
-	// ceiling is intentionally not propagated to phases) + the phase toolbox, on
-	// top of the shared opts (tool-error limits, step observer, compactor).
+	// ceiling is intentionally not propagated to phases) + the phase toolbox + the
+	// shared step observer (audit/steps/critic), on top of the shared opts
+	// (tool-error limits, compactor).
 	opts := append([]agent.Option{
 		agent.WithToolbox(toolbox),
 		agent.WithMaxSteps(maxIter),
+		agent.WithStepObserver(deps.stepObserver),
 	}, deps.sharedOpts...)
 	ag := agent.New(model, system, opts...)