feat(run): durable checkpoint + resume (wire Ports.Checkpointer)
The kernel defined run.Ports.Checkpointer + the checkpoint battery but never drove them (the documented "P2 follow-up"). This wires durable recovery into the run loop so a run interrupted by shutdown can resume on the next boot instead of being lost — the executus-side half of mort's durable-agent-recovery parity (mort #1355). Kernel (run/): - Ports.Checkpointer is now a CheckpointerFactory (Begin per run → a per-run Checkpointer, or nil for a non-durable run). The single per-instance Checkpointer couldn't distinguish runs; a factory mints one per run, matching mort's agentexec.CheckpointerFactory. - RunInfo gains GuildID + ModelTier (so the factory can build resume meta); RunCheckpointState gains CompletedPhases + ActivePhase (+ PhaseOutput). - run/checkpoint.go: ResumeState + WithResumeState / WithExistingCheckpointer context carriers, classifyCheckpointOutcome (success→Complete, shutdown→leave for boot recovery, else→Fail using run.ErrShutdown), and finalizeCheckpoint. - run/executor.go: resolve the per-run checkpointer (existing-from-ctx on a recovery re-run, else factory.Begin); single-loop wraps the step observer to accumulate the transcript + Save each step (host throttles), and a recovered run seeds the saved transcript via WithHistory and continues with no new input; finalize on exit. - run/phases.go: phase-boundary checkpointing — record completed phases after each phase; a resumed run skips already-completed phases (the interrupted phase re-runs from its start — boundary-granular, documented; only the single-loop path resumes mid-loop). Battery (checkpoint/): NewFactory wires the battery into the factory port (per-run handle, meta derived from RunInfo); RunCheckpoint + handle.Save carry the phase fields. Tests (run/checkpoint_test.go): the finalize decision matrix; single-loop Save+Complete; terminal-error Fail; resume seeds history; phase-boundary Saves completed phases; resume skips completed phases. Full ./... green. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+39
-2
@@ -64,6 +64,13 @@ type phaseDeps struct {
|
||||
stepObserver func(agent.Step)
|
||||
steer func() []llm.Message
|
||||
rec RunRecorder
|
||||
// checkpointer records phase-boundary progress (completed phases) for durable
|
||||
// recovery; nil = non-durable. resume carries a recovered run's completed
|
||||
// phases so they are skipped on re-run. Phase recovery is boundary-granular:
|
||||
// the interrupted (active) phase re-runs from its start (its mid-phase
|
||||
// transcript is NOT resumed — only the single-loop path resumes mid-loop).
|
||||
checkpointer Checkpointer
|
||||
resume *ResumeState
|
||||
}
|
||||
|
||||
// runPhases executes ra.Phases sequentially and returns a synthetic agent.Result
|
||||
@@ -73,10 +80,22 @@ type phaseDeps struct {
|
||||
// deadline/critic-kill — returns the error.
|
||||
func (e *Executor) runPhases(runCtx context.Context, ra RunnableAgent, deps phaseDeps, query string, images []llm.ImagePart) (*agent.Result, error) {
|
||||
outputs := make(map[string]string, len(ra.Phases))
|
||||
var completed []PhaseOutput
|
||||
var lastResult *agent.Result
|
||||
var lastOutput string
|
||||
var totalUsage llm.Usage
|
||||
|
||||
// Resume: pre-populate from the saved checkpoint so already-finished phases are
|
||||
// skipped. The interrupted (active) phase is NOT pre-populated, so it re-runs
|
||||
// from its start (boundary-granular recovery).
|
||||
if deps.resume != nil {
|
||||
for _, pc := range deps.resume.CompletedPhases {
|
||||
outputs[pc.Name] = pc.Output
|
||||
completed = append(completed, pc)
|
||||
lastOutput = pc.Output
|
||||
}
|
||||
}
|
||||
|
||||
// finish stamps the aggregated usage + final output onto the synthetic result.
|
||||
finish := func(err error) (*agent.Result, error) {
|
||||
if lastResult == nil {
|
||||
@@ -90,6 +109,12 @@ func (e *Executor) runPhases(runCtx context.Context, ra RunnableAgent, deps phas
|
||||
}
|
||||
|
||||
for i, phase := range ra.Phases {
|
||||
// Skip phases already completed on a resumed run (key presence, not output
|
||||
// emptiness — a legitimately-empty phase output still counts as done).
|
||||
if _, done := outputs[phase.Name]; done {
|
||||
lastOutput = outputs[phase.Name]
|
||||
continue
|
||||
}
|
||||
// A killed/timed-out/cancelled run must not start its next phase.
|
||||
if err := runCtx.Err(); err != nil {
|
||||
return finish(err)
|
||||
@@ -151,6 +176,16 @@ func (e *Executor) runPhases(runCtx context.Context, ra RunnableAgent, deps phas
|
||||
|
||||
outputs[phase.Name] = output
|
||||
lastOutput = output
|
||||
// Checkpoint the phase boundary: this phase is done, so a resumed run skips
|
||||
// it and continues from the next. (Copy the slice — the checkpointer may
|
||||
// hold/serialize it asynchronously.)
|
||||
completed = append(completed, PhaseOutput{Name: phase.Name, Output: output})
|
||||
if deps.checkpointer != nil {
|
||||
_ = deps.checkpointer.Save(runCtx, RunCheckpointState{
|
||||
CompletedPhases: append([]PhaseOutput(nil), completed...),
|
||||
ActivePhase: "",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return finish(nil)
|
||||
@@ -192,11 +227,13 @@ func (e *Executor) runOnePhase(runCtx context.Context, ra RunnableAgent, deps ph
|
||||
maxIter = deps.baseMaxIter
|
||||
}
|
||||
// Per-phase opts: a fixed step ceiling for this phase (the critic's dynamic
|
||||
// ceiling is intentionally not propagated to phases) + the phase toolbox, on
|
||||
// top of the shared opts (tool-error limits, step observer, compactor).
|
||||
// ceiling is intentionally not propagated to phases) + the phase toolbox + the
|
||||
// shared step observer (audit/steps/critic), on top of the shared opts
|
||||
// (tool-error limits, compactor).
|
||||
opts := append([]agent.Option{
|
||||
agent.WithToolbox(toolbox),
|
||||
agent.WithMaxSteps(maxIter),
|
||||
agent.WithStepObserver(deps.stepObserver),
|
||||
}, deps.sharedOpts...)
|
||||
ag := agent.New(model, system, opts...)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user