feat(run): durable checkpoint + resume (wire Ports.Checkpointer)
The kernel defined run.Ports.Checkpointer + the checkpoint battery but never drove them (the documented "P2 follow-up"). This wires durable recovery into the run loop so a run interrupted by shutdown can resume on the next boot instead of being lost — the executus-side half of mort's durable-agent-recovery parity (mort #1355). Kernel (run/): - Ports.Checkpointer is now a CheckpointerFactory (Begin per run → a per-run Checkpointer, or nil for a non-durable run). The single per-instance Checkpointer couldn't distinguish runs; a factory mints one per run, matching mort's agentexec.CheckpointerFactory. - RunInfo gains GuildID + ModelTier (so the factory can build resume meta); RunCheckpointState gains CompletedPhases + ActivePhase (+ PhaseOutput). - run/checkpoint.go: ResumeState + WithResumeState / WithExistingCheckpointer context carriers, classifyCheckpointOutcome (success→Complete, shutdown→leave for boot recovery, else→Fail using run.ErrShutdown), and finalizeCheckpoint. - run/executor.go: resolve the per-run checkpointer (existing-from-ctx on a recovery re-run, else factory.Begin); single-loop wraps the step observer to accumulate the transcript + Save each step (host throttles), and a recovered run seeds the saved transcript via WithHistory and continues with no new input; finalize on exit. - run/phases.go: phase-boundary checkpointing — record completed phases after each phase; a resumed run skips already-completed phases (the interrupted phase re-runs from its start — boundary-granular, documented; only the single-loop path resumes mid-loop). Battery (checkpoint/): NewFactory wires the battery into the factory port (per-run handle, meta derived from RunInfo); RunCheckpoint + handle.Save carry the phase fields. Tests (run/checkpoint_test.go): the finalize decision matrix; single-loop Save+Complete; terminal-error Fail; resume seeds history; phase-boundary Saves completed phases; resume skips completed phases. Full ./... green. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -4,9 +4,9 @@
|
||||
// run.Ports.Checkpointer.
|
||||
//
|
||||
// Mort backs CheckpointStore with its durable-job table; Memory() is the
|
||||
// zero-dependency default; contrib/store can add a SQLite one. NOTE: the
|
||||
// executor's call into run.Ports.Checkpointer is a P2 follow-up — this battery
|
||||
// provides the seam + impls ahead of that wiring.
|
||||
// zero-dependency default; contrib/store can add a SQLite one. The executor calls
|
||||
// run.Ports.Checkpointer (a CheckpointerFactory) during the run loop; NewFactory
|
||||
// wires this battery into that seam.
|
||||
package checkpoint
|
||||
|
||||
import (
|
||||
@@ -14,6 +14,8 @@ import (
|
||||
"time"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/executus/run"
|
||||
)
|
||||
|
||||
// RunCheckpointMeta is the run attribution needed to resume a run from scratch
|
||||
@@ -32,11 +34,12 @@ type RunCheckpointMeta struct {
|
||||
|
||||
// RunCheckpoint is one persisted snapshot of a run's resumable progress.
|
||||
type RunCheckpoint struct {
|
||||
Meta RunCheckpointMeta
|
||||
Messages []llm.Message // conversation so far
|
||||
Iteration int // completed agent-loop iterations
|
||||
ActivePhase string // current phase name (multi-phase agents); "" otherwise
|
||||
UpdatedAt time.Time
|
||||
Meta RunCheckpointMeta
|
||||
Messages []llm.Message // conversation so far (single-loop or active phase)
|
||||
Iteration int // completed agent-loop iterations
|
||||
CompletedPhases []run.PhaseOutput // finished phases, in order (multi-phase agents)
|
||||
ActivePhase string // current phase name (multi-phase agents); "" otherwise
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
// CheckpointStore persists run checkpoints keyed by run id. A live checkpoint
|
||||
|
||||
+44
-4
@@ -54,10 +54,12 @@ func (h *handle) Save(ctx context.Context, st run.RunCheckpointState) error {
|
||||
// caller believes was saved. (A run drives one Save goroutine, so the brief
|
||||
// unguarded window here can't double-write.)
|
||||
if err := h.store.Save(ctx, RunCheckpoint{
|
||||
Meta: h.meta,
|
||||
Messages: st.Messages,
|
||||
Iteration: st.Iteration,
|
||||
UpdatedAt: now,
|
||||
Meta: h.meta,
|
||||
Messages: st.Messages,
|
||||
Iteration: st.Iteration,
|
||||
CompletedPhases: st.CompletedPhases,
|
||||
ActivePhase: st.ActivePhase,
|
||||
UpdatedAt: now,
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -81,3 +83,41 @@ var _ run.Checkpointer = noop{}
|
||||
func (noop) Save(context.Context, run.RunCheckpointState) error { return nil }
|
||||
func (noop) Complete(context.Context) error { return nil }
|
||||
func (noop) Fail(context.Context, error) error { return nil }
|
||||
|
||||
// factory is a run.CheckpointerFactory that mints a per-run handle over store,
|
||||
// deriving the per-run meta from the kernel's RunInfo. It is the battery's glue
|
||||
// for the Ports.Checkpointer (factory) seam: every run becomes durable (the
|
||||
// store persists snapshots; a host wanting lazy/short-run skipping uses its own
|
||||
// factory, as mort does over its durable-job table).
|
||||
type factory struct {
|
||||
store CheckpointStore
|
||||
throttle time.Duration
|
||||
now func() time.Time
|
||||
}
|
||||
|
||||
var _ run.CheckpointerFactory = (*factory)(nil)
|
||||
|
||||
// NewFactory returns a run.CheckpointerFactory backed by store: each run gets a
|
||||
// per-run Checkpointer (throttled to at most once per throttle). A nil store
|
||||
// yields factory.Begin returning a no-op Checkpointer.
|
||||
func NewFactory(store CheckpointStore, throttle time.Duration) run.CheckpointerFactory {
|
||||
return &factory{store: store, throttle: throttle}
|
||||
}
|
||||
|
||||
// Begin mints the per-run Checkpointer. The prompt is read from
|
||||
// info.Inputs["prompt"] when present so a recovered run can re-dispatch.
|
||||
func (f *factory) Begin(_ context.Context, info run.RunInfo) (run.Checkpointer, error) {
|
||||
prompt, _ := info.Inputs["prompt"].(string)
|
||||
meta := RunCheckpointMeta{
|
||||
RunID: info.RunID,
|
||||
AgentID: info.SubjectID,
|
||||
AgentName: info.Name,
|
||||
CallerID: info.CallerID,
|
||||
ChannelID: info.ChannelID,
|
||||
GuildID: info.GuildID,
|
||||
Prompt: prompt,
|
||||
ModelTier: info.ModelTier,
|
||||
ParentRunID: info.ParentRunID,
|
||||
}
|
||||
return New(f.store, meta, f.throttle, f.now), nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user