899059a791
The kernel defined run.Ports.Checkpointer + the checkpoint battery but never drove them (the documented "P2 follow-up"). This wires durable recovery into the run loop so a run interrupted by shutdown can resume on the next boot instead of being lost — the executus-side half of mort's durable-agent-recovery parity (mort #1355). Kernel (run/): - Ports.Checkpointer is now a CheckpointerFactory (Begin per run → a per-run Checkpointer, or nil for a non-durable run). The single per-instance Checkpointer couldn't distinguish runs; a factory mints one per run, matching mort's agentexec.CheckpointerFactory. - RunInfo gains GuildID + ModelTier (so the factory can build resume meta); RunCheckpointState gains CompletedPhases + ActivePhase (+ PhaseOutput). - run/checkpoint.go: ResumeState + WithResumeState / WithExistingCheckpointer context carriers, classifyCheckpointOutcome (success→Complete, shutdown→leave for boot recovery, else→Fail using run.ErrShutdown), and finalizeCheckpoint. - run/executor.go: resolve the per-run checkpointer (existing-from-ctx on a recovery re-run, else factory.Begin); single-loop wraps the step observer to accumulate the transcript + Save each step (host throttles), and a recovered run seeds the saved transcript via WithHistory and continues with no new input; finalize on exit. - run/phases.go: phase-boundary checkpointing — record completed phases after each phase; a resumed run skips already-completed phases (the interrupted phase re-runs from its start — boundary-granular, documented; only the single-loop path resumes mid-loop). Battery (checkpoint/): NewFactory wires the battery into the factory port (per-run handle, meta derived from RunInfo); RunCheckpoint + handle.Save carry the phase fields. Tests (run/checkpoint_test.go): the finalize decision matrix; single-loop Save+Complete; terminal-error Fail; resume seeds history; phase-boundary Saves completed phases; resume skips completed phases. Full ./... green. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
124 lines
4.2 KiB
Go
124 lines
4.2 KiB
Go
package checkpoint
|
|
|
|
import (
|
|
"context"
|
|
"sync"
|
|
"time"
|
|
|
|
"gitea.stevedudenhoeffer.com/steve/executus/run"
|
|
)
|
|
|
|
// handle is a per-run run.Checkpointer bound to one run's id + meta. Save writes
|
|
// a fresh snapshot (throttled), Complete/Fail delete the checkpoint (a cleanly
|
|
// finished or terminally failed run is NOT a recovery candidate). A run
|
|
// interrupted by shutdown never calls Complete/Fail, so its checkpoint survives
|
|
// for ListInterrupted at boot.
|
|
type handle struct {
|
|
store CheckpointStore
|
|
meta RunCheckpointMeta
|
|
throttle time.Duration
|
|
now func() time.Time
|
|
|
|
mu sync.Mutex
|
|
lastSave time.Time
|
|
}
|
|
|
|
var _ run.Checkpointer = (*handle)(nil)
|
|
|
|
// New returns a run.Checkpointer that persists snapshots of the run identified
|
|
// by meta.RunID to store, no more often than throttle (Save calls inside the
|
|
// window are skipped). A nil store yields a no-op Checkpointer. throttle <= 0
|
|
// saves every call; now defaults to time.Now.
|
|
func New(store CheckpointStore, meta RunCheckpointMeta, throttle time.Duration, now func() time.Time) run.Checkpointer {
|
|
if store == nil {
|
|
return noop{}
|
|
}
|
|
if now == nil {
|
|
now = time.Now
|
|
}
|
|
return &handle{store: store, meta: meta, throttle: throttle, now: now}
|
|
}
|
|
|
|
func (h *handle) Save(ctx context.Context, st run.RunCheckpointState) error {
|
|
h.mu.Lock()
|
|
now := h.now()
|
|
if h.throttle > 0 && !h.lastSave.IsZero() && now.Sub(h.lastSave) < h.throttle {
|
|
h.mu.Unlock()
|
|
return nil // throttled — a more recent snapshot will land shortly
|
|
}
|
|
h.mu.Unlock()
|
|
|
|
// Advance the throttle clock only AFTER a successful persist. If the store
|
|
// write fails, lastSave stays put so the next Save isn't throttled away —
|
|
// otherwise a transient store error would silently drop the snapshot the
|
|
// caller believes was saved. (A run drives one Save goroutine, so the brief
|
|
// unguarded window here can't double-write.)
|
|
if err := h.store.Save(ctx, RunCheckpoint{
|
|
Meta: h.meta,
|
|
Messages: st.Messages,
|
|
Iteration: st.Iteration,
|
|
CompletedPhases: st.CompletedPhases,
|
|
ActivePhase: st.ActivePhase,
|
|
UpdatedAt: now,
|
|
}); err != nil {
|
|
return err
|
|
}
|
|
h.mu.Lock()
|
|
if now.After(h.lastSave) {
|
|
h.lastSave = now
|
|
}
|
|
h.mu.Unlock()
|
|
return nil
|
|
}
|
|
|
|
func (h *handle) Complete(ctx context.Context) error { return h.store.Delete(ctx, h.meta.RunID) }
|
|
|
|
func (h *handle) Fail(ctx context.Context, _ error) error { return h.store.Delete(ctx, h.meta.RunID) }
|
|
|
|
// noop is the nil-store Checkpointer: every method is a successful no-op.
|
|
type noop struct{}
|
|
|
|
var _ run.Checkpointer = noop{}
|
|
|
|
func (noop) Save(context.Context, run.RunCheckpointState) error { return nil }
|
|
func (noop) Complete(context.Context) error { return nil }
|
|
func (noop) Fail(context.Context, error) error { return nil }
|
|
|
|
// factory is a run.CheckpointerFactory that mints a per-run handle over store,
|
|
// deriving the per-run meta from the kernel's RunInfo. It is the battery's glue
|
|
// for the Ports.Checkpointer (factory) seam: every run becomes durable (the
|
|
// store persists snapshots; a host wanting lazy/short-run skipping uses its own
|
|
// factory, as mort does over its durable-job table).
|
|
type factory struct {
|
|
store CheckpointStore
|
|
throttle time.Duration
|
|
now func() time.Time
|
|
}
|
|
|
|
var _ run.CheckpointerFactory = (*factory)(nil)
|
|
|
|
// NewFactory returns a run.CheckpointerFactory backed by store: each run gets a
|
|
// per-run Checkpointer (throttled to at most once per throttle). A nil store
|
|
// yields factory.Begin returning a no-op Checkpointer.
|
|
func NewFactory(store CheckpointStore, throttle time.Duration) run.CheckpointerFactory {
|
|
return &factory{store: store, throttle: throttle}
|
|
}
|
|
|
|
// Begin mints the per-run Checkpointer. The prompt is read from
|
|
// info.Inputs["prompt"] when present so a recovered run can re-dispatch.
|
|
func (f *factory) Begin(_ context.Context, info run.RunInfo) (run.Checkpointer, error) {
|
|
prompt, _ := info.Inputs["prompt"].(string)
|
|
meta := RunCheckpointMeta{
|
|
RunID: info.RunID,
|
|
AgentID: info.SubjectID,
|
|
AgentName: info.Name,
|
|
CallerID: info.CallerID,
|
|
ChannelID: info.ChannelID,
|
|
GuildID: info.GuildID,
|
|
Prompt: prompt,
|
|
ModelTier: info.ModelTier,
|
|
ParentRunID: info.ParentRunID,
|
|
}
|
|
return New(f.store, meta, f.throttle, f.now), nil
|
|
}
|