Files
executus/checkpoint/handle.go
T
steve 38d656ec71
executus CI / test (pull_request) Successful in 45s
fix(run): address gadfly review of the checkpoint PR
Real findings from the consensus review (44 raw; heavy devstral noise):

- finalizeCheckpoint is now fired from the top-of-Run defer, so it runs on
  EVERY exit: a panic, an early build-error return (before the run loop), AND
  normal completion. Previously an early return on a recovered run left its
  durable record unfinalized → boot recovery would retry it forever on a
  persistent build error. (opus + glm)
- Removed the dead ActivePhase field from run.RunCheckpointState +
  run.ResumeState (and the battery RunCheckpoint) — phase recovery is
  boundary-granular (skip completed phases; the interrupted phase re-runs from
  its start), so ActivePhase was never written nor read. Docs across
  ports/checkpoint/phases now state this plainly (5-model consensus that the
  field + docs over-promised mid-phase resume).
- CheckpointerFactory.Begin error is now logged (WARN) before degrading to
  non-durable, per the documented contract (was silently swallowed). (4 models)
- finalizeCheckpoint logs Complete/Fail errors (was silent).
- Resume phase-skip now keys off a SEPARATE resumeSkip set, not the live
  outputs map — a fresh run with two same-named phases no longer skips the
  second (the outputs map fills as phases run). (opus:max) + regression test.
- Removed the dead checkpoint.factory.now field (never set). (opus + glm)
- Fixed the stale phaseDeps doc (the step observer moved out of sharedOpts to
  per-path). Hoisted the resume guard to a local; dropped the wasted acc
  allocation on the resume path; documented that Save throttling is the
  Checkpointer's responsibility and the accumulated transcript is pre-compaction
  (host size-caps it).

Note (carried from the PR): classifyCheckpointOutcome keys shutdown on
run.ErrShutdown; mort stamps its own runengine.ErrShutdown — the mort wiring PR
aliases them so errors.Is matches.

New test: duplicate phase names both run on a fresh run. Full ./... green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-29 16:34:42 -04:00

122 lines
4.2 KiB
Go

package checkpoint
import (
"context"
"sync"
"time"
"gitea.stevedudenhoeffer.com/steve/executus/run"
)
// handle is a per-run run.Checkpointer bound to one run's id + meta. Save writes
// a fresh snapshot (throttled), Complete/Fail delete the checkpoint (a cleanly
// finished or terminally failed run is NOT a recovery candidate). A run
// interrupted by shutdown never calls Complete/Fail, so its checkpoint survives
// for ListInterrupted at boot.
type handle struct {
store CheckpointStore
meta RunCheckpointMeta
throttle time.Duration
now func() time.Time
mu sync.Mutex
lastSave time.Time
}
var _ run.Checkpointer = (*handle)(nil)
// New returns a run.Checkpointer that persists snapshots of the run identified
// by meta.RunID to store, no more often than throttle (Save calls inside the
// window are skipped). A nil store yields a no-op Checkpointer. throttle <= 0
// saves every call; now defaults to time.Now.
func New(store CheckpointStore, meta RunCheckpointMeta, throttle time.Duration, now func() time.Time) run.Checkpointer {
if store == nil {
return noop{}
}
if now == nil {
now = time.Now
}
return &handle{store: store, meta: meta, throttle: throttle, now: now}
}
func (h *handle) Save(ctx context.Context, st run.RunCheckpointState) error {
h.mu.Lock()
now := h.now()
if h.throttle > 0 && !h.lastSave.IsZero() && now.Sub(h.lastSave) < h.throttle {
h.mu.Unlock()
return nil // throttled — a more recent snapshot will land shortly
}
h.mu.Unlock()
// Advance the throttle clock only AFTER a successful persist. If the store
// write fails, lastSave stays put so the next Save isn't throttled away —
// otherwise a transient store error would silently drop the snapshot the
// caller believes was saved. (A run drives one Save goroutine, so the brief
// unguarded window here can't double-write.)
if err := h.store.Save(ctx, RunCheckpoint{
Meta: h.meta,
Messages: st.Messages,
Iteration: st.Iteration,
CompletedPhases: st.CompletedPhases,
UpdatedAt: now,
}); err != nil {
return err
}
h.mu.Lock()
if now.After(h.lastSave) {
h.lastSave = now
}
h.mu.Unlock()
return nil
}
func (h *handle) Complete(ctx context.Context) error { return h.store.Delete(ctx, h.meta.RunID) }
func (h *handle) Fail(ctx context.Context, _ error) error { return h.store.Delete(ctx, h.meta.RunID) }
// noop is the nil-store Checkpointer: every method is a successful no-op.
type noop struct{}
var _ run.Checkpointer = noop{}
func (noop) Save(context.Context, run.RunCheckpointState) error { return nil }
func (noop) Complete(context.Context) error { return nil }
func (noop) Fail(context.Context, error) error { return nil }
// factory is a run.CheckpointerFactory that mints a per-run handle over store,
// deriving the per-run meta from the kernel's RunInfo. It is the battery's glue
// for the Ports.Checkpointer (factory) seam: every run becomes durable (the
// store persists snapshots; a host wanting lazy/short-run skipping uses its own
// factory, as mort does over its durable-job table).
type factory struct {
store CheckpointStore
throttle time.Duration
}
var _ run.CheckpointerFactory = (*factory)(nil)
// NewFactory returns a run.CheckpointerFactory backed by store: each run gets a
// per-run Checkpointer (throttled to at most once per throttle). A nil store
// yields factory.Begin returning a no-op Checkpointer.
func NewFactory(store CheckpointStore, throttle time.Duration) run.CheckpointerFactory {
return &factory{store: store, throttle: throttle}
}
// Begin mints the per-run Checkpointer. The prompt is read from
// info.Inputs["prompt"] when present so a recovered run can re-dispatch.
func (f *factory) Begin(_ context.Context, info run.RunInfo) (run.Checkpointer, error) {
prompt, _ := info.Inputs["prompt"].(string)
meta := RunCheckpointMeta{
RunID: info.RunID,
AgentID: info.SubjectID,
AgentName: info.Name,
CallerID: info.CallerID,
ChannelID: info.ChannelID,
GuildID: info.GuildID,
Prompt: prompt,
ModelTier: info.ModelTier,
ParentRunID: info.ParentRunID,
}
return New(f.store, meta, f.throttle, nil /* now defaults to time.Now */), nil
}