package checkpoint import ( "context" "sync" "time" "gitea.stevedudenhoeffer.com/steve/executus/run" ) // handle is a per-run run.Checkpointer bound to one run's id + meta. Save writes // a fresh snapshot (throttled), Complete/Fail delete the checkpoint (a cleanly // finished or terminally failed run is NOT a recovery candidate). A run // interrupted by shutdown never calls Complete/Fail, so its checkpoint survives // for ListInterrupted at boot. type handle struct { store CheckpointStore meta RunCheckpointMeta throttle time.Duration now func() time.Time mu sync.Mutex lastSave time.Time } var _ run.Checkpointer = (*handle)(nil) // New returns a run.Checkpointer that persists snapshots of the run identified // by meta.RunID to store, no more often than throttle (Save calls inside the // window are skipped). A nil store yields a no-op Checkpointer. throttle <= 0 // saves every call; now defaults to time.Now. func New(store CheckpointStore, meta RunCheckpointMeta, throttle time.Duration, now func() time.Time) run.Checkpointer { if store == nil { return noop{} } if now == nil { now = time.Now } return &handle{store: store, meta: meta, throttle: throttle, now: now} } func (h *handle) Save(ctx context.Context, st run.RunCheckpointState) error { h.mu.Lock() now := h.now() if h.throttle > 0 && !h.lastSave.IsZero() && now.Sub(h.lastSave) < h.throttle { h.mu.Unlock() return nil // throttled — a more recent snapshot will land shortly } h.mu.Unlock() // Advance the throttle clock only AFTER a successful persist. If the store // write fails, lastSave stays put so the next Save isn't throttled away — // otherwise a transient store error would silently drop the snapshot the // caller believes was saved. (A run drives one Save goroutine, so the brief // unguarded window here can't double-write.) if err := h.store.Save(ctx, RunCheckpoint{ Meta: h.meta, Messages: st.Messages, Iteration: st.Iteration, CompletedPhases: st.CompletedPhases, ActivePhase: st.ActivePhase, UpdatedAt: now, }); err != nil { return err } h.mu.Lock() if now.After(h.lastSave) { h.lastSave = now } h.mu.Unlock() return nil } func (h *handle) Complete(ctx context.Context) error { return h.store.Delete(ctx, h.meta.RunID) } func (h *handle) Fail(ctx context.Context, _ error) error { return h.store.Delete(ctx, h.meta.RunID) } // noop is the nil-store Checkpointer: every method is a successful no-op. type noop struct{} var _ run.Checkpointer = noop{} func (noop) Save(context.Context, run.RunCheckpointState) error { return nil } func (noop) Complete(context.Context) error { return nil } func (noop) Fail(context.Context, error) error { return nil } // factory is a run.CheckpointerFactory that mints a per-run handle over store, // deriving the per-run meta from the kernel's RunInfo. It is the battery's glue // for the Ports.Checkpointer (factory) seam: every run becomes durable (the // store persists snapshots; a host wanting lazy/short-run skipping uses its own // factory, as mort does over its durable-job table). type factory struct { store CheckpointStore throttle time.Duration now func() time.Time } var _ run.CheckpointerFactory = (*factory)(nil) // NewFactory returns a run.CheckpointerFactory backed by store: each run gets a // per-run Checkpointer (throttled to at most once per throttle). A nil store // yields factory.Begin returning a no-op Checkpointer. func NewFactory(store CheckpointStore, throttle time.Duration) run.CheckpointerFactory { return &factory{store: store, throttle: throttle} } // Begin mints the per-run Checkpointer. The prompt is read from // info.Inputs["prompt"] when present so a recovered run can re-dispatch. func (f *factory) Begin(_ context.Context, info run.RunInfo) (run.Checkpointer, error) { prompt, _ := info.Inputs["prompt"].(string) meta := RunCheckpointMeta{ RunID: info.RunID, AgentID: info.SubjectID, AgentName: info.Name, CallerID: info.CallerID, ChannelID: info.ChannelID, GuildID: info.GuildID, Prompt: prompt, ModelTier: info.ModelTier, ParentRunID: info.ParentRunID, } return New(f.store, meta, f.throttle, f.now), nil }