Files
executus/checkpoint/handle.go
T
steve eea84e6e2c
executus CI / test (pull_request) Successful in 1m39s
fix: address verified gadfly P4c findings (3-cloud fleet)
critic (all 3 models — HIGH):
- ExtendOnce was a single global one-shot shared across every run a System
  monitors, so only the FIRST run to stall got its extension and all others
  were killed by the backstop. Key the fired-state per run (RunInfo.RunID).
- Kill is now sticky: a `killed` flag short-circuits later ticks so a wavering
  Escalator returning ExtendBy after a Kill can't un-collapse the deadline; a
  Kill paired with Nudge/ExtendBy ignores the latter.
- watch() recovers panics from a misbehaving Escalator (logs; the run falls
  back to its existing deadline) instead of silently killing the watch goroutine.

checkpoint (deepseek — HIGH): handle.Save advanced the throttle clock BEFORE
the store write, so a failed save was silently throttled away (caller believes
it persisted). Advance lastSave only after a successful persist.

schedule (all 3): compute Next BEFORE Run — a permanently-unparseable cron now
skips the job entirely instead of re-running it every tick forever; nil required
callbacks return a validate() error instead of a first-tick nil panic; Loop
recovers tick panics; the Mark-failure => possible-re-run trade-off is documented
(Run must be idempotent). + tests for each.

Triaged-but-kept: critic backstopMul<=1 floor (it's a total-runtime multiple, so
a floor >1 is intentional, not the reported footgun); checkpoint Load (nil,nil)
on miss (documented convention).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 23:32:27 -04:00

84 lines
2.6 KiB
Go

package checkpoint
import (
"context"
"sync"
"time"
"gitea.stevedudenhoeffer.com/steve/executus/run"
)
// handle is a per-run run.Checkpointer bound to one run's id + meta. Save writes
// a fresh snapshot (throttled), Complete/Fail delete the checkpoint (a cleanly
// finished or terminally failed run is NOT a recovery candidate). A run
// interrupted by shutdown never calls Complete/Fail, so its checkpoint survives
// for ListInterrupted at boot.
type handle struct {
store CheckpointStore
meta RunCheckpointMeta
throttle time.Duration
now func() time.Time
mu sync.Mutex
lastSave time.Time
}
var _ run.Checkpointer = (*handle)(nil)
// New returns a run.Checkpointer that persists snapshots of the run identified
// by meta.RunID to store, no more often than throttle (Save calls inside the
// window are skipped). A nil store yields a no-op Checkpointer. throttle <= 0
// saves every call; now defaults to time.Now.
func New(store CheckpointStore, meta RunCheckpointMeta, throttle time.Duration, now func() time.Time) run.Checkpointer {
if store == nil {
return noop{}
}
if now == nil {
now = time.Now
}
return &handle{store: store, meta: meta, throttle: throttle, now: now}
}
func (h *handle) Save(ctx context.Context, st run.RunCheckpointState) error {
h.mu.Lock()
now := h.now()
if h.throttle > 0 && !h.lastSave.IsZero() && now.Sub(h.lastSave) < h.throttle {
h.mu.Unlock()
return nil // throttled — a more recent snapshot will land shortly
}
h.mu.Unlock()
// Advance the throttle clock only AFTER a successful persist. If the store
// write fails, lastSave stays put so the next Save isn't throttled away —
// otherwise a transient store error would silently drop the snapshot the
// caller believes was saved. (A run drives one Save goroutine, so the brief
// unguarded window here can't double-write.)
if err := h.store.Save(ctx, RunCheckpoint{
Meta: h.meta,
Messages: st.Messages,
Iteration: st.Iteration,
UpdatedAt: now,
}); err != nil {
return err
}
h.mu.Lock()
if now.After(h.lastSave) {
h.lastSave = now
}
h.mu.Unlock()
return nil
}
func (h *handle) Complete(ctx context.Context) error { return h.store.Delete(ctx, h.meta.RunID) }
func (h *handle) Fail(ctx context.Context, _ error) error { return h.store.Delete(ctx, h.meta.RunID) }
// noop is the nil-store Checkpointer: every method is a successful no-op.
type noop struct{}
var _ run.Checkpointer = noop{}
func (noop) Save(context.Context, run.RunCheckpointState) error { return nil }
func (noop) Complete(context.Context) error { return nil }
func (noop) Fail(context.Context, error) error { return nil }