P4c: remaining batteries — checkpoint + schedule + critic
executus CI / test (push) Failing after 1m6s
executus CI / test (push) Failing after 1m6s
Completes the P4 battery set (squashed onto main from phase-4c-batteries). - checkpoint/: run.Checkpointer durable-resume (CheckpointStore + throttled handle + Memory). - schedule/: generic cron Runner (Tick/Loop; no cron grammar of its own). - critic/: two-tier timeout watchdog (run.Critic) + Escalator policy seam + ExtendOnce default. Includes the verified gadfly #6 fixes (ExtendOnce per-run, Kill-sticky, watch panic-recovery; checkpoint throttle-after-success; schedule Next-before-Run + nil-guard + Loop recovery). P4 battery set complete: audit, budget, persona, skill, checkpoint, schedule, critic — each nil-safe, each with a default, each core-import-clean. Executor wiring for Critic/Checkpointer remains a P2 follow-up. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,83 @@
|
||||
package checkpoint
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/executus/run"
|
||||
)
|
||||
|
||||
// handle is a per-run run.Checkpointer bound to one run's id + meta. Save writes
|
||||
// a fresh snapshot (throttled), Complete/Fail delete the checkpoint (a cleanly
|
||||
// finished or terminally failed run is NOT a recovery candidate). A run
|
||||
// interrupted by shutdown never calls Complete/Fail, so its checkpoint survives
|
||||
// for ListInterrupted at boot.
|
||||
type handle struct {
|
||||
store CheckpointStore
|
||||
meta RunCheckpointMeta
|
||||
throttle time.Duration
|
||||
now func() time.Time
|
||||
|
||||
mu sync.Mutex
|
||||
lastSave time.Time
|
||||
}
|
||||
|
||||
var _ run.Checkpointer = (*handle)(nil)
|
||||
|
||||
// New returns a run.Checkpointer that persists snapshots of the run identified
|
||||
// by meta.RunID to store, no more often than throttle (Save calls inside the
|
||||
// window are skipped). A nil store yields a no-op Checkpointer. throttle <= 0
|
||||
// saves every call; now defaults to time.Now.
|
||||
func New(store CheckpointStore, meta RunCheckpointMeta, throttle time.Duration, now func() time.Time) run.Checkpointer {
|
||||
if store == nil {
|
||||
return noop{}
|
||||
}
|
||||
if now == nil {
|
||||
now = time.Now
|
||||
}
|
||||
return &handle{store: store, meta: meta, throttle: throttle, now: now}
|
||||
}
|
||||
|
||||
func (h *handle) Save(ctx context.Context, st run.RunCheckpointState) error {
|
||||
h.mu.Lock()
|
||||
now := h.now()
|
||||
if h.throttle > 0 && !h.lastSave.IsZero() && now.Sub(h.lastSave) < h.throttle {
|
||||
h.mu.Unlock()
|
||||
return nil // throttled — a more recent snapshot will land shortly
|
||||
}
|
||||
h.mu.Unlock()
|
||||
|
||||
// Advance the throttle clock only AFTER a successful persist. If the store
|
||||
// write fails, lastSave stays put so the next Save isn't throttled away —
|
||||
// otherwise a transient store error would silently drop the snapshot the
|
||||
// caller believes was saved. (A run drives one Save goroutine, so the brief
|
||||
// unguarded window here can't double-write.)
|
||||
if err := h.store.Save(ctx, RunCheckpoint{
|
||||
Meta: h.meta,
|
||||
Messages: st.Messages,
|
||||
Iteration: st.Iteration,
|
||||
UpdatedAt: now,
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
h.mu.Lock()
|
||||
if now.After(h.lastSave) {
|
||||
h.lastSave = now
|
||||
}
|
||||
h.mu.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *handle) Complete(ctx context.Context) error { return h.store.Delete(ctx, h.meta.RunID) }
|
||||
|
||||
func (h *handle) Fail(ctx context.Context, _ error) error { return h.store.Delete(ctx, h.meta.RunID) }
|
||||
|
||||
// noop is the nil-store Checkpointer: every method is a successful no-op.
|
||||
type noop struct{}
|
||||
|
||||
var _ run.Checkpointer = noop{}
|
||||
|
||||
func (noop) Save(context.Context, run.RunCheckpointState) error { return nil }
|
||||
func (noop) Complete(context.Context) error { return nil }
|
||||
func (noop) Fail(context.Context, error) error { return nil }
|
||||
Reference in New Issue
Block a user