P4c: remaining batteries — checkpoint + schedule + critic
executus CI / test (push) Failing after 1m6s
executus CI / test (push) Failing after 1m6s
Completes the P4 battery set (squashed onto main from phase-4c-batteries). - checkpoint/: run.Checkpointer durable-resume (CheckpointStore + throttled handle + Memory). - schedule/: generic cron Runner (Tick/Loop; no cron grammar of its own). - critic/: two-tier timeout watchdog (run.Critic) + Escalator policy seam + ExtendOnce default. Includes the verified gadfly #6 fixes (ExtendOnce per-run, Kill-sticky, watch panic-recovery; checkpoint throttle-after-success; schedule Next-before-Run + nil-guard + Loop recovery). P4 battery set complete: audit, budget, persona, skill, checkpoint, schedule, critic — each nil-safe, each with a default, each core-import-clean. Executor wiring for Critic/Checkpointer remains a P2 follow-up. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,132 @@
|
||||
// Package schedule is the cron-runner battery: a generic ticker that, each
|
||||
// interval, asks a store for the jobs whose next-run time has passed, runs each
|
||||
// one, and stamps its next fire time. It is host-agnostic orchestration — the
|
||||
// host wires the store (skill.SkillStore.ListDueScheduled /
|
||||
// persona.Storage.ListScheduledAgents), the run (run.Executor), and the cron
|
||||
// "next fire" function (a cron library, or skill's schedule parser). The
|
||||
// battery owns no cron grammar of its own, so it never duplicates the parser.
|
||||
package schedule
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"log/slog"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Due is one schedulable job: its id and its cron expression.
|
||||
type Due struct {
|
||||
ID string
|
||||
Cron string
|
||||
}
|
||||
|
||||
// Runner periodically fires due jobs. Every func field is required except Now
|
||||
// (defaults to time.Now) and Logger (defaults to slog.Default). Construct the
|
||||
// struct directly and call Loop (or Tick for a single pass / tests).
|
||||
type Runner struct {
|
||||
// Interval is how often Loop checks for due jobs. <= 0 defaults to 1m.
|
||||
Interval time.Duration
|
||||
// Due lists the jobs due at now.
|
||||
Due func(ctx context.Context, now time.Time) ([]Due, error)
|
||||
// Run executes one job by id.
|
||||
Run func(ctx context.Context, id string) error
|
||||
// Mark records that a job ran at ranAt and is next due at nextAt.
|
||||
Mark func(ctx context.Context, id string, ranAt, nextAt time.Time) error
|
||||
// Next computes a cron expression's next fire after a given time.
|
||||
Next func(cron string, after time.Time) (time.Time, error)
|
||||
|
||||
Now func() time.Time
|
||||
Logger *slog.Logger
|
||||
}
|
||||
|
||||
func (r *Runner) now() time.Time {
|
||||
if r.Now != nil {
|
||||
return r.Now()
|
||||
}
|
||||
return time.Now()
|
||||
}
|
||||
|
||||
func (r *Runner) log() *slog.Logger {
|
||||
if r.Logger != nil {
|
||||
return r.Logger
|
||||
}
|
||||
return slog.Default()
|
||||
}
|
||||
|
||||
// Tick runs one pass: every currently-due job is run, then stamped with its
|
||||
// next fire time. A job whose Run or Next errors is logged and skipped (its
|
||||
// next-run time is left unchanged so it stays due and retries next tick) — one
|
||||
// bad job never stalls the others. Returns the error from Due (the only
|
||||
// pass-fatal step).
|
||||
func (r *Runner) Tick(ctx context.Context) error {
|
||||
if err := r.validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
now := r.now()
|
||||
due, err := r.Due(ctx, now)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, j := range due {
|
||||
// Compute the next fire BEFORE running. A permanently-unparseable cron
|
||||
// then skips the job entirely (logged) rather than running it — an
|
||||
// unstamped job stays due, so checking Next first avoids a hot-loop of
|
||||
// real Run executions every tick.
|
||||
next, err := r.Next(j.Cron, now)
|
||||
if err != nil {
|
||||
r.log().Warn("scheduled job has an unparseable cron; skipping (not run, not rescheduled)", "job", j.ID, "cron", j.Cron, "error", err)
|
||||
continue
|
||||
}
|
||||
if err := r.Run(ctx, j.ID); err != nil {
|
||||
r.log().Warn("scheduled job failed; stays due, will retry next tick", "job", j.ID, "error", err)
|
||||
continue
|
||||
}
|
||||
// A Mark failure leaves the job due, so it re-runs next tick — Run must
|
||||
// be idempotent (there is no atomic run+stamp across two host callbacks).
|
||||
if err := r.Mark(ctx, j.ID, now, next); err != nil {
|
||||
r.log().Warn("failed to stamp next run; job may re-execute next tick (Run must be idempotent)", "job", j.ID, "error", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// validate reports a misconfigured Runner (a required callback left nil) as a
|
||||
// clear error rather than a nil-deref panic on first tick.
|
||||
func (r *Runner) validate() error {
|
||||
if r.Due == nil || r.Run == nil || r.Mark == nil || r.Next == nil {
|
||||
return errors.New("schedule: Runner requires non-nil Due, Run, Mark, and Next")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Loop ticks every Interval until ctx is cancelled. A Tick error (the Due
|
||||
// lister failing) is logged and the loop continues — a transient store hiccup
|
||||
// shouldn't kill the scheduler — and a panic from any host callback is
|
||||
// recovered so one bad tick can't silently kill the scheduler goroutine.
|
||||
func (r *Runner) Loop(ctx context.Context) {
|
||||
interval := r.Interval
|
||||
if interval <= 0 {
|
||||
interval = time.Minute
|
||||
}
|
||||
t := time.NewTicker(interval)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-t.C:
|
||||
r.safeTick(ctx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (r *Runner) safeTick(ctx context.Context) {
|
||||
defer func() {
|
||||
if rec := recover(); rec != nil {
|
||||
r.log().Error("schedule tick panicked; scheduler continues", "panic", rec)
|
||||
}
|
||||
}()
|
||||
if err := r.Tick(ctx); err != nil {
|
||||
r.log().Warn("schedule tick failed", "error", err)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user