Files
executus/schedule/runner.go
T
steve eea84e6e2c
executus CI / test (pull_request) Successful in 1m39s
fix: address verified gadfly P4c findings (3-cloud fleet)
critic (all 3 models — HIGH):
- ExtendOnce was a single global one-shot shared across every run a System
  monitors, so only the FIRST run to stall got its extension and all others
  were killed by the backstop. Key the fired-state per run (RunInfo.RunID).
- Kill is now sticky: a `killed` flag short-circuits later ticks so a wavering
  Escalator returning ExtendBy after a Kill can't un-collapse the deadline; a
  Kill paired with Nudge/ExtendBy ignores the latter.
- watch() recovers panics from a misbehaving Escalator (logs; the run falls
  back to its existing deadline) instead of silently killing the watch goroutine.

checkpoint (deepseek — HIGH): handle.Save advanced the throttle clock BEFORE
the store write, so a failed save was silently throttled away (caller believes
it persisted). Advance lastSave only after a successful persist.

schedule (all 3): compute Next BEFORE Run — a permanently-unparseable cron now
skips the job entirely instead of re-running it every tick forever; nil required
callbacks return a validate() error instead of a first-tick nil panic; Loop
recovers tick panics; the Mark-failure => possible-re-run trade-off is documented
(Run must be idempotent). + tests for each.

Triaged-but-kept: critic backstopMul<=1 floor (it's a total-runtime multiple, so
a floor >1 is intentional, not the reported footgun); checkpoint Load (nil,nil)
on miss (documented convention).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 23:32:27 -04:00

133 lines
4.4 KiB
Go

// Package schedule is the cron-runner battery: a generic ticker that, each
// interval, asks a store for the jobs whose next-run time has passed, runs each
// one, and stamps its next fire time. It is host-agnostic orchestration — the
// host wires the store (skill.SkillStore.ListDueScheduled /
// persona.Storage.ListScheduledAgents), the run (run.Executor), and the cron
// "next fire" function (a cron library, or skill's schedule parser). The
// battery owns no cron grammar of its own, so it never duplicates the parser.
package schedule
import (
"context"
"errors"
"log/slog"
"time"
)
// Due is one schedulable job: its id and its cron expression.
type Due struct {
ID string
Cron string
}
// Runner periodically fires due jobs. Every func field is required except Now
// (defaults to time.Now) and Logger (defaults to slog.Default). Construct the
// struct directly and call Loop (or Tick for a single pass / tests).
type Runner struct {
// Interval is how often Loop checks for due jobs. <= 0 defaults to 1m.
Interval time.Duration
// Due lists the jobs due at now.
Due func(ctx context.Context, now time.Time) ([]Due, error)
// Run executes one job by id.
Run func(ctx context.Context, id string) error
// Mark records that a job ran at ranAt and is next due at nextAt.
Mark func(ctx context.Context, id string, ranAt, nextAt time.Time) error
// Next computes a cron expression's next fire after a given time.
Next func(cron string, after time.Time) (time.Time, error)
Now func() time.Time
Logger *slog.Logger
}
func (r *Runner) now() time.Time {
if r.Now != nil {
return r.Now()
}
return time.Now()
}
func (r *Runner) log() *slog.Logger {
if r.Logger != nil {
return r.Logger
}
return slog.Default()
}
// Tick runs one pass: every currently-due job is run, then stamped with its
// next fire time. A job whose Run or Next errors is logged and skipped (its
// next-run time is left unchanged so it stays due and retries next tick) — one
// bad job never stalls the others. Returns the error from Due (the only
// pass-fatal step).
func (r *Runner) Tick(ctx context.Context) error {
if err := r.validate(); err != nil {
return err
}
now := r.now()
due, err := r.Due(ctx, now)
if err != nil {
return err
}
for _, j := range due {
// Compute the next fire BEFORE running. A permanently-unparseable cron
// then skips the job entirely (logged) rather than running it — an
// unstamped job stays due, so checking Next first avoids a hot-loop of
// real Run executions every tick.
next, err := r.Next(j.Cron, now)
if err != nil {
r.log().Warn("scheduled job has an unparseable cron; skipping (not run, not rescheduled)", "job", j.ID, "cron", j.Cron, "error", err)
continue
}
if err := r.Run(ctx, j.ID); err != nil {
r.log().Warn("scheduled job failed; stays due, will retry next tick", "job", j.ID, "error", err)
continue
}
// A Mark failure leaves the job due, so it re-runs next tick — Run must
// be idempotent (there is no atomic run+stamp across two host callbacks).
if err := r.Mark(ctx, j.ID, now, next); err != nil {
r.log().Warn("failed to stamp next run; job may re-execute next tick (Run must be idempotent)", "job", j.ID, "error", err)
}
}
return nil
}
// validate reports a misconfigured Runner (a required callback left nil) as a
// clear error rather than a nil-deref panic on first tick.
func (r *Runner) validate() error {
if r.Due == nil || r.Run == nil || r.Mark == nil || r.Next == nil {
return errors.New("schedule: Runner requires non-nil Due, Run, Mark, and Next")
}
return nil
}
// Loop ticks every Interval until ctx is cancelled. A Tick error (the Due
// lister failing) is logged and the loop continues — a transient store hiccup
// shouldn't kill the scheduler — and a panic from any host callback is
// recovered so one bad tick can't silently kill the scheduler goroutine.
func (r *Runner) Loop(ctx context.Context) {
interval := r.Interval
if interval <= 0 {
interval = time.Minute
}
t := time.NewTicker(interval)
defer t.Stop()
for {
select {
case <-ctx.Done():
return
case <-t.C:
r.safeTick(ctx)
}
}
}
func (r *Runner) safeTick(ctx context.Context) {
defer func() {
if rec := recover(); rec != nil {
r.log().Error("schedule tick panicked; scheduler continues", "panic", rec)
}
}()
if err := r.Tick(ctx); err != nil {
r.log().Warn("schedule tick failed", "error", err)
}
}