P4c: remaining batteries — checkpoint + schedule + critic

Completes the P4 battery set (squashed onto main from phase-4c-batteries). - checkpoint/: run.Checkpointer durable-resume (CheckpointStore + throttled handle + Memory). - schedule/: generic cron Runner (Tick/Loop; no cron grammar of its own). - critic/: two-tier timeout watchdog (run.Critic) + Escalator policy seam + ExtendOnce default. Includes the verified gadfly #6 fixes (ExtendOnce per-run, Kill-sticky, watch panic-recovery; checkpoint throttle-after-success; schedule Next-before-Run + nil-guard + Loop recovery). P4 battery set complete: audit, budget, persona, skill, checkpoint, schedule, critic — each nil-safe, each with a default, each core-import-clean. Executor wiring for Critic/Checkpointer remains a P2 follow-up. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-27 00:15:32 -04:00
parent c8559676ed
commit dc2d4ec425
8 changed files with 861 additions and 0 deletions
@@ -0,0 +1,132 @@
+// Package schedule is the cron-runner battery: a generic ticker that, each
+// interval, asks a store for the jobs whose next-run time has passed, runs each
+// one, and stamps its next fire time. It is host-agnostic orchestration — the
+// host wires the store (skill.SkillStore.ListDueScheduled /
+// persona.Storage.ListScheduledAgents), the run (run.Executor), and the cron
+// "next fire" function (a cron library, or skill's schedule parser). The
+// battery owns no cron grammar of its own, so it never duplicates the parser.
+package schedule
+
+import (
+	"context"
+	"errors"
+	"log/slog"
+	"time"
+)
+
+// Due is one schedulable job: its id and its cron expression.
+type Due struct {
+	ID   string
+	Cron string
+}
+
+// Runner periodically fires due jobs. Every func field is required except Now
+// (defaults to time.Now) and Logger (defaults to slog.Default). Construct the
+// struct directly and call Loop (or Tick for a single pass / tests).
+type Runner struct {
+	// Interval is how often Loop checks for due jobs. <= 0 defaults to 1m.
+	Interval time.Duration
+	// Due lists the jobs due at now.
+	Due func(ctx context.Context, now time.Time) ([]Due, error)
+	// Run executes one job by id.
+	Run func(ctx context.Context, id string) error
+	// Mark records that a job ran at ranAt and is next due at nextAt.
+	Mark func(ctx context.Context, id string, ranAt, nextAt time.Time) error
+	// Next computes a cron expression's next fire after a given time.
+	Next func(cron string, after time.Time) (time.Time, error)
+
+	Now    func() time.Time
+	Logger *slog.Logger
+}
+
+func (r *Runner) now() time.Time {
+	if r.Now != nil {
+		return r.Now()
+	}
+	return time.Now()
+}
+
+func (r *Runner) log() *slog.Logger {
+	if r.Logger != nil {
+		return r.Logger
+	}
+	return slog.Default()
+}
+
+// Tick runs one pass: every currently-due job is run, then stamped with its
+// next fire time. A job whose Run or Next errors is logged and skipped (its
+// next-run time is left unchanged so it stays due and retries next tick) — one
+// bad job never stalls the others. Returns the error from Due (the only
+// pass-fatal step).
+func (r *Runner) Tick(ctx context.Context) error {
+	if err := r.validate(); err != nil {
+		return err
+	}
+	now := r.now()
+	due, err := r.Due(ctx, now)
+	if err != nil {
+		return err
+	}
+	for _, j := range due {
+		// Compute the next fire BEFORE running. A permanently-unparseable cron
+		// then skips the job entirely (logged) rather than running it — an
+		// unstamped job stays due, so checking Next first avoids a hot-loop of
+		// real Run executions every tick.
+		next, err := r.Next(j.Cron, now)
+		if err != nil {
+			r.log().Warn("scheduled job has an unparseable cron; skipping (not run, not rescheduled)", "job", j.ID, "cron", j.Cron, "error", err)
+			continue
+		}
+		if err := r.Run(ctx, j.ID); err != nil {
+			r.log().Warn("scheduled job failed; stays due, will retry next tick", "job", j.ID, "error", err)
+			continue
+		}
+		// A Mark failure leaves the job due, so it re-runs next tick — Run must
+		// be idempotent (there is no atomic run+stamp across two host callbacks).
+		if err := r.Mark(ctx, j.ID, now, next); err != nil {
+			r.log().Warn("failed to stamp next run; job may re-execute next tick (Run must be idempotent)", "job", j.ID, "error", err)
+		}
+	}
+	return nil
+}
+
+// validate reports a misconfigured Runner (a required callback left nil) as a
+// clear error rather than a nil-deref panic on first tick.
+func (r *Runner) validate() error {
+	if r.Due == nil || r.Run == nil || r.Mark == nil || r.Next == nil {
+		return errors.New("schedule: Runner requires non-nil Due, Run, Mark, and Next")
+	}
+	return nil
+}
+
+// Loop ticks every Interval until ctx is cancelled. A Tick error (the Due
+// lister failing) is logged and the loop continues — a transient store hiccup
+// shouldn't kill the scheduler — and a panic from any host callback is
+// recovered so one bad tick can't silently kill the scheduler goroutine.
+func (r *Runner) Loop(ctx context.Context) {
+	interval := r.Interval
+	if interval <= 0 {
+		interval = time.Minute
+	}
+	t := time.NewTicker(interval)
+	defer t.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-t.C:
+			r.safeTick(ctx)
+		}
+	}
+}
+
+func (r *Runner) safeTick(ctx context.Context) {
+	defer func() {
+		if rec := recover(); rec != nil {
+			r.log().Error("schedule tick panicked; scheduler continues", "panic", rec)
+		}
+	}()
+	if err := r.Tick(ctx); err != nil {
+		r.log().Warn("schedule tick failed", "error", err)
+	}
+}
@@ -0,0 +1,111 @@
+package schedule
+
+import (
+	"context"
+	"errors"
+	"testing"
+	"time"
+)
+
+func TestTickRunsDueAndStampsNext(t *testing.T) {
+	ctx := context.Background()
+	now := time.Date(2026, 1, 1, 12, 0, 0, 0, time.UTC)
+	var ran []string
+	marked := map[string]time.Time{}
+
+	r := &Runner{
+		Now: func() time.Time { return now },
+		Due: func(_ context.Context, _ time.Time) ([]Due, error) {
+			return []Due{{ID: "a", Cron: "hourly"}, {ID: "b", Cron: "bad"}}, nil
+		},
+		Run:  func(_ context.Context, id string) error { ran = append(ran, id); return nil },
+		Mark: func(_ context.Context, id string, _, next time.Time) error { marked[id] = next; return nil },
+		Next: func(cron string, after time.Time) (time.Time, error) {
+			if cron == "bad" {
+				return time.Time{}, errors.New("unparseable")
+			}
+			return after.Add(time.Hour), nil
+		},
+	}
+	if err := r.Tick(ctx); err != nil {
+		t.Fatal(err)
+	}
+	// Next is checked first, so the bad-cron job is skipped BEFORE Run — only
+	// the parseable job runs and gets stamped (no hot-loop of a bad-cron Run).
+	if len(ran) != 1 || ran[0] != "a" {
+		t.Errorf("ran = %v, want only [a] (bad-cron b skipped before Run)", ran)
+	}
+	if marked["a"] != now.Add(time.Hour) {
+		t.Errorf("a next = %v, want +1h", marked["a"])
+	}
+	if _, ok := marked["b"]; ok {
+		t.Errorf("b should not be stamped (bad cron), got %v", marked["b"])
+	}
+}
+
+func TestTickRunFailureDoesNotStampOrStall(t *testing.T) {
+	ctx := context.Background()
+	var ran []string
+	marked := map[string]bool{}
+	r := &Runner{
+		Due: func(_ context.Context, _ time.Time) ([]Due, error) {
+			return []Due{{ID: "x", Cron: "h"}, {ID: "y", Cron: "h"}}, nil
+		},
+		Run: func(_ context.Context, id string) error {
+			ran = append(ran, id)
+			if id == "x" {
+				return errors.New("boom")
+			}
+			return nil
+		},
+		Mark: func(_ context.Context, id string, _, _ time.Time) error { marked[id] = true; return nil },
+		Next: func(string, time.Time) (time.Time, error) { return time.Now(), nil },
+	}
+	if err := r.Tick(ctx); err != nil {
+		t.Fatal(err)
+	}
+	if len(ran) != 2 { // y still runs despite x failing
+		t.Errorf("ran = %v, want both attempted", ran)
+	}
+	if marked["x"] { // failed job NOT stamped -> stays due, retries
+		t.Error("failed job x should not be stamped")
+	}
+	if !marked["y"] {
+		t.Error("y should be stamped")
+	}
+}
+
+func TestTickDueErrorIsFatalToPass(t *testing.T) {
+	r := &Runner{
+		Due:  func(context.Context, time.Time) ([]Due, error) { return nil, errors.New("store down") },
+		Run:  func(context.Context, string) error { return nil },
+		Mark: func(context.Context, string, time.Time, time.Time) error { return nil },
+		Next: func(string, time.Time) (time.Time, error) { return time.Now(), nil },
+	}
+	if err := r.Tick(context.Background()); err == nil {
+		t.Error("Tick should surface the Due lister error")
+	}
+}
+
+func TestUnparseableCronSkipsRunEntirely(t *testing.T) {
+	var ran []string
+	r := &Runner{
+		Due:  func(context.Context, time.Time) ([]Due, error) { return []Due{{ID: "z", Cron: "bad"}}, nil },
+		Run:  func(_ context.Context, id string) error { ran = append(ran, id); return nil },
+		Mark: func(context.Context, string, time.Time, time.Time) error { return nil },
+		Next: func(string, time.Time) (time.Time, error) { return time.Time{}, errors.New("bad cron") },
+	}
+	if err := r.Tick(context.Background()); err != nil {
+		t.Fatal(err)
+	}
+	if len(ran) != 0 {
+		t.Errorf("a job with an unparseable cron must NOT be run (avoids hot-loop), ran=%v", ran)
+	}
+}
+
+func TestValidateRejectsNilCallbacks(t *testing.T) {
+	r := &Runner{Due: func(context.Context, time.Time) ([]Due, error) { return nil, nil }} // missing Run/Mark/Next
+	if err := r.Tick(context.Background()); err == nil {
+		t.Error("Tick should return a validation error for a partially-wired Runner, not panic")
+	}
+}