fix: address verified gadfly P4c findings (3-cloud fleet)

critic (all 3 models — HIGH): - ExtendOnce was a single global one-shot shared across every run a System monitors, so only the FIRST run to stall got its extension and all others were killed by the backstop. Key the fired-state per run (RunInfo.RunID). - Kill is now sticky: a `killed` flag short-circuits later ticks so a wavering Escalator returning ExtendBy after a Kill can't un-collapse the deadline; a Kill paired with Nudge/ExtendBy ignores the latter. - watch() recovers panics from a misbehaving Escalator (logs; the run falls back to its existing deadline) instead of silently killing the watch goroutine. checkpoint (deepseek — HIGH): handle.Save advanced the throttle clock BEFORE the store write, so a failed save was silently throttled away (caller believes it persisted). Advance lastSave only after a successful persist. schedule (all 3): compute Next BEFORE Run — a permanently-unparseable cron now skips the job entirely instead of re-running it every tick forever; nil required callbacks return a validate() error instead of a first-tick nil panic; Loop recovers tick panics; the Mark-failure => possible-re-run trade-off is documented (Run must be idempotent). + tests for each. Triaged-but-kept: critic backstopMul<=1 floor (it's a total-runtime multiple, so a floor >1 is intentional, not the reported footgun); checkpoint Load (nil,nil) on miss (documented convention). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 23:32:27 -04:00
parent e5cab5525e
commit eea84e6e2c
5 changed files with 145 additions and 32 deletions
@@ -9,6 +9,7 @@ package schedule

 import (
 	"context"
+	"errors"
 	"log/slog"
 	"time"
 )
@@ -58,31 +59,50 @@ func (r *Runner) log() *slog.Logger {
 // bad job never stalls the others. Returns the error from Due (the only
 // pass-fatal step).
 func (r *Runner) Tick(ctx context.Context) error {
+	if err := r.validate(); err != nil {
+		return err
+	}
 	now := r.now()
 	due, err := r.Due(ctx, now)
 	if err != nil {
 		return err
 	}
 	for _, j := range due {
-		if err := r.Run(ctx, j.ID); err != nil {
-			r.log().Warn("scheduled job failed; will retry next tick", "job", j.ID, "error", err)
-			continue
-		}
+		// Compute the next fire BEFORE running. A permanently-unparseable cron
+		// then skips the job entirely (logged) rather than running it — an
+		// unstamped job stays due, so checking Next first avoids a hot-loop of
+		// real Run executions every tick.
 		next, err := r.Next(j.Cron, now)
 		if err != nil {
-			r.log().Warn("scheduled job has an unparseable cron; not rescheduling", "job", j.ID, "cron", j.Cron, "error", err)
+			r.log().Warn("scheduled job has an unparseable cron; skipping (not run, not rescheduled)", "job", j.ID, "cron", j.Cron, "error", err)
 			continue
 		}
-		if err := r.Mark(ctx, j.ID, now, next); err != nil {
-			r.log().Warn("failed to stamp scheduled job's next run", "job", j.ID, "error", err)
+		if err := r.Run(ctx, j.ID); err != nil {
+			r.log().Warn("scheduled job failed; stays due, will retry next tick", "job", j.ID, "error", err)
+			continue
 		}
+		// A Mark failure leaves the job due, so it re-runs next tick — Run must
+		// be idempotent (there is no atomic run+stamp across two host callbacks).
+		if err := r.Mark(ctx, j.ID, now, next); err != nil {
+			r.log().Warn("failed to stamp next run; job may re-execute next tick (Run must be idempotent)", "job", j.ID, "error", err)
+		}
+	}
+	return nil
+}
+
+// validate reports a misconfigured Runner (a required callback left nil) as a
+// clear error rather than a nil-deref panic on first tick.
+func (r *Runner) validate() error {
+	if r.Due == nil || r.Run == nil || r.Mark == nil || r.Next == nil {
+		return errors.New("schedule: Runner requires non-nil Due, Run, Mark, and Next")
 	}
 	return nil
 }

 // Loop ticks every Interval until ctx is cancelled. A Tick error (the Due
 // lister failing) is logged and the loop continues — a transient store hiccup
-// shouldn't kill the scheduler.
+// shouldn't kill the scheduler — and a panic from any host callback is
+// recovered so one bad tick can't silently kill the scheduler goroutine.
 func (r *Runner) Loop(ctx context.Context) {
 	interval := r.Interval
 	if interval <= 0 {
@@ -95,9 +115,18 @@ func (r *Runner) Loop(ctx context.Context) {
 		case <-ctx.Done():
 			return
 		case <-t.C:
-			if err := r.Tick(ctx); err != nil {
-				r.log().Warn("schedule tick failed", "error", err)
-			}
+			r.safeTick(ctx)
 		}
 	}
 }
+
+func (r *Runner) safeTick(ctx context.Context) {
+	defer func() {
+		if rec := recover(); rec != nil {
+			r.log().Error("schedule tick panicked; scheduler continues", "panic", rec)
+		}
+	}()
+	if err := r.Tick(ctx); err != nil {
+		r.log().Warn("schedule tick failed", "error", err)
+	}
+}