fix: address verified gadfly P4c findings (3-cloud fleet)
executus CI / test (pull_request) Successful in 1m39s

critic (all 3 models — HIGH):
- ExtendOnce was a single global one-shot shared across every run a System
  monitors, so only the FIRST run to stall got its extension and all others
  were killed by the backstop. Key the fired-state per run (RunInfo.RunID).
- Kill is now sticky: a `killed` flag short-circuits later ticks so a wavering
  Escalator returning ExtendBy after a Kill can't un-collapse the deadline; a
  Kill paired with Nudge/ExtendBy ignores the latter.
- watch() recovers panics from a misbehaving Escalator (logs; the run falls
  back to its existing deadline) instead of silently killing the watch goroutine.

checkpoint (deepseek — HIGH): handle.Save advanced the throttle clock BEFORE
the store write, so a failed save was silently throttled away (caller believes
it persisted). Advance lastSave only after a successful persist.

schedule (all 3): compute Next BEFORE Run — a permanently-unparseable cron now
skips the job entirely instead of re-running it every tick forever; nil required
callbacks return a validate() error instead of a first-tick nil panic; Loop
recovers tick panics; the Mark-failure => possible-re-run trade-off is documented
(Run must be idempotent). + tests for each.

Triaged-but-kept: critic backstopMul<=1 floor (it's a total-runtime multiple, so
a floor >1 is intentional, not the reported footgun); checkpoint Load (nil,nil)
on miss (documented convention).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-26 23:32:27 -04:00
parent e5cab5525e
commit eea84e6e2c
5 changed files with 145 additions and 32 deletions
+33 -4
View File
@@ -30,9 +30,10 @@ func TestTickRunsDueAndStampsNext(t *testing.T) {
if err := r.Tick(ctx); err != nil {
t.Fatal(err)
}
// Both ran; only the parseable one got a next stamp.
if len(ran) != 2 {
t.Errorf("ran = %v, want both", ran)
// Next is checked first, so the bad-cron job is skipped BEFORE Run — only
// the parseable job runs and gets stamped (no hot-loop of a bad-cron Run).
if len(ran) != 1 || ran[0] != "a" {
t.Errorf("ran = %v, want only [a] (bad-cron b skipped before Run)", ran)
}
if marked["a"] != now.Add(time.Hour) {
t.Errorf("a next = %v, want +1h", marked["a"])
@@ -75,8 +76,36 @@ func TestTickRunFailureDoesNotStampOrStall(t *testing.T) {
}
func TestTickDueErrorIsFatalToPass(t *testing.T) {
r := &Runner{Due: func(context.Context, time.Time) ([]Due, error) { return nil, errors.New("store down") }}
r := &Runner{
Due: func(context.Context, time.Time) ([]Due, error) { return nil, errors.New("store down") },
Run: func(context.Context, string) error { return nil },
Mark: func(context.Context, string, time.Time, time.Time) error { return nil },
Next: func(string, time.Time) (time.Time, error) { return time.Now(), nil },
}
if err := r.Tick(context.Background()); err == nil {
t.Error("Tick should surface the Due lister error")
}
}
func TestUnparseableCronSkipsRunEntirely(t *testing.T) {
var ran []string
r := &Runner{
Due: func(context.Context, time.Time) ([]Due, error) { return []Due{{ID: "z", Cron: "bad"}}, nil },
Run: func(_ context.Context, id string) error { ran = append(ran, id); return nil },
Mark: func(context.Context, string, time.Time, time.Time) error { return nil },
Next: func(string, time.Time) (time.Time, error) { return time.Time{}, errors.New("bad cron") },
}
if err := r.Tick(context.Background()); err != nil {
t.Fatal(err)
}
if len(ran) != 0 {
t.Errorf("a job with an unparseable cron must NOT be run (avoids hot-loop), ran=%v", ran)
}
}
func TestValidateRejectsNilCallbacks(t *testing.T) {
r := &Runner{Due: func(context.Context, time.Time) ([]Due, error) { return nil, nil }} // missing Run/Mark/Next
if err := r.Tick(context.Background()); err == nil {
t.Error("Tick should return a validation error for a partially-wired Runner, not panic")
}
}