diff --git a/.gitea/workflows/adversarial-review.yml b/.gitea/workflows/adversarial-review.yml index 9643e1d..fda1f94 100644 --- a/.gitea/workflows/adversarial-review.yml +++ b/.gitea/workflows/adversarial-review.yml @@ -51,13 +51,12 @@ jobs: GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }} OLLAMA_CLOUD_API_KEY: ${{ secrets.OLLAMA_CLOUD_API_KEY }} # Local Macs, reached through their foreman queues (native Ollama on the - # wire). GADFLY_ENDPOINT_M1 registers provider "m1", _M5 registers "m5", + # wire). GADFLY_ENDPOINT_M5 registers provider "m5", # each a foreman-preset Ollama client at the secret's URL, of the form: # foreman|https://| # Needs an image with foreman provider-type support (this one). If a Mac # is offline that model's comment shows an error and the others still post. # (Gitea secrets aren't auto-exposed — map each explicitly.) - GADFLY_ENDPOINT_M1: ${{ secrets.GADFLY_ENDPOINT_M1 }} GADFLY_ENDPOINT_M5: ${{ secrets.GADFLY_ENDPOINT_M5 }} # Full fleet: 3 cloud + M1 Pro + M5 Max. The Macs are back so the # gadfly-reports scoreboard can quantify whether they earn their keep @@ -66,8 +65,8 @@ jobs: # (ollama-cloud=1) with its 3 lenses concurrent (LENS ollama-cloud=3) so # its comment lands sooner; each Mac runs one model, lenses serial (its # foreman queue serializes anyway). All three provider lanes run parallel. - GADFLY_MODELS: "minimax-m3:cloud,glm-5.2:cloud,glm-5.1:cloud,kimi-k2.7-code:cloud,deepseek-v4-pro:cloud,nemotron-3-super:cloud,gpt-oss:120b-cloud,qwen3-coder:480b-cloud,gemma4:cloud,m1/qwen3:14b,m5/qwen3.6:35b-mlx" - GADFLY_PROVIDER_CONCURRENCY: "ollama-cloud=3,m1=1,m5=1" + GADFLY_MODELS: "minimax-m3:cloud,glm-5.2:cloud,glm-5.1:cloud,kimi-k2.7-code:cloud,deepseek-v4-pro:cloud,nemotron-3-super:cloud,gpt-oss:120b-cloud,qwen3-coder:480b-cloud,gemma4:cloud,m5/qwen3.6:35b-mlx" + GADFLY_PROVIDER_CONCURRENCY: "ollama-cloud=3,m5=1" GADFLY_PROVIDER_LENS_CONCURRENCY: "ollama-cloud=3" # Default => the 3-lens suite (security, correctness, error-handling). # Set the repo var GADFLY_SPECIALISTS to override (csv / "all" / "auto"). diff --git a/critic/critic.go b/critic/critic.go index ceb5445..0c948a5 100644 --- a/critic/critic.go +++ b/critic/critic.go @@ -10,13 +10,16 @@ // Mort plugs its LLM critic-agent in as an Escalator; ExtendOnce is the // zero-dependency default. // -// NOTE: the executor's call into run.Ports.Critic is a P2 follow-up; this -// battery provides the seam + impl ahead of that wiring. +// The executor wires run.Ports.Critic (C0b): it feeds the handle activity, +// binds the run context to its extendable Deadline, drains its Steer, and polls +// MaxSteps each step so an Escalator can also raise a long run's step ceiling +// (Decision.RaiseStepsBy). package critic import ( "context" "log/slog" + "math" "sync" "time" @@ -36,10 +39,11 @@ type Progress struct { // Decision is the Escalator's verdict for a stalled run. Zero value = do // nothing (let the hard backstop eventually kill a truly hung run). type Decision struct { - Nudge []llm.Message // injected before the agent's next turn (a steer) - ExtendBy time.Duration // push the hard deadline out by this much - Kill bool // cancel the run now - KillReason string + Nudge []llm.Message // injected before the agent's next turn (a steer) + ExtendBy time.Duration // push the hard deadline out by this much + RaiseStepsBy int // raise the run's tool-dispatch step ceiling by this + Kill bool // cancel the run now + KillReason string } // Escalator decides what to do when a run crosses its soft timeout. It is @@ -136,6 +140,7 @@ func (s *System) Monitor(ctx context.Context, info run.RunInfo, softTimeout time now: s.now, lastActivity: now, deadline: now.Add(time.Duration(float64(softTimeout) * s.backstopMul)), + maxSteps: info.MaxIterations, // base ceiling; an Escalator may RaiseStepsBy stopCh: make(chan struct{}), } go h.watch(ctx, check) @@ -155,6 +160,7 @@ type handle struct { deadline time.Time steer []llm.Message iterations int + maxSteps int // current tool-dispatch ceiling (base MaxIterations, raised by RaiseStepsBy) lastTool string killed bool // sticky: once an Escalator kills, no later decision un-kills it stopped bool @@ -192,6 +198,12 @@ func (h *handle) Deadline() time.Time { return h.deadline } +func (h *handle) MaxSteps() int { + h.mu.Lock() + defer h.mu.Unlock() + return h.maxSteps +} + func (h *handle) Stop() { h.mu.Lock() if !h.stopped { @@ -263,4 +275,13 @@ func (h *handle) tick(ctx context.Context) { if d.ExtendBy > 0 { h.deadline = h.deadline.Add(d.ExtendBy) } + if d.RaiseStepsBy > 0 { + // Overflow-safe: a buggy Escalator returning a huge delta must not wrap + // maxSteps negative (which the executor would read as "defer to base"). + if d.RaiseStepsBy > math.MaxInt-h.maxSteps { + h.maxSteps = math.MaxInt + } else { + h.maxSteps += d.RaiseStepsBy + } + } } diff --git a/go.sum b/go.sum index 7f34796..dacea4d 100644 --- a/go.sum +++ b/go.sum @@ -125,6 +125,7 @@ google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpAD google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= diff --git a/run/critic.go b/run/critic.go index bf0ff7b..420a476 100644 --- a/run/critic.go +++ b/run/critic.go @@ -88,6 +88,22 @@ func (b *criticBinding) recordToolStart(name, args string) { } } +// maxStepsOption returns the agent step-ceiling Option. With no critic it's a +// fixed WithMaxSteps(base); with a critic it's a DYNAMIC WithMaxStepsFunc that +// polls the handle each step (so the critic can raise a long run's budget), +// falling back to base when the handle defers (MaxSteps() <= 0). +func (b *criticBinding) maxStepsOption(base int) agent.Option { + if b == nil { + return agent.WithMaxSteps(base) + } + return agent.WithMaxStepsFunc(func() int { + if n := b.h.MaxSteps(); n > 0 { + return n + } + return base + }) +} + // steerOptions returns the agent RunOptions that drain the critic's steer // messages into the loop. Empty when there is no critic. func (b *criticBinding) steerOptions() []agent.RunOption { diff --git a/run/critic_test.go b/run/critic_test.go index cc197b8..cd4793f 100644 --- a/run/critic_test.go +++ b/run/critic_test.go @@ -23,6 +23,7 @@ type fakeCriticHandle struct { mu sync.Mutex steps, tools, stops int steered int + maxSteps int // 0 => defer to the run's base MaxIterations } func (h *fakeCriticHandle) RecordStep(int) { h.mu.Lock(); h.steps++; h.mu.Unlock() } @@ -33,8 +34,41 @@ func (h *fakeCriticHandle) RecordToolStart(string, string) { } func (h *fakeCriticHandle) Steer() []llm.Message { h.mu.Lock(); h.steered++; h.mu.Unlock(); return nil } func (h *fakeCriticHandle) Deadline() time.Time { return time.Time{} } // no hard deadline +func (h *fakeCriticHandle) MaxSteps() int { h.mu.Lock(); defer h.mu.Unlock(); return h.maxSteps } func (h *fakeCriticHandle) Stop() { h.mu.Lock(); h.stops++; h.mu.Unlock() } +// TestCriticRaisesStepCeiling: a critic returning a higher MaxSteps lets the agent +// run PAST its base MaxIterations (the dynamic step ceiling). With base=1 and no +// critic the run would hit ErrMaxSteps after the first tool-dispatch step; the +// critic raises it to 5 so the run completes. +func TestCriticRaisesStepCeiling(t *testing.T) { + h := &fakeCriticHandle{maxSteps: 5} + fp := fake.New("fake") + fp.Enqueue("m", + // two tool-call steps (unknown tool → tolerated error results), then answer + fake.ReplyWith(llm.Response{ToolCalls: []llm.ToolCall{{ID: "c1", Name: "noop", Arguments: []byte(`{}`)}}}), + fake.ReplyWith(llm.Response{ToolCalls: []llm.ToolCall{{ID: "c2", Name: "noop", Arguments: []byte(`{}`)}}}), + fake.Reply("done after 2 tool steps"), + ) + m, _ := fp.Model("m") + ex := run.New(run.Config{ + Registry: tool.NewRegistry(), + Models: func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil }, + Ports: run.Ports{Critic: &fakeCritic{h: h}}, + // large soft timeout so the deadline-watch never interferes in the test + Defaults: run.Defaults{CriticSoftTimeout: time.Hour}, + }) + res := ex.Run(context.Background(), + run.RunnableAgent{Name: "x", ModelTier: "m", MaxIterations: 1, Critic: run.CriticConfig{Enabled: true}}, + tool.Invocation{RunID: "r"}, "go") + if res.Err != nil { + t.Fatalf("critic raised the ceiling to 5, run should complete past base=1: %v", res.Err) + } + if res.Output != "done after 2 tool steps" { + t.Errorf("output = %q", res.Output) + } +} + // TestCriticWired: an agent with Critic.Enabled gets monitored — Monitor returns // a handle the executor feeds (RecordStep), drains (Steer), and stops. func TestCriticWired(t *testing.T) { diff --git a/run/executor.go b/run/executor.go index 2dab0bd..acd24af 100644 --- a/run/executor.go +++ b/run/executor.go @@ -156,14 +156,15 @@ func (e *Executor) Run(ctx context.Context, ra RunnableAgent, inv tool.Invocatio // Audit start (optional). The recorder satisfies RunTally; stamp it on the // invocation so a self-status tool can read live progress. info := RunInfo{ - RunID: inv.RunID, - SubjectID: ra.ID, - Name: ra.Name, - CallerID: inv.CallerID, - ChannelID: inv.ChannelID, - ParentRunID: inv.ParentRunID, - Inputs: inv.SkillInputs, - StartedAt: started, + RunID: inv.RunID, + SubjectID: ra.ID, + Name: ra.Name, + CallerID: inv.CallerID, + ChannelID: inv.ChannelID, + ParentRunID: inv.ParentRunID, + Inputs: inv.SkillInputs, + StartedAt: started, + MaxIterations: maxIter, } var rec RunRecorder var stateAcc *RunStateAccessor @@ -243,7 +244,10 @@ func (e *Executor) Run(ctx context.Context, ra RunnableAgent, inv tool.Invocatio opts := []agent.Option{ agent.WithToolbox(toolbox), - agent.WithMaxSteps(maxIter), + // Step ceiling: a fixed WithMaxSteps(maxIter) normally, but when a critic is + // active it owns a DYNAMIC ceiling (WithMaxStepsFunc) so it can raise a + // healthy-but-long run's budget mid-flight. Falls back to maxIter. + critic.maxStepsOption(maxIter), agent.WithToolErrorLimits(e.cfg.Defaults.MaxConsecutiveToolErrors, e.cfg.Defaults.MaxSameToolCallRepeats), agent.WithStepObserver(stepObserver), } diff --git a/run/ports.go b/run/ports.go index 0a5dfe9..fe66c32 100644 --- a/run/ports.go +++ b/run/ports.go @@ -48,6 +48,9 @@ type RunInfo struct { ParentRunID string Inputs map[string]any StartedAt time.Time + // MaxIterations is the run's base tool-dispatch step ceiling, so a critic can + // raise it relative to the baseline (see CriticHandle.MaxSteps). + MaxIterations int } // RunStats is the terminal roll-up a recorder's Close writes. Mirrors mort's @@ -129,6 +132,11 @@ type CriticHandle interface { // Deadline returns the current hard-kill deadline (the critic may extend // it); the executor binds the run context to it. Zero = no hard deadline. Deadline() time.Time + // MaxSteps returns the current tool-dispatch step ceiling, polled by the + // executor each step (via majordomo WithMaxStepsFunc) so a critic can raise a + // healthy-but-long run's iteration budget mid-flight. Return <= 0 to defer to + // the run's base MaxIterations. + MaxSteps() int // Stop ends monitoring when the run finishes. Stop() }