executus/run/critic_test.go

package run_test

import (
	"context"
	"sync"
	"testing"
	"time"

	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
	"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"

	"gitea.stevedudenhoeffer.com/steve/executus/run"
	"gitea.stevedudenhoeffer.com/steve/executus/tool"
)

type fakeCritic struct{ h *fakeCriticHandle }

func (c *fakeCritic) Monitor(_ context.Context, _ run.RunInfo, _ time.Duration) run.CriticHandle {
	return c.h
}

type fakeCriticHandle struct {
	mu                  sync.Mutex
	steps, tools, stops int
	steered             int
	maxSteps            int   // 0 => defer to the run's base MaxIterations
	killCause           error // non-nil simulates a critic kill
}

func (h *fakeCriticHandle) RecordStep(int, *llm.Response) { h.mu.Lock(); h.steps++; h.mu.Unlock() }
func (h *fakeCriticHandle) KillCause() error {
	h.mu.Lock()
	defer h.mu.Unlock()
	return h.killCause
}
func (h *fakeCriticHandle) RecordToolStart(string, string) {
	h.mu.Lock()
	h.tools++
	h.mu.Unlock()
}
func (h *fakeCriticHandle) Steer() []llm.Message { h.mu.Lock(); h.steered++; h.mu.Unlock(); return nil }
func (h *fakeCriticHandle) Deadline() time.Time  { return time.Time{} } // no hard deadline
func (h *fakeCriticHandle) MaxSteps() int        { h.mu.Lock(); defer h.mu.Unlock(); return h.maxSteps }
func (h *fakeCriticHandle) Stop()                { h.mu.Lock(); h.stops++; h.mu.Unlock() }

// TestCriticRaisesStepCeiling: a critic returning a higher MaxSteps lets the agent
// run PAST its base MaxIterations (the dynamic step ceiling). With base=1 and no
// critic the run would hit ErrMaxSteps after the first tool-dispatch step; the
// critic raises it to 5 so the run completes.
func TestCriticRaisesStepCeiling(t *testing.T) {
	h := &fakeCriticHandle{maxSteps: 5}
	fp := fake.New("fake")
	fp.Enqueue("m",
		// two tool-call steps (unknown tool → tolerated error results), then answer
		fake.ReplyWith(llm.Response{ToolCalls: []llm.ToolCall{{ID: "c1", Name: "noop", Arguments: []byte(`{}`)}}}),
		fake.ReplyWith(llm.Response{ToolCalls: []llm.ToolCall{{ID: "c2", Name: "noop", Arguments: []byte(`{}`)}}}),
		fake.Reply("done after 2 tool steps"),
	)
	m, _ := fp.Model("m")
	ex := run.New(run.Config{
		Registry: tool.NewRegistry(),
		Models:   func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
		Ports:    run.Ports{Critic: &fakeCritic{h: h}},
		// large soft timeout so the deadline-watch never interferes in the test
		Defaults: run.Defaults{CriticSoftTimeout: time.Hour},
	})
	res := ex.Run(context.Background(),
		run.RunnableAgent{Name: "x", ModelTier: "m", MaxIterations: 1, Critic: run.CriticConfig{Enabled: true}},
		tool.Invocation{RunID: "r"}, "go")
	if res.Err != nil {
		t.Fatalf("critic raised the ceiling to 5, run should complete past base=1: %v", res.Err)
	}
	if res.Output != "done after 2 tool steps" {
		t.Errorf("output = %q", res.Output)
	}
}

// TestCriticWired: an agent with Critic.Enabled gets monitored — Monitor returns
// a handle the executor feeds (RecordStep), drains (Steer), and stops.
func TestCriticWired(t *testing.T) {
	h := &fakeCriticHandle{}
	fp := fake.New("fake")
	fp.Enqueue("m", fake.Reply("done"))
	m, _ := fp.Model("m")
	ex := run.New(run.Config{
		Registry: tool.NewRegistry(),
		Models:   func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
		Ports:    run.Ports{Critic: &fakeCritic{h: h}},
	})
	res := ex.Run(context.Background(),
		run.RunnableAgent{Name: "watched", ModelTier: "m", Critic: run.CriticConfig{Enabled: true}},
		tool.Invocation{RunID: "r"}, "go")
	if res.Err != nil {
		t.Fatalf("run error: %v", res.Err)
	}
	h.mu.Lock()
	defer h.mu.Unlock()
	if h.steps < 1 {
		t.Errorf("critic should have seen >=1 step, got %d", h.steps)
	}
	if h.steered < 1 {
		t.Errorf("critic Steer should be drained at least once, got %d", h.steered)
	}
	if h.stops != 1 {
		t.Errorf("critic Stop should be called exactly once, got %d", h.stops)
	}
}

// TestCriticDisabledNotMonitored: Critic.Enabled=false → Monitor never called.
func TestCriticDisabledNotMonitored(t *testing.T) {
	h := &fakeCriticHandle{}
	fp := fake.New("fake")
	fp.Enqueue("m", fake.Reply("done"))
	m, _ := fp.Model("m")
	ex := run.New(run.Config{
		Registry: tool.NewRegistry(),
		Models:   func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
		Ports:    run.Ports{Critic: &fakeCritic{h: h}},
	})
	ex.Run(context.Background(),
		run.RunnableAgent{Name: "x", ModelTier: "m"}, // Critic.Enabled=false
		tool.Invocation{RunID: "r"}, "go")
	h.mu.Lock()
	defer h.mu.Unlock()
	if h.stops != 0 || h.steps != 0 {
		t.Errorf("disabled critic should not be monitored: steps=%d stops=%d", h.steps, h.stops)
	}
}