executus/run/critic_deadline_test.go

package run_test

import (
	"context"
	"sync"
	"testing"
	"time"

	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
	"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"

	"gitea.stevedudenhoeffer.com/steve/executus/run"
	"gitea.stevedudenhoeffer.com/steve/executus/tool"
)

// slowToolInvocation builds an Invocation whose session factory adds a "slow"
// tool that sleeps for d (respecting ctx). The model script calls it once, then
// answers — so the run's wall-clock is dominated by d, letting a test set a tiny
// MaxRuntime and observe whether MaxRuntime hard-cancels the run.
func slowToolInvocation(runID string, d time.Duration) tool.Invocation {
	slow := llm.DefineTool("slow", "sleeps for a while",
		func(ctx context.Context, _ struct{}) (any, error) {
			select {
			case <-time.After(d):
				return "ok", nil
			case <-ctx.Done():
				return nil, ctx.Err()
			}
		})
	return tool.Invocation{
		RunID: runID,
		SessionToolFactory: func(_ tool.AgentSession) tool.SessionTools {
			return tool.SessionTools{Tools: []llm.Tool{slow}}
		},
	}
}

func slowModel() llm.Model {
	fp := fake.New("fake")
	fp.Enqueue("m",
		fake.ReplyWith(llm.Response{ToolCalls: []llm.ToolCall{{ID: "c1", Name: "slow", Arguments: []byte(`{}`)}}}),
		fake.Reply("done"),
	)
	m, _ := fp.Model("m")
	return m
}

// TestNoCritic_MaxRuntimeIsHardCap: the legacy contract is preserved — without a
// critic, MaxRuntime is a literal WithTimeout that kills a run whose work outlasts
// it. The slow tool (200ms) outlasts MaxRuntime (20ms), so runCtx cancels mid-tool
// and the run ends in error (timeout).
func TestNoCritic_MaxRuntimeIsHardCap(t *testing.T) {
	m := slowModel()
	ex := run.New(run.Config{
		Registry: tool.NewRegistry(),
		Models:   func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
	})
	res := ex.Run(context.Background(),
		run.RunnableAgent{Name: "x", ModelTier: "m", MaxIterations: 5, MaxRuntime: 20 * time.Millisecond},
		slowToolInvocation("r", 200*time.Millisecond), "go")
	if res.Err == nil {
		t.Fatalf("non-critic run should hard-timeout at MaxRuntime; got output=%q err=nil", res.Output)
	}
}

// TestCriticOwnsDeadline_SurvivesPastMaxRuntime: the fix — when the critic owns the
// deadline (Ports.Critic set + Critic.Enabled), MaxRuntime becomes the SOFT trigger
// and is NOT a hard cap. The fake critic exposes no hard deadline (Deadline()==zero,
// no kill), so the only hard ceiling is CriticAbsoluteMax (10s here). The slow tool
// (200ms) outlasts the tiny MaxRuntime (20ms) but the run completes — proving the
// old agentexec two-tier semantics are restored.
func TestCriticOwnsDeadline_SurvivesPastMaxRuntime(t *testing.T) {
	m := slowModel()
	h := &fakeCriticHandle{} // Deadline()==zero → no hard deadline, no kill
	ex := run.New(run.Config{
		Registry: tool.NewRegistry(),
		Models:   func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
		Ports:    run.Ports{Critic: &fakeCritic{h: h}},
		Defaults: run.Defaults{CriticAbsoluteMax: 10 * time.Second},
	})
	res := ex.Run(context.Background(),
		run.RunnableAgent{Name: "watched", ModelTier: "m", MaxIterations: 5, MaxRuntime: 20 * time.Millisecond,
			Critic: run.CriticConfig{Enabled: true}},
		slowToolInvocation("r", 200*time.Millisecond), "go")
	if res.Err != nil {
		t.Fatalf("critic-owned run must survive past MaxRuntime (soft trigger); got err=%v", res.Err)
	}
	if res.Output != "done" {
		t.Errorf("output = %q, want %q", res.Output, "done")
	}
}

// capturingCritic records the soft trigger the executor passes to Monitor.
type capturingCritic struct {
	mu   sync.Mutex
	soft time.Duration
	h    run.CriticHandle
}

func (c *capturingCritic) Monitor(_ context.Context, _ run.RunInfo, soft time.Duration) run.CriticHandle {
	c.mu.Lock()
	c.soft = soft
	c.mu.Unlock()
	return c.h
}

// TestCriticSoftTriggerIsMaxRuntime: the soft trigger handed to the host critic is
// the run's resolved MaxRuntime (mort's two-tier model — the critic first wakes once
// the run exceeds its nominal budget), not some global/default value.
func TestCriticSoftTriggerIsMaxRuntime(t *testing.T) {
	fp := fake.New("fake")
	fp.Enqueue("m", fake.Reply("done"))
	m, _ := fp.Model("m")
	cc := &capturingCritic{h: &fakeCriticHandle{}}
	ex := run.New(run.Config{
		Registry: tool.NewRegistry(),
		Models:   func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
		Ports:    run.Ports{Critic: cc},
	})
	const wantSoft = 7 * time.Minute
	ex.Run(context.Background(),
		run.RunnableAgent{Name: "x", ModelTier: "m", MaxRuntime: wantSoft, Critic: run.CriticConfig{Enabled: true}},
		tool.Invocation{RunID: "r"}, "go")
	cc.mu.Lock()
	got := cc.soft
	cc.mu.Unlock()
	if got != wantSoft {
		t.Errorf("soft trigger = %v, want the agent's MaxRuntime %v", got, wantSoft)
	}
}

// TestCriticOwnsDeadline_NilHandleFallsBackToMaxRuntime: the agent enables the
// critic but the host Monitor returns NO handle (nil) — there is no deadline-watch,
// so the run is unsupervised. It must fall back to the nominal MaxRuntime hard cap
// (the slow 200ms tool outlasts the 20ms MaxRuntime → the run errors), NOT run free
// up to the generous CriticAbsoluteMax runaway ceiling.
func TestCriticOwnsDeadline_NilHandleFallsBackToMaxRuntime(t *testing.T) {
	m := slowModel()
	cc := &capturingCritic{} // h is the nil interface → Monitor returns a nil handle
	ex := run.New(run.Config{
		Registry: tool.NewRegistry(),
		Models:   func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
		Ports:    run.Ports{Critic: cc},
		Defaults: run.Defaults{CriticAbsoluteMax: time.Hour}, // generous ceiling; must NOT be what bounds the run
	})
	res := ex.Run(context.Background(),
		run.RunnableAgent{Name: "x", ModelTier: "m", MaxIterations: 5, MaxRuntime: 20 * time.Millisecond,
			Critic: run.CriticConfig{Enabled: true}},
		slowToolInvocation("r", 200*time.Millisecond), "go")
	if res.Err == nil {
		t.Fatalf("critic-enabled run with a nil Monitor handle must fall back to the MaxRuntime hard cap; got output=%q err=nil", res.Output)
	}
}