executus/run/critic.go

package run

import (
	"context"
	"fmt"
	"time"

	"gitea.stevedudenhoeffer.com/steve/majordomo/agent"
	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
)

// criticDeadlineCheck is how often the deadline-watch goroutine polls the
// critic's hard deadline. Small relative to any realistic soft timeout.
const criticDeadlineCheck = time.Second

// criticBinding wires a CriticHandle into a run: the executor forwards activity
// (steps + tool starts) to it, binds the run's hard cancellation to the critic's
// extendable deadline, and exposes the critic's Steer messages as an agent
// RunOption. All methods are nil-safe so the executor can call them
// unconditionally when no critic is configured.
type criticBinding struct {
	h CriticHandle
}

// criticOwnsDeadline reports whether a critic is configured AND this run enables
// it — the single predicate that decides the two-tier-timeout path. Used by BOTH
// Run (to choose the generous runaway ceiling over the literal MaxRuntime cap) and
// startCritic (the arm/no-op gate), so the two can never drift.
func (e *Executor) criticOwnsDeadline(ra RunnableAgent) bool {
	return e.cfg.Ports.Critic != nil && ra.Critic.Enabled
}

// startCritic begins critic monitoring for this run when one is configured and
// the agent enables it. It launches a goroutine that cancels runCtx (via
// cancelCause) the moment the critic's hard deadline passes — the critic may
// extend that deadline, so a healthy-but-slow run is given room while a hung one
// is killed. When the deadline passes because the critic KILLED the run
// (KillCause() != nil), the cancellation cause is ErrCriticKill (→ status
// "killed"); when the backstop simply expired, it is context.DeadlineExceeded (→
// "timeout"). Returns (nil, no-op stop) when there is no critic. The caller MUST
// defer the returned stop.
//
// softTrigger is the run's resolved MaxRuntime: for a critic-owned run MaxRuntime
// is the soft wake (mort's two-tier semantics — the critic first reviews once the
// run exceeds its nominal budget, and its backstop = softTrigger × multiplier).
// The caller (Run) always passes the resolved MaxRuntime, which withFallbacks
// guarantees is > 0; the 90s floor below is purely a defensive guard for a
// hypothetical caller that passes a non-positive value.
func (e *Executor) startCritic(runCtx context.Context, cancelCause context.CancelCauseFunc, ra RunnableAgent, info RunInfo, softTrigger time.Duration) (*criticBinding, func()) {
	noop := func() {}
	if !e.criticOwnsDeadline(ra) {
		return nil, noop
	}
	soft := softTrigger
	if soft <= 0 {
		soft = 90 * time.Second // defensive only; the sole caller passes MaxRuntime (>0)
	}
	h := e.cfg.Ports.Critic.Monitor(runCtx, info, soft)
	if h == nil {
		return nil, noop
	}
	done := make(chan struct{})
	go func() {
		// A host CriticHandle.Deadline() that panics must not crash the process
		// (this runs on its own goroutine, so the executor's top-level recover
		// can't catch it). Log-free best-effort: just stop watching.
		defer func() { _ = recover() }()
		t := time.NewTicker(criticDeadlineCheck)
		defer t.Stop()
		for {
			select {
			case <-done:
				return
			case <-runCtx.Done():
				return
			case <-t.C:
				// A zero deadline = no hard cap (not yet set); otherwise cancel
				// once we're at or past it, distinguishing an explicit kill from a
				// natural backstop expiry so the run gets the right status.
				if d := h.Deadline(); !d.IsZero() && !time.Now().Before(d) {
					if cause := h.KillCause(); cause != nil {
						cancelCause(fmt.Errorf("%w: %s", ErrCriticKill, cause.Error()))
					} else {
						cancelCause(context.DeadlineExceeded)
					}
					return
				}
			}
		}
	}()
	return &criticBinding{h: h}, func() {
		close(done)
		h.Stop()
	}
}

func (b *criticBinding) recordStep(iter int, resp *llm.Response) {
	if b != nil {
		b.h.RecordStep(iter, resp)
	}
}

// recordToolStart forwards a tool call to the critic. NOTE: majordomo's step
// observer only fires AFTER an iteration completes, so this currently lands
// post-tool, not at dispatch — the activity clock is refreshed once per
// iteration, not mid-tool. A single very long tool call (e.g. a 30-min render)
// therefore won't refresh the clock until it returns; a host that runs such
// tools should feed interim progress to its Critic (mort's InstallProgressBridge
// pattern). A true pre-dispatch refresh needs a majordomo hook (follow-up).
func (b *criticBinding) recordToolStart(name, args string) {
	if b != nil {
		b.h.RecordToolStart(name, args)
	}
}

// maxStepsOption returns the agent step-ceiling Option. With no critic it's a
// fixed WithMaxSteps(base); with a critic it's a DYNAMIC WithMaxStepsFunc that
// polls the handle each step (so the critic can raise a long run's budget),
// falling back to base when the handle defers (MaxSteps() <= 0).
func (b *criticBinding) maxStepsOption(base int) agent.Option {
	if b == nil {
		return agent.WithMaxSteps(base)
	}
	return agent.WithMaxStepsFunc(func() int {
		if n := b.h.MaxSteps(); n > 0 {
			return n
		}
		return base
	})
}

// drainSteer returns the critic's queued steer messages (nil-safe), so the
// executor can merge them with the session steer mailbox into one WithSteer.
func (b *criticBinding) drainSteer() []llm.Message {
	if b == nil {
		return nil
	}
	return b.h.Steer()
}