executus/run/runengine.go

// Package run is executus's run kernel: the shared run-loop mechanics around
// majordomo's agent loop, plus the host seams (run.Ports / RunnableAgent) that
// let one executor serve every surface — a light host's bounded one-shot run,
// a heavy host's persona agent or saved skill — without the kernel importing a
// battery.
//
// This file holds the genuinely-identical scaffolding both run shapes need:
// context cancellation merging, the detached-cleanup timeout, the per-run
// progress accessor the self-status tool reads, the legacy `submit`
// compatibility tool (submit.go), the ancestor progress bridge (progress.go),
// and the run-finalizer machinery — one source of truth.
//
// The kernel depends only on majordomo + executus/tool + the run.Ports
// interfaces; persistence, audit, the persona/skill nouns, and the critic are
// host-supplied via Ports (see ports.go) so importing the kernel never drags in
// a store or a battery.
package run

import (
	"context"
	"errors"
	"log/slog"
	"sync/atomic"
	"time"

	"gitea.stevedudenhoeffer.com/steve/executus/tool"
)

// ErrShutdown is the cancellation cause set on mort's base lifecycle context
// when the process is shutting down (SIGTERM after the drain window). The
// agent executor uses it to distinguish a run interrupted by shutdown (which
// should be left durable-recoverable) from a run that errored or hit its own
// deadline (terminal).
var ErrShutdown = errors.New("mort: shutting down")

// CleanupContextTimeout caps how long a run's post-completion cleanup ops
// (budget commit, audit Close, attachment bookkeeping) may wait on
// storage after detaching from the caller's — possibly already
// cancelled — context. 10s is generous for a single-row UPDATE against
// MySQL; longer suggests a hung connection the run goroutine shouldn't
// keep waiting on. Both executors derive their cleanup contexts as
// context.WithTimeout(context.WithoutCancel(ctx), CleanupContextTimeout).
const CleanupContextTimeout = 10 * time.Second

// Reserved state-react lifecycle event keys, shared so both nouns surface
// the same UX shape. Namespaced with double-underscores to make accidental
// collision with a tool name near-impossible.
const (
	StateReactStart          = "__start__"
	StateReactEnd            = "__end__"
	StateReactError          = "__error__"
	StateReactBudgetExceeded = "__budget_exceeded__"
)

// MergeCancellation returns a context cancelled when EITHER input is
// cancelled, propagating the cancellation Cause from whichever fired. Used
// by the lane preemption path (the lane's per-job ctx.Cause flows into the
// run context) and by the runtime-detach path (process shutdown still
// reaches a run whose deadline was reset after a lane wait). Always call
// the returned cancel to release the watcher goroutine; it is also invoked
// once when either input fires.
func MergeCancellation(parent, secondary context.Context) (context.Context, context.CancelFunc) {
	merged, cancel := context.WithCancelCause(parent)
	go func() {
		select {
		case <-merged.Done():
			return
		case <-secondary.Done():
			cancel(context.Cause(secondary))
		}
	}()
	return merged, func() { cancel(nil) }
}

// RunFinalizer is invoked at run finish so per-run tool state (open HTTP
// streams, per-run code_exec counters, per-run search budgets) is released
// and the process-lifetime maps keyed by run id don't grow unbounded.
// Both executors fire their registered finalizers via FireFinalizers.
type RunFinalizer interface {
	FinalizeRun(runID string)
}

// FireFinalizers runs every finalizer for runID, isolating each behind a
// panic-recover so one buggy finalizer can't take down the run goroutine
// or skip the others. Safe to call with a nil/empty slice.
func FireFinalizers(fs []RunFinalizer, runID string) {
	for _, f := range fs {
		if f == nil {
			continue
		}
		func() {
			defer func() {
				if r := recover(); r != nil {
					slog.Error("runengine: run finalizer panicked",
						"run_id", runID, "panic", r)
				}
			}()
			f.FinalizeRun(runID)
		}()
	}
}

// RunTally is the narrow live-progress source the RunStateAccessor reads —
// the running token and tool-call counts for the in-flight run. The audit
// battery's writer satisfies it; this interface is how the run kernel reads
// live tallies without importing the audit package (the inversion of mort's
// direct *skillaudit.Writer dependency).
type RunTally interface {
	// TokenStats returns the running input, output, and thinking token totals.
	TokenStats() (in, out, thinking int64)
	// ToolCallsCount returns the number of tool calls executed so far.
	ToolCallsCount() int
}

// RunStateAccessor is the per-run live-progress accessor the executor
// stamps on Invocation.RunState before building the toolbox, so the
// self-status tool can report iteration / tool-calls / tokens / elapsed for
// the in-flight run. Construct with NewRunStateAccessor; the executor's step
// observer calls SetIteration each loop.
type RunStateAccessor struct {
	tally     RunTally
	iter      atomic.Int32
	maxIter   int
	maxCalls  int
	startedAt time.Time
}

// NewRunStateAccessor builds the accessor. writer supplies the live token
// + tool-call tallies; maxIter / maxCalls are the reported caps (0 =
// uncapped); startedAt anchors the elapsed clock.
func NewRunStateAccessor(tally RunTally, maxIter, maxCalls int, startedAt time.Time) *RunStateAccessor {
	return &RunStateAccessor{
		tally:     tally,
		maxIter:   maxIter,
		maxCalls:  maxCalls,
		startedAt: startedAt,
	}
}

// SetIteration records the current agent-loop iteration (called from the
// executor's step observer).
func (a *RunStateAccessor) SetIteration(iter int) { a.iter.Store(int32(iter)) }

// RunState satisfies tool.RunStateAccessor.
func (a *RunStateAccessor) RunState() tool.RunState {
	in, out, think := a.tally.TokenStats()
	return tool.RunState{
		Iteration:      int(a.iter.Load()),
		MaxIterations:  a.maxIter,
		ToolCalls:      a.tally.ToolCallsCount(),
		MaxToolCalls:   a.maxCalls,
		InputTokens:    in,
		OutputTokens:   out,
		ThinkingTokens: think,
		ElapsedSeconds: int(time.Since(a.startedAt).Seconds()),
	}
}