executus/run/phases.go

package run

import (
	"bytes"
	"context"
	"errors"
	"fmt"
	"log/slog"
	"strings"
	"text/template"
	"unicode/utf8"

	"gitea.stevedudenhoeffer.com/steve/majordomo/agent"
	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
)

// The multi-step phase runner. A phased RunnableAgent (ra.Phases non-empty) runs
// its phases in order; each phase is a fresh majordomo agent loop (or a single
// bare LLM call for IsRunFunc phases) with its own template-expanded system
// prompt, model tier, step cap, and tool subset. Phase outputs feed later phases
// through {{.<PhaseName>}} template variables; {{.Query}} is the original input.
// The final phase's output is the run's output.
//
// Ported from mort's agentexec pipeline so the executus kernel — which already
// carries RunnableAgent.Phases as a DTO — actually EXECUTES them (it previously
// ignored the slice and ran a single loop with the base prompt). It reuses the
// shared run machinery built once in Run: the same stepObserver (so audit/steps/
// critic-activity accumulate across every phase, including IsRunFunc bare calls),
// the same critic steer, and the same compaction option.
//
// Semantics preserved from mort's pipeline:
//   - phases run sequentially; ctx cancellation/deadline/critic-kill aborts the
//     run (even mid-phase and even for an Optional phase).
//   - IsRunFunc = one bare LLM call, no tools, no loop.
//   - Optional phases swallow NON-context errors and substitute FallbackMessage.
//   - a non-optional phase that merely exhausts its step/tool budget is NOT fatal:
//     its partial transcript is salvaged and the pipeline continues — EXCEPT a
//     final phase that salvaged nothing, which is a genuine empty-result failure.
//   - per-phase ModelTier resolve failures fall back to the base model with a WARN.
//
// Deliberately NOT carried over (kernel is leaner than mort's legacy pipeline):
// the legacy `submit` capture tool (the kernel relies on majordomo's
// no-tool-call-is-final-answer termination, like its single-loop path), and the
// critic's dynamic iteration ceiling (per-phase caps are fixed at phase start —
// the run-level critic's steer + hard deadline still apply across phases).
//
// NOTE on phase names: {{.<PhaseName>}} resolves a map key, so a phase whose name
// is not a Go-template identifier (hyphens, spaces, leading digit) cannot be
// referenced as {{.my-phase}} — authors must use {{index . "my-phase"}}. A
// template that fails to parse/execute is logged (WARN) and passed through
// unchanged rather than silently dropped (see expandPhaseTemplate). Avoid naming
// a phase "Query" — it shadows the original-input variable.

// phaseDeps carries the per-run state the phase runner shares with Run: the base
// model, the full decorated toolbox (filtered per phase), the base step cap, the
// shared agent options (tool-error limits + compactor — the step observer is
// added per phase, NOT in sharedOpts, so checkpointing can vary per path), the
// shared step observer (wired into each phase's loop AND invoked for IsRunFunc
// bare calls), the critic/session steer, and the audit recorder (phase events).
type phaseDeps struct {
	baseModel    llm.Model
	baseToolbox  *llm.Toolbox
	baseMaxIter  int
	sharedOpts   []agent.Option
	stepObserver func(agent.Step)
	steer        func() []llm.Message
	rec          RunRecorder
	// checkpointer records phase-boundary progress (completed phases) for durable
	// recovery; nil = non-durable. resume carries a recovered run's completed
	// phases so they are skipped on re-run. Phase recovery is boundary-granular:
	// the interrupted (active) phase re-runs from its start (its mid-phase
	// transcript is NOT resumed — only the single-loop path resumes mid-loop).
	checkpointer Checkpointer
	resume       *ResumeState
}

// runPhases executes ra.Phases sequentially and returns a synthetic agent.Result
// whose Output is the final phase's output, with Usage aggregated across phases
// and Messages set to the last phase's transcript (for the PostRun hook). A hard
// (non-optional, non-budget) phase failure — and any context cancellation/
// deadline/critic-kill — returns the error.
func (e *Executor) runPhases(runCtx context.Context, ra RunnableAgent, deps phaseDeps, query string, images []llm.ImagePart) (*agent.Result, error) {
	outputs := make(map[string]string, len(ra.Phases))
	var completed []PhaseOutput
	var lastResult *agent.Result
	var lastOutput string
	var totalUsage llm.Usage

	// resumeSkip is the set of phases already finished on a RECOVERED run — kept
	// SEPARATE from the live `outputs` map (which fills as phases run this time) so
	// the skip guard only skips RESUME-completed phases, never a fresh run's own
	// phases. (Reusing `outputs` would make a second phase with a duplicate name
	// skip itself.) Pre-populate outputs + completed so a resumed run threads the
	// saved outputs into later phases. The interrupted (active) phase is NOT
	// pre-populated, so it re-runs from its start (boundary-granular recovery).
	resumeSkip := map[string]bool{}
	if deps.resume != nil {
		for _, pc := range deps.resume.CompletedPhases {
			outputs[pc.Name] = pc.Output
			resumeSkip[pc.Name] = true
			completed = append(completed, pc)
			lastOutput = pc.Output
		}
	}

	// finish stamps the aggregated usage + final output onto the synthetic result.
	finish := func(err error) (*agent.Result, error) {
		if lastResult == nil {
			lastResult = &agent.Result{}
		}
		lastResult.Usage = totalUsage
		if err == nil {
			lastResult.Output = lastOutput
		}
		return lastResult, err
	}

	for i, phase := range ra.Phases {
		// Skip phases already completed on a resumed run.
		if resumeSkip[phase.Name] {
			continue
		}
		// A killed/timed-out/cancelled run must not start its next phase.
		if err := runCtx.Err(); err != nil {
			return finish(err)
		}

		instructions := expandPhaseTemplate(phase.SystemPrompt, query, outputs)
		if deps.rec != nil {
			deps.rec.LogEvent("phase_start", map[string]any{"phase": phase.Name})
		}

		output, res, err := e.runOnePhase(runCtx, ra, deps, phase, instructions, query, images)
		if res != nil {
			lastResult = res
			totalUsage = addUsage(totalUsage, res.Usage)
		}
		if err != nil {
			// A context cancellation / deadline / critic-kill is NEVER swallowed by
			// the Optional or budget-salvage branches — the run genuinely ended and
			// must surface as cancelled/timeout/killed (statusFor classifies it).
			if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
				return finish(err)
			}
			isLast := i == len(ra.Phases)-1
			trimmed := strings.TrimSpace(output)
			switch {
			case phase.Optional:
				output = phase.FallbackMessage
				if output == "" {
					output = fmt.Sprintf("(Phase %q encountered an error -- proceeding without its results)", phase.Name)
				}
				slog.Warn("run: optional pipeline phase failed",
					"agent", ra.Name, "phase", phase.Name, "error", err)
				if deps.rec != nil {
					deps.rec.LogEvent("phase_failed_optional", map[string]any{"phase": phase.Name, "error": err.Error()})
				}

			case isPhaseBudgetExhaustion(err) && (!isLast || trimmed != ""):
				// Soft stop: the phase ran out of its step/tool budget before
				// composing a final answer. Not fatal — it did real work (runOnePhase
				// salvaged its partial transcript into output), and aborting would
				// discard every completed phase before it. Degrade and continue.
				// (A FINAL phase that salvaged nothing falls through to the hard error
				// below: there is no result to return.)
				if trimmed == "" {
					output = fmt.Sprintf("(Phase %q reached its step budget before producing a consolidated result; continuing with its partial findings.)", phase.Name)
				} else {
					output += fmt.Sprintf("\n\n(Note: phase %q reached its step budget before fully completing; the above is its partial output.)", phase.Name)
				}
				slog.Warn("run: pipeline phase exhausted its budget; salvaging partial output and continuing",
					"agent", ra.Name, "phase", phase.Name, "last_phase", isLast, "error", err)
				if deps.rec != nil {
					deps.rec.LogEvent("phase_budget_exhausted", map[string]any{"phase": phase.Name, "error": err.Error(), "last_phase": isLast})
				}

			default:
				return finish(fmt.Errorf("pipeline phase %q: %w", phase.Name, err))
			}
		}

		outputs[phase.Name] = output
		lastOutput = output
		// Checkpoint the phase boundary: this phase is done, so a resumed run skips
		// it and continues from the next. (Copy the slice — the checkpointer may
		// hold/serialize it asynchronously.)
		completed = append(completed, PhaseOutput{Name: phase.Name, Output: output})
		if deps.checkpointer != nil {
			_ = deps.checkpointer.Save(runCtx, RunCheckpointState{
				CompletedPhases: append([]PhaseOutput(nil), completed...),
			})
		}
	}

	return finish(nil)
}

// runOnePhase runs a single phase: a bare LLM call for IsRunFunc phases, a fresh
// agent loop otherwise. Returns the phase output, the loop result (nil for a
// failed bare call), and any error. On a budget-exhaustion error the loop's
// partial transcript is salvaged into the returned output.
func (e *Executor) runOnePhase(runCtx context.Context, ra RunnableAgent, deps phaseDeps, phase Phase, instructions, query string, images []llm.ImagePart) (string, *agent.Result, error) {
	phaseCtx, model := e.phaseModel(runCtx, deps, ra, phase)
	// The phase's expanded instructions are the system prompt (with the platform
	// header so tools keep their run ids); the original query is the user message.
	system := e.systemPromptWithBody(instructions)

	if phase.IsRunFunc {
		// Bare LLM call: no tool loop, no tools array (some models 400 on an empty
		// tools list). The response is fed through the SAME step observer as a loop
		// step so the audit token tally, Result.Steps, AND the critic's activity
		// clock all see it (a long synthesize phase must not look idle to the critic).
		msgs := []llm.Message{multimodalUserMessage(query, images)}
		resp, err := model.Generate(phaseCtx, llm.Request{System: system, Messages: msgs})
		if err != nil {
			return "", nil, fmt.Errorf("phase %q model call: %w", phase.Name, err)
		}
		if deps.stepObserver != nil {
			deps.stepObserver(agent.Step{Index: 0, Response: resp})
		}
		return resp.Text(), &agent.Result{
			Output:   resp.Text(),
			Usage:    resp.Usage,
			Messages: append(msgs, resp.Message()),
		}, nil
	}

	toolbox := filterToolbox(deps.baseToolbox, phase.Tools)
	maxIter := phase.MaxIterations
	if maxIter <= 0 {
		maxIter = deps.baseMaxIter
	}
	// Per-phase opts: a fixed step ceiling for this phase (the critic's dynamic
	// ceiling is intentionally not propagated to phases) + the phase toolbox + the
	// shared step observer (audit/steps/critic), on top of the shared opts
	// (tool-error limits, compactor).
	opts := append([]agent.Option{
		agent.WithToolbox(toolbox),
		agent.WithMaxSteps(maxIter),
		agent.WithStepObserver(deps.stepObserver),
	}, deps.sharedOpts...)
	ag := agent.New(model, system, opts...)

	res, runErr := runAgent(phaseCtx, ag, query, images, agent.WithSteer(deps.steer))
	output := ""
	if res != nil {
		output = res.Output
	}
	// Budget/guard exhaustion leaves a usable partial transcript but an empty
	// final answer; salvage the narrated work so the pipeline can carry it forward.
	if runErr != nil && isPhaseBudgetExhaustion(runErr) {
		if salvaged := salvagePhaseTranscript(res); salvaged != "" {
			output = salvaged
		}
	}
	return output, res, runErr
}

// phaseModel resolves the phase's model tier, returning the resolver's enriched
// context (usage attribution) alongside the model. An empty tier or a resolution
// failure falls back to the base model + the run context (WARN — visible, not
// fatal). Returning the enriched ctx mirrors the single-loop path, which adopts
// ctx = modelCtx, so a non-base-tier phase's calls are attributed correctly.
func (e *Executor) phaseModel(ctx context.Context, deps phaseDeps, ra RunnableAgent, phase Phase) (context.Context, llm.Model) {
	if phase.ModelTier == "" {
		return ctx, deps.baseModel
	}
	modelCtx, m, err := e.cfg.Models(ctx, phase.ModelTier)
	if err != nil || m == nil {
		reason := "resolver returned a nil model"
		if err != nil {
			reason = err.Error()
		}
		slog.Warn("run: pipeline phase model resolve failed; using base model",
			"agent", ra.Name, "phase", phase.Name, "tier", phase.ModelTier, "reason", reason)
		return ctx, deps.baseModel
	}
	return modelCtx, m
}

// isPhaseBudgetExhaustion reports whether err is a soft budget/guard stop (the
// loop hit its step cap or tripped a tool-error guard) — which leaves a usable
// partial transcript — as opposed to a hard error (cancellation, model failure).
func isPhaseBudgetExhaustion(err error) bool {
	return errors.Is(err, agent.ErrMaxSteps) || errors.Is(err, agent.ErrToolLoop)
}

// maxSalvageBytes bounds a salvaged partial transcript so a long phase's narrated
// reasoning doesn't blow up the next phase's prompt (the tail is the most recent,
// most relevant reasoning). Matches mort's pipeline cap.
const maxSalvageBytes = 8000

// salvagePhaseTranscript reconstructs a best-effort phase output from a loop that
// ended without a final answer: the assistant's narrated text across every step,
// tail-trimmed to maxSalvageBytes on a rune boundary. Returns "" when the model
// wrote no prose.
func salvagePhaseTranscript(res *agent.Result) string {
	if res == nil {
		return ""
	}
	var b strings.Builder
	for _, step := range res.Steps {
		if step.Response == nil {
			continue
		}
		if t := strings.TrimSpace(step.Response.Text()); t != "" {
			if b.Len() > 0 {
				b.WriteString("\n\n")
			}
			b.WriteString(t)
		}
	}
	out := strings.TrimSpace(b.String())
	if len(out) > maxSalvageBytes {
		tail := out[len(out)-maxSalvageBytes:]
		// Advance to the next rune boundary so the cut never splits a UTF-8 rune.
		for len(tail) > 0 && !utf8.RuneStart(tail[0]) {
			tail = tail[1:]
		}
		out = "...(earlier reasoning trimmed)...\n" + tail
	}
	return out
}

// multimodalUserMessage builds a user message from text + inline images. Shared
// by the phase runner and runAgent so the image-folding lives in one place.
// Empty text with images yields an image-only message (no empty text part).
func multimodalUserMessage(text string, images []llm.ImagePart) llm.Message {
	if len(images) == 0 {
		return llm.UserText(text)
	}
	parts := make([]llm.Part, 0, len(images)+1)
	if strings.TrimSpace(text) != "" {
		parts = append(parts, llm.Text(text))
	}
	for _, img := range images {
		parts = append(parts, img)
	}
	return llm.UserParts(parts...)
}

// expandPhaseTemplate applies Go text/template substitution to a phase prompt,
// replacing {{.Query}} with the original query and {{.<PhaseName>}} with a prior
// phase's output. On a parse/execute error it logs a WARN and returns the
// template unchanged (best-effort, non-fatal) so a misconfigured prompt is
// visible rather than silently masked.
func expandPhaseTemplate(tmpl, query string, priorOutputs map[string]string) string {
	t, err := template.New("phase").Option("missingkey=zero").Parse(tmpl)
	if err != nil {
		slog.Warn("run: pipeline phase template parse failed; using it unexpanded", "error", err)
		return tmpl
	}
	data := map[string]string{"Query": query}
	for k, v := range priorOutputs {
		data[k] = v
	}
	var buf bytes.Buffer
	if err := t.Execute(&buf, data); err != nil {
		slog.Warn("run: pipeline phase template execute failed; using it unexpanded", "error", err)
		return tmpl
	}
	return buf.String()
}

// filterToolbox returns a toolbox restricted to the named tools (preserving
// palette order). Empty names = the full palette (the base toolbox is returned
// as-is — it is read-only during a run, like the single-loop path). Unknown names
// are skipped with a WARN — a typo'd phase tool list should not abort a run.
func filterToolbox(box *llm.Toolbox, names []string) *llm.Toolbox {
	if len(names) == 0 {
		return box
	}
	out := llm.NewToolbox(box.Name())
	for _, name := range names {
		t, ok := box.Get(name)
		if !ok {
			slog.Warn("run: pipeline phase references unknown tool; skipping", "tool", name)
			continue
		}
		if err := out.Add(t); err != nil {
			slog.Warn("run: pipeline phase tool duplicated; skipping", "tool", name, "error", err)
		}
	}
	return out
}

// addUsage sums two llm.Usage tallies field-by-field so a phased run reports the
// total tokens across all phases. NOTE: if llm.Usage gains a field, add it here
// too — the audit recorder (rec) is the authoritative per-run token source, this
// is the secondary Result.Usage roll-up.
func addUsage(a, b llm.Usage) llm.Usage {
	a.InputTokens += b.InputTokens
	a.OutputTokens += b.OutputTokens
	a.CacheReadTokens += b.CacheReadTokens
	a.CacheWriteTokens += b.CacheWriteTokens
	a.ReasoningTokens += b.ReasoningTokens
	return a
}