feat(run): execute multi-phase pipelines (RunnableAgent.Phases) #19

Merged
steve merged 2 commits from feat/kernel-phases into main 2026-06-29 19:52:52 +00:00
4 changed files with 677 additions and 25 deletions
+14 -2
View File
@@ -55,15 +55,27 @@ type RunnableAgent struct {
}
// Phase is one step of a multi-step run: its own system prompt, model tier,
// iteration cap, and tool subset. Optional phases may be skipped by the
// pipeline when their precondition isn't met.
// iteration cap, and tool subset. Phase prompts are Go text/template strings
// expanded against {{.Query}} (the original input) and {{.<PhaseName>}} (a
// prior phase's output) before the phase runs, so a phase can consume earlier
// work. The final phase's output is the run's output.
type Phase struct {
Name string
SystemPrompt string
ModelTier string
MaxIterations int
Tools []string
// Optional swallows a phase's error and substitutes FallbackMessage (or a
// generated note) as its output, so a non-critical phase failing does not
// abort the pipeline.
Optional bool
// FallbackMessage is the substitute output when an Optional phase fails.
// Empty → a generated "(phase %q encountered an error…)" note.
FallbackMessage string
// IsRunFunc marks a phase as a single bare LLM call (no tool loop, no tools
// array) — a deterministic transform step (plan/synthesize) rather than an
// agentic loop. Its Tools/MaxIterations are ignored.
IsRunFunc bool
}
// CriticConfig configures the optional run-critic. Enabled gates whether a
+47 -22
View File
@@ -4,7 +4,6 @@ import (
"context"
"errors"
"fmt"
"strings"
"time"
"gitea.stevedudenhoeffer.com/steve/majordomo/agent"
@@ -289,12 +288,10 @@ func (e *Executor) Run(ctx context.Context, ra RunnableAgent, inv tool.Invocatio
}
}
opts := []agent.Option{
agent.WithToolbox(toolbox),
// Step ceiling: a fixed WithMaxSteps(maxIter) normally, but when a critic is
// active it owns a DYNAMIC ceiling (WithMaxStepsFunc) so it can raise a
// healthy-but-long run's budget mid-flight. Falls back to maxIter.
critic.maxStepsOption(maxIter),
// Shared agent options used by BOTH the single-loop path and every phase: the
// tool-error guards, the step observer, and optional compaction. The toolbox +
// step ceiling are NOT shared (they vary per phase), so they're added per path.
sharedOpts := []agent.Option{
agent.WithToolErrorLimits(e.cfg.Defaults.MaxConsecutiveToolErrors, e.cfg.Defaults.MaxSameToolCallRepeats),
agent.WithStepObserver(stepObserver),
}
@@ -313,11 +310,10 @@ func (e *Executor) Run(ctx context.Context, ra RunnableAgent, inv tool.Invocatio
})
}
}
opts = append(opts, agent.WithCompactor(e.cfg.Compactor(threshold, onFire)))
sharedOpts = append(sharedOpts, agent.WithCompactor(e.cfg.Compactor(threshold, onFire)))
}
}
ag := agent.New(model, e.systemPrompt(ra), opts...)
// Stage non-image input attachments (audio/PDF/binary) into the host file
// store and fold an [ATTACHED FILES] descriptor into the prompt so the agent
// can reach them by file_id. No-op when Ports.InputFiles is nil or there are
@@ -327,7 +323,35 @@ func (e *Executor) Run(ctx context.Context, ra RunnableAgent, inv tool.Invocatio
// One WithSteer drains BOTH the session mailbox (a tool's AttachImages) and
// the critic's nudges before each step.
steer := func() []llm.Message { return append(mailbox.drain(), critic.drainSteer()...) }
runRes, runErr := runAgent(runCtx, ag, input, inv.Images, agent.WithSteer(steer))
var runRes *agent.Result
var runErr error
if len(ra.Phases) == 0 {
// Single-loop run: the agent's base prompt + full toolbox, with the
// critic's DYNAMIC step ceiling (WithMaxStepsFunc, so it can raise a
// healthy-but-long run's budget mid-flight; falls back to maxIter).
opts := append([]agent.Option{
agent.WithToolbox(toolbox),
critic.maxStepsOption(maxIter),
}, sharedOpts...)
ag := agent.New(model, e.systemPrompt(ra), opts...)
runRes, runErr = runAgent(runCtx, ag, input, inv.Images, agent.WithSteer(steer))
} else {
// Multi-phase pipeline: each phase runs its own prompt/tier/tools/step-cap
// sequentially, threading outputs through {{.<PhaseName>}} templates. Reuses
// the shared opts so audit/steps/critic-steer accumulate across every phase.
// (Per-phase step caps are fixed — the critic's dynamic ceiling is not
// propagated to phases — but its steer + hard deadline still apply.)
runRes, runErr = e.runPhases(runCtx, ra, phaseDeps{
baseModel: model,
baseToolbox: toolbox,
baseMaxIter: maxIter,
sharedOpts: sharedOpts,
stepObserver: stepObserver,
steer: steer,
rec: rec,
}, input, inv.Images)
}
status := statusFor(runCtx, runErr)
if runRes != nil {
@@ -403,13 +427,20 @@ func (e *Executor) finishAudit(ctx context.Context, rec RunRecorder, status stri
}
func (e *Executor) systemPrompt(ra RunnableAgent) string {
return e.systemPromptWithBody(ra.SystemPrompt)
}
// systemPromptWithBody composes the optional platform header with an arbitrary
// body. The single-loop path passes ra.SystemPrompt; the phase runner passes a
// phase's expanded instructions, so each phase keeps the platform header.
func (e *Executor) systemPromptWithBody(body string) string {
if e.cfg.SystemHeader == "" {
return ra.SystemPrompt
return body
}
if ra.SystemPrompt == "" {
if body == "" {
return e.cfg.SystemHeader
}
return e.cfg.SystemHeader + "\n\n" + ra.SystemPrompt
return e.cfg.SystemHeader + "\n\n" + body
}
// compactionThreshold returns the token threshold for the tier's model context
@@ -460,15 +491,9 @@ func runAgent(ctx context.Context, ag *agent.Agent, input string, images []llm.I
if len(images) == 0 {
return ag.Run(ctx, input, opts...)
}
parts := make([]llm.Part, 0, len(images)+1)
if strings.TrimSpace(input) != "" {
parts = append(parts, llm.Text(input))
}
for _, img := range images {
parts = append(parts, img)
}
// Copy opts before appending so a caller-supplied backing array is never
// mutated/aliased (the variadic slice can have spare capacity).
opts = append(opts[:len(opts):len(opts)], agent.WithHistory([]llm.Message{llm.UserParts(parts...)}))
// mutated/aliased (the variadic slice can have spare capacity). The multimodal
// opening turn (text + image parts) is built by the shared helper.
opts = append(opts[:len(opts):len(opts)], agent.WithHistory([]llm.Message{multimodalUserMessage(input, images)}))
return ag.Run(ctx, "", opts...)
}
+357
View File
@@ -0,0 +1,357 @@
package run
import (
"bytes"
"context"
"errors"
"fmt"
"log/slog"
"strings"
"text/template"
"unicode/utf8"
"gitea.stevedudenhoeffer.com/steve/majordomo/agent"
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
)
// The multi-step phase runner. A phased RunnableAgent (ra.Phases non-empty) runs
// its phases in order; each phase is a fresh majordomo agent loop (or a single
// bare LLM call for IsRunFunc phases) with its own template-expanded system
// prompt, model tier, step cap, and tool subset. Phase outputs feed later phases
// through {{.<PhaseName>}} template variables; {{.Query}} is the original input.
// The final phase's output is the run's output.
//
// Ported from mort's agentexec pipeline so the executus kernel — which already
// carries RunnableAgent.Phases as a DTO — actually EXECUTES them (it previously
// ignored the slice and ran a single loop with the base prompt). It reuses the
// shared run machinery built once in Run: the same stepObserver (so audit/steps/
// critic-activity accumulate across every phase, including IsRunFunc bare calls),
// the same critic steer, and the same compaction option.
//
// Semantics preserved from mort's pipeline:
// - phases run sequentially; ctx cancellation/deadline/critic-kill aborts the
// run (even mid-phase and even for an Optional phase).
// - IsRunFunc = one bare LLM call, no tools, no loop.
// - Optional phases swallow NON-context errors and substitute FallbackMessage.
// - a non-optional phase that merely exhausts its step/tool budget is NOT fatal:
// its partial transcript is salvaged and the pipeline continues — EXCEPT a
// final phase that salvaged nothing, which is a genuine empty-result failure.
// - per-phase ModelTier resolve failures fall back to the base model with a WARN.
//
// Deliberately NOT carried over (kernel is leaner than mort's legacy pipeline):
// the legacy `submit` capture tool (the kernel relies on majordomo's
// no-tool-call-is-final-answer termination, like its single-loop path), and the
// critic's dynamic iteration ceiling (per-phase caps are fixed at phase start —
// the run-level critic's steer + hard deadline still apply across phases).
//
Outdated
Review

🟡 Redundant comments in phaseDeps

maintainability · flagged by 1 model

  1. Redundant Comments - Location: run/phases.go:46-58 - Issue: The phaseDeps struct has a comment that explains its purpose, but the comment is somewhat redundant with the struct's name and fields. The comment could be more concise. - Suggestion: Shorten the comment to focus on the unique aspects of phaseDeps that aren't immediately obvious from the struct definition.

🪰 Gadfly · advisory

🟡 **Redundant comments in phaseDeps** _maintainability · flagged by 1 model_ 3. **Redundant Comments** - **Location**: `run/phases.go:46-58` - **Issue**: The `phaseDeps` struct has a comment that explains its purpose, but the comment is somewhat redundant with the struct's name and fields. The comment could be more concise. - **Suggestion**: Shorten the comment to focus on the unique aspects of `phaseDeps` that aren't immediately obvious from the struct definition. <sub>🪰 Gadfly · advisory</sub>
// NOTE on phase names: {{.<PhaseName>}} resolves a map key, so a phase whose name
// is not a Go-template identifier (hyphens, spaces, leading digit) cannot be
// referenced as {{.my-phase}} — authors must use {{index . "my-phase"}}. A
// template that fails to parse/execute is logged (WARN) and passed through
// unchanged rather than silently dropped (see expandPhaseTemplate). Avoid naming
// a phase "Query" — it shadows the original-input variable.
Outdated
Review

🟠 Dead field baseTier in phaseDeps struct — populated but never read

maintainability · flagged by 4 models

  • phaseDeps.baseTier is dead code (run/phases.go:52). The field is populated in executor.go:348 (baseTier: tier) but never read anywhere in the package. grep confirms deps.baseTier has zero usages. This is a dead struct field that will confuse future readers. Remove it from the struct and the construction site.

🪰 Gadfly · advisory

🟠 **Dead field baseTier in phaseDeps struct — populated but never read** _maintainability · flagged by 4 models_ - **`phaseDeps.baseTier` is dead code** (`run/phases.go:52`). The field is populated in `executor.go:348` (`baseTier: tier`) but never read anywhere in the package. `grep` confirms `deps.baseTier` has zero usages. This is a dead struct field that will confuse future readers. Remove it from the struct and the construction site. <sub>🪰 Gadfly · advisory</sub>
// phaseDeps carries the per-run state the phase runner shares with Run: the base
// model, the full decorated toolbox (filtered per phase), the base step cap, the
// shared agent options (tool-error limits + step observer + compactor), the
// shared step observer (also fed by IsRunFunc bare calls), the critic/session
// steer, and the audit recorder (phase events).
type phaseDeps struct {
baseModel llm.Model
baseToolbox *llm.Toolbox
baseMaxIter int
sharedOpts []agent.Option
stepObserver func(agent.Step)
Outdated
Review

🟡 Long function runPhases

maintainability · flagged by 1 model

  1. Long Function in runPhases - Location: run/phases.go:64-140 - Issue: The runPhases function is quite long and handles multiple responsibilities, including iterating over phases, handling errors, and managing outputs. This makes it harder to read and maintain. - Suggestion: Break this function into smaller, focused functions. For example, separate the error handling logic into a dedicated function.

🪰 Gadfly · advisory

🟡 **Long function runPhases** _maintainability · flagged by 1 model_ 1. **Long Function in `runPhases`** - **Location**: `run/phases.go:64-140` - **Issue**: The `runPhases` function is quite long and handles multiple responsibilities, including iterating over phases, handling errors, and managing outputs. This makes it harder to read and maintain. - **Suggestion**: Break this function into smaller, focused functions. For example, separate the error handling logic into a dedicated function. <sub>🪰 Gadfly · advisory</sub>
steer func() []llm.Message
rec RunRecorder
}
// runPhases executes ra.Phases sequentially and returns a synthetic agent.Result
// whose Output is the final phase's output, with Usage aggregated across phases
// and Messages set to the last phase's transcript (for the PostRun hook). A hard
// (non-optional, non-budget) phase failure — and any context cancellation/
// deadline/critic-kill — returns the error.
func (e *Executor) runPhases(runCtx context.Context, ra RunnableAgent, deps phaseDeps, query string, images []llm.ImagePart) (*agent.Result, error) {
outputs := make(map[string]string, len(ra.Phases))
var lastResult *agent.Result
Review

🟠 Untrusted model outputs (prior phase results) are injected into later phases' system prompts via {{.}}, creating a prompt-injection escalation path where adversarial content gains system-prompt privilege.

security · flagged by 1 model

  • Untrusted model outputs are injected into later phases' system prompts (run/phases.go:76, run/phases.go:122, run/phases.go:144, run/phases.go:262-275)expandPhaseTemplate substitutes prior-phase outputs (model-generated, influenced by adversarial query/tool content) via {{.<PhaseName>}}; the expanded instructions become the next phase's System: prompt (systemPromptWithBodyagent.New(model, system, …) / model.Generate{System: system}). In the single-loop path model…

🪰 Gadfly · advisory

🟠 **Untrusted model outputs (prior phase results) are injected into later phases' system prompts via {{.<PhaseName>}}, creating a prompt-injection escalation path where adversarial content gains system-prompt privilege.** _security · flagged by 1 model_ - **Untrusted model outputs are injected into later phases' system prompts (`run/phases.go:76`, `run/phases.go:122`, `run/phases.go:144`, `run/phases.go:262-275`)** — `expandPhaseTemplate` substitutes prior-phase outputs (model-generated, influenced by adversarial query/tool content) via `{{.<PhaseName>}}`; the expanded `instructions` become the next phase's `System:` prompt (`systemPromptWithBody` → `agent.New(model, system, …)` / `model.Generate{System: system}`). In the single-loop path model… <sub>🪰 Gadfly · advisory</sub>
var lastOutput string
var totalUsage llm.Usage
// finish stamps the aggregated usage + final output onto the synthetic result.
finish := func(err error) (*agent.Result, error) {
if lastResult == nil {
lastResult = &agent.Result{}
}
lastResult.Usage = totalUsage
if err == nil {
lastResult.Output = lastOutput
}
return lastResult, err
Outdated
Review

🔴 Optional-phase branch swallows context cancellation/deadline/critic-kill; a killed/timed-out run with an Optional final phase reports status "ok" with a fallback message

correctness, error-handling · flagged by 3 models

1. Optional phase handling swallows context cancellation / deadline / critic-kill — a killed or timed-out run whose final phase is Optional is reported as a successful runrun/phases.go:89

🪰 Gadfly · advisory

🔴 **Optional-phase branch swallows context cancellation/deadline/critic-kill; a killed/timed-out run with an Optional final phase reports status "ok" with a fallback message** _correctness, error-handling · flagged by 3 models_ **1. `Optional` phase handling swallows context cancellation / deadline / critic-kill — a killed or timed-out run whose final phase is `Optional` is reported as a successful run** — `run/phases.go:89` <sub>🪰 Gadfly · advisory</sub>
}
for i, phase := range ra.Phases {
// A killed/timed-out/cancelled run must not start its next phase.
if err := runCtx.Err(); err != nil {
return finish(err)
}
instructions := expandPhaseTemplate(phase.SystemPrompt, query, outputs)
if deps.rec != nil {
deps.rec.LogEvent("phase_start", map[string]any{"phase": phase.Name})
Review

🟠 Last non-optional phase exhausting its budget with empty salvage aborts the run with a hard error and empty output, discarding all prior phases' completed work — contradicting the documented 'NOT fatal / never discards earlier work' semantics

correctness, error-handling, maintainability, performance · flagged by 3 models

  • run/phases.go:100-118 — a final non-optional phase that exhausts its step/tool budget with no salvageable narrated text aborts the whole run with a hard error and empty output, discarding all earlier phases' completed work — contradicting the file's own documented "NOT fatal" guarantee. Confirmed by tracing the switch at lines 88–119. runOnePhase (lines 178–190) only sets output to a non-empty value when salvagePhaseTranscript(res) finds assistant prose; otherwise output stays `"…

🪰 Gadfly · advisory

🟠 **Last non-optional phase exhausting its budget with empty salvage aborts the run with a hard error and empty output, discarding all prior phases' completed work — contradicting the documented 'NOT fatal / never discards earlier work' semantics** _correctness, error-handling, maintainability, performance · flagged by 3 models_ - **`run/phases.go:100-118` — a final non-optional phase that exhausts its step/tool budget with no salvageable narrated text aborts the whole run with a hard error and empty output, discarding all earlier phases' completed work — contradicting the file's own documented "NOT fatal" guarantee.** Confirmed by tracing the switch at lines 88–119. `runOnePhase` (lines 178–190) only sets `output` to a non-empty value when `salvagePhaseTranscript(res)` finds assistant prose; otherwise `output` stays `"… <sub>🪰 Gadfly · advisory</sub>
}
output, res, err := e.runOnePhase(runCtx, ra, deps, phase, instructions, query, images)
if res != nil {
lastResult = res
totalUsage = addUsage(totalUsage, res.Usage)
}
if err != nil {
// A context cancellation / deadline / critic-kill is NEVER swallowed by
// the Optional or budget-salvage branches — the run genuinely ended and
// must surface as cancelled/timeout/killed (statusFor classifies it).
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return finish(err)
}
isLast := i == len(ra.Phases)-1
trimmed := strings.TrimSpace(output)
switch {
case phase.Optional:
Review

🟡 totalUsage not applied to lastResult on hard-error return; prior phases' token usage lost from run result

correctness · flagged by 1 model

  • run/phases.go:118totalUsage not applied to lastResult on hard-error return (verified by tracing lines 82-84, 118, and executor.go:358-360). When a non-optional phase fails hard and runOnePhase returns a non-nil partial res, line 83 overwrites lastResult = res and line 84 accumulates totalUsage correctly. But the error return on line 118 sends lastResult as-is — its .Usage field still holds only the failed phase's partial usage, not the accumulated totalUsage that i…

🪰 Gadfly · advisory

🟡 **totalUsage not applied to lastResult on hard-error return; prior phases' token usage lost from run result** _correctness · flagged by 1 model_ - **`run/phases.go:118` — `totalUsage` not applied to `lastResult` on hard-error return** (verified by tracing lines 82-84, 118, and `executor.go:358-360`). When a non-optional phase fails hard and `runOnePhase` returns a non-nil partial `res`, line 83 overwrites `lastResult = res` and line 84 accumulates `totalUsage` correctly. But the error return on line 118 sends `lastResult` as-is — its `.Usage` field still holds only the failed phase's partial usage, not the accumulated `totalUsage` that i… <sub>🪰 Gadfly · advisory</sub>
output = phase.FallbackMessage
if output == "" {
output = fmt.Sprintf("(Phase %q encountered an error -- proceeding without its results)", phase.Name)
}
slog.Warn("run: optional pipeline phase failed",
"agent", ra.Name, "phase", phase.Name, "error", err)
if deps.rec != nil {
deps.rec.LogEvent("phase_failed_optional", map[string]any{"phase": phase.Name, "error": err.Error()})
}
Review

🟡 When the last phase fails as Optional, synthesized lastResult carries a prior phase's transcript for the PostRun hook

error-handling · flagged by 1 model

  • run/phases.go:128 runPhases — final synthesized lastResult may carry a prior phase's transcript when the last phase fails optionally. When the last phase is Optional and fails with a nil res, lastResult is not updated and retains the previous phase's agent.Result; the code then overwrites Output/Usage with the fallback/accumulated usage, but Messages remains the prior phase's transcript (passed to the PostRun hook). The doc says "last phase's transcript for the PostRu…

🪰 Gadfly · advisory

🟡 **When the last phase fails as Optional, synthesized lastResult carries a prior phase's transcript for the PostRun hook** _error-handling · flagged by 1 model_ - **`run/phases.go:128` `runPhases` — final synthesized `lastResult` may carry a *prior* phase's transcript when the last phase fails optionally.** When the last phase is `Optional` and fails with a nil `res`, `lastResult` is not updated and retains the previous phase's `agent.Result`; the code then overwrites `Output`/`Usage` with the fallback/accumulated usage, but `Messages` remains the prior phase's transcript (passed to the PostRun hook). The doc says "last phase's transcript for the PostRu… <sub>🪰 Gadfly · advisory</sub>
case isPhaseBudgetExhaustion(err) && (!isLast || trimmed != ""):
// Soft stop: the phase ran out of its step/tool budget before
// composing a final answer. Not fatal — it did real work (runOnePhase
// salvaged its partial transcript into output), and aborting would
// discard every completed phase before it. Degrade and continue.
// (A FINAL phase that salvaged nothing falls through to the hard error
// below: there is no result to return.)
if trimmed == "" {
output = fmt.Sprintf("(Phase %q reached its step budget before producing a consolidated result; continuing with its partial findings.)", phase.Name)
} else {
output += fmt.Sprintf("\n\n(Note: phase %q reached its step budget before fully completing; the above is its partial output.)", phase.Name)
}
slog.Warn("run: pipeline phase exhausted its budget; salvaging partial output and continuing",
"agent", ra.Name, "phase", phase.Name, "last_phase", isLast, "error", err)
if deps.rec != nil {
Review

🟡 Inconsistent error handling in runOnePhase

maintainability · flagged by 1 model

  1. Inconsistent Error Handling in runOnePhase - Location: run/phases.go:143-200 - Issue: The runOnePhase function handles errors in a nested manner, which can make it difficult to follow the flow of execution. The error handling logic is intertwined with the main logic. - Suggestion: Extract error handling into separate functions or use a more structured approach to improve readability.

🪰 Gadfly · advisory

🟡 **Inconsistent error handling in runOnePhase** _maintainability · flagged by 1 model_ 4. **Inconsistent Error Handling in `runOnePhase`** - **Location**: `run/phases.go:143-200` - **Issue**: The `runOnePhase` function handles errors in a nested manner, which can make it difficult to follow the flow of execution. The error handling logic is intertwined with the main logic. - **Suggestion**: Extract error handling into separate functions or use a more structured approach to improve readability. <sub>🪰 Gadfly · advisory</sub>
deps.rec.LogEvent("phase_budget_exhausted", map[string]any{"phase": phase.Name, "error": err.Error(), "last_phase": isLast})
}
default:
return finish(fmt.Errorf("pipeline phase %q: %w", phase.Name, err))
}
}
outputs[phase.Name] = output
lastOutput = output
}
Review

🟠 IsRunFunc phases bypass shared stepObserver; critic clock not refreshed, steps missing from emitter

correctness · flagged by 2 models

🪰 Gadfly · advisory

🟠 **IsRunFunc phases bypass shared stepObserver; critic clock not refreshed, steps missing from emitter** _correctness · flagged by 2 models_ <sub>🪰 Gadfly · advisory</sub>
return finish(nil)
}
// runOnePhase runs a single phase: a bare LLM call for IsRunFunc phases, a fresh
// agent loop otherwise. Returns the phase output, the loop result (nil for a
// failed bare call), and any error. On a budget-exhaustion error the loop's
// partial transcript is salvaged into the returned output.
func (e *Executor) runOnePhase(runCtx context.Context, ra RunnableAgent, deps phaseDeps, phase Phase, instructions, query string, images []llm.ImagePart) (string, *agent.Result, error) {
phaseCtx, model := e.phaseModel(runCtx, deps, ra, phase)
// The phase's expanded instructions are the system prompt (with the platform
// header so tools keep their run ids); the original query is the user message.
system := e.systemPromptWithBody(instructions)
if phase.IsRunFunc {
// Bare LLM call: no tool loop, no tools array (some models 400 on an empty
// tools list). The response is fed through the SAME step observer as a loop
// step so the audit token tally, Result.Steps, AND the critic's activity
// clock all see it (a long synthesize phase must not look idle to the critic).
msgs := []llm.Message{multimodalUserMessage(query, images)}
resp, err := model.Generate(phaseCtx, llm.Request{System: system, Messages: msgs})
if err != nil {
return "", nil, fmt.Errorf("phase %q model call: %w", phase.Name, err)
}
if deps.stepObserver != nil {
deps.stepObserver(agent.Step{Index: 0, Response: resp})
}
return resp.Text(), &agent.Result{
Output: resp.Text(),
Usage: resp.Usage,
Messages: append(msgs, resp.Message()),
}, nil
Review

🟠 Missing nil check for res.Steps

error-handling · flagged by 1 model

  1. Finding at run/phases.go:186: The draft claims that salvagePhaseTranscript assumes res.Steps is non-nil without an explicit check. Looking at the code, the function salvagePhaseTranscript does iterate over res.Steps without a nil check. This is a valid concern, and the finding should be kept.

🪰 Gadfly · advisory

🟠 **Missing nil check for res.Steps** _error-handling · flagged by 1 model_ 2. **Finding at `run/phases.go:186`**: The draft claims that `salvagePhaseTranscript` assumes `res.Steps` is non-nil without an explicit check. Looking at the code, the function `salvagePhaseTranscript` does iterate over `res.Steps` without a nil check. This is a valid concern, and the finding should be kept. <sub>🪰 Gadfly · advisory</sub>
}
toolbox := filterToolbox(deps.baseToolbox, phase.Tools)
maxIter := phase.MaxIterations
if maxIter <= 0 {
maxIter = deps.baseMaxIter
}
// Per-phase opts: a fixed step ceiling for this phase (the critic's dynamic
// ceiling is intentionally not propagated to phases) + the phase toolbox, on
// top of the shared opts (tool-error limits, step observer, compactor).
opts := append([]agent.Option{
agent.WithToolbox(toolbox),
agent.WithMaxSteps(maxIter),
Review

🟠 phaseModel discards the resolver's enriched usage-attribution context, so non-base-tier phases can be mis-attributed when resp.Model is empty (single-loop path adopts it via ctx = modelCtx)

correctness, error-handling · flagged by 2 models

2. Per-phase model resolution discards the enriched usage-attribution contextrun/phases.go:199 (phaseModel: _, m, err := e.cfg.Models(ctx, phase.ModelTier))

🪰 Gadfly · advisory

🟠 **phaseModel discards the resolver's enriched usage-attribution context, so non-base-tier phases can be mis-attributed when resp.Model is empty (single-loop path adopts it via ctx = modelCtx)** _correctness, error-handling · flagged by 2 models_ **2. Per-phase model resolution discards the enriched usage-attribution context** — `run/phases.go:199` (`phaseModel`: `_, m, err := e.cfg.Models(ctx, phase.ModelTier)`) <sub>🪰 Gadfly · advisory</sub>
}, deps.sharedOpts...)
ag := agent.New(model, system, opts...)
res, runErr := runAgent(phaseCtx, ag, query, images, agent.WithSteer(deps.steer))
output := ""
if res != nil {
output = res.Output
}
// Budget/guard exhaustion leaves a usable partial transcript but an empty
// final answer; salvage the narrated work so the pipeline can carry it forward.
if runErr != nil && isPhaseBudgetExhaustion(runErr) {
if salvaged := salvagePhaseTranscript(res); salvaged != "" {
output = salvaged
Review

🟡 run/phases.go:212

correctness · flagged by 1 model

  • run/phases.go:212: The isPhaseBudgetExhaustion function checks for agent.ErrMaxSteps and agent.ErrToolLoop, but these errors are not defined in the repository. This could lead to incorrect behavior if the errors are not properly defined or imported. Verified by searching for the error definitions in the repository and finding no matches.

🪰 Gadfly · advisory

🟡 **`run/phases.go:212`** _correctness · flagged by 1 model_ - **`run/phases.go:212`**: The `isPhaseBudgetExhaustion` function checks for `agent.ErrMaxSteps` and `agent.ErrToolLoop`, but these errors are not defined in the repository. This could lead to incorrect behavior if the errors are not properly defined or imported. **Verified** by searching for the error definitions in the repository and finding no matches. <sub>🪰 Gadfly · advisory</sub>
}
}
return output, res, runErr
}
// phaseModel resolves the phase's model tier, returning the resolver's enriched
// context (usage attribution) alongside the model. An empty tier or a resolution
// failure falls back to the base model + the run context (WARN — visible, not
// fatal). Returning the enriched ctx mirrors the single-loop path, which adopts
// ctx = modelCtx, so a non-base-tier phase's calls are attributed correctly.
func (e *Executor) phaseModel(ctx context.Context, deps phaseDeps, ra RunnableAgent, phase Phase) (context.Context, llm.Model) {
if phase.ModelTier == "" {
return ctx, deps.baseModel
}
modelCtx, m, err := e.cfg.Models(ctx, phase.ModelTier)
if err != nil || m == nil {
reason := "resolver returned a nil model"
if err != nil {
reason = err.Error()
}
slog.Warn("run: pipeline phase model resolve failed; using base model",
"agent", ra.Name, "phase", phase.Name, "tier", phase.ModelTier, "reason", reason)
return ctx, deps.baseModel
Outdated
Review

🔴 salvagePhaseTranscript truncates by bytes, corrupting multi-byte UTF-8 runes

correctness, error-handling, maintainability, performance · flagged by 7 models

  • run/phases.go:236-238salvagePhaseTranscript truncates by byte length (len(out) > maxSalvage and out[len(out)-maxSalvage:]). For any multi-byte UTF-8 text this slices through runes, producing invalid UTF-8 and garbled output fed into downstream phases. The limit should count runes (utf8.RuneCountInString) and slice on a rune boundary.

🪰 Gadfly · advisory

🔴 **salvagePhaseTranscript truncates by bytes, corrupting multi-byte UTF-8 runes** _correctness, error-handling, maintainability, performance · flagged by 7 models_ - **`run/phases.go:236-238`** — `salvagePhaseTranscript` truncates by **byte length** (`len(out) > maxSalvage` and `out[len(out)-maxSalvage:]`). For any multi-byte UTF-8 text this slices through runes, producing invalid UTF-8 and garbled output fed into downstream phases. The limit should count runes (`utf8.RuneCountInString`) and slice on a rune boundary. <sub>🪰 Gadfly · advisory</sub>
}
return modelCtx, m
}
// isPhaseBudgetExhaustion reports whether err is a soft budget/guard stop (the
// loop hit its step cap or tripped a tool-error guard) — which leaves a usable
// partial transcript — as opposed to a hard error (cancellation, model failure).
func isPhaseBudgetExhaustion(err error) bool {
return errors.Is(err, agent.ErrMaxSteps) || errors.Is(err, agent.ErrToolLoop)
Review

🟠 phaseUserMessage duplicates runAgent's multimodal message construction — extract shared helper

maintainability · flagged by 5 models

  • phaseUserMessage duplicates runAgent's multimodal message logic (run/phases.go:244–256 vs run/executor.go:491–506). Both functions build a llm.Message from query + images with the same pattern: no images → simple path, otherwise build []llm.Part with optional text + image parts. The comment on phaseUserMessage even acknowledges it "mirrors" runAgent. Extract a shared helper (e.g., func userMessage(query string, images []llm.ImagePart) llm.Message) so the two call sites…

🪰 Gadfly · advisory

🟠 **phaseUserMessage duplicates runAgent's multimodal message construction — extract shared helper** _maintainability · flagged by 5 models_ - **`phaseUserMessage` duplicates `runAgent`'s multimodal message logic** (`run/phases.go:244–256` vs `run/executor.go:491–506`). Both functions build a `llm.Message` from `query + images` with the same pattern: no images → simple path, otherwise build `[]llm.Part` with optional text + image parts. The comment on `phaseUserMessage` even acknowledges it "mirrors" `runAgent`. Extract a shared helper (e.g., `func userMessage(query string, images []llm.ImagePart) llm.Message`) so the two call sites… <sub>🪰 Gadfly · advisory</sub>
}
// maxSalvageBytes bounds a salvaged partial transcript so a long phase's narrated
// reasoning doesn't blow up the next phase's prompt (the tail is the most recent,
// most relevant reasoning). Matches mort's pipeline cap.
const maxSalvageBytes = 8000
// salvagePhaseTranscript reconstructs a best-effort phase output from a loop that
// ended without a final answer: the assistant's narrated text across every step,
// tail-trimmed to maxSalvageBytes on a rune boundary. Returns "" when the model
// wrote no prose.
func salvagePhaseTranscript(res *agent.Result) string {
if res == nil {
return ""
}
var b strings.Builder
Review

🔴 expandPhaseTemplate silently fails for phase names with hyphens or non-identifier characters

correctness, error-handling, maintainability, performance, security · flagged by 6 models

  • run/phases.go:262-275expandPhaseTemplate builds text/template variables as {{.<PhaseName>}} (line 269: data[k] = v). Go text/template identifiers may contain only letters, digits, and underscores. Phase names with hyphens (deep-research), digits at the start, spaces, etc. parse as invalid expressions or fail execution silently, causing the template to return unexpanded — downstream phases receive literal {{.deep-research}} instead of the actual prior output. This con…

🪰 Gadfly · advisory

🔴 **expandPhaseTemplate silently fails for phase names with hyphens or non-identifier characters** _correctness, error-handling, maintainability, performance, security · flagged by 6 models_ - **`run/phases.go:262-275`** — `expandPhaseTemplate` builds `text/template` variables as `{{.<PhaseName>}}` (line 269: `data[k] = v`). Go `text/template` identifiers may contain only letters, digits, and underscores. Phase names with hyphens (`deep-research`), digits at the start, spaces, etc. parse as invalid expressions or fail execution silently, causing the template to return **unexpanded** — downstream phases receive literal `{{.deep-research}}` instead of the actual prior output. This con… <sub>🪰 Gadfly · advisory</sub>
for _, step := range res.Steps {
if step.Response == nil {
continue
}
if t := strings.TrimSpace(step.Response.Text()); t != "" {
if b.Len() > 0 {
b.WriteString("\n\n")
Review

🟡 Unsanitized user input in template data could lead to XSS or injection

security · flagged by 1 model

Finding 2: Unsanitized User Input in Template Data (run/phases.go:267-270) - Verification: The query and priorOutputs values are directly passed into the template data map without sanitization. - Impact: Confirmed. This could lead to XSS or other injection issues if the output is rendered in a web context. - Conclusion: This finding is valid.

🪰 Gadfly · advisory

🟡 **Unsanitized user input in template data could lead to XSS or injection** _security · flagged by 1 model_ ### Finding 2: Unsanitized User Input in Template Data (`run/phases.go:267-270`) - **Verification**: The `query` and `priorOutputs` values are directly passed into the template data map without sanitization. - **Impact**: Confirmed. This could lead to XSS or other injection issues if the output is rendered in a web context. - **Conclusion**: This finding is valid. <sub>🪰 Gadfly · advisory</sub>
}
b.WriteString(t)
}
}
out := strings.TrimSpace(b.String())
if len(out) > maxSalvageBytes {
tail := out[len(out)-maxSalvageBytes:]
// Advance to the next rune boundary so the cut never splits a UTF-8 rune.
for len(tail) > 0 && !utf8.RuneStart(tail[0]) {
tail = tail[1:]
}
out = "...(earlier reasoning trimmed)...\n" + tail
}
return out
}
Review

🟠 filterToolbox copies entire toolbox when no filtering is requested

correctness, performance · flagged by 3 models

  • filterToolbox copies entire toolbox when no filtering is requestedrun/phases.go:283-287 When a phase does not specify a tool subset (len(names) == 0), filterToolbox still allocates a new llm.Toolbox and copies every tool into it. If the base toolbox is large and the pipeline has many phases, this is unnecessary allocation and Add overhead on every phase of every run. The single-loop path already passes the original toolbox pointer directly without copying. Fix: Return…

🪰 Gadfly · advisory

🟠 **filterToolbox copies entire toolbox when no filtering is requested** _correctness, performance · flagged by 3 models_ * **`filterToolbox` copies entire toolbox when no filtering is requested** — `run/phases.go:283-287` When a phase does not specify a tool subset (`len(names) == 0`), `filterToolbox` still allocates a new `llm.Toolbox` and copies every tool into it. If the base toolbox is large and the pipeline has many phases, this is unnecessary allocation and `Add` overhead on every phase of every run. The single-loop path already passes the original `toolbox` pointer directly without copying. **Fix:** Return… <sub>🪰 Gadfly · advisory</sub>
// multimodalUserMessage builds a user message from text + inline images. Shared
// by the phase runner and runAgent so the image-folding lives in one place.
// Empty text with images yields an image-only message (no empty text part).
func multimodalUserMessage(text string, images []llm.ImagePart) llm.Message {
if len(images) == 0 {
return llm.UserText(text)
}
parts := make([]llm.Part, 0, len(images)+1)
if strings.TrimSpace(text) != "" {
parts = append(parts, llm.Text(text))
}
for _, img := range images {
parts = append(parts, img)
}
return llm.UserParts(parts...)
}
// expandPhaseTemplate applies Go text/template substitution to a phase prompt,
// replacing {{.Query}} with the original query and {{.<PhaseName>}} with a prior
// phase's output. On a parse/execute error it logs a WARN and returns the
// template unchanged (best-effort, non-fatal) so a misconfigured prompt is
Outdated
Review

🟠 addUsage manually sums llm.Usage fields, brittle to upstream changes

maintainability · flagged by 2 models

  • run/phases.go:304-311addUsage manually accumulates every field of llm.Usage. If the upstream struct gains a new field (e.g. a future cost or token category), this function silently under-counts. Prefer an Add method on llm.Usage (if majordomo doesn't provide one, add it there) or generate this with a simple reflection helper; at minimum add a compile-time assertion that verifies field count.

🪰 Gadfly · advisory

🟠 **addUsage manually sums llm.Usage fields, brittle to upstream changes** _maintainability · flagged by 2 models_ - **`run/phases.go:304-311`** — `addUsage` manually accumulates every field of `llm.Usage`. If the upstream struct gains a new field (e.g. a future cost or token category), this function silently under-counts. Prefer an `Add` method on `llm.Usage` (if majordomo doesn't provide one, add it there) or generate this with a simple reflection helper; at minimum add a compile-time assertion that verifies field count. <sub>🪰 Gadfly · advisory</sub>
// visible rather than silently masked.
func expandPhaseTemplate(tmpl, query string, priorOutputs map[string]string) string {
t, err := template.New("phase").Option("missingkey=zero").Parse(tmpl)
if err != nil {
slog.Warn("run: pipeline phase template parse failed; using it unexpanded", "error", err)
return tmpl
}
data := map[string]string{"Query": query}
for k, v := range priorOutputs {
data[k] = v
}
var buf bytes.Buffer
if err := t.Execute(&buf, data); err != nil {
slog.Warn("run: pipeline phase template execute failed; using it unexpanded", "error", err)
return tmpl
}
return buf.String()
}
// filterToolbox returns a toolbox restricted to the named tools (preserving
// palette order). Empty names = the full palette (the base toolbox is returned
// as-is — it is read-only during a run, like the single-loop path). Unknown names
// are skipped with a WARN — a typo'd phase tool list should not abort a run.
func filterToolbox(box *llm.Toolbox, names []string) *llm.Toolbox {
if len(names) == 0 {
return box
}
out := llm.NewToolbox(box.Name())
for _, name := range names {
t, ok := box.Get(name)
if !ok {
slog.Warn("run: pipeline phase references unknown tool; skipping", "tool", name)
continue
}
if err := out.Add(t); err != nil {
slog.Warn("run: pipeline phase tool duplicated; skipping", "tool", name, "error", err)
}
}
return out
}
// addUsage sums two llm.Usage tallies field-by-field so a phased run reports the
// total tokens across all phases. NOTE: if llm.Usage gains a field, add it here
// too — the audit recorder (rec) is the authoritative per-run token source, this
// is the secondary Result.Usage roll-up.
func addUsage(a, b llm.Usage) llm.Usage {
a.InputTokens += b.InputTokens
a.OutputTokens += b.OutputTokens
a.CacheReadTokens += b.CacheReadTokens
a.CacheWriteTokens += b.CacheWriteTokens
a.ReasoningTokens += b.ReasoningTokens
return a
}
+258
View File
@@ -0,0 +1,258 @@
package run
import (
"context"
"encoding/json"
"errors"
"strings"
"testing"
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
"gitea.stevedudenhoeffer.com/steve/executus/tool"
)
// phaseProvider builds a fake provider scripted with the given per-call steps
// (consumed in order across every phase's model call) and a resolver over it,
// returning both so a test can read back each call's request.
func phaseProvider(t *testing.T, steps ...fake.Step) (ModelResolver, *fake.Provider) {
t.Helper()
fp := fake.New("fake")
fp.Enqueue("test-model", steps...)
m, err := fp.Model("test-model")
if err != nil {
t.Fatalf("fake model: %v", err)
}
return func(ctx context.Context, _ string) (context.Context, llm.Model, error) {
return ctx, m, nil
}, fp
}
// TestPhases_SequentialThreadsOutputs: phases run in order, each phase's output
// is threaded into the next via {{.<PhaseName>}}, {{.Query}} reaches a phase, and
// the final phase's output is the run output.
func TestPhases_SequentialThreadsOutputs(t *testing.T) {
models, fp := phaseProvider(t,
fake.Reply("out-a"),
fake.Reply("out-b"),
fake.Reply("out-c"),
)
ex := New(Config{Registry: tool.NewRegistry(), Models: models})
ra := RunnableAgent{
Name: "pipeline",
ModelTier: "test-model",
Phases: []Phase{
{Name: "a", SystemPrompt: "Phase A instructions"},
{Name: "b", SystemPrompt: "B saw: {{.a}}"},
{Name: "c", SystemPrompt: "C saw: {{.b}} and query {{.Query}}"},
},
}
res := ex.Run(context.Background(), ra, tool.Invocation{RunID: "r", CallerID: "c"}, "QUERY-TEXT")
if res.Err != nil {
t.Fatalf("run error: %v", res.Err)
}
if res.Output != "out-c" {
t.Fatalf("final output = %q, want the LAST phase's output out-c", res.Output)
}
calls := fp.Calls()
if len(calls) != 3 {
t.Fatalf("want 3 model calls (one per phase), got %d", len(calls))
}
if got := calls[0].Request.System; got != "Phase A instructions" {
t.Errorf("phase a system = %q", got)
}
if got := calls[1].Request.System; got != "B saw: out-a" {
t.Errorf("phase b should see phase a's output threaded; system = %q", got)
}
if got := calls[2].Request.System; got != "C saw: out-b and query QUERY-TEXT" {
t.Errorf("phase c should see phase b's output + {{.Query}}; system = %q", got)
}
}
// TestPhases_OptionalFailureSubstitutesFallback: an Optional phase that errors
// does not abort the pipeline — its FallbackMessage becomes its output and is
// threaded into later phases, which still run.
func TestPhases_OptionalFailureSubstitutesFallback(t *testing.T) {
models, fp := phaseProvider(t,
fake.Fail(errors.New("provider exploded")), // phase a fails
fake.Reply("out-b"), // phase b runs
)
ex := New(Config{Registry: tool.NewRegistry(), Models: models})
ra := RunnableAgent{
Name: "pipeline",
ModelTier: "test-model",
Phases: []Phase{
{Name: "a", SystemPrompt: "Phase A", Optional: true, FallbackMessage: "FALLBACK-A"},
{Name: "b", SystemPrompt: "B saw: {{.a}}"},
},
}
res := ex.Run(context.Background(), ra, tool.Invocation{RunID: "r", CallerID: "c"}, "Q")
if res.Err != nil {
t.Fatalf("optional-phase failure must not fail the run: %v", res.Err)
}
if res.Output != "out-b" {
t.Fatalf("final output = %q, want out-b", res.Output)
}
calls := fp.Calls()
if len(calls) != 2 {
t.Fatalf("want 2 calls (failed phase a + phase b), got %d", len(calls))
}
if got := calls[1].Request.System; got != "B saw: FALLBACK-A" {
t.Errorf("phase b should see the fallback threaded; system = %q", got)
}
}
// TestPhases_OptionalDoesNotSwallowCancellation: an Optional phase that fails
// with a context cancellation must NOT be swallowed into its FallbackMessage —
// the run genuinely ended (cancel/deadline/critic-kill) and must surface the
// error so the run is classified cancelled/timeout/killed, not "ok".
func TestPhases_OptionalDoesNotSwallowCancellation(t *testing.T) {
models, _ := phaseProvider(t, fake.Fail(context.Canceled))
ex := New(Config{Registry: tool.NewRegistry(), Models: models})
ra := RunnableAgent{
Name: "pipeline",
ModelTier: "test-model",
Phases: []Phase{
// IsRunFunc so the cancellation surfaces directly wrapped (%w).
{Name: "a", SystemPrompt: "Phase A", IsRunFunc: true, Optional: true, FallbackMessage: "FB"},
},
}
res := ex.Run(context.Background(), ra, tool.Invocation{RunID: "r", CallerID: "c"}, "Q")
if !errors.Is(res.Err, context.Canceled) {
t.Fatalf("Optional phase must NOT swallow a cancellation; res.Err = %v", res.Err)
}
if res.Output == "FB" {
t.Error("a cancelled run must not report the fallback message as output")
}
}
// TestPhases_HardErrorAborts: a NON-optional phase that hits a hard error (not a
// budget/step exhaustion) aborts the pipeline; later phases do not run.
func TestPhases_HardErrorAborts(t *testing.T) {
boom := errors.New("model down")
models, fp := phaseProvider(t,
fake.Fail(boom), // phase a (non-optional) fails hard
fake.Reply("out-b"), // must NOT be consumed
)
ex := New(Config{Registry: tool.NewRegistry(), Models: models})
ra := RunnableAgent{
Name: "pipeline",
ModelTier: "test-model",
Phases: []Phase{
{Name: "a", SystemPrompt: "Phase A"},
{Name: "b", SystemPrompt: "Phase B"},
},
}
res := ex.Run(context.Background(), ra, tool.Invocation{RunID: "r", CallerID: "c"}, "Q")
if res.Err == nil {
t.Fatal("a hard non-optional phase error must fail the run")
}
if !errors.Is(res.Err, boom) {
t.Errorf("run error %v should wrap the phase's model error", res.Err)
}
if n := len(fp.Calls()); n != 1 {
t.Errorf("pipeline must abort after phase a; got %d calls (phase b should not run)", n)
}
}
// TestPhases_IsRunFuncBareCall: an IsRunFunc phase produces output via a bare LLM
// call and that output threads into a following loop phase.
func TestPhases_IsRunFuncBareCall(t *testing.T) {
models, fp := phaseProvider(t,
fake.Reply("plan-output"), // IsRunFunc phase a
fake.Reply("final"), // loop phase b
)
ex := New(Config{Registry: tool.NewRegistry(), Models: models})
ra := RunnableAgent{
Name: "pipeline",
ModelTier: "test-model",
Phases: []Phase{
{Name: "plan", SystemPrompt: "Make a plan for {{.Query}}", IsRunFunc: true},
{Name: "exec", SystemPrompt: "Execute: {{.plan}}"},
},
}
res := ex.Run(context.Background(), ra, tool.Invocation{RunID: "r", CallerID: "c"}, "do-thing")
if res.Err != nil {
t.Fatalf("run error: %v", res.Err)
}
if res.Output != "final" {
t.Fatalf("output = %q, want final", res.Output)
}
calls := fp.Calls()
if len(calls) != 2 {
t.Fatalf("want 2 calls, got %d", len(calls))
}
if got := calls[0].Request.System; got != "Make a plan for do-thing" {
t.Errorf("IsRunFunc phase system = %q", got)
}
if got := calls[1].Request.System; got != "Execute: plan-output" {
t.Errorf("exec phase should see the plan output threaded; system = %q", got)
}
}
// TestPhases_SystemHeaderAppliedPerPhase: the platform SystemHeader is prepended
// to every phase's prompt (each phase keeps it).
func TestPhases_SystemHeaderAppliedPerPhase(t *testing.T) {
models, fp := phaseProvider(t, fake.Reply("a"), fake.Reply("b"))
ex := New(Config{Registry: tool.NewRegistry(), Models: models, SystemHeader: "PLATFORM"})
ra := RunnableAgent{
Name: "p",
ModelTier: "test-model",
Phases: []Phase{{Name: "one", SystemPrompt: "P1"}, {Name: "two", SystemPrompt: "P2"}},
}
if res := ex.Run(context.Background(), ra, tool.Invocation{RunID: "r"}, "Q"); res.Err != nil {
t.Fatalf("run error: %v", res.Err)
}
for i, want := range []string{"PLATFORM\n\nP1", "PLATFORM\n\nP2"} {
if got := fp.Calls()[i].Request.System; got != want {
t.Errorf("phase %d system = %q, want %q", i, got, want)
}
}
}
// TestFilterToolbox: a named subset restricts the toolbox (preserving order);
// empty names = the full palette; unknown names are skipped.
func TestFilterToolbox(t *testing.T) {
box := llm.NewToolbox("base")
noop := func(context.Context, json.RawMessage) (any, error) { return "", nil }
for _, name := range []string{"alpha", "beta", "gamma"} {
if err := box.Add(llm.Tool{Name: name, Description: "d", Handler: noop}); err != nil {
t.Fatalf("add %s: %v", name, err)
}
}
full := filterToolbox(box, nil)
if len(full.Tools()) != 3 {
t.Errorf("nil names = full palette; got %d tools", len(full.Tools()))
}
sub := filterToolbox(box, []string{"gamma", "alpha", "nonexistent"})
names := make([]string, 0)
for _, tl := range sub.Tools() {
names = append(names, tl.Name)
}
if strings.Join(names, ",") != "gamma,alpha" {
t.Errorf("subset (order-preserving, unknown skipped) = %v, want [gamma alpha]", names)
}
}
// TestExpandPhaseTemplate: {{.Query}} + prior outputs substitute; a parse error
// returns the template unchanged (best-effort).
func TestExpandPhaseTemplate(t *testing.T) {
got := expandPhaseTemplate("q={{.Query}} a={{.a}}", "QQ", map[string]string{"a": "AA"})
if got != "q=QQ a=AA" {
t.Errorf("expand = %q", got)
}
// Malformed template → returned unchanged.
bad := "{{.Unclosed"
if expandPhaseTemplate(bad, "QQ", nil) != bad {
t.Errorf("malformed template should pass through unchanged")
}
}