feat(run): execute multi-phase pipelines (RunnableAgent.Phases) #19

Merged
steve merged 2 commits from feat/kernel-phases into main 2026-06-29 19:52:52 +00:00
3 changed files with 151 additions and 87 deletions
Showing only changes of commit 0dd2ced717 - Show all commits
+10 -17
View File
@@ -4,7 +4,6 @@ import (
"context"
"errors"
"fmt"
"strings"
"time"
"gitea.stevedudenhoeffer.com/steve/majordomo/agent"
@@ -344,13 +343,13 @@ func (e *Executor) Run(ctx context.Context, ra RunnableAgent, inv tool.Invocatio
// (Per-phase step caps are fixed — the critic's dynamic ceiling is not
// propagated to phases — but its steer + hard deadline still apply.)
runRes, runErr = e.runPhases(runCtx, ra, phaseDeps{
baseModel: model,
baseTier: tier,
baseToolbox: toolbox,
baseMaxIter: maxIter,
sharedOpts: sharedOpts,
steer: steer,
rec: rec,
baseModel: model,
baseToolbox: toolbox,
baseMaxIter: maxIter,
sharedOpts: sharedOpts,
stepObserver: stepObserver,
steer: steer,
rec: rec,
}, input, inv.Images)
}
@@ -492,15 +491,9 @@ func runAgent(ctx context.Context, ag *agent.Agent, input string, images []llm.I
if len(images) == 0 {
return ag.Run(ctx, input, opts...)
}
parts := make([]llm.Part, 0, len(images)+1)
if strings.TrimSpace(input) != "" {
parts = append(parts, llm.Text(input))
}
for _, img := range images {
parts = append(parts, img)
}
// Copy opts before appending so a caller-supplied backing array is never
// mutated/aliased (the variadic slice can have spare capacity).
opts = append(opts[:len(opts):len(opts)], agent.WithHistory([]llm.Message{llm.UserParts(parts...)}))
// mutated/aliased (the variadic slice can have spare capacity). The multimodal
// opening turn (text + image parts) is built by the shared helper.
opts = append(opts[:len(opts):len(opts)], agent.WithHistory([]llm.Message{multimodalUserMessage(input, images)}))
return ag.Run(ctx, "", opts...)
}
+116 -70
View File
@@ -8,6 +8,7 @@ import (
"log/slog"
"strings"
"text/template"
"unicode/utf8"
"gitea.stevedudenhoeffer.com/steve/majordomo/agent"
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
@@ -24,17 +25,17 @@ import (
// carries RunnableAgent.Phases as a DTO — actually EXECUTES them (it previously
// ignored the slice and ran a single loop with the base prompt). It reuses the
// shared run machinery built once in Run: the same stepObserver (so audit/steps/
// critic accumulate across every phase), the same critic steer, and the same
// compaction option.
// critic-activity accumulate across every phase, including IsRunFunc bare calls),
// the same critic steer, and the same compaction option.
//
// Semantics preserved from mort's pipeline:
// - phases run sequentially; ctx cancellation between phases aborts the run.
// - phases run sequentially; ctx cancellation/deadline/critic-kill aborts the
// run (even mid-phase and even for an Optional phase).
// - IsRunFunc = one bare LLM call, no tools, no loop.
// - Optional phases swallow errors and substitute FallbackMessage.
// - Optional phases swallow NON-context errors and substitute FallbackMessage.
// - a non-optional phase that merely exhausts its step/tool budget is NOT fatal:
// its partial transcript is salvaged and the pipeline continues (so a long
// verify phase never discards every earlier phase's work). A hard error
// (cancellation, model failure) still aborts.
// its partial transcript is salvaged and the pipeline continues — EXCEPT a
// final phase that salvaged nothing, which is a genuine empty-result failure.
// - per-phase ModelTier resolve failures fall back to the base model with a WARN.
//
// Deliberately NOT carried over (kernel is leaner than mort's legacy pipeline):
@@ -42,35 +43,56 @@ import (
// no-tool-call-is-final-answer termination, like its single-loop path), and the
// critic's dynamic iteration ceiling (per-phase caps are fixed at phase start —
// the run-level critic's steer + hard deadline still apply across phases).
//
Outdated
Review

🟡 Redundant comments in phaseDeps

maintainability · flagged by 1 model

  1. Redundant Comments - Location: run/phases.go:46-58 - Issue: The phaseDeps struct has a comment that explains its purpose, but the comment is somewhat redundant with the struct's name and fields. The comment could be more concise. - Suggestion: Shorten the comment to focus on the unique aspects of phaseDeps that aren't immediately obvious from the struct definition.

🪰 Gadfly · advisory

🟡 **Redundant comments in phaseDeps** _maintainability · flagged by 1 model_ 3. **Redundant Comments** - **Location**: `run/phases.go:46-58` - **Issue**: The `phaseDeps` struct has a comment that explains its purpose, but the comment is somewhat redundant with the struct's name and fields. The comment could be more concise. - **Suggestion**: Shorten the comment to focus on the unique aspects of `phaseDeps` that aren't immediately obvious from the struct definition. <sub>🪰 Gadfly · advisory</sub>
// NOTE on phase names: {{.<PhaseName>}} resolves a map key, so a phase whose name
// is not a Go-template identifier (hyphens, spaces, leading digit) cannot be
// referenced as {{.my-phase}} — authors must use {{index . "my-phase"}}. A
// template that fails to parse/execute is logged (WARN) and passed through
// unchanged rather than silently dropped (see expandPhaseTemplate). Avoid naming
// a phase "Query" — it shadows the original-input variable.
Outdated
Review

🟠 Dead field baseTier in phaseDeps struct — populated but never read

maintainability · flagged by 4 models

  • phaseDeps.baseTier is dead code (run/phases.go:52). The field is populated in executor.go:348 (baseTier: tier) but never read anywhere in the package. grep confirms deps.baseTier has zero usages. This is a dead struct field that will confuse future readers. Remove it from the struct and the construction site.

🪰 Gadfly · advisory

🟠 **Dead field baseTier in phaseDeps struct — populated but never read** _maintainability · flagged by 4 models_ - **`phaseDeps.baseTier` is dead code** (`run/phases.go:52`). The field is populated in `executor.go:348` (`baseTier: tier`) but never read anywhere in the package. `grep` confirms `deps.baseTier` has zero usages. This is a dead struct field that will confuse future readers. Remove it from the struct and the construction site. <sub>🪰 Gadfly · advisory</sub>
// phaseDeps carries the per-run state the phase runner shares with Run: the base
// model + tier, the full decorated toolbox (filtered per phase), the base step
// cap, the shared agent options (tool-error limits + step observer + compactor),
// the critic/session steer, and the audit recorder (phase events).
// model, the full decorated toolbox (filtered per phase), the base step cap, the
// shared agent options (tool-error limits + step observer + compactor), the
// shared step observer (also fed by IsRunFunc bare calls), the critic/session
// steer, and the audit recorder (phase events).
type phaseDeps struct {
baseModel llm.Model
baseTier string
baseToolbox *llm.Toolbox
baseMaxIter int
sharedOpts []agent.Option
steer func() []llm.Message
rec RunRecorder
baseModel llm.Model
baseToolbox *llm.Toolbox
baseMaxIter int
sharedOpts []agent.Option
stepObserver func(agent.Step)
Outdated
Review

🟡 Long function runPhases

maintainability · flagged by 1 model

  1. Long Function in runPhases - Location: run/phases.go:64-140 - Issue: The runPhases function is quite long and handles multiple responsibilities, including iterating over phases, handling errors, and managing outputs. This makes it harder to read and maintain. - Suggestion: Break this function into smaller, focused functions. For example, separate the error handling logic into a dedicated function.

🪰 Gadfly · advisory

🟡 **Long function runPhases** _maintainability · flagged by 1 model_ 1. **Long Function in `runPhases`** - **Location**: `run/phases.go:64-140` - **Issue**: The `runPhases` function is quite long and handles multiple responsibilities, including iterating over phases, handling errors, and managing outputs. This makes it harder to read and maintain. - **Suggestion**: Break this function into smaller, focused functions. For example, separate the error handling logic into a dedicated function. <sub>🪰 Gadfly · advisory</sub>
steer func() []llm.Message
rec RunRecorder
}
// runPhases executes ra.Phases sequentially and returns a synthetic agent.Result
// whose Output is the final phase's output, with Usage aggregated across phases
// and Messages set to the last phase's transcript (for the PostRun hook). A hard
// (non-optional, non-budget) phase failure returns the error.
// (non-optional, non-budget) phase failure — and any context cancellation/
// deadline/critic-kill — returns the error.
func (e *Executor) runPhases(runCtx context.Context, ra RunnableAgent, deps phaseDeps, query string, images []llm.ImagePart) (*agent.Result, error) {
outputs := make(map[string]string, len(ra.Phases))
var lastResult *agent.Result
Review

🟠 Untrusted model outputs (prior phase results) are injected into later phases' system prompts via {{.}}, creating a prompt-injection escalation path where adversarial content gains system-prompt privilege.

security · flagged by 1 model

  • Untrusted model outputs are injected into later phases' system prompts (run/phases.go:76, run/phases.go:122, run/phases.go:144, run/phases.go:262-275)expandPhaseTemplate substitutes prior-phase outputs (model-generated, influenced by adversarial query/tool content) via {{.<PhaseName>}}; the expanded instructions become the next phase's System: prompt (systemPromptWithBodyagent.New(model, system, …) / model.Generate{System: system}). In the single-loop path model…

🪰 Gadfly · advisory

🟠 **Untrusted model outputs (prior phase results) are injected into later phases' system prompts via {{.<PhaseName>}}, creating a prompt-injection escalation path where adversarial content gains system-prompt privilege.** _security · flagged by 1 model_ - **Untrusted model outputs are injected into later phases' system prompts (`run/phases.go:76`, `run/phases.go:122`, `run/phases.go:144`, `run/phases.go:262-275`)** — `expandPhaseTemplate` substitutes prior-phase outputs (model-generated, influenced by adversarial query/tool content) via `{{.<PhaseName>}}`; the expanded `instructions` become the next phase's `System:` prompt (`systemPromptWithBody` → `agent.New(model, system, …)` / `model.Generate{System: system}`). In the single-loop path model… <sub>🪰 Gadfly · advisory</sub>
var lastOutput string
var totalUsage llm.Usage
// finish stamps the aggregated usage + final output onto the synthetic result.
finish := func(err error) (*agent.Result, error) {
if lastResult == nil {
lastResult = &agent.Result{}
}
lastResult.Usage = totalUsage
if err == nil {
lastResult.Output = lastOutput
}
return lastResult, err
Outdated
Review

🔴 Optional-phase branch swallows context cancellation/deadline/critic-kill; a killed/timed-out run with an Optional final phase reports status "ok" with a fallback message

correctness, error-handling · flagged by 3 models

1. Optional phase handling swallows context cancellation / deadline / critic-kill — a killed or timed-out run whose final phase is Optional is reported as a successful runrun/phases.go:89

🪰 Gadfly · advisory

🔴 **Optional-phase branch swallows context cancellation/deadline/critic-kill; a killed/timed-out run with an Optional final phase reports status "ok" with a fallback message** _correctness, error-handling · flagged by 3 models_ **1. `Optional` phase handling swallows context cancellation / deadline / critic-kill — a killed or timed-out run whose final phase is `Optional` is reported as a successful run** — `run/phases.go:89` <sub>🪰 Gadfly · advisory</sub>
}
for i, phase := range ra.Phases {
// A killed/timed-out/cancelled run must not start its next phase.
if err := runCtx.Err(); err != nil {
return lastResult, err
return finish(err)
}
instructions := expandPhaseTemplate(phase.SystemPrompt, query, outputs)
1
@@ -84,7 +106,14 @@ func (e *Executor) runPhases(runCtx context.Context, ra RunnableAgent, deps phas
totalUsage = addUsage(totalUsage, res.Usage)
}
if err != nil {
// A context cancellation / deadline / critic-kill is NEVER swallowed by
// the Optional or budget-salvage branches — the run genuinely ended and
// must surface as cancelled/timeout/killed (statusFor classifies it).
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return finish(err)
}
isLast := i == len(ra.Phases)-1
trimmed := strings.TrimSpace(output)
switch {
case phase.Optional:
Review

🟡 totalUsage not applied to lastResult on hard-error return; prior phases' token usage lost from run result

correctness · flagged by 1 model

  • run/phases.go:118totalUsage not applied to lastResult on hard-error return (verified by tracing lines 82-84, 118, and executor.go:358-360). When a non-optional phase fails hard and runOnePhase returns a non-nil partial res, line 83 overwrites lastResult = res and line 84 accumulates totalUsage correctly. But the error return on line 118 sends lastResult as-is — its .Usage field still holds only the failed phase's partial usage, not the accumulated totalUsage that i…

🪰 Gadfly · advisory

🟡 **totalUsage not applied to lastResult on hard-error return; prior phases' token usage lost from run result** _correctness · flagged by 1 model_ - **`run/phases.go:118` — `totalUsage` not applied to `lastResult` on hard-error return** (verified by tracing lines 82-84, 118, and `executor.go:358-360`). When a non-optional phase fails hard and `runOnePhase` returns a non-nil partial `res`, line 83 overwrites `lastResult = res` and line 84 accumulates `totalUsage` correctly. But the error return on line 118 sends `lastResult` as-is — its `.Usage` field still holds only the failed phase's partial usage, not the accumulated `totalUsage` that i… <sub>🪰 Gadfly · advisory</sub>
output = phase.FallbackMessage
@@ -97,13 +126,14 @@ func (e *Executor) runPhases(runCtx context.Context, ra RunnableAgent, deps phas
deps.rec.LogEvent("phase_failed_optional", map[string]any{"phase": phase.Name, "error": err.Error()})
}
Review

🟡 When the last phase fails as Optional, synthesized lastResult carries a prior phase's transcript for the PostRun hook

error-handling · flagged by 1 model

  • run/phases.go:128 runPhases — final synthesized lastResult may carry a prior phase's transcript when the last phase fails optionally. When the last phase is Optional and fails with a nil res, lastResult is not updated and retains the previous phase's agent.Result; the code then overwrites Output/Usage with the fallback/accumulated usage, but Messages remains the prior phase's transcript (passed to the PostRun hook). The doc says "last phase's transcript for the PostRu…

🪰 Gadfly · advisory

🟡 **When the last phase fails as Optional, synthesized lastResult carries a prior phase's transcript for the PostRun hook** _error-handling · flagged by 1 model_ - **`run/phases.go:128` `runPhases` — final synthesized `lastResult` may carry a *prior* phase's transcript when the last phase fails optionally.** When the last phase is `Optional` and fails with a nil `res`, `lastResult` is not updated and retains the previous phase's `agent.Result`; the code then overwrites `Output`/`Usage` with the fallback/accumulated usage, but `Messages` remains the prior phase's transcript (passed to the PostRun hook). The doc says "last phase's transcript for the PostRu… <sub>🪰 Gadfly · advisory</sub>
case isPhaseBudgetExhaustion(err) && (!isLast || strings.TrimSpace(output) != ""):
case isPhaseBudgetExhaustion(err) && (!isLast || trimmed != ""):
// Soft stop: the phase ran out of its step/tool budget before
// composing a final answer. Not fatal — it did real work
// (runOnePhase salvaged its partial transcript into output), and
// aborting would discard every completed phase before it. Degrade
// gracefully and continue.
if strings.TrimSpace(output) == "" {
// composing a final answer. Not fatal — it did real work (runOnePhase
// salvaged its partial transcript into output), and aborting would
// discard every completed phase before it. Degrade and continue.
// (A FINAL phase that salvaged nothing falls through to the hard error
// below: there is no result to return.)
if trimmed == "" {
output = fmt.Sprintf("(Phase %q reached its step budget before producing a consolidated result; continuing with its partial findings.)", phase.Name)
} else {
output += fmt.Sprintf("\n\n(Note: phase %q reached its step budget before fully completing; the above is its partial output.)", phase.Name)
1
@@ -115,7 +145,7 @@ func (e *Executor) runPhases(runCtx context.Context, ra RunnableAgent, deps phas
}
default:
return lastResult, fmt.Errorf("pipeline phase %q: %w", phase.Name, err)
return finish(fmt.Errorf("pipeline phase %q: %w", phase.Name, err))
}
}
@@ -123,14 +153,7 @@ func (e *Executor) runPhases(runCtx context.Context, ra RunnableAgent, deps phas
lastOutput = output
}
Review

🟠 IsRunFunc phases bypass shared stepObserver; critic clock not refreshed, steps missing from emitter

correctness · flagged by 2 models

🪰 Gadfly · advisory

🟠 **IsRunFunc phases bypass shared stepObserver; critic clock not refreshed, steps missing from emitter** _correctness · flagged by 2 models_ <sub>🪰 Gadfly · advisory</sub>
// Synthesize the run result: the final phase's output, usage aggregated over
// all phases, and the last phase's transcript for the PostRun hook.
if lastResult == nil {
lastResult = &agent.Result{}
}
lastResult.Output = lastOutput
lastResult.Usage = totalUsage
return lastResult, nil
return finish(nil)
}
// runOnePhase runs a single phase: a bare LLM call for IsRunFunc phases, a fresh
@@ -138,21 +161,23 @@ func (e *Executor) runPhases(runCtx context.Context, ra RunnableAgent, deps phas
// failed bare call), and any error. On a budget-exhaustion error the loop's
// partial transcript is salvaged into the returned output.
func (e *Executor) runOnePhase(runCtx context.Context, ra RunnableAgent, deps phaseDeps, phase Phase, instructions, query string, images []llm.ImagePart) (string, *agent.Result, error) {
model := e.phaseModel(runCtx, deps, ra, phase)
phaseCtx, model := e.phaseModel(runCtx, deps, ra, phase)
// The phase's expanded instructions are the system prompt (with the platform
// header so tools keep their run ids); the original query is the user message.
system := e.systemPromptWithBody(instructions)
if phase.IsRunFunc {
// Bare LLM call: no tool loop, no tools array (some models 400 on an empty
// tools list). The response still lands in the audit token tally.
msgs := []llm.Message{phaseUserMessage(query, images)}
resp, err := model.Generate(runCtx, llm.Request{System: system, Messages: msgs})
// tools list). The response is fed through the SAME step observer as a loop
// step so the audit token tally, Result.Steps, AND the critic's activity
// clock all see it (a long synthesize phase must not look idle to the critic).
msgs := []llm.Message{multimodalUserMessage(query, images)}
resp, err := model.Generate(phaseCtx, llm.Request{System: system, Messages: msgs})
if err != nil {
return "", nil, fmt.Errorf("phase %q model call: %w", phase.Name, err)
}
if deps.rec != nil {
deps.rec.OnStep(1, resp)
if deps.stepObserver != nil {
deps.stepObserver(agent.Step{Index: 0, Response: resp})
}
return resp.Text(), &agent.Result{
Output: resp.Text(),
2
@@ -175,7 +200,7 @@ func (e *Executor) runOnePhase(runCtx context.Context, ra RunnableAgent, deps ph
}, deps.sharedOpts...)
ag := agent.New(model, system, opts...)
res, runErr := runAgent(runCtx, ag, query, images, agent.WithSteer(deps.steer))
res, runErr := runAgent(phaseCtx, ag, query, images, agent.WithSteer(deps.steer))
output := ""
if res != nil {
output = res.Output
1
@@ -190,19 +215,26 @@ func (e *Executor) runOnePhase(runCtx context.Context, ra RunnableAgent, deps ph
return output, res, runErr
}
// phaseModel resolves the phase's model tier, falling back to the base model on
// an empty tier or a resolution failure (WARN — visible, non-fatal).
func (e *Executor) phaseModel(ctx context.Context, deps phaseDeps, ra RunnableAgent, phase Phase) llm.Model {
// phaseModel resolves the phase's model tier, returning the resolver's enriched
// context (usage attribution) alongside the model. An empty tier or a resolution
// failure falls back to the base model + the run context (WARN — visible, not
// fatal). Returning the enriched ctx mirrors the single-loop path, which adopts
// ctx = modelCtx, so a non-base-tier phase's calls are attributed correctly.
func (e *Executor) phaseModel(ctx context.Context, deps phaseDeps, ra RunnableAgent, phase Phase) (context.Context, llm.Model) {
if phase.ModelTier == "" {
return deps.baseModel
return ctx, deps.baseModel
}
_, m, err := e.cfg.Models(ctx, phase.ModelTier)
modelCtx, m, err := e.cfg.Models(ctx, phase.ModelTier)
if err != nil || m == nil {
reason := "resolver returned a nil model"
if err != nil {
reason = err.Error()
}
slog.Warn("run: pipeline phase model resolve failed; using base model",
"agent", ra.Name, "phase", phase.Name, "tier", phase.ModelTier, "error", err)
return deps.baseModel
"agent", ra.Name, "phase", phase.Name, "tier", phase.ModelTier, "reason", reason)
return ctx, deps.baseModel
Outdated
Review

🔴 salvagePhaseTranscript truncates by bytes, corrupting multi-byte UTF-8 runes

correctness, error-handling, maintainability, performance · flagged by 7 models

  • run/phases.go:236-238salvagePhaseTranscript truncates by byte length (len(out) > maxSalvage and out[len(out)-maxSalvage:]). For any multi-byte UTF-8 text this slices through runes, producing invalid UTF-8 and garbled output fed into downstream phases. The limit should count runes (utf8.RuneCountInString) and slice on a rune boundary.

🪰 Gadfly · advisory

🔴 **salvagePhaseTranscript truncates by bytes, corrupting multi-byte UTF-8 runes** _correctness, error-handling, maintainability, performance · flagged by 7 models_ - **`run/phases.go:236-238`** — `salvagePhaseTranscript` truncates by **byte length** (`len(out) > maxSalvage` and `out[len(out)-maxSalvage:]`). For any multi-byte UTF-8 text this slices through runes, producing invalid UTF-8 and garbled output fed into downstream phases. The limit should count runes (`utf8.RuneCountInString`) and slice on a rune boundary. <sub>🪰 Gadfly · advisory</sub>
}
return m
return modelCtx, m
}
// isPhaseBudgetExhaustion reports whether err is a soft budget/guard stop (the
@@ -212,9 +244,15 @@ func isPhaseBudgetExhaustion(err error) bool {
return errors.Is(err, agent.ErrMaxSteps) || errors.Is(err, agent.ErrToolLoop)
Review

🟠 phaseUserMessage duplicates runAgent's multimodal message construction — extract shared helper

maintainability · flagged by 5 models

  • phaseUserMessage duplicates runAgent's multimodal message logic (run/phases.go:244–256 vs run/executor.go:491–506). Both functions build a llm.Message from query + images with the same pattern: no images → simple path, otherwise build []llm.Part with optional text + image parts. The comment on phaseUserMessage even acknowledges it "mirrors" runAgent. Extract a shared helper (e.g., func userMessage(query string, images []llm.ImagePart) llm.Message) so the two call sites…

🪰 Gadfly · advisory

🟠 **phaseUserMessage duplicates runAgent's multimodal message construction — extract shared helper** _maintainability · flagged by 5 models_ - **`phaseUserMessage` duplicates `runAgent`'s multimodal message logic** (`run/phases.go:244–256` vs `run/executor.go:491–506`). Both functions build a `llm.Message` from `query + images` with the same pattern: no images → simple path, otherwise build `[]llm.Part` with optional text + image parts. The comment on `phaseUserMessage` even acknowledges it "mirrors" `runAgent`. Extract a shared helper (e.g., `func userMessage(query string, images []llm.ImagePart) llm.Message`) so the two call sites… <sub>🪰 Gadfly · advisory</sub>
}
// maxSalvageBytes bounds a salvaged partial transcript so a long phase's narrated
// reasoning doesn't blow up the next phase's prompt (the tail is the most recent,
// most relevant reasoning). Matches mort's pipeline cap.
const maxSalvageBytes = 8000
// salvagePhaseTranscript reconstructs a best-effort phase output from a loop that
// ended without a final answer: the assistant's narrated text across every step,
// tail-trimmed to a bound. Returns "" when the model wrote no prose.
// tail-trimmed to maxSalvageBytes on a rune boundary. Returns "" when the model
// wrote no prose.
func salvagePhaseTranscript(res *agent.Result) string {
if res == nil {
return ""
2
@@ -232,22 +270,27 @@ func salvagePhaseTranscript(res *agent.Result) string {
}
}
out := strings.TrimSpace(b.String())
const maxSalvage = 8000
if len(out) > maxSalvage {
out = "...(earlier reasoning trimmed)...\n" + out[len(out)-maxSalvage:]
if len(out) > maxSalvageBytes {
tail := out[len(out)-maxSalvageBytes:]
// Advance to the next rune boundary so the cut never splits a UTF-8 rune.
for len(tail) > 0 && !utf8.RuneStart(tail[0]) {
tail = tail[1:]
}
out = "...(earlier reasoning trimmed)...\n" + tail
}
return out
}
Review

🟠 filterToolbox copies entire toolbox when no filtering is requested

correctness, performance · flagged by 3 models

  • filterToolbox copies entire toolbox when no filtering is requestedrun/phases.go:283-287 When a phase does not specify a tool subset (len(names) == 0), filterToolbox still allocates a new llm.Toolbox and copies every tool into it. If the base toolbox is large and the pipeline has many phases, this is unnecessary allocation and Add overhead on every phase of every run. The single-loop path already passes the original toolbox pointer directly without copying. Fix: Return…

🪰 Gadfly · advisory

🟠 **filterToolbox copies entire toolbox when no filtering is requested** _correctness, performance · flagged by 3 models_ * **`filterToolbox` copies entire toolbox when no filtering is requested** — `run/phases.go:283-287` When a phase does not specify a tool subset (`len(names) == 0`), `filterToolbox` still allocates a new `llm.Toolbox` and copies every tool into it. If the base toolbox is large and the pipeline has many phases, this is unnecessary allocation and `Add` overhead on every phase of every run. The single-loop path already passes the original `toolbox` pointer directly without copying. **Fix:** Return… <sub>🪰 Gadfly · advisory</sub>
// phaseUserMessage builds a phase's user message: the original query text plus
// any inline images. Mirrors the single-loop multimodal seeding in runAgent.
func phaseUserMessage(query string, images []llm.ImagePart) llm.Message {
// multimodalUserMessage builds a user message from text + inline images. Shared
// by the phase runner and runAgent so the image-folding lives in one place.
// Empty text with images yields an image-only message (no empty text part).
func multimodalUserMessage(text string, images []llm.ImagePart) llm.Message {
if len(images) == 0 {
return llm.UserText(query)
return llm.UserText(text)
}
parts := make([]llm.Part, 0, len(images)+1)
if strings.TrimSpace(query) != "" {
parts = append(parts, llm.Text(query))
if strings.TrimSpace(text) != "" {
parts = append(parts, llm.Text(text))
}
for _, img := range images {
parts = append(parts, img)
@@ -257,11 +300,13 @@ func phaseUserMessage(query string, images []llm.ImagePart) llm.Message {
// expandPhaseTemplate applies Go text/template substitution to a phase prompt,
// replacing {{.Query}} with the original query and {{.<PhaseName>}} with a prior
// phase's output. Returns the original string unchanged if parsing or execution
// fails (best-effort, not fatal).
// phase's output. On a parse/execute error it logs a WARN and returns the
// template unchanged (best-effort, non-fatal) so a misconfigured prompt is
Outdated
Review

🟠 addUsage manually sums llm.Usage fields, brittle to upstream changes

maintainability · flagged by 2 models

  • run/phases.go:304-311addUsage manually accumulates every field of llm.Usage. If the upstream struct gains a new field (e.g. a future cost or token category), this function silently under-counts. Prefer an Add method on llm.Usage (if majordomo doesn't provide one, add it there) or generate this with a simple reflection helper; at minimum add a compile-time assertion that verifies field count.

🪰 Gadfly · advisory

🟠 **addUsage manually sums llm.Usage fields, brittle to upstream changes** _maintainability · flagged by 2 models_ - **`run/phases.go:304-311`** — `addUsage` manually accumulates every field of `llm.Usage`. If the upstream struct gains a new field (e.g. a future cost or token category), this function silently under-counts. Prefer an `Add` method on `llm.Usage` (if majordomo doesn't provide one, add it there) or generate this with a simple reflection helper; at minimum add a compile-time assertion that verifies field count. <sub>🪰 Gadfly · advisory</sub>
// visible rather than silently masked.
func expandPhaseTemplate(tmpl, query string, priorOutputs map[string]string) string {
t, err := template.New("phase").Option("missingkey=zero").Parse(tmpl)
if err != nil {
slog.Warn("run: pipeline phase template parse failed; using it unexpanded", "error", err)
return tmpl
}
data := map[string]string{"Query": query}
@@ -270,22 +315,21 @@ func expandPhaseTemplate(tmpl, query string, priorOutputs map[string]string) str
}
var buf bytes.Buffer
if err := t.Execute(&buf, data); err != nil {
slog.Warn("run: pipeline phase template execute failed; using it unexpanded", "error", err)
return tmpl
}
return buf.String()
}
// filterToolbox returns a new toolbox restricted to the named tools (preserving
// palette order). Empty names = the full palette. Unknown names are skipped with
// a WARN — a typo'd phase tool list should not abort a run mid-pipeline.
// filterToolbox returns a toolbox restricted to the named tools (preserving
// palette order). Empty names = the full palette (the base toolbox is returned
// as-is — it is read-only during a run, like the single-loop path). Unknown names
// are skipped with a WARN — a typo'd phase tool list should not abort a run.
func filterToolbox(box *llm.Toolbox, names []string) *llm.Toolbox {
out := llm.NewToolbox(box.Name())
if len(names) == 0 {
for _, t := range box.Tools() {
_ = out.Add(t)
}
return out
return box
}
out := llm.NewToolbox(box.Name())
for _, name := range names {
t, ok := box.Get(name)
if !ok {
@@ -300,7 +344,9 @@ func filterToolbox(box *llm.Toolbox, names []string) *llm.Toolbox {
}
// addUsage sums two llm.Usage tallies field-by-field so a phased run reports the
// total tokens across all phases.
// total tokens across all phases. NOTE: if llm.Usage gains a field, add it here
// too — the audit recorder (rec) is the authoritative per-run token source, this
// is the secondary Result.Usage roll-up.
func addUsage(a, b llm.Usage) llm.Usage {
a.InputTokens += b.InputTokens
a.OutputTokens += b.OutputTokens
+25
View File
@@ -105,6 +105,31 @@ func TestPhases_OptionalFailureSubstitutesFallback(t *testing.T) {
}
}
// TestPhases_OptionalDoesNotSwallowCancellation: an Optional phase that fails
// with a context cancellation must NOT be swallowed into its FallbackMessage —
// the run genuinely ended (cancel/deadline/critic-kill) and must surface the
// error so the run is classified cancelled/timeout/killed, not "ok".
func TestPhases_OptionalDoesNotSwallowCancellation(t *testing.T) {
models, _ := phaseProvider(t, fake.Fail(context.Canceled))
ex := New(Config{Registry: tool.NewRegistry(), Models: models})
ra := RunnableAgent{
Name: "pipeline",
ModelTier: "test-model",
Phases: []Phase{
// IsRunFunc so the cancellation surfaces directly wrapped (%w).
{Name: "a", SystemPrompt: "Phase A", IsRunFunc: true, Optional: true, FallbackMessage: "FB"},
},
}
res := ex.Run(context.Background(), ra, tool.Invocation{RunID: "r", CallerID: "c"}, "Q")
if !errors.Is(res.Err, context.Canceled) {
t.Fatalf("Optional phase must NOT swallow a cancellation; res.Err = %v", res.Err)
}
if res.Output == "FB" {
t.Error("a cancelled run must not report the fallback message as output")
}
}
// TestPhases_HardErrorAborts: a NON-optional phase that hits a hard error (not a
// budget/step exhaustion) aborts the pipeline; later phases do not run.
func TestPhases_HardErrorAborts(t *testing.T) {