a35c176b42
The executor passed only the text `input` to majordomo's agent.Run, silently dropping inv.Images — so a multimodal run (vision: chatbot @mention, chat API) lost its images on the executus path. majordomo's Run input arg is text-only, so fold the images into the first user message (text + image parts) via WithHistory and call Run with empty input, mirroring mort agentexec's multimodal seeding. The image-less path is unchanged (prompt passes straight through). Tests: a run with Images carries the image bytes + prompt into the first model request; the text-only path still reaches the model. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
94 lines
2.8 KiB
Go
94 lines
2.8 KiB
Go
package run_test
|
|
|
|
import (
|
|
"context"
|
|
"strings"
|
|
"testing"
|
|
|
|
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
|
|
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
|
|
|
|
"gitea.stevedudenhoeffer.com/steve/executus/run"
|
|
"gitea.stevedudenhoeffer.com/steve/executus/tool"
|
|
)
|
|
|
|
// TestExecutorFoldsInitialImages: when the invocation carries Images, they're
|
|
// folded into the first user message (alongside the prompt text) instead of being
|
|
// dropped — majordomo's Run input arg is text-only, so the executor seeds the
|
|
// multimodal opening turn via history.
|
|
func TestExecutorFoldsInitialImages(t *testing.T) {
|
|
fp := fake.New("fake")
|
|
fp.Enqueue("m", fake.Reply("saw the image"))
|
|
m, _ := fp.Model("m")
|
|
|
|
img := llm.ImagePart{MIME: "image/png", Data: []byte("PNGDATA")}
|
|
inv := tool.Invocation{RunID: "r1", Images: []llm.ImagePart{img}}
|
|
ex := run.New(run.Config{
|
|
Registry: tool.NewRegistry(),
|
|
Models: func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
|
|
})
|
|
res := ex.Run(context.Background(), run.RunnableAgent{ModelTier: "m"}, inv, "describe this")
|
|
if res.Err != nil {
|
|
t.Fatalf("run error: %v", res.Err)
|
|
}
|
|
|
|
calls := fp.Calls()
|
|
if len(calls) == 0 {
|
|
t.Fatal("no model calls recorded")
|
|
}
|
|
// The first request must carry a user message bearing the image bytes + prompt.
|
|
sawImage, sawText := false, false
|
|
for _, msg := range calls[0].Request.Messages {
|
|
for _, p := range msg.Parts {
|
|
switch pp := p.(type) {
|
|
case llm.ImagePart:
|
|
if string(pp.Data) == "PNGDATA" {
|
|
sawImage = true
|
|
}
|
|
case llm.TextPart:
|
|
if strings.Contains(pp.Text, "describe this") {
|
|
sawText = true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if !sawImage {
|
|
t.Error("initial image was not folded into the first model request (dropped)")
|
|
}
|
|
if !sawText {
|
|
t.Error("prompt text missing from the multimodal first message")
|
|
}
|
|
}
|
|
|
|
// TestExecutorTextOnlyUnchanged: with no Images, the prompt flows through as the
|
|
// text input (regression guard that the fold path didn't break the common case).
|
|
func TestExecutorTextOnlyUnchanged(t *testing.T) {
|
|
fp := fake.New("fake")
|
|
fp.Enqueue("m", fake.Reply("ok"))
|
|
m, _ := fp.Model("m")
|
|
|
|
ex := run.New(run.Config{
|
|
Registry: tool.NewRegistry(),
|
|
Models: func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
|
|
})
|
|
res := ex.Run(context.Background(), run.RunnableAgent{ModelTier: "m"}, tool.Invocation{RunID: "r2"}, "plain prompt")
|
|
if res.Err != nil {
|
|
t.Fatalf("run error: %v", res.Err)
|
|
}
|
|
calls := fp.Calls()
|
|
if len(calls) == 0 {
|
|
t.Fatal("no model calls recorded")
|
|
}
|
|
sawText := false
|
|
for _, msg := range calls[0].Request.Messages {
|
|
for _, p := range msg.Parts {
|
|
if tp, ok := p.(llm.TextPart); ok && strings.Contains(tp.Text, "plain prompt") {
|
|
sawText = true
|
|
}
|
|
}
|
|
}
|
|
if !sawText {
|
|
t.Error("text-only prompt did not reach the model")
|
|
}
|
|
}
|