run: fold inv.Images into the initial user message (multimodal opening turn)

The executor passed only the text `input` to majordomo's agent.Run, silently dropping inv.Images — so a multimodal run (vision: chatbot @mention, chat API) lost its images on the executus path. majordomo's Run input arg is text-only, so fold the images into the first user message (text + image parts) via WithHistory and call Run with empty input, mirroring mort agentexec's multimodal seeding. The image-less path is unchanged (prompt passes straight through). Tests: a run with Images carries the image bytes + prompt into the first model request; the text-only path still reaches the model. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-28 00:37:53 -04:00
parent 1cf46c9954
commit a35c176b42
2 changed files with 112 additions and 1 deletions
@@ -0,0 +1,93 @@
+package run_test
+
+import (
+	"context"
+	"strings"
+	"testing"
+
+	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
+	"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
+
+	"gitea.stevedudenhoeffer.com/steve/executus/run"
+	"gitea.stevedudenhoeffer.com/steve/executus/tool"
+)
+
+// TestExecutorFoldsInitialImages: when the invocation carries Images, they're
+// folded into the first user message (alongside the prompt text) instead of being
+// dropped — majordomo's Run input arg is text-only, so the executor seeds the
+// multimodal opening turn via history.
+func TestExecutorFoldsInitialImages(t *testing.T) {
+	fp := fake.New("fake")
+	fp.Enqueue("m", fake.Reply("saw the image"))
+	m, _ := fp.Model("m")
+
+	img := llm.ImagePart{MIME: "image/png", Data: []byte("PNGDATA")}
+	inv := tool.Invocation{RunID: "r1", Images: []llm.ImagePart{img}}
+	ex := run.New(run.Config{
+		Registry: tool.NewRegistry(),
+		Models:   func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
+	})
+	res := ex.Run(context.Background(), run.RunnableAgent{ModelTier: "m"}, inv, "describe this")
+	if res.Err != nil {
+		t.Fatalf("run error: %v", res.Err)
+	}
+
+	calls := fp.Calls()
+	if len(calls) == 0 {
+		t.Fatal("no model calls recorded")
+	}
+	// The first request must carry a user message bearing the image bytes + prompt.
+	sawImage, sawText := false, false
+	for _, msg := range calls[0].Request.Messages {
+		for _, p := range msg.Parts {
+			switch pp := p.(type) {
+			case llm.ImagePart:
+				if string(pp.Data) == "PNGDATA" {
+					sawImage = true
+				}
+			case llm.TextPart:
+				if strings.Contains(pp.Text, "describe this") {
+					sawText = true
+				}
+			}
+		}
+	}
+	if !sawImage {
+		t.Error("initial image was not folded into the first model request (dropped)")
+	}
+	if !sawText {
+		t.Error("prompt text missing from the multimodal first message")
+	}
+}
+
+// TestExecutorTextOnlyUnchanged: with no Images, the prompt flows through as the
+// text input (regression guard that the fold path didn't break the common case).
+func TestExecutorTextOnlyUnchanged(t *testing.T) {
+	fp := fake.New("fake")
+	fp.Enqueue("m", fake.Reply("ok"))
+	m, _ := fp.Model("m")
+
+	ex := run.New(run.Config{
+		Registry: tool.NewRegistry(),
+		Models:   func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
+	})
+	res := ex.Run(context.Background(), run.RunnableAgent{ModelTier: "m"}, tool.Invocation{RunID: "r2"}, "plain prompt")
+	if res.Err != nil {
+		t.Fatalf("run error: %v", res.Err)
+	}
+	calls := fp.Calls()
+	if len(calls) == 0 {
+		t.Fatal("no model calls recorded")
+	}
+	sawText := false
+	for _, msg := range calls[0].Request.Messages {
+		for _, p := range msg.Parts {
+			if tp, ok := p.(llm.TextPart); ok && strings.Contains(tp.Text, "plain prompt") {
+				sawText = true
+			}
+		}
+	}
+	if !sawText {
+		t.Error("text-only prompt did not reach the model")
+	}
+}