executus/run/images_test.go

package run_test

import (
	"context"
	"strings"
	"testing"

	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
	"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"

	"gitea.stevedudenhoeffer.com/steve/executus/run"
	"gitea.stevedudenhoeffer.com/steve/executus/tool"
)

// TestExecutorFoldsInitialImages: when the invocation carries Images, they're
// folded into the first user message (alongside the prompt text) instead of being
// dropped — majordomo's Run input arg is text-only, so the executor seeds the
// multimodal opening turn via history.
func TestExecutorFoldsInitialImages(t *testing.T) {
	fp := fake.New("fake")
	fp.Enqueue("m", fake.Reply("saw the image"))
	m, _ := fp.Model("m")

	img := llm.ImagePart{MIME: "image/png", Data: []byte("PNGDATA")}
	inv := tool.Invocation{RunID: "r1", Images: []llm.ImagePart{img}}
	ex := run.New(run.Config{
		Registry: tool.NewRegistry(),
		Models:   func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
	})
	res := ex.Run(context.Background(), run.RunnableAgent{ModelTier: "m"}, inv, "describe this")
	if res.Err != nil {
		t.Fatalf("run error: %v", res.Err)
	}

	calls := fp.Calls()
	if len(calls) == 0 {
		t.Fatal("no model calls recorded")
	}
	// The first request must carry a user message bearing the image bytes + prompt.
	sawImage, sawText := false, false
	for _, msg := range calls[0].Request.Messages {
		for _, p := range msg.Parts {
			switch pp := p.(type) {
			case llm.ImagePart:
				if string(pp.Data) == "PNGDATA" {
					sawImage = true
				}
			case llm.TextPart:
				if strings.Contains(pp.Text, "describe this") {
					sawText = true
				}
			}
		}
	}
	if !sawImage {
		t.Error("initial image was not folded into the first model request (dropped)")
	}
	if !sawText {
		t.Error("prompt text missing from the multimodal first message")
	}
}

// TestExecutorTextOnlyUnchanged: with no Images, the prompt flows through as the
// text input (regression guard that the fold path didn't break the common case).
func TestExecutorTextOnlyUnchanged(t *testing.T) {
	fp := fake.New("fake")
	fp.Enqueue("m", fake.Reply("ok"))
	m, _ := fp.Model("m")

	ex := run.New(run.Config{
		Registry: tool.NewRegistry(),
		Models:   func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
	})
	res := ex.Run(context.Background(), run.RunnableAgent{ModelTier: "m"}, tool.Invocation{RunID: "r2"}, "plain prompt")
	if res.Err != nil {
		t.Fatalf("run error: %v", res.Err)
	}
	calls := fp.Calls()
	if len(calls) == 0 {
		t.Fatal("no model calls recorded")
	}
	sawText := false
	for _, msg := range calls[0].Request.Messages {
		for _, p := range msg.Parts {
			if tp, ok := p.(llm.TextPart); ok && strings.Contains(tp.Text, "plain prompt") {
				sawText = true
			}
		}
	}
	if !sawText {
		t.Error("text-only prompt did not reach the model")
	}
}