executus/run/images_test.go

package run_test

import (
	"context"
	"strings"
	"testing"

	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
	"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"

	"gitea.stevedudenhoeffer.com/steve/executus/run"
	"gitea.stevedudenhoeffer.com/steve/executus/tool"
)

// TestExecutorFoldsInitialImages: when the invocation carries Images, they're
// folded into the first user message (alongside the prompt text) instead of being
// dropped — majordomo's Run input arg is text-only, so the executor seeds the
// multimodal opening turn via history.
func TestExecutorFoldsInitialImages(t *testing.T) {
	fp := fake.New("fake")
	fp.Enqueue("m", fake.Reply("saw the image"))
	m, _ := fp.Model("m")

	img := llm.ImagePart{MIME: "image/png", Data: []byte("PNGDATA")}
	inv := tool.Invocation{RunID: "r1", Images: []llm.ImagePart{img}}
	ex := run.New(run.Config{
		Registry: tool.NewRegistry(),
		Models:   func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
	})
	res := ex.Run(context.Background(), run.RunnableAgent{ModelTier: "m"}, inv, "describe this")
	if res.Err != nil {
		t.Fatalf("run error: %v", res.Err)
	}

	calls := fp.Calls()
	if len(calls) == 0 {
		t.Fatal("no model calls recorded")
	}
	// The text + image must be CO-LOCATED in a single user message (not split
	// across two), so the model reads them as one multimodal turn.
	coLocated := false
	for _, msg := range calls[0].Request.Messages {
		sawImage, sawText := false, false
		for _, p := range msg.Parts {
			switch pp := p.(type) {
			case llm.ImagePart:
				if string(pp.Data) == "PNGDATA" {
					sawImage = true
				}
			case llm.TextPart:
				if strings.Contains(pp.Text, "describe this") {
					sawText = true
				}
			}
		}
		if sawImage && sawText {
			coLocated = true
		}
	}
	if !coLocated {
		t.Error("image + prompt text were not folded into the SAME user message")
	}
}

// TestExecutorImageOnlyNoBlankText: an image-only run (blank prompt) must NOT emit
// an empty TextPart — the message carries just the image, matching
// runSession.AttachImages's guard.
func TestExecutorImageOnlyNoBlankText(t *testing.T) {
	fp := fake.New("fake")
	fp.Enqueue("m", fake.Reply("saw it"))
	m, _ := fp.Model("m")

	inv := tool.Invocation{RunID: "r3", Images: []llm.ImagePart{{MIME: "image/png", Data: []byte("IMG")}}}
	ex := run.New(run.Config{
		Registry: tool.NewRegistry(),
		Models:   func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
	})
	res := ex.Run(context.Background(), run.RunnableAgent{ModelTier: "m"}, inv, "   ")
	if res.Err != nil {
		t.Fatalf("run error: %v", res.Err)
	}
	for _, msg := range fp.Calls()[0].Request.Messages {
		for _, p := range msg.Parts {
			if tp, ok := p.(llm.TextPart); ok && strings.TrimSpace(tp.Text) == "" {
				t.Error("image-only run emitted a blank TextPart")
			}
		}
	}
}

// TestExecutorTextOnlyUnchanged: with no Images, the prompt flows through as the
// text input (regression guard that the fold path didn't break the common case).
func TestExecutorTextOnlyUnchanged(t *testing.T) {
	fp := fake.New("fake")
	fp.Enqueue("m", fake.Reply("ok"))
	m, _ := fp.Model("m")

	ex := run.New(run.Config{
		Registry: tool.NewRegistry(),
		Models:   func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
	})
	res := ex.Run(context.Background(), run.RunnableAgent{ModelTier: "m"}, tool.Invocation{RunID: "r2"}, "plain prompt")
	if res.Err != nil {
		t.Fatalf("run error: %v", res.Err)
	}
	calls := fp.Calls()
	if len(calls) == 0 {
		t.Fatal("no model calls recorded")
	}
	sawText := false
	for _, msg := range calls[0].Request.Messages {
		for _, p := range msg.Parts {
			if tp, ok := p.(llm.TextPart); ok && strings.Contains(tp.Text, "plain prompt") {
				sawText = true
			}
		}
	}
	if !sawText {
		t.Error("text-only prompt did not reach the model")
	}
}