// Command multimodal demonstrates capability-aware image input: attach an
// image without knowing the target's limits — majordomo sniffs the real
// format, downscales, re-encodes, and enforces counts/sizes against the
// actual serving target before the request goes out.
package main

import (
	"context"
	"flag"
	"fmt"
	"log"
	"os"

	"gitea.stevedudenhoeffer.com/steve/majordomo"
)

func main() {
	model := flag.String("model", "ollama-cloud/kimi-k2.6:cloud", "vision-capable model spec")
	path := flag.String("image", "", "path to a jpeg/png/gif image (required)")
	flag.Parse()

	if *path == "" {
		log.Fatal("usage: multimodal -image photo.jpg [-model spec]")
	}
	data, err := os.ReadFile(*path)
	if err != nil {
		log.Fatalf("read image: %v", err)
	}

	m, err := majordomo.Parse(*model)
	if err != nil {
		log.Fatalf("parse: %v", err)
	}

	// The declared MIME may even be wrong — the media pipeline sniffs the
	// bytes and corrects it.
	resp, err := m.Generate(context.Background(), majordomo.Request{
		Messages: []majordomo.Message{majordomo.UserParts(
			majordomo.Text("Describe this image in two sentences."),
			majordomo.Image("image/jpeg", data),
		)},
	})
	if err != nil {
		log.Fatalf("generate: %v", err)
	}
	fmt.Printf("[%s] %s\n", resp.Model, resp.Text())
}