majordomo/imagegen/imagegen.go

// Package imagegen is majordomo's canonical text-to-image surface. It is a
// deliberately separate contract from the llm package: image generation does
// not fit the chat Request/Response shape (no messages, tools, streaming, or
// failover chains in v1), so it gets its own small Provider/Model interface
// rather than overloading llm.Model.
//
// Generated images are carried as llm.ImagePart (bytes + MIME), so a result
// drops straight back into a chat turn:
//
//	res, _ := im.Generate(ctx, imagegen.Request{Prompt: "a red bicycle"})
//	msg := llm.UserParts(llm.Text("describe this"), res.Images[0])
//
// The first implementation is provider/llamaswap, which targets the OpenAI
// /v1/images/generations endpoint routed to a stable-diffusion.cpp backend.
package imagegen

import (
	"context"

	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
)

// Image is one generated image: raw bytes plus a MIME type. Aliased to
// llm.ImagePart so generated images are interchangeable with chat content and
// can be fed into llm.UserParts without conversion.
type Image = llm.ImagePart

// Request is a text-to-image generation request. Pointer-free zero values mean
// "provider default": N == 0 yields the backend's default count (usually one),
// and an empty Size leaves the backend's default resolution.
type Request struct {
	// Prompt is the text description of the image to generate.
	Prompt string

	// N is the number of images to generate; 0 = provider default.
	N int

	// Size is the requested resolution, e.g. "512x512" or "1024x1024";
	// "" = provider default.
	Size string
}

// Result is the canonical image-generation result.
type Result struct {
	// Images are the generated images, in the order the backend returned them.
	Images []Image

	// Raw is the provider-native response object, an escape hatch for
	// provider-specific fields. May be nil; never required for normal use.
	Raw any
}

// Option mutates a Request before it is sent. Options passed to Generate are
// applied to a copy of the request, so a Request value can be reused.
type Option func(*Request)

// WithN sets the number of images to generate.
func WithN(n int) Option { return func(r *Request) { r.N = n } }

// WithSize sets the requested resolution (e.g. "1024x1024").
func WithSize(size string) Option { return func(r *Request) { r.Size = size } }

// Apply returns a copy of the request with all options applied. Providers call
// this once at the top of Generate.
func (r Request) Apply(opts ...Option) Request {
	for _, opt := range opts {
		opt(&r)
	}
	return r
}

// Model generates images from a text prompt. It is intentionally narrower than
// llm.Model — no Stream, no Capabilities, no tool calls.
type Model interface {
	// Generate produces one or more images for the request's prompt.
	Generate(ctx context.Context, req Request, opts ...Option) (*Result, error)
}

// ModelOption configures a Model at construction time (Provider.ImageModel).
// Reserved for future per-model settings (e.g. a default size); present now so
// the interface is forward-compatible.
type ModelOption func(*ModelConfig)

// ModelConfig carries per-model construction settings.
type ModelConfig struct{}

// ApplyModelOptions folds options into a config.
func ApplyModelOptions(opts []ModelOption) ModelConfig {
	var cfg ModelConfig
	for _, opt := range opts {
		opt(&cfg)
	}
	return cfg
}

// Provider mints image Models bound to one backend. It mirrors llm.Provider
// but for image generation.
type Provider interface {
	// Name is the registry identifier for the provider.
	Name() string

	// ImageModel returns a Model bound to the given id (passed through to the
	// backend verbatim; no catalog validation).
	ImageModel(id string, opts ...ModelOption) (Model, error)
}