majordomo/provider/llamaswap/image.go

package llamaswap

import (
	"context"
	"encoding/base64"
	"fmt"
	"net/http"
	"strings"

	"gitea.stevedudenhoeffer.com/steve/majordomo/imagegen"
	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
)

// ImageModel implements imagegen.Provider, binding an image-generation model
// served by llama-swap (routed to a stable-diffusion.cpp upstream). The id is
// passed through verbatim and selects which upstream llama-swap loads.
func (p *Provider) ImageModel(id string, opts ...imagegen.ModelOption) (imagegen.Model, error) {
	if p.baseURL == "" {
		return nil, fmt.Errorf("llama-swap provider %q: no base URL configured (set one via WithBaseURL or an LLM_* env DSN)", p.name)
	}
	_ = imagegen.ApplyModelOptions(opts)
	return &imageModel{p: p, id: id}, nil
}

type imageModel struct {
	p  *Provider
	id string
}

// imageRequest is the OpenAI /v1/images/generations request shape. We always
// request b64_json so the bytes come back inline (no second fetch).
type imageRequest struct {
	Model          string `json:"model"`
	Prompt         string `json:"prompt"`
	N              int    `json:"n,omitempty"`
	Size           string `json:"size,omitempty"`
	ResponseFormat string `json:"response_format"`
}

type imageResponse struct {
	Created int64 `json:"created"`
	Data    []struct {
		B64JSON string `json:"b64_json"`
		URL     string `json:"url"`
	} `json:"data"`
}

// Generate implements imagegen.Model via POST {base}/v1/images/generations.
func (m *imageModel) Generate(ctx context.Context, req imagegen.Request, opts ...imagegen.Option) (*imagegen.Result, error) {
	req = req.Apply(opts...)
	if strings.TrimSpace(req.Prompt) == "" {
		return nil, fmt.Errorf("%w: image generation requires a prompt", llm.ErrUnsupported)
	}
	if req.N < 0 {
		return nil, fmt.Errorf("%w: image count N must be >= 0, got %d", llm.ErrUnsupported, req.N)
	}

	wire := imageRequest{
		Model:          m.id,
		Prompt:         req.Prompt,
		N:              req.N,
		Size:           req.Size,
		ResponseFormat: "b64_json",
	}

	var resp imageResponse
	if err := m.p.doJSON(ctx, http.MethodPost, "/v1/images/generations", m.id, &wire, &resp); err != nil {
		return nil, err
	}

	out := &imagegen.Result{Raw: &resp}
	for i, d := range resp.Data {
		if d.B64JSON == "" {
			// Why error rather than skip: a url-only entry means the backend
			// ignored response_format; we don't fetch remote content (mirrors
			// llm.ImagePart's bytes-only contract), so surface it.
			return nil, &llm.APIError{
				Provider: m.p.name,
				Model:    m.id,
				Message:  fmt.Sprintf("image %d returned no inline b64_json data", i),
			}
		}
		raw, err := base64.StdEncoding.DecodeString(d.B64JSON)
		if err != nil {
			return nil, fmt.Errorf("llama-swap: decode image %d: %w", i, err)
		}
		out.Images = append(out.Images, llm.ImagePart{MIME: sniffImageMIME(raw), Data: raw})
	}
	if len(out.Images) == 0 {
		return nil, &llm.APIError{
			Provider: m.p.name,
			Model:    m.id,
			Message:  "image response contained no images",
		}
	}
	return out, nil
}

// sniffImageMIME identifies the image format from its leading bytes, defaulting
// to image/png (stable-diffusion.cpp emits PNG) when detection is inconclusive.
func sniffImageMIME(data []byte) string {
	mime := http.DetectContentType(data)
	if !strings.HasPrefix(mime, "image/") {
		return "image/png"
	}
	return mime
}