package llamaswap import ( "context" "encoding/base64" "fmt" "net/http" "strings" "gitea.stevedudenhoeffer.com/steve/majordomo/imagegen" "gitea.stevedudenhoeffer.com/steve/majordomo/llm" ) // ImageModel implements imagegen.Provider, binding an image-generation model // served by llama-swap (routed to a stable-diffusion.cpp upstream). The id is // passed through verbatim and selects which upstream llama-swap loads. func (p *Provider) ImageModel(id string, opts ...imagegen.ModelOption) (imagegen.Model, error) { if p.baseURL == "" { return nil, fmt.Errorf("llama-swap provider %q: no base URL configured (set one via WithBaseURL or an LLM_* env DSN)", p.name) } _ = imagegen.ApplyModelOptions(opts) return &imageModel{p: p, id: id}, nil } type imageModel struct { p *Provider id string } // imageRequest is the OpenAI /v1/images/generations request shape, plus the // stable-diffusion.cpp extras llama-swap forwards to sd-server. We always // request b64_json so the bytes come back inline (no second fetch). The // optional fields are pointers/omitempty so an unset value is omitted entirely // and sd-server falls back to the model's own default (a field name a given // sd-server build doesn't recognize is simply ignored — harmless). type imageRequest struct { Model string `json:"model"` Prompt string `json:"prompt"` N int `json:"n,omitempty"` Size string `json:"size,omitempty"` ResponseFormat string `json:"response_format"` Steps *int `json:"steps,omitempty"` CFGScale *float64 `json:"cfg_scale,omitempty"` NegativePrompt string `json:"negative_prompt,omitempty"` SampleMethod string `json:"sample_method,omitempty"` Seed *int64 `json:"seed,omitempty"` } type imageResponse struct { Created int64 `json:"created"` Data []struct { B64JSON string `json:"b64_json"` URL string `json:"url"` } `json:"data"` } // Generate implements imagegen.Model via POST {base}/v1/images/generations. func (m *imageModel) Generate(ctx context.Context, req imagegen.Request, opts ...imagegen.Option) (*imagegen.Result, error) { req = req.Apply(opts...) if strings.TrimSpace(req.Prompt) == "" { return nil, fmt.Errorf("%w: image generation requires a prompt", llm.ErrUnsupported) } if req.N < 0 { return nil, fmt.Errorf("%w: image count N must be >= 0, got %d", llm.ErrUnsupported, req.N) } wire := imageRequest{ Model: m.id, Prompt: req.Prompt, N: req.N, Size: req.Size, ResponseFormat: "b64_json", Steps: req.Steps, CFGScale: req.CFGScale, NegativePrompt: req.NegativePrompt, SampleMethod: req.Sampler, Seed: req.Seed, } var resp imageResponse if err := m.p.doJSON(ctx, http.MethodPost, "/v1/images/generations", m.id, &wire, &resp); err != nil { return nil, err } out := &imagegen.Result{Raw: &resp} for i, d := range resp.Data { if d.B64JSON == "" { // Why error rather than skip: a url-only entry means the backend // ignored response_format; we don't fetch remote content (mirrors // llm.ImagePart's bytes-only contract), so surface it. return nil, &llm.APIError{ Provider: m.p.name, Model: m.id, Message: fmt.Sprintf("image %d returned no inline b64_json data", i), } } raw, err := base64.StdEncoding.DecodeString(d.B64JSON) if err != nil { return nil, fmt.Errorf("llama-swap: decode image %d: %w", i, err) } out.Images = append(out.Images, llm.ImagePart{MIME: sniffImageMIME(raw), Data: raw}) } if len(out.Images) == 0 { return nil, &llm.APIError{ Provider: m.p.name, Model: m.id, Message: "image response contained no images", } } return out, nil } // sniffImageMIME identifies the image format from its leading bytes, defaulting // to image/png (stable-diffusion.cpp emits PNG) when detection is inconclusive. func sniffImageMIME(data []byte) string { mime := http.DetectContentType(data) if !strings.HasPrefix(mime, "image/") { return "image/png" } return mime }