a213c18263
The OpenAI /v1/images/generations endpoint ignores `seed` on our stable-diffusion.cpp build — every render of a given prompt comes back byte-identical, so a drawbot batch of N collapsed to one image. Switch the image provider to sd-server's A1111 /sdapi/v1/txt2img endpoint, which honors `seed` (verified live: distinct seeds -> distinct images on SDXL and Qwen-Image). Size is split into width/height; llama-swap still routes by the `model` field. Tests + ADR-0016 updated.
138 lines
4.5 KiB
Go
138 lines
4.5 KiB
Go
package llamaswap
|
|
|
|
import (
|
|
"context"
|
|
"encoding/base64"
|
|
"fmt"
|
|
"net/http"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"gitea.stevedudenhoeffer.com/steve/majordomo/imagegen"
|
|
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
|
|
)
|
|
|
|
// ImageModel implements imagegen.Provider, binding an image-generation model
|
|
// served by llama-swap (routed to a stable-diffusion.cpp upstream). The id is
|
|
// passed through verbatim and selects which upstream llama-swap loads.
|
|
func (p *Provider) ImageModel(id string, opts ...imagegen.ModelOption) (imagegen.Model, error) {
|
|
if p.baseURL == "" {
|
|
return nil, fmt.Errorf("llama-swap provider %q: no base URL configured (set one via WithBaseURL or an LLM_* env DSN)", p.name)
|
|
}
|
|
_ = imagegen.ApplyModelOptions(opts)
|
|
return &imageModel{p: p, id: id}, nil
|
|
}
|
|
|
|
type imageModel struct {
|
|
p *Provider
|
|
id string
|
|
}
|
|
|
|
// txt2imgRequest is the stable-diffusion.cpp sd-server A1111 request shape
|
|
// (POST /sdapi/v1/txt2img). We use this endpoint rather than the OpenAI
|
|
// /v1/images/generations one because that endpoint IGNORES `seed` on this
|
|
// sd-server build — every render of a given prompt comes back byte-identical,
|
|
// so a batch of N collapses to one image. /sdapi/v1/txt2img honours `seed`,
|
|
// giving real variety. llama-swap still routes by the `model` field in the
|
|
// body. Optional fields are pointers/omitempty so an unset value falls back to
|
|
// the model's baked default (the per-model --steps/--cfg-scale/etc. flags).
|
|
type txt2imgRequest struct {
|
|
Model string `json:"model"`
|
|
Prompt string `json:"prompt"`
|
|
NegativePrompt string `json:"negative_prompt,omitempty"`
|
|
Seed *int64 `json:"seed,omitempty"`
|
|
Steps *int `json:"steps,omitempty"`
|
|
CFGScale *float64 `json:"cfg_scale,omitempty"`
|
|
Width *int `json:"width,omitempty"`
|
|
Height *int `json:"height,omitempty"`
|
|
SampleMethod string `json:"sample_method,omitempty"`
|
|
BatchCount int `json:"batch_count,omitempty"`
|
|
}
|
|
|
|
type txt2imgResponse struct {
|
|
Images []string `json:"images"`
|
|
}
|
|
|
|
// Generate implements imagegen.Model via POST {base}/sdapi/v1/txt2img.
|
|
func (m *imageModel) Generate(ctx context.Context, req imagegen.Request, opts ...imagegen.Option) (*imagegen.Result, error) {
|
|
req = req.Apply(opts...)
|
|
if strings.TrimSpace(req.Prompt) == "" {
|
|
return nil, fmt.Errorf("%w: image generation requires a prompt", llm.ErrUnsupported)
|
|
}
|
|
if req.N < 0 {
|
|
return nil, fmt.Errorf("%w: image count N must be >= 0, got %d", llm.ErrUnsupported, req.N)
|
|
}
|
|
|
|
width, height, err := parseSize(req.Size)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("%w: %v", llm.ErrUnsupported, err)
|
|
}
|
|
|
|
wire := txt2imgRequest{
|
|
Model: m.id,
|
|
Prompt: req.Prompt,
|
|
NegativePrompt: req.NegativePrompt,
|
|
Seed: req.Seed,
|
|
Steps: req.Steps,
|
|
CFGScale: req.CFGScale,
|
|
Width: width,
|
|
Height: height,
|
|
SampleMethod: req.Sampler,
|
|
BatchCount: req.N,
|
|
}
|
|
|
|
var resp txt2imgResponse
|
|
if err := m.p.doJSON(ctx, http.MethodPost, "/sdapi/v1/txt2img", m.id, &wire, &resp); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
out := &imagegen.Result{Raw: &resp}
|
|
for i, b64 := range resp.Images {
|
|
if b64 == "" {
|
|
continue
|
|
}
|
|
raw, err := base64.StdEncoding.DecodeString(b64)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("llama-swap: decode image %d: %w", i, err)
|
|
}
|
|
out.Images = append(out.Images, llm.ImagePart{MIME: sniffImageMIME(raw), Data: raw})
|
|
}
|
|
if len(out.Images) == 0 {
|
|
return nil, &llm.APIError{
|
|
Provider: m.p.name,
|
|
Model: m.id,
|
|
Message: "image response contained no images",
|
|
}
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// parseSize splits a "WxH" string into width/height pointers. "" yields
|
|
// (nil, nil) so the model's own default resolution applies.
|
|
func parseSize(size string) (*int, *int, error) {
|
|
size = strings.TrimSpace(size)
|
|
if size == "" {
|
|
return nil, nil, nil
|
|
}
|
|
parts := strings.SplitN(strings.ToLower(size), "x", 2)
|
|
if len(parts) != 2 {
|
|
return nil, nil, fmt.Errorf("invalid size %q (want WxH)", size)
|
|
}
|
|
w, err1 := strconv.Atoi(strings.TrimSpace(parts[0]))
|
|
h, err2 := strconv.Atoi(strings.TrimSpace(parts[1]))
|
|
if err1 != nil || err2 != nil || w <= 0 || h <= 0 {
|
|
return nil, nil, fmt.Errorf("invalid size %q (want WxH)", size)
|
|
}
|
|
return &w, &h, nil
|
|
}
|
|
|
|
// sniffImageMIME identifies the image format from its leading bytes, defaulting
|
|
// to image/png (stable-diffusion.cpp emits PNG) when detection is inconclusive.
|
|
func sniffImageMIME(data []byte) string {
|
|
mime := http.DetectContentType(data)
|
|
if !strings.HasPrefix(mime, "image/") {
|
|
return "image/png"
|
|
}
|
|
return mime
|
|
}
|