package llamaswap import ( "context" "encoding/base64" "fmt" "net/http" "strconv" "strings" "gitea.stevedudenhoeffer.com/steve/majordomo/imagegen" "gitea.stevedudenhoeffer.com/steve/majordomo/llm" ) // ImageModel implements imagegen.Provider, binding an image-generation model // served by llama-swap (routed to a stable-diffusion.cpp upstream). The id is // passed through verbatim and selects which upstream llama-swap loads. func (p *Provider) ImageModel(id string, opts ...imagegen.ModelOption) (imagegen.Model, error) { if p.baseURL == "" { return nil, fmt.Errorf("llama-swap provider %q: no base URL configured (set one via WithBaseURL or an LLM_* env DSN)", p.name) } _ = imagegen.ApplyModelOptions(opts) return &imageModel{p: p, id: id}, nil } type imageModel struct { p *Provider id string } // txt2imgRequest is the stable-diffusion.cpp sd-server A1111 request shape // (POST /sdapi/v1/txt2img). We use this endpoint rather than the OpenAI // /v1/images/generations one because that endpoint IGNORES `seed` on this // sd-server build — every render of a given prompt comes back byte-identical, // so a batch of N collapses to one image. /sdapi/v1/txt2img honours `seed`, // giving real variety. llama-swap still routes by the `model` field in the // body. Optional fields are pointers/omitempty so an unset value falls back to // the model's baked default (the per-model --steps/--cfg-scale/etc. flags). type txt2imgRequest struct { Model string `json:"model"` Prompt string `json:"prompt"` NegativePrompt string `json:"negative_prompt,omitempty"` Seed *int64 `json:"seed,omitempty"` Steps *int `json:"steps,omitempty"` CFGScale *float64 `json:"cfg_scale,omitempty"` Width *int `json:"width,omitempty"` Height *int `json:"height,omitempty"` SampleMethod string `json:"sample_method,omitempty"` BatchCount int `json:"batch_count,omitempty"` } type txt2imgResponse struct { Images []string `json:"images"` } // Generate implements imagegen.Model via POST {base}/sdapi/v1/txt2img. func (m *imageModel) Generate(ctx context.Context, req imagegen.Request, opts ...imagegen.Option) (*imagegen.Result, error) { req = req.Apply(opts...) if strings.TrimSpace(req.Prompt) == "" { return nil, fmt.Errorf("%w: image generation requires a prompt", llm.ErrUnsupported) } if req.N < 0 { return nil, fmt.Errorf("%w: image count N must be >= 0, got %d", llm.ErrUnsupported, req.N) } width, height, err := parseSize(req.Size) if err != nil { return nil, fmt.Errorf("%w: %v", llm.ErrUnsupported, err) } wire := txt2imgRequest{ Model: m.id, Prompt: req.Prompt, NegativePrompt: req.NegativePrompt, Seed: req.Seed, Steps: req.Steps, CFGScale: req.CFGScale, Width: width, Height: height, SampleMethod: req.Sampler, BatchCount: req.N, } var resp txt2imgResponse if err := m.p.doJSON(ctx, http.MethodPost, "/sdapi/v1/txt2img", m.id, &wire, &resp); err != nil { return nil, err } out := &imagegen.Result{Raw: &resp} for i, b64 := range resp.Images { if b64 == "" { continue } raw, err := base64.StdEncoding.DecodeString(b64) if err != nil { return nil, fmt.Errorf("llama-swap: decode image %d: %w", i, err) } out.Images = append(out.Images, llm.ImagePart{MIME: sniffImageMIME(raw), Data: raw}) } if len(out.Images) == 0 { return nil, &llm.APIError{ Provider: m.p.name, Model: m.id, Message: "image response contained no images", } } return out, nil } // parseSize splits a "WxH" string into width/height pointers. "" yields // (nil, nil) so the model's own default resolution applies. func parseSize(size string) (*int, *int, error) { size = strings.TrimSpace(size) if size == "" { return nil, nil, nil } parts := strings.SplitN(strings.ToLower(size), "x", 2) if len(parts) != 2 { return nil, nil, fmt.Errorf("invalid size %q (want WxH)", size) } w, err1 := strconv.Atoi(strings.TrimSpace(parts[0])) h, err2 := strconv.Atoi(strings.TrimSpace(parts[1])) if err1 != nil || err2 != nil || w <= 0 || h <= 0 { return nil, nil, fmt.Errorf("invalid size %q (want WxH)", size) } return &w, &h, nil } // sniffImageMIME identifies the image format from its leading bytes, defaulting // to image/png (stable-diffusion.cpp emits PNG) when detection is inconclusive. func sniffImageMIME(data []byte) string { mime := http.DetectContentType(data) if !strings.HasPrefix(mime, "image/") { return "image/png" } return mime }