feat: OpenAI, Anthropic, and native-Ollama providers + media pipeline
Phase 3: - provider/openai: Chat Completions for OpenAI + compat endpoints (SSE streaming with by-index tool-call assembly, response_format json_schema, legacy max_tokens option, reasoning_effort) - provider/anthropic: Messages API (tool_use/tool_result, GA structured output via output_config.format, full SSE event parser, 529 transient) - provider/ollama: one native /api/chat client behind the ollama, ollama-cloud, and foreman built-ins (presets; NDJSON streaming tolerant of foreman's buffered single-object responses; object tool arguments; format-schema structured output; think mapping) - media/: capability normalization (sniff, downscale, transcode, byte ladder, ErrUnsupported), wired into the chain executor per target with penalty-free advance past incapable elements - registry: real provider + scheme wiring, WithHTTPClient option, required env-foreman TLS chat round-trip test - ADR-0009 multimodal strategy, ADR-0010 tools/structured mapping; README matrix + CLAUDE.md synced Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
+293
@@ -0,0 +1,293 @@
|
||||
// Package media fits request images to a target's declared capabilities.
|
||||
//
|
||||
// Normalize sniffs each image's real format from magic bytes (declared MIME
|
||||
// types lie), corrects the part's MIME, and passes through anything that
|
||||
// already satisfies the target's llm.Capabilities. Images that do not fit
|
||||
// are decoded, downscaled (never upscaled), and re-encoded into an allowed
|
||||
// format and byte budget. Anything that cannot honestly be made to fit —
|
||||
// undecodable formats, impossible byte budgets, too many images, images for
|
||||
// a text-only target — fails with an error wrapping llm.ErrUnsupported so a
|
||||
// failover chain can advance to a more capable target without a health
|
||||
// penalty.
|
||||
//
|
||||
// Why a separate package: every provider would otherwise duplicate the same
|
||||
// decode/scale/encode pipeline. Providers keep only a cheap capability
|
||||
// enforcement backstop; this package performs the actual transformation,
|
||||
// once, against whichever target a chain is currently attempting.
|
||||
package media
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"image"
|
||||
"image/gif"
|
||||
"image/jpeg"
|
||||
"image/png"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
|
||||
)
|
||||
|
||||
// Normalize returns a copy of req whose images fit caps, transforming
|
||||
// (downscale, re-encode) where needed. The input request is never mutated.
|
||||
//
|
||||
// Fast paths: a request with no image parts, or whose images already satisfy
|
||||
// caps, is returned unchanged with all underlying slices shared. When any
|
||||
// image transforms, the Messages slice and the Parts slices of affected
|
||||
// messages are copied (copy-on-write); untouched parts stay shared.
|
||||
//
|
||||
// Images that cannot be made to fit return an error wrapping
|
||||
// llm.ErrUnsupported.
|
||||
func Normalize(req llm.Request, caps llm.Capabilities) (llm.Request, error) {
|
||||
total := 0
|
||||
for i := range req.Messages {
|
||||
for _, p := range req.Messages[i].Parts {
|
||||
if _, ok := p.(llm.ImagePart); ok {
|
||||
total++
|
||||
}
|
||||
}
|
||||
}
|
||||
if total == 0 {
|
||||
return req, nil
|
||||
}
|
||||
if !caps.SupportsImages() {
|
||||
return llm.Request{}, fmt.Errorf("media: %w: target does not accept image input (request carries %d image(s))", llm.ErrUnsupported, total)
|
||||
}
|
||||
// Why error instead of dropping the overflow: silently removing an image
|
||||
// changes the question the caller asked; the honest move is to refuse and
|
||||
// let a chain try a roomier target.
|
||||
if total > caps.MaxImagesPerReq {
|
||||
return llm.Request{}, fmt.Errorf("media: %w: request carries %d images, target allows at most %d per request", llm.ErrUnsupported, total, caps.MaxImagesPerReq)
|
||||
}
|
||||
|
||||
out := req
|
||||
copiedMessages := false
|
||||
for mi := range req.Messages {
|
||||
copiedParts := false
|
||||
for pi, part := range req.Messages[mi].Parts {
|
||||
ip, ok := part.(llm.ImagePart)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
norm, changed, err := normalizeImage(ip, caps)
|
||||
if err != nil {
|
||||
return llm.Request{}, fmt.Errorf("media: message %d, part %d: %w", mi, pi, err)
|
||||
}
|
||||
if !changed {
|
||||
continue
|
||||
}
|
||||
if !copiedMessages {
|
||||
out.Messages = make([]llm.Message, len(req.Messages))
|
||||
copy(out.Messages, req.Messages)
|
||||
copiedMessages = true
|
||||
}
|
||||
if !copiedParts {
|
||||
parts := make([]llm.Part, len(req.Messages[mi].Parts))
|
||||
copy(parts, req.Messages[mi].Parts)
|
||||
out.Messages[mi].Parts = parts
|
||||
copiedParts = true
|
||||
}
|
||||
out.Messages[mi].Parts[pi] = norm
|
||||
}
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// Info reports an image part's sniffed format ("jpeg", "png", "gif", or
|
||||
// "webp") and pixel dimensions. It is a cheap metadata read — the pixels are
|
||||
// never decoded. webp is recognized by signature but not decodable with the
|
||||
// standard library, so it reports format "webp" with zero dimensions and a
|
||||
// nil error.
|
||||
func Info(p llm.ImagePart) (format string, width, height int, err error) {
|
||||
format = sniff(p.Data)
|
||||
switch format {
|
||||
case "":
|
||||
return "", 0, 0, fmt.Errorf("media: image bytes match no known format (jpeg, png, gif, webp)")
|
||||
case "webp":
|
||||
return "webp", 0, 0, nil
|
||||
}
|
||||
cfg, _, err := image.DecodeConfig(bytes.NewReader(p.Data))
|
||||
if err != nil {
|
||||
return format, 0, 0, fmt.Errorf("media: decode %s config: %w", format, err)
|
||||
}
|
||||
return format, cfg.Width, cfg.Height, nil
|
||||
}
|
||||
|
||||
// normalizeImage fits one image part to caps. It returns the (possibly
|
||||
// transformed) part and whether it differs from the input. A corrected MIME
|
||||
// with untouched bytes still counts as changed so Normalize copy-on-writes
|
||||
// the containing slices.
|
||||
func normalizeImage(p llm.ImagePart, caps llm.Capabilities) (llm.ImagePart, bool, error) {
|
||||
// Why sniff instead of trusting p.MIME: callers routinely mislabel image
|
||||
// bytes, and providers reject mismatches; the bytes are the truth.
|
||||
format := sniff(p.Data)
|
||||
if format == "" {
|
||||
return p, false, fmt.Errorf("%w: image bytes (declared %q) match no known format (jpeg, png, gif, webp)", llm.ErrUnsupported, p.MIME)
|
||||
}
|
||||
realMIME := "image/" + format
|
||||
changed := false
|
||||
if p.MIME != realMIME {
|
||||
p.MIME = realMIME
|
||||
changed = true
|
||||
}
|
||||
|
||||
mimeOK := caps.MIMEAllowed(realMIME)
|
||||
fitsBytes := caps.MaxImageBytes == 0 || len(p.Data) <= caps.MaxImageBytes
|
||||
fitsDims := true
|
||||
if caps.MaxImageDimension > 0 && format != "webp" {
|
||||
// Cheap header-only dimension read; a failure forces the transform
|
||||
// path, which surfaces the real decode error.
|
||||
cfg, _, err := image.DecodeConfig(bytes.NewReader(p.Data))
|
||||
if err != nil {
|
||||
fitsDims = false
|
||||
} else {
|
||||
fitsDims = cfg.Width <= caps.MaxImageDimension && cfg.Height <= caps.MaxImageDimension
|
||||
}
|
||||
}
|
||||
// Why webp skips the dimension check: the stdlib cannot read webp
|
||||
// headers, so dimensions are unverifiable; if MIME and bytes fit we pass
|
||||
// it through rather than reject a possibly-fine image.
|
||||
if mimeOK && fitsBytes && fitsDims {
|
||||
return p, changed, nil
|
||||
}
|
||||
|
||||
// Transformation required from here on, which needs a real decode.
|
||||
if format == "webp" {
|
||||
return p, false, fmt.Errorf("%w: image is webp (%d bytes), which the Go standard library cannot decode; provide jpeg, png, or gif instead", llm.ErrUnsupported, len(p.Data))
|
||||
}
|
||||
img, _, err := image.Decode(bytes.NewReader(p.Data))
|
||||
if err != nil {
|
||||
return p, false, fmt.Errorf("%w: cannot decode %s image for transformation: %v", llm.ErrUnsupported, format, err)
|
||||
}
|
||||
|
||||
if caps.MaxImageDimension > 0 {
|
||||
b := img.Bounds()
|
||||
if b.Dx() > caps.MaxImageDimension || b.Dy() > caps.MaxImageDimension {
|
||||
nw, nh := fitDims(b.Dx(), b.Dy(), caps.MaxImageDimension)
|
||||
img = downscale(img, nw, nh)
|
||||
}
|
||||
}
|
||||
|
||||
target, err := targetMIME(realMIME, caps)
|
||||
if err != nil {
|
||||
return p, false, err
|
||||
}
|
||||
data, err := encodeFit(img, target, caps.MaxImageBytes)
|
||||
if err != nil {
|
||||
return p, false, err
|
||||
}
|
||||
return llm.ImagePart{MIME: target, Data: data}, true, nil
|
||||
}
|
||||
|
||||
// sniff identifies an image format from its magic bytes, returning "jpeg",
|
||||
// "png", "gif", "webp", or "" when nothing matches.
|
||||
func sniff(data []byte) string {
|
||||
switch {
|
||||
case len(data) >= 3 && data[0] == 0xFF && data[1] == 0xD8 && data[2] == 0xFF:
|
||||
return "jpeg"
|
||||
case len(data) >= 4 && data[0] == 0x89 && data[1] == 'P' && data[2] == 'N' && data[3] == 'G':
|
||||
return "png"
|
||||
case len(data) >= 4 && string(data[:4]) == "GIF8":
|
||||
return "gif"
|
||||
case len(data) >= 12 && string(data[:4]) == "RIFF" && string(data[8:12]) == "WEBP":
|
||||
return "webp"
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// encodableMIME reports whether the stdlib can encode the given image type.
|
||||
func encodableMIME(mime string) bool {
|
||||
switch mime {
|
||||
case "image/jpeg", "image/png", "image/gif":
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// targetMIME picks the re-encode format: the original when allowed, else
|
||||
// jpeg, else png, else the first allowed encodable type (gif). When nothing
|
||||
// allowed is encodable (e.g. only webp), it errors with llm.ErrUnsupported.
|
||||
func targetMIME(original string, caps llm.Capabilities) (string, error) {
|
||||
if encodableMIME(original) && caps.MIMEAllowed(original) {
|
||||
return original, nil
|
||||
}
|
||||
// Why jpeg before png: vision inputs are photographs more often than
|
||||
// screenshots, and jpeg's quality knob is the only size lever we have
|
||||
// for the byte-budget loop.
|
||||
for _, m := range []string{"image/jpeg", "image/png"} {
|
||||
if caps.MIMEAllowed(m) {
|
||||
return m, nil
|
||||
}
|
||||
}
|
||||
// An empty allow-list permits everything and was caught above, so the
|
||||
// list is non-empty here: take its first encodable entry.
|
||||
for _, m := range caps.AllowedImageMIME {
|
||||
if encodableMIME(m) {
|
||||
return m, nil
|
||||
}
|
||||
}
|
||||
return "", fmt.Errorf("%w: none of the allowed image types %v can be encoded with the Go standard library", llm.ErrUnsupported, caps.AllowedImageMIME)
|
||||
}
|
||||
|
||||
// encodeFit encodes img as mime within maxBytes (0 = no limit), trading
|
||||
// jpeg quality first and then resolution for size. The ladder is fixed
|
||||
// (jpeg: q85, q65, q45, q30, then half and quarter dimensions at q65;
|
||||
// png/gif: full, half, quarter dimensions) — at most six attempts, since an
|
||||
// image that survives a 16x pixel reduction over budget will not be saved
|
||||
// by further fiddling.
|
||||
func encodeFit(img image.Image, mime string, maxBytes int) ([]byte, error) {
|
||||
type attempt struct {
|
||||
div int // divide both dimensions by this
|
||||
quality int // jpeg quality; ignored for png/gif
|
||||
}
|
||||
var ladder []attempt
|
||||
if mime == "image/jpeg" {
|
||||
ladder = []attempt{{1, 85}, {1, 65}, {1, 45}, {1, 30}, {2, 65}, {4, 65}}
|
||||
} else {
|
||||
ladder = []attempt{{1, 0}, {2, 0}, {4, 0}}
|
||||
}
|
||||
|
||||
scaled := map[int]image.Image{1: img}
|
||||
smallest := -1
|
||||
for _, a := range ladder {
|
||||
cur, ok := scaled[a.div]
|
||||
if !ok {
|
||||
b := img.Bounds()
|
||||
nw, nh := max(b.Dx()/a.div, 1), max(b.Dy()/a.div, 1)
|
||||
cur = downscale(img, nw, nh)
|
||||
scaled[a.div] = cur
|
||||
}
|
||||
data, err := encodeImage(cur, mime, a.quality)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("encode %s: %w", mime, err)
|
||||
}
|
||||
if maxBytes == 0 || len(data) <= maxBytes {
|
||||
return data, nil
|
||||
}
|
||||
if smallest == -1 || len(data) < smallest {
|
||||
smallest = len(data)
|
||||
}
|
||||
}
|
||||
return nil, fmt.Errorf("%w: image cannot be reduced to the %d-byte limit; smallest achievable %s encoding is %d bytes", llm.ErrUnsupported, maxBytes, mime, smallest)
|
||||
}
|
||||
|
||||
// encodeImage encodes img into the given MIME type. quality applies to jpeg
|
||||
// only.
|
||||
func encodeImage(img image.Image, mime string, quality int) ([]byte, error) {
|
||||
var buf bytes.Buffer
|
||||
var err error
|
||||
switch mime {
|
||||
case "image/jpeg":
|
||||
err = jpeg.Encode(&buf, img, &jpeg.Options{Quality: quality})
|
||||
case "image/png":
|
||||
err = png.Encode(&buf, img)
|
||||
case "image/gif":
|
||||
err = gif.Encode(&buf, img, nil)
|
||||
default:
|
||||
return nil, fmt.Errorf("no stdlib encoder for %q", mime)
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return buf.Bytes(), nil
|
||||
}
|
||||
@@ -0,0 +1,513 @@
|
||||
package media
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"image"
|
||||
"image/color"
|
||||
"image/gif"
|
||||
"image/jpeg"
|
||||
"image/png"
|
||||
"math/rand/v2"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
|
||||
)
|
||||
|
||||
// --- test image builders -------------------------------------------------
|
||||
|
||||
// gradient builds a smooth w x h RGBA image (compresses well).
|
||||
func gradient(w, h int) *image.RGBA {
|
||||
img := image.NewRGBA(image.Rect(0, 0, w, h))
|
||||
for y := 0; y < h; y++ {
|
||||
for x := 0; x < w; x++ {
|
||||
img.SetRGBA(x, y, color.RGBA{
|
||||
R: uint8(x * 255 / max(w-1, 1)),
|
||||
G: uint8(y * 255 / max(h-1, 1)),
|
||||
B: 128,
|
||||
A: 255,
|
||||
})
|
||||
}
|
||||
}
|
||||
return img
|
||||
}
|
||||
|
||||
// noisy builds a w x h image of deterministic random pixels (compresses
|
||||
// terribly — ideal for exercising the byte-budget ladder).
|
||||
func noisy(w, h int) *image.RGBA {
|
||||
rng := rand.New(rand.NewPCG(1, 2))
|
||||
img := image.NewRGBA(image.Rect(0, 0, w, h))
|
||||
for i := range img.Pix {
|
||||
img.Pix[i] = uint8(rng.UintN(256))
|
||||
}
|
||||
return img
|
||||
}
|
||||
|
||||
func encPNG(t *testing.T, img image.Image) []byte {
|
||||
t.Helper()
|
||||
var buf bytes.Buffer
|
||||
if err := png.Encode(&buf, img); err != nil {
|
||||
t.Fatalf("png encode: %v", err)
|
||||
}
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
func encJPEG(t *testing.T, img image.Image) []byte {
|
||||
t.Helper()
|
||||
var buf bytes.Buffer
|
||||
if err := jpeg.Encode(&buf, img, &jpeg.Options{Quality: 90}); err != nil {
|
||||
t.Fatalf("jpeg encode: %v", err)
|
||||
}
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
func encGIF(t *testing.T, img image.Image) []byte {
|
||||
t.Helper()
|
||||
var buf bytes.Buffer
|
||||
if err := gif.Encode(&buf, img, nil); err != nil {
|
||||
t.Fatalf("gif encode: %v", err)
|
||||
}
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
// webpBlob is a minimal byte sequence carrying the RIFF/WEBP signature.
|
||||
// The stdlib cannot decode webp, so sniffing is all that ever reads it.
|
||||
func webpBlob() []byte {
|
||||
b := []byte("RIFF")
|
||||
b = append(b, 0x1a, 0x00, 0x00, 0x00)
|
||||
b = append(b, "WEBPVP8 "...)
|
||||
b = append(b, make([]byte, 18)...)
|
||||
return b
|
||||
}
|
||||
|
||||
func imgReq(parts ...llm.Part) llm.Request {
|
||||
return llm.Request{Messages: []llm.Message{llm.UserParts(parts...)}}
|
||||
}
|
||||
|
||||
// firstImage returns the first image part in the request.
|
||||
func firstImage(t *testing.T, req llm.Request) llm.ImagePart {
|
||||
t.Helper()
|
||||
for _, m := range req.Messages {
|
||||
for _, p := range m.Parts {
|
||||
if ip, ok := p.(llm.ImagePart); ok {
|
||||
return ip
|
||||
}
|
||||
}
|
||||
}
|
||||
t.Fatal("no image part in request")
|
||||
return llm.ImagePart{}
|
||||
}
|
||||
|
||||
// --- fast paths -----------------------------------------------------------
|
||||
|
||||
func TestNormalizeFastPathNoImages(t *testing.T) {
|
||||
req := llm.Request{Messages: []llm.Message{llm.UserText("hello")}}
|
||||
got, err := Normalize(req, llm.Capabilities{}) // even a no-image target
|
||||
if err != nil {
|
||||
t.Fatalf("Normalize: %v", err)
|
||||
}
|
||||
if &got.Messages[0] != &req.Messages[0] {
|
||||
t.Error("messages slice was copied on the no-image fast path")
|
||||
}
|
||||
if &got.Messages[0].Parts[0] != &req.Messages[0].Parts[0] {
|
||||
t.Error("parts slice was copied on the no-image fast path")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeFastPathFittingImages(t *testing.T) {
|
||||
data := encPNG(t, gradient(20, 10))
|
||||
req := imgReq(llm.Text("look:"), llm.Image("image/png", data))
|
||||
caps := llm.Capabilities{
|
||||
MaxImagesPerReq: 4,
|
||||
MaxImageBytes: len(data) + 100,
|
||||
MaxImageDimension: 64,
|
||||
AllowedImageMIME: []string{"image/png"},
|
||||
}
|
||||
got, err := Normalize(req, caps)
|
||||
if err != nil {
|
||||
t.Fatalf("Normalize: %v", err)
|
||||
}
|
||||
if &got.Messages[0] != &req.Messages[0] {
|
||||
t.Error("messages slice was copied although every image already fits")
|
||||
}
|
||||
if &got.Messages[0].Parts[1] != &req.Messages[0].Parts[1] {
|
||||
t.Error("parts slice was copied although every image already fits")
|
||||
}
|
||||
}
|
||||
|
||||
// --- rejection paths ------------------------------------------------------
|
||||
|
||||
func TestNormalizeImagesUnsupported(t *testing.T) {
|
||||
req := imgReq(llm.Image("image/png", encPNG(t, gradient(4, 4))))
|
||||
_, err := Normalize(req, llm.Capabilities{MaxImagesPerReq: 0})
|
||||
if !errors.Is(err, llm.ErrUnsupported) {
|
||||
t.Fatalf("err = %v, want ErrUnsupported", err)
|
||||
}
|
||||
if !strings.Contains(err.Error(), "does not accept image input") {
|
||||
t.Errorf("err message %q lacks explanation", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeTooManyImages(t *testing.T) {
|
||||
img := llm.Image("image/png", encPNG(t, gradient(4, 4)))
|
||||
req := llm.Request{Messages: []llm.Message{
|
||||
llm.UserParts(img, img),
|
||||
llm.UserParts(img),
|
||||
}}
|
||||
_, err := Normalize(req, llm.Capabilities{MaxImagesPerReq: 2})
|
||||
if !errors.Is(err, llm.ErrUnsupported) {
|
||||
t.Fatalf("err = %v, want ErrUnsupported", err)
|
||||
}
|
||||
if !strings.Contains(err.Error(), "3 images") || !strings.Contains(err.Error(), "at most 2") {
|
||||
t.Errorf("err message %q lacks the counts", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeGarbageBytes(t *testing.T) {
|
||||
req := imgReq(llm.Image("image/png", []byte("certainly not an image")))
|
||||
_, err := Normalize(req, llm.Capabilities{MaxImagesPerReq: 1})
|
||||
if !errors.Is(err, llm.ErrUnsupported) {
|
||||
t.Fatalf("err = %v, want ErrUnsupported", err)
|
||||
}
|
||||
if !strings.Contains(err.Error(), "no known format") {
|
||||
t.Errorf("err message %q lacks a clear explanation", err)
|
||||
}
|
||||
}
|
||||
|
||||
// --- MIME sniffing & correction --------------------------------------------
|
||||
|
||||
func TestNormalizeMIMECorrection(t *testing.T) {
|
||||
data := encPNG(t, gradient(8, 8))
|
||||
req := imgReq(llm.Image("image/jpeg", data)) // caller lies: bytes are png
|
||||
got, err := Normalize(req, llm.Capabilities{MaxImagesPerReq: 1})
|
||||
if err != nil {
|
||||
t.Fatalf("Normalize: %v", err)
|
||||
}
|
||||
ip := firstImage(t, got)
|
||||
if ip.MIME != "image/png" {
|
||||
t.Errorf("MIME = %q, want sniff-corrected %q", ip.MIME, "image/png")
|
||||
}
|
||||
if !bytes.Equal(ip.Data, data) {
|
||||
t.Error("image bytes changed although only the MIME needed correcting")
|
||||
}
|
||||
if orig := firstImage(t, req); orig.MIME != "image/jpeg" {
|
||||
t.Errorf("input request mutated: MIME now %q", orig.MIME)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeCopyOnWrite(t *testing.T) {
|
||||
data := encPNG(t, gradient(8, 8))
|
||||
req := llm.Request{Messages: []llm.Message{
|
||||
llm.UserText("untouched message"),
|
||||
llm.UserParts(llm.Text("untouched part"), llm.Image("image/jpeg", data)),
|
||||
}}
|
||||
got, err := Normalize(req, llm.Capabilities{MaxImagesPerReq: 1})
|
||||
if err != nil {
|
||||
t.Fatalf("Normalize: %v", err)
|
||||
}
|
||||
if &got.Messages[0] == &req.Messages[0] {
|
||||
t.Error("messages slice shared although a part changed (mutation hazard)")
|
||||
}
|
||||
if &got.Messages[0].Parts[0] != &req.Messages[0].Parts[0] {
|
||||
t.Error("parts slice of the untouched message was copied")
|
||||
}
|
||||
if &got.Messages[1].Parts[0] == &req.Messages[1].Parts[0] {
|
||||
t.Error("parts slice of the changed message is still shared (mutation hazard)")
|
||||
}
|
||||
}
|
||||
|
||||
// --- dimension capping ------------------------------------------------------
|
||||
|
||||
func TestNormalizeDownscale(t *testing.T) {
|
||||
req := imgReq(llm.Image("image/png", encPNG(t, gradient(200, 100))))
|
||||
caps := llm.Capabilities{MaxImagesPerReq: 1, MaxImageDimension: 50}
|
||||
got, err := Normalize(req, caps)
|
||||
if err != nil {
|
||||
t.Fatalf("Normalize: %v", err)
|
||||
}
|
||||
format, w, h, err := Info(firstImage(t, got))
|
||||
if err != nil {
|
||||
t.Fatalf("Info: %v", err)
|
||||
}
|
||||
if format != "png" {
|
||||
t.Errorf("format = %q, want original format %q preserved", format, "png")
|
||||
}
|
||||
if w != 50 || h != 25 {
|
||||
t.Errorf("dimensions = %dx%d, want 50x25 (aspect preserved)", w, h)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeDownscalePortrait(t *testing.T) {
|
||||
req := imgReq(llm.Image("image/png", encPNG(t, gradient(100, 200))))
|
||||
caps := llm.Capabilities{MaxImagesPerReq: 1, MaxImageDimension: 50}
|
||||
got, err := Normalize(req, caps)
|
||||
if err != nil {
|
||||
t.Fatalf("Normalize: %v", err)
|
||||
}
|
||||
_, w, h, err := Info(firstImage(t, got))
|
||||
if err != nil {
|
||||
t.Fatalf("Info: %v", err)
|
||||
}
|
||||
if w != 25 || h != 50 {
|
||||
t.Errorf("dimensions = %dx%d, want 25x50 (aspect preserved)", w, h)
|
||||
}
|
||||
}
|
||||
|
||||
// --- transcoding -------------------------------------------------------------
|
||||
|
||||
func TestNormalizeTranscode(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
data []byte
|
||||
mime string
|
||||
allowed []string
|
||||
want string
|
||||
}{
|
||||
{"png to jpeg", encPNG(t, gradient(16, 16)), "image/png", []string{"image/jpeg"}, "jpeg"},
|
||||
{"jpeg to png", encJPEG(t, gradient(16, 16)), "image/jpeg", []string{"image/png"}, "png"},
|
||||
{"gif to png", encGIF(t, gradient(16, 16)), "image/gif", []string{"image/png"}, "png"},
|
||||
{"png to gif fallback", encPNG(t, gradient(16, 16)), "image/png", []string{"image/gif"}, "gif"},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
req := imgReq(llm.Image(tt.mime, tt.data))
|
||||
caps := llm.Capabilities{MaxImagesPerReq: 1, AllowedImageMIME: tt.allowed}
|
||||
got, err := Normalize(req, caps)
|
||||
if err != nil {
|
||||
t.Fatalf("Normalize: %v", err)
|
||||
}
|
||||
ip := firstImage(t, got)
|
||||
if ip.MIME != "image/"+tt.want {
|
||||
t.Errorf("MIME = %q, want %q", ip.MIME, "image/"+tt.want)
|
||||
}
|
||||
format, w, h, err := Info(ip)
|
||||
if err != nil {
|
||||
t.Fatalf("Info: %v", err)
|
||||
}
|
||||
if format != tt.want {
|
||||
t.Errorf("sniffed format = %q, want %q", format, tt.want)
|
||||
}
|
||||
if w != 16 || h != 16 {
|
||||
t.Errorf("dimensions = %dx%d, want 16x16 (no resize needed)", w, h)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeNoEncodableAllowedType(t *testing.T) {
|
||||
// png needs transcoding but the only allowed type is webp, which the
|
||||
// stdlib cannot encode.
|
||||
req := imgReq(llm.Image("image/png", encPNG(t, gradient(8, 8))))
|
||||
caps := llm.Capabilities{MaxImagesPerReq: 1, AllowedImageMIME: []string{"image/webp"}}
|
||||
_, err := Normalize(req, caps)
|
||||
if !errors.Is(err, llm.ErrUnsupported) {
|
||||
t.Fatalf("err = %v, want ErrUnsupported", err)
|
||||
}
|
||||
if !strings.Contains(err.Error(), "image/webp") {
|
||||
t.Errorf("err message %q does not name the unencodable allowed types", err)
|
||||
}
|
||||
}
|
||||
|
||||
// --- byte budget ---------------------------------------------------------------
|
||||
|
||||
func TestNormalizeByteBudgetFits(t *testing.T) {
|
||||
// Random noise defeats q85 jpeg at full size; the ladder must walk down
|
||||
// quality and then resolution until the encoding fits.
|
||||
req := imgReq(llm.Image("image/png", encPNG(t, noisy(256, 256))))
|
||||
caps := llm.Capabilities{
|
||||
MaxImagesPerReq: 1,
|
||||
AllowedImageMIME: []string{"image/jpeg"},
|
||||
MaxImageBytes: 8 * 1024,
|
||||
}
|
||||
got, err := Normalize(req, caps)
|
||||
if err != nil {
|
||||
t.Fatalf("Normalize: %v", err)
|
||||
}
|
||||
ip := firstImage(t, got)
|
||||
if len(ip.Data) > caps.MaxImageBytes {
|
||||
t.Errorf("len(Data) = %d, exceeds budget %d", len(ip.Data), caps.MaxImageBytes)
|
||||
}
|
||||
if format, _, _, err := Info(ip); err != nil || format != "jpeg" {
|
||||
t.Errorf("Info = %q, %v; want jpeg, nil", format, err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeByteBudgetImpossible(t *testing.T) {
|
||||
req := imgReq(llm.Image("image/png", encPNG(t, noisy(256, 256))))
|
||||
caps := llm.Capabilities{
|
||||
MaxImagesPerReq: 1,
|
||||
AllowedImageMIME: []string{"image/jpeg"},
|
||||
MaxImageBytes: 10, // no image fits in 10 bytes
|
||||
}
|
||||
_, err := Normalize(req, caps)
|
||||
if !errors.Is(err, llm.ErrUnsupported) {
|
||||
t.Fatalf("err = %v, want ErrUnsupported", err)
|
||||
}
|
||||
if !strings.Contains(err.Error(), "10-byte limit") {
|
||||
t.Errorf("err message %q lacks the budget", err)
|
||||
}
|
||||
if !strings.Contains(err.Error(), "smallest achievable") {
|
||||
t.Errorf("err message %q lacks the achieved size", err)
|
||||
}
|
||||
}
|
||||
|
||||
// --- webp ---------------------------------------------------------------------
|
||||
|
||||
func TestNormalizeWebPPassThrough(t *testing.T) {
|
||||
data := webpBlob()
|
||||
req := imgReq(llm.Image("image/webp", data))
|
||||
caps := llm.Capabilities{
|
||||
MaxImagesPerReq: 1,
|
||||
MaxImageBytes: 1024,
|
||||
MaxImageDimension: 50, // unverifiable for webp; must not force a transform
|
||||
AllowedImageMIME: []string{"image/webp"},
|
||||
}
|
||||
got, err := Normalize(req, caps)
|
||||
if err != nil {
|
||||
t.Fatalf("Normalize: %v", err)
|
||||
}
|
||||
if &got.Messages[0] != &req.Messages[0] {
|
||||
t.Error("request copied although the webp image passes through")
|
||||
}
|
||||
if ip := firstImage(t, got); !bytes.Equal(ip.Data, data) {
|
||||
t.Error("webp bytes changed on pass-through")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeWebPNeedsTransform(t *testing.T) {
|
||||
req := imgReq(llm.Image("image/webp", webpBlob()))
|
||||
caps := llm.Capabilities{MaxImagesPerReq: 1, AllowedImageMIME: []string{"image/jpeg"}}
|
||||
_, err := Normalize(req, caps)
|
||||
if !errors.Is(err, llm.ErrUnsupported) {
|
||||
t.Fatalf("err = %v, want ErrUnsupported", err)
|
||||
}
|
||||
if !strings.Contains(err.Error(), "webp") {
|
||||
t.Errorf("err message %q does not name the format", err)
|
||||
}
|
||||
if !strings.Contains(err.Error(), "jpeg, png, or gif") {
|
||||
t.Errorf("err message %q does not say what to provide instead", err)
|
||||
}
|
||||
}
|
||||
|
||||
// --- input immutability ----------------------------------------------------------
|
||||
|
||||
func TestNormalizeInputNotMutated(t *testing.T) {
|
||||
data := encPNG(t, gradient(200, 100))
|
||||
snapshot := bytes.Clone(data)
|
||||
req := llm.Request{
|
||||
System: "sys",
|
||||
Messages: []llm.Message{
|
||||
llm.UserParts(llm.Text("scale me"), llm.Image("image/jpeg", data)),
|
||||
},
|
||||
}
|
||||
caps := llm.Capabilities{
|
||||
MaxImagesPerReq: 1,
|
||||
MaxImageDimension: 50,
|
||||
AllowedImageMIME: []string{"image/jpeg"},
|
||||
}
|
||||
got, err := Normalize(req, caps)
|
||||
if err != nil {
|
||||
t.Fatalf("Normalize: %v", err)
|
||||
}
|
||||
orig := firstImage(t, req)
|
||||
if orig.MIME != "image/jpeg" {
|
||||
t.Errorf("input MIME mutated to %q", orig.MIME)
|
||||
}
|
||||
if !bytes.Equal(orig.Data, snapshot) {
|
||||
t.Error("input image bytes mutated")
|
||||
}
|
||||
if txt := req.Messages[0].Parts[0].(llm.TextPart); txt.Text != "scale me" {
|
||||
t.Errorf("input text part mutated: %q", txt.Text)
|
||||
}
|
||||
if ip := firstImage(t, got); bytes.Equal(ip.Data, snapshot) {
|
||||
t.Error("output image was expected to transform but is byte-identical")
|
||||
}
|
||||
}
|
||||
|
||||
// --- alpha handling ----------------------------------------------------------------
|
||||
|
||||
func TestNormalizeAlphaPNGToJPEG(t *testing.T) {
|
||||
img := image.NewRGBA(image.Rect(0, 0, 32, 32))
|
||||
for y := 0; y < 32; y++ {
|
||||
for x := 0; x < 32; x++ {
|
||||
img.SetRGBA(x, y, color.RGBA{R: 200, G: 60, B: 30, A: uint8(x * 8)})
|
||||
}
|
||||
}
|
||||
req := imgReq(llm.Image("image/png", encPNG(t, img)))
|
||||
caps := llm.Capabilities{MaxImagesPerReq: 1, AllowedImageMIME: []string{"image/jpeg"}}
|
||||
got, err := Normalize(req, caps)
|
||||
if err != nil {
|
||||
t.Fatalf("Normalize: %v", err)
|
||||
}
|
||||
ip := firstImage(t, got)
|
||||
decoded, err := jpeg.Decode(bytes.NewReader(ip.Data))
|
||||
if err != nil {
|
||||
t.Fatalf("decoding transcoded jpeg: %v", err)
|
||||
}
|
||||
if b := decoded.Bounds(); b.Dx() != 32 || b.Dy() != 32 {
|
||||
t.Errorf("decoded dimensions = %dx%d, want 32x32", b.Dx(), b.Dy())
|
||||
}
|
||||
}
|
||||
|
||||
// --- Info ----------------------------------------------------------------------------
|
||||
|
||||
func TestInfo(t *testing.T) {
|
||||
pngData := encPNG(t, gradient(10, 7))
|
||||
jpegData := encJPEG(t, gradient(5, 9))
|
||||
gifData := encGIF(t, gradient(6, 4))
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
part llm.ImagePart
|
||||
format string
|
||||
w, h int
|
||||
wantErr bool
|
||||
}{
|
||||
{"png", llm.ImagePart{MIME: "image/png", Data: pngData}, "png", 10, 7, false},
|
||||
{"jpeg", llm.ImagePart{MIME: "image/jpeg", Data: jpegData}, "jpeg", 5, 9, false},
|
||||
{"gif", llm.ImagePart{MIME: "image/gif", Data: gifData}, "gif", 6, 4, false},
|
||||
{"mislabeled png", llm.ImagePart{MIME: "image/jpeg", Data: pngData}, "png", 10, 7, false},
|
||||
{"webp", llm.ImagePart{MIME: "image/webp", Data: webpBlob()}, "webp", 0, 0, false},
|
||||
{"garbage", llm.ImagePart{MIME: "image/png", Data: []byte("nope")}, "", 0, 0, true},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
format, w, h, err := Info(tt.part)
|
||||
if tt.wantErr {
|
||||
if err == nil {
|
||||
t.Fatal("Info: expected error, got nil")
|
||||
}
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("Info: %v", err)
|
||||
}
|
||||
if format != tt.format || w != tt.w || h != tt.h {
|
||||
t.Errorf("Info = %q, %d, %d; want %q, %d, %d", format, w, h, tt.format, tt.w, tt.h)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// --- byte-cap pass-through interplay ----------------------------------------------------
|
||||
|
||||
func TestNormalizeOversizeBytesTriggersTransform(t *testing.T) {
|
||||
// A fitting MIME and dimension but an over-budget payload must re-encode,
|
||||
// not pass through.
|
||||
data := encPNG(t, noisy(64, 64))
|
||||
req := imgReq(llm.Image("image/png", data))
|
||||
caps := llm.Capabilities{
|
||||
MaxImagesPerReq: 1,
|
||||
MaxImageBytes: len(data) / 2,
|
||||
AllowedImageMIME: []string{"image/png", "image/jpeg"},
|
||||
}
|
||||
got, err := Normalize(req, caps)
|
||||
if err != nil {
|
||||
t.Fatalf("Normalize: %v", err)
|
||||
}
|
||||
ip := firstImage(t, got)
|
||||
if len(ip.Data) > caps.MaxImageBytes {
|
||||
t.Errorf("len(Data) = %d, exceeds budget %d", len(ip.Data), caps.MaxImageBytes)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
package media
|
||||
|
||||
import "image"
|
||||
|
||||
// fitDims scales (w, h) so the longer side equals limit, preserving aspect
|
||||
// ratio with round-half-up on the shorter side, floored at 1 pixel.
|
||||
func fitDims(w, h, limit int) (int, int) {
|
||||
if w >= h {
|
||||
return limit, max((h*limit+w/2)/w, 1)
|
||||
}
|
||||
return max((w*limit+h/2)/h, 1), limit
|
||||
}
|
||||
|
||||
// downscale resizes src to dw x dh using area averaging (a box filter): each
|
||||
// destination pixel is the mean of its corresponding source region.
|
||||
//
|
||||
// Why hand-rolled: the stdlib has no scaler and ADR-0007 bars
|
||||
// golang.org/x/image without a new ADR. Area averaging is dependency-free,
|
||||
// alias-resistant when shrinking (every source pixel contributes exactly
|
||||
// once), and entirely adequate quality for vision-model input. It is only
|
||||
// ever called to shrink — Normalize never upscales.
|
||||
func downscale(src image.Image, dw, dh int) *image.RGBA {
|
||||
b := src.Bounds()
|
||||
sw, sh := b.Dx(), b.Dy()
|
||||
dst := image.NewRGBA(image.Rect(0, 0, dw, dh))
|
||||
for dy := 0; dy < dh; dy++ {
|
||||
// Integer box edges: destination pixel dy covers source rows
|
||||
// [dy*sh/dh, (dy+1)*sh/dh), widened to at least one row.
|
||||
sy0 := dy * sh / dh
|
||||
sy1 := max((dy+1)*sh/dh, sy0+1)
|
||||
for dx := 0; dx < dw; dx++ {
|
||||
sx0 := dx * sw / dw
|
||||
sx1 := max((dx+1)*sw/dw, sx0+1)
|
||||
var r, g, bl, a uint64
|
||||
for sy := sy0; sy < sy1; sy++ {
|
||||
for sx := sx0; sx < sx1; sx++ {
|
||||
pr, pg, pb, pa := src.At(b.Min.X+sx, b.Min.Y+sy).RGBA()
|
||||
r += uint64(pr)
|
||||
g += uint64(pg)
|
||||
bl += uint64(pb)
|
||||
a += uint64(pa)
|
||||
}
|
||||
}
|
||||
n := uint64((sy1 - sy0) * (sx1 - sx0))
|
||||
i := dst.PixOffset(dx, dy)
|
||||
// RGBA() returns 16-bit channels; average, then drop to 8 bits.
|
||||
dst.Pix[i+0] = uint8(r / n >> 8)
|
||||
dst.Pix[i+1] = uint8(g / n >> 8)
|
||||
dst.Pix[i+2] = uint8(bl / n >> 8)
|
||||
dst.Pix[i+3] = uint8(a / n >> 8)
|
||||
}
|
||||
}
|
||||
return dst
|
||||
}
|
||||
Reference in New Issue
Block a user