Files
majordomo/media/media.go
T
steve d71aca4c3a
Adversarial Review (Gadfly) / review (pull_request) Has been cancelled
CI / Tidy (pull_request) Successful in 9m27s
CI / Build & Test (pull_request) Successful in 9m44s
fix(media): drop oldest images on over-count instead of refusing the request
media.Normalize refused (ErrUnsupported) when a request carried more images than
the target's MaxImagesPerReq, on the theory that a failover chain would try a
roomier target. In practice the chain's targets share the same cap — an agent loop
that accumulates a preview image per iteration (e.g. scaddy's write_scad) blows
past the cap, EVERY target rejects ("9 images, target allows at most 8"), and the
run dies. Observed live on ollama-cloud (cap 8).

Now: over-count keeps the most-recent MaxImagesPerReq images and replaces each
older one with a short text placeholder ("[earlier image omitted to fit this
model's per-request image limit]"), preserving each message's turn structure and
telling the model an image was elided. The most-recent images are the relevant
ones in an iterative run. Copy-on-write; the input request is never mutated. The
per-model threshold stays configurable via Capabilities.MaxImagesPerReq (0 still
means no image support); SupportsImages / MIME / byte-budget / dimension behavior
is unchanged, and the provider-side count backstop remains.

Test: TestNormalizeTooManyImages_DropsOldest — 3 images, cap 2 → 2 kept (the most
recent), 1 placeholder, no error, oldest dropped, input unmutated.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-28 17:38:21 -04:00

353 lines
12 KiB
Go

// Package media fits request images to a target's declared capabilities.
//
// Normalize sniffs each image's real format from magic bytes (declared MIME
// types lie), corrects the part's MIME, and passes through anything that
// already satisfies the target's llm.Capabilities. Images that do not fit
// are decoded, downscaled (never upscaled), and re-encoded into an allowed
// format and byte budget. Anything that cannot honestly be made to fit —
// undecodable formats, impossible byte budgets, images for a text-only
// target — fails with an error wrapping llm.ErrUnsupported so a failover
// chain can advance to a more capable target without a health penalty.
//
// Over-count is the exception: a request carrying more images than
// MaxImagesPerReq does NOT fail — the oldest images are replaced with a short
// text placeholder and the most-recent MaxImagesPerReq are kept, because a hard
// refuse exhausts a chain whose targets share the same cap (e.g. an agent loop
// accumulating a preview image per iteration). MaxImagesPerReq remains the
// per-model knob (0 = no image support).
//
// Why a separate package: every provider would otherwise duplicate the same
// decode/scale/encode pipeline. Providers keep only a cheap capability
// enforcement backstop; this package performs the actual transformation,
// once, against whichever target a chain is currently attempting.
package media
import (
"bytes"
"fmt"
"image"
"image/gif"
"image/jpeg"
"image/png"
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
)
// Normalize returns a copy of req whose images fit caps, transforming
// (downscale, re-encode) where needed. The input request is never mutated.
//
// Fast paths: a request with no image parts, or whose images already satisfy
// caps, is returned unchanged with all underlying slices shared. When any
// image transforms, the Messages slice and the Parts slices of affected
// messages are copied (copy-on-write); untouched parts stay shared.
//
// Images that cannot be made to fit return an error wrapping
// llm.ErrUnsupported.
func Normalize(req llm.Request, caps llm.Capabilities) (llm.Request, error) {
total := 0
for i := range req.Messages {
for _, p := range req.Messages[i].Parts {
if _, ok := p.(llm.ImagePart); ok {
total++
}
}
}
if total == 0 {
return req, nil
}
if !caps.SupportsImages() {
return llm.Request{}, fmt.Errorf("media: %w: target does not accept image input (request carries %d image(s))", llm.ErrUnsupported, total)
}
// Overflow: keep the most-recent MaxImagesPerReq images and replace each
// older one with a short text placeholder, rather than refusing the whole
// request. A hard refuse exhausts a failover chain whose targets share the
// same cap — e.g. an agent loop that accumulates a preview image per
// iteration past the cap makes EVERY target reject and the run dies. The
// placeholder preserves each message's turn structure and tells the model an
// earlier image was elided; the most recent images (the relevant ones in an
// iterative run) are retained. The per-model threshold stays configurable via
// Capabilities.MaxImagesPerReq (0 still means "no image support").
if total > caps.MaxImagesPerReq {
req = dropOldestImages(req, total-caps.MaxImagesPerReq)
}
out := req
copiedMessages := false
for mi := range req.Messages {
copiedParts := false
for pi, part := range req.Messages[mi].Parts {
ip, ok := part.(llm.ImagePart)
if !ok {
continue
}
norm, changed, err := normalizeImage(ip, caps)
if err != nil {
return llm.Request{}, fmt.Errorf("media: message %d, part %d: %w", mi, pi, err)
}
if !changed {
continue
}
if !copiedMessages {
out.Messages = make([]llm.Message, len(req.Messages))
copy(out.Messages, req.Messages)
copiedMessages = true
}
if !copiedParts {
parts := make([]llm.Part, len(req.Messages[mi].Parts))
copy(parts, req.Messages[mi].Parts)
out.Messages[mi].Parts = parts
copiedParts = true
}
out.Messages[mi].Parts[pi] = norm
}
}
return out, nil
}
// imageOverflowPlaceholder replaces an image dropped to fit a target's
// per-request image cap. It keeps the message turn intact and tells the model
// an earlier image was elided rather than silently changing the conversation.
const imageOverflowPlaceholder = "[earlier image omitted to fit this model's per-request image limit]"
// dropOldestImages replaces the n oldest image parts (front-to-back across the
// message history) with imageOverflowPlaceholder text, keeping the most-recent
// images and preserving every message's turn structure. Copy-on-write: the
// input request is never mutated. n <= 0 returns req unchanged.
func dropOldestImages(req llm.Request, n int) llm.Request {
if n <= 0 {
return req
}
out := req
out.Messages = make([]llm.Message, len(req.Messages))
copy(out.Messages, req.Messages)
dropped := 0
for mi := range out.Messages {
if dropped >= n {
break
}
if !hasImagePart(out.Messages[mi].Parts) {
continue
}
parts := make([]llm.Part, 0, len(out.Messages[mi].Parts))
for _, p := range out.Messages[mi].Parts {
if _, ok := p.(llm.ImagePart); ok && dropped < n {
dropped++
parts = append(parts, llm.Text(imageOverflowPlaceholder))
continue
}
parts = append(parts, p)
}
out.Messages[mi].Parts = parts
}
return out
}
func hasImagePart(parts []llm.Part) bool {
for _, p := range parts {
if _, ok := p.(llm.ImagePart); ok {
return true
}
}
return false
}
// Info reports an image part's sniffed format ("jpeg", "png", "gif", or
// "webp") and pixel dimensions. It is a cheap metadata read — the pixels are
// never decoded. webp is recognized by signature but not decodable with the
// standard library, so it reports format "webp" with zero dimensions and a
// nil error.
func Info(p llm.ImagePart) (format string, width, height int, err error) {
format = sniff(p.Data)
switch format {
case "":
return "", 0, 0, fmt.Errorf("media: image bytes match no known format (jpeg, png, gif, webp)")
case "webp":
return "webp", 0, 0, nil
}
cfg, _, err := image.DecodeConfig(bytes.NewReader(p.Data))
if err != nil {
return format, 0, 0, fmt.Errorf("media: decode %s config: %w", format, err)
}
return format, cfg.Width, cfg.Height, nil
}
// normalizeImage fits one image part to caps. It returns the (possibly
// transformed) part and whether it differs from the input. A corrected MIME
// with untouched bytes still counts as changed so Normalize copy-on-writes
// the containing slices.
func normalizeImage(p llm.ImagePart, caps llm.Capabilities) (llm.ImagePart, bool, error) {
// Why sniff instead of trusting p.MIME: callers routinely mislabel image
// bytes, and providers reject mismatches; the bytes are the truth.
format := sniff(p.Data)
if format == "" {
return p, false, fmt.Errorf("%w: image bytes (declared %q) match no known format (jpeg, png, gif, webp)", llm.ErrUnsupported, p.MIME)
}
realMIME := "image/" + format
changed := false
if p.MIME != realMIME {
p.MIME = realMIME
changed = true
}
mimeOK := caps.MIMEAllowed(realMIME)
fitsBytes := caps.MaxImageBytes == 0 || len(p.Data) <= caps.MaxImageBytes
fitsDims := true
if caps.MaxImageDimension > 0 && format != "webp" {
// Cheap header-only dimension read; a failure forces the transform
// path, which surfaces the real decode error.
cfg, _, err := image.DecodeConfig(bytes.NewReader(p.Data))
if err != nil {
fitsDims = false
} else {
fitsDims = cfg.Width <= caps.MaxImageDimension && cfg.Height <= caps.MaxImageDimension
}
}
// Why webp skips the dimension check: the stdlib cannot read webp
// headers, so dimensions are unverifiable; if MIME and bytes fit we pass
// it through rather than reject a possibly-fine image.
if mimeOK && fitsBytes && fitsDims {
return p, changed, nil
}
// Transformation required from here on, which needs a real decode.
if format == "webp" {
return p, false, fmt.Errorf("%w: image is webp (%d bytes), which the Go standard library cannot decode; provide jpeg, png, or gif instead", llm.ErrUnsupported, len(p.Data))
}
img, _, err := image.Decode(bytes.NewReader(p.Data))
if err != nil {
return p, false, fmt.Errorf("%w: cannot decode %s image for transformation: %v", llm.ErrUnsupported, format, err)
}
if caps.MaxImageDimension > 0 {
b := img.Bounds()
if b.Dx() > caps.MaxImageDimension || b.Dy() > caps.MaxImageDimension {
nw, nh := fitDims(b.Dx(), b.Dy(), caps.MaxImageDimension)
img = downscale(img, nw, nh)
}
}
target, err := targetMIME(realMIME, caps)
if err != nil {
return p, false, err
}
data, err := encodeFit(img, target, caps.MaxImageBytes)
if err != nil {
return p, false, err
}
return llm.ImagePart{MIME: target, Data: data}, true, nil
}
// sniff identifies an image format from its magic bytes, returning "jpeg",
// "png", "gif", "webp", or "" when nothing matches.
func sniff(data []byte) string {
switch {
case len(data) >= 3 && data[0] == 0xFF && data[1] == 0xD8 && data[2] == 0xFF:
return "jpeg"
case len(data) >= 4 && data[0] == 0x89 && data[1] == 'P' && data[2] == 'N' && data[3] == 'G':
return "png"
case len(data) >= 4 && string(data[:4]) == "GIF8":
return "gif"
case len(data) >= 12 && string(data[:4]) == "RIFF" && string(data[8:12]) == "WEBP":
return "webp"
default:
return ""
}
}
// encodableMIME reports whether the stdlib can encode the given image type.
func encodableMIME(mime string) bool {
switch mime {
case "image/jpeg", "image/png", "image/gif":
return true
}
return false
}
// targetMIME picks the re-encode format: the original when allowed, else
// jpeg, else png, else the first allowed encodable type (gif). When nothing
// allowed is encodable (e.g. only webp), it errors with llm.ErrUnsupported.
func targetMIME(original string, caps llm.Capabilities) (string, error) {
if encodableMIME(original) && caps.MIMEAllowed(original) {
return original, nil
}
// Why jpeg before png: vision inputs are photographs more often than
// screenshots, and jpeg's quality knob is the only size lever we have
// for the byte-budget loop.
for _, m := range []string{"image/jpeg", "image/png"} {
if caps.MIMEAllowed(m) {
return m, nil
}
}
// An empty allow-list permits everything and was caught above, so the
// list is non-empty here: take its first encodable entry.
for _, m := range caps.AllowedImageMIME {
if encodableMIME(m) {
return m, nil
}
}
return "", fmt.Errorf("%w: none of the allowed image types %v can be encoded with the Go standard library", llm.ErrUnsupported, caps.AllowedImageMIME)
}
// encodeFit encodes img as mime within maxBytes (0 = no limit), trading
// jpeg quality first and then resolution for size. The ladder is fixed
// (jpeg: q85, q65, q45, q30, then half and quarter dimensions at q65;
// png/gif: full, half, quarter dimensions) — at most six attempts, since an
// image that survives a 16x pixel reduction over budget will not be saved
// by further fiddling.
func encodeFit(img image.Image, mime string, maxBytes int) ([]byte, error) {
type attempt struct {
div int // divide both dimensions by this
quality int // jpeg quality; ignored for png/gif
}
var ladder []attempt
if mime == "image/jpeg" {
ladder = []attempt{{1, 85}, {1, 65}, {1, 45}, {1, 30}, {2, 65}, {4, 65}}
} else {
ladder = []attempt{{1, 0}, {2, 0}, {4, 0}}
}
scaled := map[int]image.Image{1: img}
smallest := -1
for _, a := range ladder {
cur, ok := scaled[a.div]
if !ok {
b := img.Bounds()
nw, nh := max(b.Dx()/a.div, 1), max(b.Dy()/a.div, 1)
cur = downscale(img, nw, nh)
scaled[a.div] = cur
}
data, err := encodeImage(cur, mime, a.quality)
if err != nil {
return nil, fmt.Errorf("encode %s: %w", mime, err)
}
if maxBytes == 0 || len(data) <= maxBytes {
return data, nil
}
if smallest == -1 || len(data) < smallest {
smallest = len(data)
}
}
return nil, fmt.Errorf("%w: image cannot be reduced to the %d-byte limit; smallest achievable %s encoding is %d bytes", llm.ErrUnsupported, maxBytes, mime, smallest)
}
// encodeImage encodes img into the given MIME type. quality applies to jpeg
// only.
func encodeImage(img image.Image, mime string, quality int) ([]byte, error) {
var buf bytes.Buffer
var err error
switch mime {
case "image/jpeg":
err = jpeg.Encode(&buf, img, &jpeg.Options{Quality: quality})
case "image/png":
err = png.Encode(&buf, img)
case "image/gif":
err = gif.Encode(&buf, img, nil)
default:
return nil, fmt.Errorf("no stdlib encoder for %q", mime)
}
if err != nil {
return nil, err
}
return buf.Bytes(), nil
}