// Package media fits request images to a target's declared capabilities. // // Normalize sniffs each image's real format from magic bytes (declared MIME // types lie), corrects the part's MIME, and passes through anything that // already satisfies the target's llm.Capabilities. Images that do not fit // are decoded, downscaled (never upscaled), and re-encoded into an allowed // format and byte budget. Anything that cannot honestly be made to fit — // undecodable formats, impossible byte budgets, images for a text-only // target — fails with an error wrapping llm.ErrUnsupported so a failover // chain can advance to a more capable target without a health penalty. // // Over-count is the exception: a request carrying more images than // MaxImagesPerReq does NOT fail — the oldest images are replaced with a short // text placeholder and the most-recent MaxImagesPerReq are kept, because a hard // refuse exhausts a chain whose targets share the same cap (e.g. an agent loop // accumulating a preview image per iteration). MaxImagesPerReq remains the // per-model knob (0 = no image support). // // Why a separate package: every provider would otherwise duplicate the same // decode/scale/encode pipeline. Providers keep only a cheap capability // enforcement backstop; this package performs the actual transformation, // once, against whichever target a chain is currently attempting. package media import ( "bytes" "fmt" "image" "image/gif" "image/jpeg" "image/png" "gitea.stevedudenhoeffer.com/steve/majordomo/llm" ) // Normalize returns a copy of req whose images fit caps, transforming // (downscale, re-encode) where needed. The input request is never mutated. // // Fast paths: a request with no image parts, or whose images already satisfy // caps, is returned unchanged with all underlying slices shared. When any // image transforms, the Messages slice and the Parts slices of affected // messages are copied (copy-on-write); untouched parts stay shared. // // Images that cannot be made to fit return an error wrapping // llm.ErrUnsupported. func Normalize(req llm.Request, caps llm.Capabilities) (llm.Request, error) { total := 0 for i := range req.Messages { for _, p := range req.Messages[i].Parts { if _, ok := p.(llm.ImagePart); ok { total++ } } } if total == 0 { return req, nil } if !caps.SupportsImages() { return llm.Request{}, fmt.Errorf("media: %w: target does not accept image input (request carries %d image(s))", llm.ErrUnsupported, total) } // Over-cap images are elided in the same copy-on-write pass below: the // OLDEST excess are replaced with a placeholder and the most-recent // MaxImagesPerReq kept (see the package doc for why we elide rather than // refuse). toElide is how many of the first images, front-to-back, to drop. toElide := 0 if total > caps.MaxImagesPerReq { toElide = total - caps.MaxImagesPerReq } // Single copy-on-write pass: for each image, the first toElide become a text // placeholder; the rest are size-normalized against caps. The Messages slice // and an affected message's Parts slice are copied at most once. out := req copiedMessages := false seen := 0 for mi := range req.Messages { copiedParts := false for pi, part := range req.Messages[mi].Parts { ip, ok := part.(llm.ImagePart) if !ok { continue } seen++ var replacement llm.Part if seen <= toElide { replacement = llm.Text(imageOverflowPlaceholder) } else { norm, changed, err := normalizeImage(ip, caps) if err != nil { return llm.Request{}, fmt.Errorf("media: message %d, part %d: %w", mi, pi, err) } if !changed { continue } replacement = norm } if !copiedMessages { out.Messages = make([]llm.Message, len(req.Messages)) copy(out.Messages, req.Messages) copiedMessages = true } if !copiedParts { parts := make([]llm.Part, len(req.Messages[mi].Parts)) copy(parts, req.Messages[mi].Parts) out.Messages[mi].Parts = parts copiedParts = true } out.Messages[mi].Parts[pi] = replacement } } return out, nil } // imageOverflowPlaceholder replaces an image elided to fit a target's // per-request image cap. It keeps the message turn intact and tells the model // an earlier image was omitted rather than silently changing the conversation. const imageOverflowPlaceholder = "[earlier image omitted to fit this model's per-request image limit]" // Info reports an image part's sniffed format ("jpeg", "png", "gif", or // "webp") and pixel dimensions. It is a cheap metadata read — the pixels are // never decoded. webp is recognized by signature but not decodable with the // standard library, so it reports format "webp" with zero dimensions and a // nil error. func Info(p llm.ImagePart) (format string, width, height int, err error) { format = sniff(p.Data) switch format { case "": return "", 0, 0, fmt.Errorf("media: image bytes match no known format (jpeg, png, gif, webp)") case "webp": return "webp", 0, 0, nil } cfg, _, err := image.DecodeConfig(bytes.NewReader(p.Data)) if err != nil { return format, 0, 0, fmt.Errorf("media: decode %s config: %w", format, err) } return format, cfg.Width, cfg.Height, nil } // normalizeImage fits one image part to caps. It returns the (possibly // transformed) part and whether it differs from the input. A corrected MIME // with untouched bytes still counts as changed so Normalize copy-on-writes // the containing slices. func normalizeImage(p llm.ImagePart, caps llm.Capabilities) (llm.ImagePart, bool, error) { // Why sniff instead of trusting p.MIME: callers routinely mislabel image // bytes, and providers reject mismatches; the bytes are the truth. format := sniff(p.Data) if format == "" { return p, false, fmt.Errorf("%w: image bytes (declared %q) match no known format (jpeg, png, gif, webp)", llm.ErrUnsupported, p.MIME) } realMIME := "image/" + format changed := false if p.MIME != realMIME { p.MIME = realMIME changed = true } mimeOK := caps.MIMEAllowed(realMIME) fitsBytes := caps.MaxImageBytes == 0 || len(p.Data) <= caps.MaxImageBytes fitsDims := true if caps.MaxImageDimension > 0 && format != "webp" { // Cheap header-only dimension read; a failure forces the transform // path, which surfaces the real decode error. cfg, _, err := image.DecodeConfig(bytes.NewReader(p.Data)) if err != nil { fitsDims = false } else { fitsDims = cfg.Width <= caps.MaxImageDimension && cfg.Height <= caps.MaxImageDimension } } // Why webp skips the dimension check: the stdlib cannot read webp // headers, so dimensions are unverifiable; if MIME and bytes fit we pass // it through rather than reject a possibly-fine image. if mimeOK && fitsBytes && fitsDims { return p, changed, nil } // Transformation required from here on, which needs a real decode. if format == "webp" { return p, false, fmt.Errorf("%w: image is webp (%d bytes), which the Go standard library cannot decode; provide jpeg, png, or gif instead", llm.ErrUnsupported, len(p.Data)) } img, _, err := image.Decode(bytes.NewReader(p.Data)) if err != nil { return p, false, fmt.Errorf("%w: cannot decode %s image for transformation: %v", llm.ErrUnsupported, format, err) } if caps.MaxImageDimension > 0 { b := img.Bounds() if b.Dx() > caps.MaxImageDimension || b.Dy() > caps.MaxImageDimension { nw, nh := fitDims(b.Dx(), b.Dy(), caps.MaxImageDimension) img = downscale(img, nw, nh) } } target, err := targetMIME(realMIME, caps) if err != nil { return p, false, err } data, err := encodeFit(img, target, caps.MaxImageBytes) if err != nil { return p, false, err } return llm.ImagePart{MIME: target, Data: data}, true, nil } // sniff identifies an image format from its magic bytes, returning "jpeg", // "png", "gif", "webp", or "" when nothing matches. func sniff(data []byte) string { switch { case len(data) >= 3 && data[0] == 0xFF && data[1] == 0xD8 && data[2] == 0xFF: return "jpeg" case len(data) >= 4 && data[0] == 0x89 && data[1] == 'P' && data[2] == 'N' && data[3] == 'G': return "png" case len(data) >= 4 && string(data[:4]) == "GIF8": return "gif" case len(data) >= 12 && string(data[:4]) == "RIFF" && string(data[8:12]) == "WEBP": return "webp" default: return "" } } // encodableMIME reports whether the stdlib can encode the given image type. func encodableMIME(mime string) bool { switch mime { case "image/jpeg", "image/png", "image/gif": return true } return false } // targetMIME picks the re-encode format: the original when allowed, else // jpeg, else png, else the first allowed encodable type (gif). When nothing // allowed is encodable (e.g. only webp), it errors with llm.ErrUnsupported. func targetMIME(original string, caps llm.Capabilities) (string, error) { if encodableMIME(original) && caps.MIMEAllowed(original) { return original, nil } // Why jpeg before png: vision inputs are photographs more often than // screenshots, and jpeg's quality knob is the only size lever we have // for the byte-budget loop. for _, m := range []string{"image/jpeg", "image/png"} { if caps.MIMEAllowed(m) { return m, nil } } // An empty allow-list permits everything and was caught above, so the // list is non-empty here: take its first encodable entry. for _, m := range caps.AllowedImageMIME { if encodableMIME(m) { return m, nil } } return "", fmt.Errorf("%w: none of the allowed image types %v can be encoded with the Go standard library", llm.ErrUnsupported, caps.AllowedImageMIME) } // encodeFit encodes img as mime within maxBytes (0 = no limit), trading // jpeg quality first and then resolution for size. The ladder is fixed // (jpeg: q85, q65, q45, q30, then half and quarter dimensions at q65; // png/gif: full, half, quarter dimensions) — at most six attempts, since an // image that survives a 16x pixel reduction over budget will not be saved // by further fiddling. func encodeFit(img image.Image, mime string, maxBytes int) ([]byte, error) { type attempt struct { div int // divide both dimensions by this quality int // jpeg quality; ignored for png/gif } var ladder []attempt if mime == "image/jpeg" { ladder = []attempt{{1, 85}, {1, 65}, {1, 45}, {1, 30}, {2, 65}, {4, 65}} } else { ladder = []attempt{{1, 0}, {2, 0}, {4, 0}} } scaled := map[int]image.Image{1: img} smallest := -1 for _, a := range ladder { cur, ok := scaled[a.div] if !ok { b := img.Bounds() nw, nh := max(b.Dx()/a.div, 1), max(b.Dy()/a.div, 1) cur = downscale(img, nw, nh) scaled[a.div] = cur } data, err := encodeImage(cur, mime, a.quality) if err != nil { return nil, fmt.Errorf("encode %s: %w", mime, err) } if maxBytes == 0 || len(data) <= maxBytes { return data, nil } if smallest == -1 || len(data) < smallest { smallest = len(data) } } return nil, fmt.Errorf("%w: image cannot be reduced to the %d-byte limit; smallest achievable %s encoding is %d bytes", llm.ErrUnsupported, maxBytes, mime, smallest) } // encodeImage encodes img into the given MIME type. quality applies to jpeg // only. func encodeImage(img image.Image, mime string, quality int) ([]byte, error) { var buf bytes.Buffer var err error switch mime { case "image/jpeg": err = jpeg.Encode(&buf, img, &jpeg.Options{Quality: quality}) case "image/png": err = png.Encode(&buf, img) case "image/gif": err = gif.Encode(&buf, img, nil) default: return nil, fmt.Errorf("no stdlib encoder for %q", mime) } if err != nil { return nil, err } return buf.Bytes(), nil }