043249e0e1
Phase 3: - provider/openai: Chat Completions for OpenAI + compat endpoints (SSE streaming with by-index tool-call assembly, response_format json_schema, legacy max_tokens option, reasoning_effort) - provider/anthropic: Messages API (tool_use/tool_result, GA structured output via output_config.format, full SSE event parser, 529 transient) - provider/ollama: one native /api/chat client behind the ollama, ollama-cloud, and foreman built-ins (presets; NDJSON streaming tolerant of foreman's buffered single-object responses; object tool arguments; format-schema structured output; think mapping) - media/: capability normalization (sniff, downscale, transcode, byte ladder, ErrUnsupported), wired into the chain executor per target with penalty-free advance past incapable elements - registry: real provider + scheme wiring, WithHTTPClient option, required env-foreman TLS chat round-trip test - ADR-0009 multimodal strategy, ADR-0010 tools/structured mapping; README matrix + CLAUDE.md synced Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
294 lines
10 KiB
Go
294 lines
10 KiB
Go
// Package media fits request images to a target's declared capabilities.
|
|
//
|
|
// Normalize sniffs each image's real format from magic bytes (declared MIME
|
|
// types lie), corrects the part's MIME, and passes through anything that
|
|
// already satisfies the target's llm.Capabilities. Images that do not fit
|
|
// are decoded, downscaled (never upscaled), and re-encoded into an allowed
|
|
// format and byte budget. Anything that cannot honestly be made to fit —
|
|
// undecodable formats, impossible byte budgets, too many images, images for
|
|
// a text-only target — fails with an error wrapping llm.ErrUnsupported so a
|
|
// failover chain can advance to a more capable target without a health
|
|
// penalty.
|
|
//
|
|
// Why a separate package: every provider would otherwise duplicate the same
|
|
// decode/scale/encode pipeline. Providers keep only a cheap capability
|
|
// enforcement backstop; this package performs the actual transformation,
|
|
// once, against whichever target a chain is currently attempting.
|
|
package media
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"image"
|
|
"image/gif"
|
|
"image/jpeg"
|
|
"image/png"
|
|
|
|
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
|
|
)
|
|
|
|
// Normalize returns a copy of req whose images fit caps, transforming
|
|
// (downscale, re-encode) where needed. The input request is never mutated.
|
|
//
|
|
// Fast paths: a request with no image parts, or whose images already satisfy
|
|
// caps, is returned unchanged with all underlying slices shared. When any
|
|
// image transforms, the Messages slice and the Parts slices of affected
|
|
// messages are copied (copy-on-write); untouched parts stay shared.
|
|
//
|
|
// Images that cannot be made to fit return an error wrapping
|
|
// llm.ErrUnsupported.
|
|
func Normalize(req llm.Request, caps llm.Capabilities) (llm.Request, error) {
|
|
total := 0
|
|
for i := range req.Messages {
|
|
for _, p := range req.Messages[i].Parts {
|
|
if _, ok := p.(llm.ImagePart); ok {
|
|
total++
|
|
}
|
|
}
|
|
}
|
|
if total == 0 {
|
|
return req, nil
|
|
}
|
|
if !caps.SupportsImages() {
|
|
return llm.Request{}, fmt.Errorf("media: %w: target does not accept image input (request carries %d image(s))", llm.ErrUnsupported, total)
|
|
}
|
|
// Why error instead of dropping the overflow: silently removing an image
|
|
// changes the question the caller asked; the honest move is to refuse and
|
|
// let a chain try a roomier target.
|
|
if total > caps.MaxImagesPerReq {
|
|
return llm.Request{}, fmt.Errorf("media: %w: request carries %d images, target allows at most %d per request", llm.ErrUnsupported, total, caps.MaxImagesPerReq)
|
|
}
|
|
|
|
out := req
|
|
copiedMessages := false
|
|
for mi := range req.Messages {
|
|
copiedParts := false
|
|
for pi, part := range req.Messages[mi].Parts {
|
|
ip, ok := part.(llm.ImagePart)
|
|
if !ok {
|
|
continue
|
|
}
|
|
norm, changed, err := normalizeImage(ip, caps)
|
|
if err != nil {
|
|
return llm.Request{}, fmt.Errorf("media: message %d, part %d: %w", mi, pi, err)
|
|
}
|
|
if !changed {
|
|
continue
|
|
}
|
|
if !copiedMessages {
|
|
out.Messages = make([]llm.Message, len(req.Messages))
|
|
copy(out.Messages, req.Messages)
|
|
copiedMessages = true
|
|
}
|
|
if !copiedParts {
|
|
parts := make([]llm.Part, len(req.Messages[mi].Parts))
|
|
copy(parts, req.Messages[mi].Parts)
|
|
out.Messages[mi].Parts = parts
|
|
copiedParts = true
|
|
}
|
|
out.Messages[mi].Parts[pi] = norm
|
|
}
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// Info reports an image part's sniffed format ("jpeg", "png", "gif", or
|
|
// "webp") and pixel dimensions. It is a cheap metadata read — the pixels are
|
|
// never decoded. webp is recognized by signature but not decodable with the
|
|
// standard library, so it reports format "webp" with zero dimensions and a
|
|
// nil error.
|
|
func Info(p llm.ImagePart) (format string, width, height int, err error) {
|
|
format = sniff(p.Data)
|
|
switch format {
|
|
case "":
|
|
return "", 0, 0, fmt.Errorf("media: image bytes match no known format (jpeg, png, gif, webp)")
|
|
case "webp":
|
|
return "webp", 0, 0, nil
|
|
}
|
|
cfg, _, err := image.DecodeConfig(bytes.NewReader(p.Data))
|
|
if err != nil {
|
|
return format, 0, 0, fmt.Errorf("media: decode %s config: %w", format, err)
|
|
}
|
|
return format, cfg.Width, cfg.Height, nil
|
|
}
|
|
|
|
// normalizeImage fits one image part to caps. It returns the (possibly
|
|
// transformed) part and whether it differs from the input. A corrected MIME
|
|
// with untouched bytes still counts as changed so Normalize copy-on-writes
|
|
// the containing slices.
|
|
func normalizeImage(p llm.ImagePart, caps llm.Capabilities) (llm.ImagePart, bool, error) {
|
|
// Why sniff instead of trusting p.MIME: callers routinely mislabel image
|
|
// bytes, and providers reject mismatches; the bytes are the truth.
|
|
format := sniff(p.Data)
|
|
if format == "" {
|
|
return p, false, fmt.Errorf("%w: image bytes (declared %q) match no known format (jpeg, png, gif, webp)", llm.ErrUnsupported, p.MIME)
|
|
}
|
|
realMIME := "image/" + format
|
|
changed := false
|
|
if p.MIME != realMIME {
|
|
p.MIME = realMIME
|
|
changed = true
|
|
}
|
|
|
|
mimeOK := caps.MIMEAllowed(realMIME)
|
|
fitsBytes := caps.MaxImageBytes == 0 || len(p.Data) <= caps.MaxImageBytes
|
|
fitsDims := true
|
|
if caps.MaxImageDimension > 0 && format != "webp" {
|
|
// Cheap header-only dimension read; a failure forces the transform
|
|
// path, which surfaces the real decode error.
|
|
cfg, _, err := image.DecodeConfig(bytes.NewReader(p.Data))
|
|
if err != nil {
|
|
fitsDims = false
|
|
} else {
|
|
fitsDims = cfg.Width <= caps.MaxImageDimension && cfg.Height <= caps.MaxImageDimension
|
|
}
|
|
}
|
|
// Why webp skips the dimension check: the stdlib cannot read webp
|
|
// headers, so dimensions are unverifiable; if MIME and bytes fit we pass
|
|
// it through rather than reject a possibly-fine image.
|
|
if mimeOK && fitsBytes && fitsDims {
|
|
return p, changed, nil
|
|
}
|
|
|
|
// Transformation required from here on, which needs a real decode.
|
|
if format == "webp" {
|
|
return p, false, fmt.Errorf("%w: image is webp (%d bytes), which the Go standard library cannot decode; provide jpeg, png, or gif instead", llm.ErrUnsupported, len(p.Data))
|
|
}
|
|
img, _, err := image.Decode(bytes.NewReader(p.Data))
|
|
if err != nil {
|
|
return p, false, fmt.Errorf("%w: cannot decode %s image for transformation: %v", llm.ErrUnsupported, format, err)
|
|
}
|
|
|
|
if caps.MaxImageDimension > 0 {
|
|
b := img.Bounds()
|
|
if b.Dx() > caps.MaxImageDimension || b.Dy() > caps.MaxImageDimension {
|
|
nw, nh := fitDims(b.Dx(), b.Dy(), caps.MaxImageDimension)
|
|
img = downscale(img, nw, nh)
|
|
}
|
|
}
|
|
|
|
target, err := targetMIME(realMIME, caps)
|
|
if err != nil {
|
|
return p, false, err
|
|
}
|
|
data, err := encodeFit(img, target, caps.MaxImageBytes)
|
|
if err != nil {
|
|
return p, false, err
|
|
}
|
|
return llm.ImagePart{MIME: target, Data: data}, true, nil
|
|
}
|
|
|
|
// sniff identifies an image format from its magic bytes, returning "jpeg",
|
|
// "png", "gif", "webp", or "" when nothing matches.
|
|
func sniff(data []byte) string {
|
|
switch {
|
|
case len(data) >= 3 && data[0] == 0xFF && data[1] == 0xD8 && data[2] == 0xFF:
|
|
return "jpeg"
|
|
case len(data) >= 4 && data[0] == 0x89 && data[1] == 'P' && data[2] == 'N' && data[3] == 'G':
|
|
return "png"
|
|
case len(data) >= 4 && string(data[:4]) == "GIF8":
|
|
return "gif"
|
|
case len(data) >= 12 && string(data[:4]) == "RIFF" && string(data[8:12]) == "WEBP":
|
|
return "webp"
|
|
default:
|
|
return ""
|
|
}
|
|
}
|
|
|
|
// encodableMIME reports whether the stdlib can encode the given image type.
|
|
func encodableMIME(mime string) bool {
|
|
switch mime {
|
|
case "image/jpeg", "image/png", "image/gif":
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// targetMIME picks the re-encode format: the original when allowed, else
|
|
// jpeg, else png, else the first allowed encodable type (gif). When nothing
|
|
// allowed is encodable (e.g. only webp), it errors with llm.ErrUnsupported.
|
|
func targetMIME(original string, caps llm.Capabilities) (string, error) {
|
|
if encodableMIME(original) && caps.MIMEAllowed(original) {
|
|
return original, nil
|
|
}
|
|
// Why jpeg before png: vision inputs are photographs more often than
|
|
// screenshots, and jpeg's quality knob is the only size lever we have
|
|
// for the byte-budget loop.
|
|
for _, m := range []string{"image/jpeg", "image/png"} {
|
|
if caps.MIMEAllowed(m) {
|
|
return m, nil
|
|
}
|
|
}
|
|
// An empty allow-list permits everything and was caught above, so the
|
|
// list is non-empty here: take its first encodable entry.
|
|
for _, m := range caps.AllowedImageMIME {
|
|
if encodableMIME(m) {
|
|
return m, nil
|
|
}
|
|
}
|
|
return "", fmt.Errorf("%w: none of the allowed image types %v can be encoded with the Go standard library", llm.ErrUnsupported, caps.AllowedImageMIME)
|
|
}
|
|
|
|
// encodeFit encodes img as mime within maxBytes (0 = no limit), trading
|
|
// jpeg quality first and then resolution for size. The ladder is fixed
|
|
// (jpeg: q85, q65, q45, q30, then half and quarter dimensions at q65;
|
|
// png/gif: full, half, quarter dimensions) — at most six attempts, since an
|
|
// image that survives a 16x pixel reduction over budget will not be saved
|
|
// by further fiddling.
|
|
func encodeFit(img image.Image, mime string, maxBytes int) ([]byte, error) {
|
|
type attempt struct {
|
|
div int // divide both dimensions by this
|
|
quality int // jpeg quality; ignored for png/gif
|
|
}
|
|
var ladder []attempt
|
|
if mime == "image/jpeg" {
|
|
ladder = []attempt{{1, 85}, {1, 65}, {1, 45}, {1, 30}, {2, 65}, {4, 65}}
|
|
} else {
|
|
ladder = []attempt{{1, 0}, {2, 0}, {4, 0}}
|
|
}
|
|
|
|
scaled := map[int]image.Image{1: img}
|
|
smallest := -1
|
|
for _, a := range ladder {
|
|
cur, ok := scaled[a.div]
|
|
if !ok {
|
|
b := img.Bounds()
|
|
nw, nh := max(b.Dx()/a.div, 1), max(b.Dy()/a.div, 1)
|
|
cur = downscale(img, nw, nh)
|
|
scaled[a.div] = cur
|
|
}
|
|
data, err := encodeImage(cur, mime, a.quality)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("encode %s: %w", mime, err)
|
|
}
|
|
if maxBytes == 0 || len(data) <= maxBytes {
|
|
return data, nil
|
|
}
|
|
if smallest == -1 || len(data) < smallest {
|
|
smallest = len(data)
|
|
}
|
|
}
|
|
return nil, fmt.Errorf("%w: image cannot be reduced to the %d-byte limit; smallest achievable %s encoding is %d bytes", llm.ErrUnsupported, maxBytes, mime, smallest)
|
|
}
|
|
|
|
// encodeImage encodes img into the given MIME type. quality applies to jpeg
|
|
// only.
|
|
func encodeImage(img image.Image, mime string, quality int) ([]byte, error) {
|
|
var buf bytes.Buffer
|
|
var err error
|
|
switch mime {
|
|
case "image/jpeg":
|
|
err = jpeg.Encode(&buf, img, &jpeg.Options{Quality: quality})
|
|
case "image/png":
|
|
err = png.Encode(&buf, img)
|
|
case "image/gif":
|
|
err = gif.Encode(&buf, img, nil)
|
|
default:
|
|
return nil, fmt.Errorf("no stdlib encoder for %q", mime)
|
|
}
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return buf.Bytes(), nil
|
|
}
|