diff --git a/media/media.go b/media/media.go index e6e0bc8..467affa 100644 --- a/media/media.go +++ b/media/media.go @@ -5,10 +5,16 @@ // already satisfies the target's llm.Capabilities. Images that do not fit // are decoded, downscaled (never upscaled), and re-encoded into an allowed // format and byte budget. Anything that cannot honestly be made to fit — -// undecodable formats, impossible byte budgets, too many images, images for -// a text-only target — fails with an error wrapping llm.ErrUnsupported so a -// failover chain can advance to a more capable target without a health -// penalty. +// undecodable formats, impossible byte budgets, images for a text-only +// target — fails with an error wrapping llm.ErrUnsupported so a failover +// chain can advance to a more capable target without a health penalty. +// +// Over-count is the exception: a request carrying more images than +// MaxImagesPerReq does NOT fail — the oldest images are replaced with a short +// text placeholder and the most-recent MaxImagesPerReq are kept, because a hard +// refuse exhausts a chain whose targets share the same cap (e.g. an agent loop +// accumulating a preview image per iteration). MaxImagesPerReq remains the +// per-model knob (0 = no image support). // // Why a separate package: every provider would otherwise duplicate the same // decode/scale/encode pipeline. Providers keep only a cheap capability @@ -52,15 +58,21 @@ func Normalize(req llm.Request, caps llm.Capabilities) (llm.Request, error) { if !caps.SupportsImages() { return llm.Request{}, fmt.Errorf("media: %w: target does not accept image input (request carries %d image(s))", llm.ErrUnsupported, total) } - // Why error instead of dropping the overflow: silently removing an image - // changes the question the caller asked; the honest move is to refuse and - // let a chain try a roomier target. + // Over-cap images are elided in the same copy-on-write pass below: the + // OLDEST excess are replaced with a placeholder and the most-recent + // MaxImagesPerReq kept (see the package doc for why we elide rather than + // refuse). toElide is how many of the first images, front-to-back, to drop. + toElide := 0 if total > caps.MaxImagesPerReq { - return llm.Request{}, fmt.Errorf("media: %w: request carries %d images, target allows at most %d per request", llm.ErrUnsupported, total, caps.MaxImagesPerReq) + toElide = total - caps.MaxImagesPerReq } + // Single copy-on-write pass: for each image, the first toElide become a text + // placeholder; the rest are size-normalized against caps. The Messages slice + // and an affected message's Parts slice are copied at most once. out := req copiedMessages := false + seen := 0 for mi := range req.Messages { copiedParts := false for pi, part := range req.Messages[mi].Parts { @@ -68,13 +80,22 @@ func Normalize(req llm.Request, caps llm.Capabilities) (llm.Request, error) { if !ok { continue } - norm, changed, err := normalizeImage(ip, caps) - if err != nil { - return llm.Request{}, fmt.Errorf("media: message %d, part %d: %w", mi, pi, err) - } - if !changed { - continue + seen++ + + var replacement llm.Part + if seen <= toElide { + replacement = llm.Text(imageOverflowPlaceholder) + } else { + norm, changed, err := normalizeImage(ip, caps) + if err != nil { + return llm.Request{}, fmt.Errorf("media: message %d, part %d: %w", mi, pi, err) + } + if !changed { + continue + } + replacement = norm } + if !copiedMessages { out.Messages = make([]llm.Message, len(req.Messages)) copy(out.Messages, req.Messages) @@ -86,12 +107,17 @@ func Normalize(req llm.Request, caps llm.Capabilities) (llm.Request, error) { out.Messages[mi].Parts = parts copiedParts = true } - out.Messages[mi].Parts[pi] = norm + out.Messages[mi].Parts[pi] = replacement } } return out, nil } +// imageOverflowPlaceholder replaces an image elided to fit a target's +// per-request image cap. It keeps the message turn intact and tells the model +// an earlier image was omitted rather than silently changing the conversation. +const imageOverflowPlaceholder = "[earlier image omitted to fit this model's per-request image limit]" + // Info reports an image part's sniffed format ("jpeg", "png", "gif", or // "webp") and pixel dimensions. It is a cheap metadata read — the pixels are // never decoded. webp is recognized by signature but not decodable with the diff --git a/media/media_test.go b/media/media_test.go index 2e3e92c..0a310f2 100644 --- a/media/media_test.go +++ b/media/media_test.go @@ -149,18 +149,48 @@ func TestNormalizeImagesUnsupported(t *testing.T) { } } -func TestNormalizeTooManyImages(t *testing.T) { - img := llm.Image("image/png", encPNG(t, gradient(4, 4))) +func TestNormalizeOverCount(t *testing.T) { + // 3 distinguishable images across 2 messages; cap = 2. Over-count no longer + // errors — the OLDEST image is replaced with a placeholder and the most-recent + // two (the relevant ones in an iterative run) are kept, in order. + a := llm.Image("image/png", encPNG(t, gradient(2, 2))).(llm.ImagePart) + b := llm.Image("image/png", encPNG(t, gradient(4, 4))).(llm.ImagePart) + c := llm.Image("image/png", encPNG(t, gradient(8, 8))).(llm.ImagePart) req := llm.Request{Messages: []llm.Message{ - llm.UserParts(img, img), - llm.UserParts(img), + llm.UserParts(a, b), + llm.UserParts(c), }} - _, err := Normalize(req, llm.Capabilities{MaxImagesPerReq: 2}) - if !errors.Is(err, llm.ErrUnsupported) { - t.Fatalf("err = %v, want ErrUnsupported", err) + caps := llm.Capabilities{MaxImagesPerReq: 2, MaxImageDimension: 64, MaxImageBytes: 1 << 20, AllowedImageMIME: []string{"image/png"}} + out, err := Normalize(req, caps) + if err != nil { + t.Fatalf("over-count should not error: %v", err) } - if !strings.Contains(err.Error(), "3 images") || !strings.Contains(err.Error(), "at most 2") { - t.Errorf("err message %q lacks the counts", err) + var imgs []llm.ImagePart + placeholders := 0 + for _, m := range out.Messages { + for _, p := range m.Parts { + switch v := p.(type) { + case llm.ImagePart: + imgs = append(imgs, v) + case llm.TextPart: + if v.Text == imageOverflowPlaceholder { + placeholders++ + } + } + } + } + // The exact survivors are the most-recent two, in order: b then c (a elided). + if len(imgs) != 2 || !bytes.Equal(imgs[0].Data, b.Data) || !bytes.Equal(imgs[1].Data, c.Data) { + t.Fatalf("kept %d images; want exactly [b, c] (the most-recent two)", len(imgs)) + } + if placeholders != 1 { + t.Errorf("placeholders = %d, want 1 for the elided oldest image", placeholders) + } + // Input request untouched (copy-on-write): the first part is still image a, + // not a placeholder — a len check alone wouldn't catch in-place substitution. + first, ok := req.Messages[0].Parts[0].(llm.ImagePart) + if !ok || !bytes.Equal(first.Data, a.Data) { + t.Errorf("input request was mutated; first part = %+v", req.Messages[0].Parts[0]) } }