diff --git a/media/media.go b/media/media.go index e6e0bc8..8cce4c4 100644 --- a/media/media.go +++ b/media/media.go @@ -5,10 +5,16 @@ // already satisfies the target's llm.Capabilities. Images that do not fit // are decoded, downscaled (never upscaled), and re-encoded into an allowed // format and byte budget. Anything that cannot honestly be made to fit — -// undecodable formats, impossible byte budgets, too many images, images for -// a text-only target — fails with an error wrapping llm.ErrUnsupported so a -// failover chain can advance to a more capable target without a health -// penalty. +// undecodable formats, impossible byte budgets, images for a text-only +// target — fails with an error wrapping llm.ErrUnsupported so a failover +// chain can advance to a more capable target without a health penalty. +// +// Over-count is the exception: a request carrying more images than +// MaxImagesPerReq does NOT fail — the oldest images are replaced with a short +// text placeholder and the most-recent MaxImagesPerReq are kept, because a hard +// refuse exhausts a chain whose targets share the same cap (e.g. an agent loop +// accumulating a preview image per iteration). MaxImagesPerReq remains the +// per-model knob (0 = no image support). // // Why a separate package: every provider would otherwise duplicate the same // decode/scale/encode pipeline. Providers keep only a cheap capability @@ -52,11 +58,17 @@ func Normalize(req llm.Request, caps llm.Capabilities) (llm.Request, error) { if !caps.SupportsImages() { return llm.Request{}, fmt.Errorf("media: %w: target does not accept image input (request carries %d image(s))", llm.ErrUnsupported, total) } - // Why error instead of dropping the overflow: silently removing an image - // changes the question the caller asked; the honest move is to refuse and - // let a chain try a roomier target. + // Overflow: keep the most-recent MaxImagesPerReq images and replace each + // older one with a short text placeholder, rather than refusing the whole + // request. A hard refuse exhausts a failover chain whose targets share the + // same cap — e.g. an agent loop that accumulates a preview image per + // iteration past the cap makes EVERY target reject and the run dies. The + // placeholder preserves each message's turn structure and tells the model an + // earlier image was elided; the most recent images (the relevant ones in an + // iterative run) are retained. The per-model threshold stays configurable via + // Capabilities.MaxImagesPerReq (0 still means "no image support"). if total > caps.MaxImagesPerReq { - return llm.Request{}, fmt.Errorf("media: %w: request carries %d images, target allows at most %d per request", llm.ErrUnsupported, total, caps.MaxImagesPerReq) + req = dropOldestImages(req, total-caps.MaxImagesPerReq) } out := req @@ -92,6 +104,53 @@ func Normalize(req llm.Request, caps llm.Capabilities) (llm.Request, error) { return out, nil } +// imageOverflowPlaceholder replaces an image dropped to fit a target's +// per-request image cap. It keeps the message turn intact and tells the model +// an earlier image was elided rather than silently changing the conversation. +const imageOverflowPlaceholder = "[earlier image omitted to fit this model's per-request image limit]" + +// dropOldestImages replaces the n oldest image parts (front-to-back across the +// message history) with imageOverflowPlaceholder text, keeping the most-recent +// images and preserving every message's turn structure. Copy-on-write: the +// input request is never mutated. n <= 0 returns req unchanged. +func dropOldestImages(req llm.Request, n int) llm.Request { + if n <= 0 { + return req + } + out := req + out.Messages = make([]llm.Message, len(req.Messages)) + copy(out.Messages, req.Messages) + dropped := 0 + for mi := range out.Messages { + if dropped >= n { + break + } + if !hasImagePart(out.Messages[mi].Parts) { + continue + } + parts := make([]llm.Part, 0, len(out.Messages[mi].Parts)) + for _, p := range out.Messages[mi].Parts { + if _, ok := p.(llm.ImagePart); ok && dropped < n { + dropped++ + parts = append(parts, llm.Text(imageOverflowPlaceholder)) + continue + } + parts = append(parts, p) + } + out.Messages[mi].Parts = parts + } + return out +} + +func hasImagePart(parts []llm.Part) bool { + for _, p := range parts { + if _, ok := p.(llm.ImagePart); ok { + return true + } + } + return false +} + // Info reports an image part's sniffed format ("jpeg", "png", "gif", or // "webp") and pixel dimensions. It is a cheap metadata read — the pixels are // never decoded. webp is recognized by signature but not decodable with the diff --git a/media/media_test.go b/media/media_test.go index 2e3e92c..e0129dd 100644 --- a/media/media_test.go +++ b/media/media_test.go @@ -149,18 +149,50 @@ func TestNormalizeImagesUnsupported(t *testing.T) { } } -func TestNormalizeTooManyImages(t *testing.T) { - img := llm.Image("image/png", encPNG(t, gradient(4, 4))) +func TestNormalizeTooManyImages_DropsOldest(t *testing.T) { + // 3 distinguishable images across 2 messages; cap = 2. Overflow no longer + // errors — the OLDEST image is replaced with a placeholder and the most-recent + // two (the relevant ones in an iterative run) are kept. + a := llm.Image("image/png", encPNG(t, gradient(2, 2))).(llm.ImagePart) + b := llm.Image("image/png", encPNG(t, gradient(4, 4))).(llm.ImagePart) + c := llm.Image("image/png", encPNG(t, gradient(8, 8))).(llm.ImagePart) req := llm.Request{Messages: []llm.Message{ - llm.UserParts(img, img), - llm.UserParts(img), + llm.UserParts(a, b), + llm.UserParts(c), }} - _, err := Normalize(req, llm.Capabilities{MaxImagesPerReq: 2}) - if !errors.Is(err, llm.ErrUnsupported) { - t.Fatalf("err = %v, want ErrUnsupported", err) + caps := llm.Capabilities{MaxImagesPerReq: 2, MaxImageDimension: 64, MaxImageBytes: 1 << 20, AllowedImageMIME: []string{"image/png"}} + out, err := Normalize(req, caps) + if err != nil { + t.Fatalf("drop-oldest overflow should not error: %v", err) } - if !strings.Contains(err.Error(), "3 images") || !strings.Contains(err.Error(), "at most 2") { - t.Errorf("err message %q lacks the counts", err) + var imgs []llm.ImagePart + placeholders := 0 + for _, m := range out.Messages { + for _, p := range m.Parts { + switch v := p.(type) { + case llm.ImagePart: + imgs = append(imgs, v) + case llm.TextPart: + if strings.Contains(v.Text, "omitted") { + placeholders++ + } + } + } + } + if len(imgs) != 2 { + t.Fatalf("kept %d images, want 2 (the cap)", len(imgs)) + } + if placeholders != 1 { + t.Errorf("placeholders = %d, want 1 for the dropped oldest image", placeholders) + } + for _, im := range imgs { + if bytes.Equal(im.Data, a.Data) { + t.Errorf("oldest image was kept; the most-recent two should survive") + } + } + // The input request must be untouched (copy-on-write). + if len(req.Messages[0].Parts) != 2 { + t.Errorf("input request was mutated: %+v", req.Messages[0].Parts) } }