From df4033f42e566997090b67b13a2a2235410b5c9f Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Sun, 28 Jun 2026 14:08:57 -0400 Subject: [PATCH] fix(run): harden input-file staging per gadfly #18 validation pass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Second-pass findings on the security fix: - Mime sanitized ONCE and passed to BOTH StageInputFile and the descriptor (was passing raw f.MimeType to the host store while only the descriptor sanitized) — 3 models. - sanitizeField now also strips Unicode format chars (category Cf, incl. the bidi overrides U+202A–U+202E that can reorder how the descriptor renders); IsControl already covers \n\r\t so the explicit checks are dropped. - fileID is sanitized before inlining + an empty file_id drops the file (defense vs a misbehaving stager). - humanizeBytes clamps the prefix index so an absurd size (≥1024^6) can't index past "KMGTPE" and panic — a no-panic guarantee independent of the per-file cap. - Docs sync: README Ports list gains InputFiles; tool.InputFile.Name doc now says the executor reduces an untrusted name to a safe base name (was claiming the field is already safe). Tests: bidi/control stripping; mime sanitized in staged value + descriptor; empty file_id drop; humanizeBytes no-panic across sizes up to 1<<62. Suite green (-race). Co-Authored-By: Claude Opus 4.8 (1M context) --- README.md | 2 +- run/input_files.go | 31 +++++++++++++++++------ run/input_files_test.go | 55 +++++++++++++++++++++++++++++++++++++++++ tool/registry.go | 7 +++--- 4 files changed, 83 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 3cf04d5..aec93db 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ bot) — mort and gadfly are the first two consumers (heavy and light). See tool registry, majordomo's agent loop, context compaction, run-bounding, and step/audit instrumentation into one `Run(ctx, RunnableAgent, inv) Result`, with every host concern behind a nil-safe `run.Ports` (Audit/Budget/Critic/ - Checkpointer/PaletteSource/Delivery). See `examples/minimal`. + Checkpointer/PaletteSource/Delivery/InputFiles). See `examples/minimal`. - `model/` — config-driven tier resolution + failover over majordomo, with pluggable `UsageSink`/`TraceSink` and `GenerateWith[T]` structured output. - `tool/` — the tool registry + 3-stage permission model + SSRF guard. diff --git a/run/input_files.go b/run/input_files.go index bad3815..932edfd 100644 --- a/run/input_files.go +++ b/run/input_files.go @@ -68,13 +68,23 @@ func (e *Executor) stageInputFiles(ctx context.Context, runID, agentID string, f // Then disambiguate colliding base names so two attachments don't both map // to /workspace/ (the second would clobber the first). name := uniqueName(sanitizeName(f.Name), seenNames) - fileID, err := e.cfg.Ports.InputFiles.StageInputFile(ctx, runID, agentID, name, f.MimeType, f.Data) + // Sanitize the mime ONCE and pass the clean value to both the host store + // and the descriptor (don't hand the raw value to StageInputFile). + mime := sanitizeField(f.MimeType) + fileID, err := e.cfg.Ports.InputFiles.StageInputFile(ctx, runID, agentID, name, mime, f.Data) if err != nil { slog.Warn("run: failed to stage input file", "agent", agentID, "run_id", runID, "name", name, "error", err) continue } - staged = append(staged, stagedFile{name: name, mime: sanitizeField(f.MimeType), fileID: fileID, size: len(f.Data)}) + if fileID == "" { + slog.Warn("run: stager returned empty file_id, skipping", + "agent", agentID, "run_id", runID, "name", name) + continue + } + // fileID is host-generated, but sanitize it too before inlining — the + // descriptor must never carry control chars no matter the stager impl. + staged = append(staged, stagedFile{name: name, mime: mime, fileID: sanitizeField(fileID), size: len(f.Data)}) } if len(staged) == 0 { return prompt @@ -113,12 +123,14 @@ func sanitizeName(name string) string { return base } -// sanitizeField strips control characters (incl. newlines/tabs) from a value -// that gets inlined verbatim into the prompt descriptor, so it can't break out -// of its line or inject instructions. +// sanitizeField strips characters that could let a value inlined verbatim into +// the prompt descriptor break out of its line or visually mislead: control +// characters (IsControl covers newlines/tabs) AND Unicode format characters +// (category Cf — e.g. the bidi overrides U+202A–U+202E, which can reorder how +// the descriptor renders). func sanitizeField(s string) string { return strings.Map(func(r rune) rune { - if r == '\n' || r == '\r' || r == '\t' || unicode.IsControl(r) { + if unicode.IsControl(r) || unicode.Is(unicode.Cf, r) { return -1 } return r @@ -155,10 +167,13 @@ func humanizeBytes(n int) string { if n < unit { return fmt.Sprintf("%d B", n) } + const prefixes = "KMGTPE" div, exp := int64(unit), 0 - for v := int64(n) / unit; v >= unit; v /= unit { + // Clamp exp to the last prefix so an absurd size (≥1024^7) can't index past + // "KMGTPE" and panic — a no-panic guarantee independent of the per-file cap. + for v := int64(n) / unit; v >= unit && exp < len(prefixes)-1; v /= unit { div *= unit exp++ } - return fmt.Sprintf("%.1f %cB", float64(n)/float64(div), "KMGTPE"[exp]) + return fmt.Sprintf("%.1f %cB", float64(n)/float64(div), prefixes[exp]) } diff --git a/run/input_files_test.go b/run/input_files_test.go index 77777e4..681dcf7 100644 --- a/run/input_files_test.go +++ b/run/input_files_test.go @@ -186,3 +186,58 @@ func TestStageInputFilesSanitizesTraversal(t *testing.T) { t.Errorf("descriptor leaked the traversal path:\n%s", out) } } + +// TestSanitizeFieldStripsBidiAndControl: control chars AND Unicode format/bidi +// overrides are removed from inlined values. +func TestSanitizeFieldStripsBidiAndControl(t *testing.T) { + in := "audio/‮mpg\n; rm -rf" // bidi override + newline + got := sanitizeField(in) + if strings.ContainsAny(got, "\n\r\t") || strings.ContainsRune(got, '‮') { + t.Errorf("sanitizeField left control/bidi chars: %q", got) + } +} + +// TestStageInputFilesSanitizesMime: a mime with a control char is cleaned in BOTH +// the staged value and the descriptor. +func TestStageInputFilesSanitizesMime(t *testing.T) { + st := &stagerFunc{} + ex := newStagerExecutor(st) + out := ex.stageInputFiles(context.Background(), "r", "a", + []tool.InputFile{{Name: "c.wav", MimeType: "audio/wav\ninjected", Data: []byte("x")}}, "go") + if len(st.staged) != 1 || strings.ContainsAny(st.staged[0].mime, "\n\r") { + t.Errorf("mime not sanitized before staging: %+v", st.staged) + } + if strings.Contains(out, "\ninjected") { + t.Errorf("descriptor carried an unsanitized mime newline:\n%s", out) + } +} + +// TestStageInputFilesEmptyFileID: a stager returning an empty file_id drops the +// file (no blank file_id in the descriptor). +func TestStageInputFilesEmptyFileID(t *testing.T) { + ex := newStagerExecutor(emptyIDStager{}) + out := ex.stageInputFiles(context.Background(), "r", "a", + []tool.InputFile{{Name: "x.bin", Data: []byte("z")}}, "p") + if out != "p" { + t.Errorf("empty file_id should drop the file, got %q", out) + } +} + +type emptyIDStager struct{} + +func (emptyIDStager) StageInputFile(context.Context, string, string, string, string, []byte) (string, error) { + return "", nil +} + +// TestHumanizeBytesNoPanic: an absurd size clamps to the last prefix instead of +// indexing past "KMGTPE". +func TestHumanizeBytesNoPanic(t *testing.T) { + defer func() { + if r := recover(); r != nil { + t.Fatalf("humanizeBytes panicked: %v", r) + } + }() + for _, n := range []int{0, 512, 2048, 5_000_000, 1 << 62} { + _ = humanizeBytes(n) + } +} diff --git a/tool/registry.go b/tool/registry.go index 7528f3f..008d20d 100644 --- a/tool/registry.go +++ b/tool/registry.go @@ -154,9 +154,10 @@ type ContinuationContext struct { // InputFile is a non-image file the user supplied with a run (audio, // etc.). The executor stages it into the file store under run scope and -// surfaces its file_id to the agent. Name is a safe base name (no path -// separators) suitable for /workspace/; MimeType is the resolved -// content type; Data is the raw bytes. +// surfaces its file_id to the agent. Name may be an untrusted attachment +// filename — the executor reduces it to a safe base name (stripping path +// separators + control chars) before staging or exposing it as +// /workspace/; MimeType is the resolved content type; Data is the raw bytes. type InputFile struct { Name string MimeType string