feat(run): InputFileStager seam — stage non-image attachments into the prompt #18
+53
-9
@@ -6,6 +6,7 @@ import (
|
||||
"log/slog"
|
||||
"path"
|
||||
"strings"
|
||||
"unicode"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/executus/tool"
|
||||
)
|
||||
@@ -16,6 +17,10 @@ import (
|
||||
// directly must not write an unbounded blob to the host file store.
|
||||
const maxInputFileBytes = 50_000_000
|
||||
|
||||
// maxInputFiles bounds how many attachments a single run stages, independent of
|
||||
// the per-file byte cap — defense-in-depth against a flood of tiny files.
|
||||
const maxInputFiles = 32
|
||||
|
||||
// stageInputFiles persists each non-image input attachment into the host file
|
||||
// store (Ports.InputFiles) under run scope and appends a descriptor block to the
|
||||
// prompt so the agent knows the file_ids it can pass to a worker tool. The bytes
|
||||
@@ -30,12 +35,19 @@ func (e *Executor) stageInputFiles(ctx context.Context, runID, agentID string, f
|
||||
if e.cfg.Ports.InputFiles == nil || len(files) == 0 {
|
||||
return prompt
|
||||
}
|
||||
// Count cap: bound how many attachments one run can stage, independent of the
|
||||
// per-file byte cap (defense-in-depth against a flood of tiny files).
|
||||
if len(files) > maxInputFiles {
|
||||
slog.Warn("run: too many input files, truncating",
|
||||
"agent", agentID, "run_id", runID, "count", len(files), "cap", maxInputFiles)
|
||||
files = files[:maxInputFiles]
|
||||
}
|
||||
|
||||
type staged struct {
|
||||
type stagedFile struct {
|
||||
name, mime, fileID string
|
||||
size int
|
||||
}
|
||||
var ok []staged
|
||||
var staged []stagedFile
|
||||
seenNames := make(map[string]int, len(files))
|
||||
for _, f := range files {
|
||||
if len(f.Data) == 0 {
|
||||
@@ -49,19 +61,22 @@ func (e *Executor) stageInputFiles(ctx context.Context, runID, agentID string, f
|
||||
"size", len(f.Data), "cap", maxInputFileBytes)
|
||||
continue
|
||||
}
|
||||
// Disambiguate colliding base names so two attachments with the same
|
||||
// name don't both map to /workspace/<name> (the second would clobber the
|
||||
// first when the agent writes them via code_exec).
|
||||
name := uniqueName(f.Name, seenNames)
|
||||
// Reduce the untrusted filename to a safe base name BEFORE staging or
|
||||
// inlining: strips ../ and absolute-path components (so it can't escape
|
||||
// the host store or /workspace/<name>) and drops control chars/newlines
|
||||
// (so a crafted name can't inject text into the descriptor block below).
|
||||
// Then disambiguate colliding base names so two attachments don't both map
|
||||
// to /workspace/<name> (the second would clobber the first).
|
||||
name := uniqueName(sanitizeName(f.Name), seenNames)
|
||||
fileID, err := e.cfg.Ports.InputFiles.StageInputFile(ctx, runID, agentID, name, f.MimeType, f.Data)
|
||||
if err != nil {
|
||||
slog.Warn("run: failed to stage input file",
|
||||
"agent", agentID, "run_id", runID, "name", name, "error", err)
|
||||
continue
|
||||
}
|
||||
ok = append(ok, staged{name: name, mime: f.MimeType, fileID: fileID, size: len(f.Data)})
|
||||
staged = append(staged, stagedFile{name: name, mime: sanitizeField(f.MimeType), fileID: fileID, size: len(f.Data)})
|
||||
}
|
||||
if len(ok) == 0 {
|
||||
if len(staged) == 0 {
|
||||
return prompt
|
||||
}
|
||||
|
||||
@@ -71,7 +86,7 @@ func (e *Executor) stageInputFiles(ctx context.Context, runID, agentID string, f
|
||||
b.WriteString("To work with one, call the code_exec tool with a files_in entry — e.g. ")
|
||||
b.WriteString(`files_in: [{"name": "<name>", "file_id": "<file_id>"}]`)
|
||||
b.WriteString(" — which writes it to /workspace/<name> inside the Python sandbox. You may also pass a file_id to any other tool that accepts one.\n")
|
||||
for _, s := range ok {
|
||||
for _, s := range staged {
|
||||
fmt.Fprintf(&b, "- %s (%s, %s) → file_id: %s\n", s.name, s.mime, humanizeBytes(s.size), s.fileID)
|
||||
}
|
||||
|
||||
@@ -81,6 +96,35 @@ func (e *Executor) stageInputFiles(ctx context.Context, runID, agentID string, f
|
||||
return prompt + "\n\n" + b.String()
|
||||
}
|
||||
|
||||
// sanitizeName reduces an untrusted attachment filename to a safe base name. It
|
||||
// drops control characters / newlines (which would otherwise let a crafted name
|
||||
// inject text into the [ATTACHED FILES] descriptor) and strips every directory
|
||||
// component — defeating ../ traversal, nested dirs, and absolute / drive paths
|
||||
// both in the host file store and at /workspace/<name>. Returns "attachment"
|
||||
// when nothing usable remains (empty, ".", "..").
|
||||
func sanitizeName(name string) string {
|
||||
name = sanitizeField(name)
|
||||
// Normalize backslashes so a Windows-style path also reduces to its base.
|
||||
base := path.Base(strings.ReplaceAll(name, `\`, "/"))
|
||||
base = strings.TrimSpace(base)
|
||||
if base == "" || base == "." || base == ".." {
|
||||
return "attachment"
|
||||
}
|
||||
return base
|
||||
}
|
||||
|
||||
// sanitizeField strips control characters (incl. newlines/tabs) from a value
|
||||
// that gets inlined verbatim into the prompt descriptor, so it can't break out
|
||||
// of its line or inject instructions.
|
||||
func sanitizeField(s string) string {
|
||||
return strings.Map(func(r rune) rune {
|
||||
if r == '\n' || r == '\r' || r == '\t' || unicode.IsControl(r) {
|
||||
return -1
|
||||
}
|
||||
return r
|
||||
}, s)
|
||||
}
|
||||
|
||||
// uniqueName returns name unchanged the first time it's seen, then name-2,
|
||||
// name-3, … (suffix inserted before the extension) on repeats, recording each
|
||||
// result in seen so later collisions keep counting up.
|
||||
|
||||
@@ -117,3 +117,72 @@ func TestStageInputFilesSkipsBad(t *testing.T) {
|
||||
t.Errorf("save error should drop the file and leave the prompt, got %q", out)
|
||||
}
|
||||
}
|
||||
|
||||
// TestStageInputFilesOversize: a file past the byte cap is skipped (prompt
|
||||
// unchanged), exercising the size guard directly.
|
||||
func TestStageInputFilesOversize(t *testing.T) {
|
||||
st := &stagerFunc{}
|
||||
ex := newStagerExecutor(st)
|
||||
big := make([]byte, maxInputFileBytes+1)
|
||||
out := ex.stageInputFiles(context.Background(), "r", "a",
|
||||
[]tool.InputFile{{Name: "huge.bin", Data: big}}, "p")
|
||||
if out != "p" || len(st.staged) != 0 {
|
||||
t.Errorf("oversized file should be skipped: out=%q staged=%d", out, len(st.staged))
|
||||
}
|
||||
}
|
||||
|
||||
// TestStageInputFilesCountCap: more than maxInputFiles attachments are truncated
|
||||
// to the cap.
|
||||
func TestStageInputFilesCountCap(t *testing.T) {
|
||||
st := &stagerFunc{}
|
||||
ex := newStagerExecutor(st)
|
||||
files := make([]tool.InputFile, maxInputFiles+5)
|
||||
for i := range files {
|
||||
files[i] = tool.InputFile{Name: "f.bin", Data: []byte("x")}
|
||||
}
|
||||
ex.stageInputFiles(context.Background(), "r", "a", files, "p")
|
||||
if len(st.staged) != maxInputFiles {
|
||||
t.Errorf("count cap: staged %d, want %d", len(st.staged), maxInputFiles)
|
||||
}
|
||||
}
|
||||
|
||||
// TestSanitizeName: traversal + absolute + control-char filenames are reduced to
|
||||
// a safe base name (no path separators, no newlines), with a fallback.
|
||||
func TestSanitizeName(t *testing.T) {
|
||||
cases := map[string]string{
|
||||
"../../etc/passwd": "passwd",
|
||||
"/etc/cron.d/x": "x",
|
||||
`..\..\windows\sys`: "sys",
|
||||
"clip.mp3": "clip.mp3",
|
||||
"": "attachment",
|
||||
"..": "attachment",
|
||||
".": "attachment",
|
||||
"evil\n- injected": "evil- injected",
|
||||
"a/b/c.wav": "c.wav",
|
||||
}
|
||||
for in, want := range cases {
|
||||
if got := sanitizeName(in); got != want {
|
||||
t.Errorf("sanitizeName(%q) = %q, want %q", in, got, want)
|
||||
}
|
||||
// A sanitized name must never carry a path separator or newline.
|
||||
got := sanitizeName(in)
|
||||
if strings.ContainsAny(got, "/\\\n\r") {
|
||||
t.Errorf("sanitizeName(%q) = %q still contains a separator/newline", in, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestStageInputFilesSanitizesTraversal: a traversal filename is staged AND
|
||||
// described under its safe base name only.
|
||||
func TestStageInputFilesSanitizesTraversal(t *testing.T) {
|
||||
st := &stagerFunc{}
|
||||
ex := newStagerExecutor(st)
|
||||
out := ex.stageInputFiles(context.Background(), "r", "a",
|
||||
[]tool.InputFile{{Name: "../../../etc/passwd", MimeType: "text/plain", Data: []byte("x")}}, "go")
|
||||
if len(st.staged) != 1 || st.staged[0].name != "passwd" {
|
||||
t.Fatalf("staged name = %+v, want passwd", st.staged)
|
||||
}
|
||||
if strings.Contains(out, "..") || strings.Contains(out, "/etc/") {
|
||||
t.Errorf("descriptor leaked the traversal path:\n%s", out)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user