feat(run): InputFileStager seam — stage non-image attachments into the prompt #18

Merged
steve merged 3 commits from feat/input-file-stager into main 2026-06-28 18:19:29 +00:00
4 changed files with 259 additions and 0 deletions
Showing only changes of commit 2ef88f2a73 - Show all commits
+6
View File
@@ -318,6 +318,12 @@ func (e *Executor) Run(ctx context.Context, ra RunnableAgent, inv tool.Invocatio
}
ag := agent.New(model, e.systemPrompt(ra), opts...)
// Stage non-image input attachments (audio/PDF/binary) into the host file
// store and fold an [ATTACHED FILES] descriptor into the prompt so the agent
// can reach them by file_id. No-op when Ports.InputFiles is nil or there are
// no files. Done after the model/toolbox build but before the loop, so the
// descriptor rides the very first user turn.
input = e.stageInputFiles(runCtx, inv.RunID, ra.ID, inv.InputFiles, input)
// One WithSteer drains BOTH the session mailbox (a tool's AttachImages) and
// the critic's nudges before each step.
steer := func() []llm.Message { return append(mailbox.drain(), critic.drainSteer()...) }
+120
View File
@@ -0,0 +1,120 @@
package run
import (
"context"
"fmt"
"log/slog"
"path"
"strings"
"gitea.stevedudenhoeffer.com/steve/executus/tool"
)
// maxInputFileBytes is a defense-in-depth cap at the staging boundary. A host's
// extraction path may already cap downloads, but stageInputFiles is the trust
// boundary for the InputFiles seam: a call site or bug that populates InputFiles
// directly must not write an unbounded blob to the host file store.
const maxInputFileBytes = 50_000_000
// stageInputFiles persists each non-image input attachment into the host file
// store (Ports.InputFiles) under run scope and appends a descriptor block to the
// prompt so the agent knows the file_ids it can pass to a worker tool. The bytes
// are NOT inlined into the model context — the LLM can't read raw audio/binary —
// so the agent reaches them via a file_id-aware tool (e.g. code_exec files_in,
// which writes the file to /workspace/<name>).
//
// Best-effort: a nil stager, no files, or a per-file save error degrades to
// "skip that file" — the run still proceeds. Returns the (possibly augmented)
// prompt.
func (e *Executor) stageInputFiles(ctx context.Context, runID, agentID string, files []tool.InputFile, prompt string) string {
if e.cfg.Ports.InputFiles == nil || len(files) == 0 {
return prompt
}
type staged struct {
name, mime, fileID string
size int
}
var ok []staged
seenNames := make(map[string]int, len(files))
for _, f := range files {
if len(f.Data) == 0 {
slog.Warn("run: skipping empty input file",
"agent", agentID, "run_id", runID, "name", f.Name)
continue
}
if len(f.Data) > maxInputFileBytes {
slog.Warn("run: skipping oversized input file",
"agent", agentID, "run_id", runID, "name", f.Name,
"size", len(f.Data), "cap", maxInputFileBytes)
continue
}
// Disambiguate colliding base names so two attachments with the same
// name don't both map to /workspace/<name> (the second would clobber the
// first when the agent writes them via code_exec).
name := uniqueName(f.Name, seenNames)
fileID, err := e.cfg.Ports.InputFiles.StageInputFile(ctx, runID, agentID, name, f.MimeType, f.Data)
if err != nil {
slog.Warn("run: failed to stage input file",
"agent", agentID, "run_id", runID, "name", name, "error", err)
continue
}
ok = append(ok, staged{name: name, mime: f.MimeType, fileID: fileID, size: len(f.Data)})
}
if len(ok) == 0 {
return prompt
}
var b strings.Builder
b.WriteString("[ATTACHED FILES]\n")
b.WriteString("The user attached the following file(s). Their contents are NOT included in this prompt and you cannot read them directly. ")
b.WriteString("To work with one, call the code_exec tool with a files_in entry — e.g. ")
b.WriteString(`files_in: [{"name": "<name>", "file_id": "<file_id>"}]`)
b.WriteString(" — which writes it to /workspace/<name> inside the Python sandbox. You may also pass a file_id to any other tool that accepts one.\n")
for _, s := range ok {
fmt.Fprintf(&b, "- %s (%s, %s) → file_id: %s\n", s.name, s.mime, humanizeBytes(s.size), s.fileID)
}
if strings.TrimSpace(prompt) == "" {
return b.String()
}
return prompt + "\n\n" + b.String()
}
// uniqueName returns name unchanged the first time it's seen, then name-2,
// name-3, … (suffix inserted before the extension) on repeats, recording each
// result in seen so later collisions keep counting up.
func uniqueName(name string, seen map[string]int) string {
if seen[name] == 0 {
seen[name]++
return name
}
ext := path.Ext(name)
base := strings.TrimSuffix(name, ext)
for {
seen[name]++
candidate := fmt.Sprintf("%s-%d%s", base, seen[name], ext)
if seen[candidate] == 0 {
seen[candidate]++
return candidate
}
}
}
// humanizeBytes renders a byte count as a short human-readable string (e.g.
// "2.1 MB") for the attached-files descriptor block.
func humanizeBytes(n int) string {
if n < 0 {
n = 0
}
const unit = 1024
if n < unit {
return fmt.Sprintf("%d B", n)
}
div, exp := int64(unit), 0
for v := int64(n) / unit; v >= unit; v /= unit {
div *= unit
exp++
}
return fmt.Sprintf("%.1f %cB", float64(n)/float64(div), "KMGTPE"[exp])
}
+119
View File
@@ -0,0 +1,119 @@
package run
import (
"context"
"errors"
"strings"
"testing"
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
"gitea.stevedudenhoeffer.com/steve/executus/tool"
)
// stagerFunc is a test InputFileStager: it records each staged file and returns
// a deterministic file_id ("file_<name>"), or an error if err is set.
type stagerFunc struct {
staged []stagedRec
err error
}
type stagedRec struct {
runID, agentID, name, mime string
size int
}
func (s *stagerFunc) StageInputFile(_ context.Context, runID, agentID, name, mime string, content []byte) (string, error) {
if s.err != nil {
return "", s.err
}
s.staged = append(s.staged, stagedRec{runID, agentID, name, mime, len(content)})
return "file_" + name, nil
}
func newStagerExecutor(s InputFileStager) *Executor {
return New(Config{
Registry: tool.NewRegistry(),
Models: func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, nil, nil },
Ports: Ports{InputFiles: s},
})
}
// TestStageInputFiles: files are staged via the port and an [ATTACHED FILES]
// descriptor (with each file_id) is appended to the prompt.
func TestStageInputFiles(t *testing.T) {
st := &stagerFunc{}
ex := newStagerExecutor(st)
out := ex.stageInputFiles(context.Background(), "run-1", "agent-1",
[]tool.InputFile{{Name: "clip.mp3", MimeType: "audio/mpeg", Data: []byte("abcd")}},
"transcribe this")
if len(st.staged) != 1 || st.staged[0].name != "clip.mp3" {
t.Fatalf("staged = %+v, want one clip.mp3", st.staged)
}
if st.staged[0].runID != "run-1" || st.staged[0].agentID != "agent-1" {
t.Errorf("stager got runID/agentID = %q/%q, want run-1/agent-1", st.staged[0].runID, st.staged[0].agentID)
}
for _, want := range []string{"transcribe this", "[ATTACHED FILES]", "clip.mp3", "file_clip.mp3", "audio/mpeg"} {
if !strings.Contains(out, want) {
t.Errorf("output missing %q:\n%s", want, out)
}
}
}
// TestStageInputFilesNoStager: a nil port leaves the prompt untouched and never
// drops the run.
func TestStageInputFilesNoStager(t *testing.T) {
ex := newStagerExecutor(nil) // Ports.InputFiles == nil
out := ex.stageInputFiles(context.Background(), "r", "a",
[]tool.InputFile{{Name: "x.bin", Data: []byte("z")}}, "prompt")
if out != "prompt" {
t.Errorf("nil stager changed the prompt: %q", out)
}
}
// TestStageInputFilesNoFiles: no attachments leaves the prompt untouched.
func TestStageInputFilesNoFiles(t *testing.T) {
ex := newStagerExecutor(&stagerFunc{})
out := ex.stageInputFiles(context.Background(), "r", "a", nil, "prompt")
if out != "prompt" {
t.Errorf("no files changed the prompt: %q", out)
}
}
// TestStageInputFilesDedup: colliding base names are disambiguated so they don't
// clobber each other at /workspace/<name>.
func TestStageInputFilesDedup(t *testing.T) {
st := &stagerFunc{}
ex := newStagerExecutor(st)
out := ex.stageInputFiles(context.Background(), "r", "a", []tool.InputFile{
{Name: "a.wav", MimeType: "audio/wav", Data: []byte("1")},
{Name: "a.wav", MimeType: "audio/wav", Data: []byte("2")},
}, "go")
if len(st.staged) != 2 {
t.Fatalf("staged %d files, want 2", len(st.staged))
}
if st.staged[0].name != "a.wav" || st.staged[1].name != "a-2.wav" {
t.Errorf("dedup names = %q, %q; want a.wav, a-2.wav", st.staged[0].name, st.staged[1].name)
}
if !strings.Contains(out, "a-2.wav") {
t.Errorf("output missing disambiguated name:\n%s", out)
}
}
// TestStageInputFilesSkipsBad: empty + oversized files are skipped; a save error
// drops only that file. With nothing staged, the prompt is unchanged.
func TestStageInputFilesSkipsBad(t *testing.T) {
// Empty data → skipped; with no good files the prompt is returned as-is.
ex := newStagerExecutor(&stagerFunc{})
if out := ex.stageInputFiles(context.Background(), "r", "a",
[]tool.InputFile{{Name: "empty.bin", Data: nil}}, "p"); out != "p" {
t.Errorf("empty file should be skipped, got %q", out)
}
// A stager error → that file is dropped; nothing staged → prompt unchanged.
exErr := newStagerExecutor(&stagerFunc{err: errors.New("disk full")})
if out := exErr.stageInputFiles(context.Background(), "r", "a",
[]tool.InputFile{{Name: "x.bin", Data: []byte("z")}}, "p"); out != "p" {
t.Errorf("save error should drop the file and leave the prompt, got %q", out)
}
}
+14
View File
@@ -42,6 +42,20 @@ type Ports struct {
// Delivery is where the run's output + artifacts go. nil = the caller
// reads the Result in-process (the light-host default).
Delivery deliver.Delivery
// InputFiles persists non-image input attachments (audio, PDF, binary)
// carried on Invocation.InputFiles into a host file store under run scope,
// returning file_ids the agent can hand to a worker tool. nil = input files
// are silently ignored (the run still proceeds, text-only). The bytes are
// never inlined into the model context — the LLM can't read raw audio/binary.
InputFiles InputFileStager
}
// InputFileStager persists a single non-image input attachment into a host file
// store under run scope and returns a file_id the run can reference. It is the
// seam mort's skill FileStorage (and any host blob store) implements so the
// kernel can stage Invocation.InputFiles without importing a storage layer.
type InputFileStager interface {
StageInputFile(ctx context.Context, runID, agentID, name, mime string, content []byte) (fileID string, err error)
}
// RunInfo describes a run at start time — the attribution a recorder/critic