From df95425bb5e9a50c0cb4028943e6a01510de3804 Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Fri, 26 Jun 2026 20:54:28 -0400 Subject: [PATCH] P3 (kickoff): generic tools/ library + end-to-end tool-using-agent test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stand up executus/tools — the generic, host-agnostic tool library — and prove the full pattern end to end: - tools/tools.go: Register(reg) adds the always-available zero-dependency tools (currently `think`). A light host calls it and is immediately useful; backed tools (web/store/meta groups) will register via grouped registrars with nil-safe Deps as they land. - tools/think.go: the `think` tool moved from mort (imports only executus/tool). - tools/integration_test.go: end-to-end proof that the executor runs an agent which CALLS a registered tool — the fake model emits a `think` tool call, the executor dispatches it through the registry, the model finalises, and the step instrumentation captures the `think` step. Exercises the full tool-dispatch loop through run.Executor. Stacked on phase-2-run-kernel (P3 needs run.Executor). Remaining P3: the meta/web/net/store/compose groups + their Deps + default backends (splitting mort's default.go grab-bag). Co-Authored-By: Claude Opus 4.8 (1M context) --- CLAUDE.md | 6 +++- tools/integration_test.go | 73 +++++++++++++++++++++++++++++++++++++++ tools/think.go | 72 ++++++++++++++++++++++++++++++++++++++ tools/tools.go | 30 ++++++++++++++++ 4 files changed, 180 insertions(+), 1 deletion(-) create mode 100644 tools/integration_test.go create mode 100644 tools/think.go create mode 100644 tools/tools.go diff --git a/CLAUDE.md b/CLAUDE.md index 86a7be9..5a7be76 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -58,7 +58,11 @@ CORE (majordomo + stdlib): structured output — no separate structured/ pkg) llmmeta/ shared meta-LLM helper over model/ [P1 ✓] compact/ context compactor (WithCompactor hook) [P2 ✓] - tools/{web,net,store,compose,meta,comms} generic tools [P3] + tools/ generic tool library + Register entrypoint; [P3 wip] + think moved; end-to-end "agent calls a tool" + test green. Remaining: meta/web/net/store/ + compose groups + their nil-safe Deps + default + backends (the default.go grab-bag split) [P3] BATTERIES (opt-in siblings, each nil-safe + a default): persona/ Agent noun + AgentStore seam + yml loader [P4] diff --git a/tools/integration_test.go b/tools/integration_test.go new file mode 100644 index 0000000..d9cf39e --- /dev/null +++ b/tools/integration_test.go @@ -0,0 +1,73 @@ +package tools_test + +import ( + "context" + "encoding/json" + "testing" + + "gitea.stevedudenhoeffer.com/steve/majordomo/llm" + "gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake" + + "gitea.stevedudenhoeffer.com/steve/executus/run" + "gitea.stevedudenhoeffer.com/steve/executus/tool" + "gitea.stevedudenhoeffer.com/steve/executus/tools" +) + +// TestExecutorRunsToolUsingAgent is the end-to-end proof that a host can +// register a generic tool and the executor runs an agent that CALLS it: the +// fake model emits a `think` tool call, the executor dispatches it through the +// registered tool, then the model finalises. Exercises the full tool-dispatch +// loop + step instrumentation. +func TestExecutorRunsToolUsingAgent(t *testing.T) { + reg := tool.NewRegistry() + if err := tools.Register(reg); err != nil { + t.Fatalf("register tools: %v", err) + } + + fp := fake.New("fake") + fp.Enqueue("test-model", + // Step 1: the model decides to call `think`. + fake.ReplyWith(llm.Response{ + ToolCalls: []llm.ToolCall{{ + ID: "call-1", + Name: "think", + Arguments: json.RawMessage(`{"thought":"plan: answer briefly"}`), + }}, + }), + // Step 2: with the tool result in hand, the model finalises. + fake.Reply("all done"), + ) + m, err := fp.Model("test-model") + if err != nil { + t.Fatalf("fake model: %v", err) + } + + ex := run.New(run.Config{ + Registry: reg, + Models: func(ctx context.Context, _ string) (context.Context, llm.Model, error) { + return ctx, m, nil + }, + }) + + res := ex.Run(context.Background(), + run.RunnableAgent{Name: "thinker", ModelTier: "test-model", LowLevelTools: []string{"think"}}, + tool.Invocation{RunID: "run-tool-1", CallerID: "c"}, + "do the thing") + + if res.Err != nil { + t.Fatalf("run error: %v", res.Err) + } + if res.Output != "all done" { + t.Fatalf("output = %q, want %q", res.Output, "all done") + } + // The step instrumentation should have captured the think call. + var sawThink bool + for _, s := range res.Steps { + if s.Title == "think" { + sawThink = true + } + } + if !sawThink { + t.Errorf("expected a `think` step in Result.Steps, got %d steps: %+v", len(res.Steps), res.Steps) + } +} diff --git a/tools/think.go b/tools/think.go new file mode 100644 index 0000000..d0dd2c0 --- /dev/null +++ b/tools/think.go @@ -0,0 +1,72 @@ +// Package tools — v11 think. +// +// Pure prompt-engineering tool: the agent's "thought" is recorded +// to skill_run_logs (via the audit hook the gated wrapper applies +// transparently) but produces no side effect. The literature on +// agent design notes that giving an agent an explicit `think` tool +// keeps it on plan better than giving it nothing — without one, +// agents tend to either skip planning OR babble into the final +// output. With one, planning lands in tool calls and the final +// output stays clean. +// +// V11 deliberately rejects empty thoughts. An agent that learns +// "calling think with empty args is free" will spam it; a +// rejection forces the call to actually carry reasoning. +package tools + +import ( + "context" + "fmt" + "strings" + + "gitea.stevedudenhoeffer.com/steve/executus/tool" +) + +type thinkParams struct { + Thought string `json:"thought" description:"Your reasoning. May be a plan, a working hypothesis, an analysis of a tool result, or anything else you'd note in a private scratchpad. Empty input is rejected — make this load-bearing."` +} + +// thinkResponse is intentionally minimal. The agent doesn't need +// machine-readable output; the value is the audit trail + the +// implicit "now you've planned, what's next" prompting the call +// gives the agent loop. +type thinkResponse struct { + OK bool `json:"ok"` + Error string `json:"error,omitempty"` +} + +// NewThink constructs the v11 think tool. No deps — the audit +// hook wrapper handles persistence transparently. +func NewThink() tool.Tool { + return tool.NewGatedTool[thinkParams]( + "think", + "Record a thought / plan / working hypothesis. The thought is logged to the run trace but does NOT affect any external state. Use to slow down before a tricky tool call, sketch a multi-step plan, or summarise findings before continuing. Empty thoughts are rejected.", + tool.Permission{ + AuthoringRequirement: tool.RequirementAnyone, + OperatesOn: tool.ScopeGlobal, + SafeForShare: true, + Categories: []string{"utility"}, + }, + func(_ context.Context, _ tool.Invocation, p thinkParams) (string, error) { + if strings.TrimSpace(p.Thought) == "" { + // Returns ok:false in a structured envelope rather + // than an error so the agent loop continues with a + // recoverable signal. + return `{"ok":false,"error":"empty_thought"}`, nil + } + // Successful think emits a flat JSON. The audit hook + // (auto-injected by NewGatedTool) writes the args + result + // pair so the trace UI shows the thought verbatim. + return `{"ok":true}`, nil + }, + ) +} + +// Note: returning a hand-rolled JSON literal instead of a marshaller +// keeps think the cheapest possible tool — no heap allocation, no +// json.Marshal call, no goroutine-local buffer churn. The two output +// shapes are static. If a future field is added to thinkResponse, +// switch back to json.Marshal — but until then, the literal is the +// idiom that matches the tool's "do nothing" intent. +var _ = thinkResponse{} // declared so vet doesn't flag the unused struct +var _ = fmt.Errorf diff --git a/tools/tools.go b/tools/tools.go new file mode 100644 index 0000000..3c0045a --- /dev/null +++ b/tools/tools.go @@ -0,0 +1,30 @@ +// Package tools is executus's library of generic, host-agnostic agent tools. +// +// A host registers the tools it wants against a tool.Registry, then runs an +// agent whose RunnableAgent.LowLevelTools name them. Tools split two ways: +// +// - Always-available, zero-dependency tools (think, ...) need no host backend +// and register via Register. A light host (gadfly) can call Register and be +// immediately useful. +// - Backed tools (web search, file/kv storage, summarize, ...) take a nil-safe +// Deps describing their host backend; they register via grouped registrars +// (RegisterWeb, RegisterStore, ...) as those land. +// +// Every tool ships with the same three-stage permission model as mort's, and a +// host adds its own domain tools against the SAME registry. +package tools + +import "gitea.stevedudenhoeffer.com/steve/executus/tool" + +// Register adds the always-available, zero-dependency generic tools to reg +// (currently: think). Returns the first registration error, if any. +func Register(reg tool.Registry) error { + for _, t := range []tool.Tool{ + NewThink(), + } { + if err := reg.Register(t); err != nil { + return err + } + } + return nil +}