Files
executus/tool/gated_tool_test.go
T
steve dc28b63ad8
executus CI / test (push) Successful in 36s
P1 (part 1): move skilltools core -> tool/ (clean, verbatim)
The tool registry core (registry, permission model, Invocation, gated-tool
wrapper, ssrf guard, hmac, encryption, argcoerce, helpers, rootrun,
session_tools, webhook_rate_limit) had zero mort coupling — it imports only
majordomo/llm + x/crypto/hkdf — so it moves verbatim with a package rename
(skilltools -> tool). All same-package tests came along and pass; the SSRF,
gated-wrapper, encryption and output-pattern invariants are re-anchored here.

majordomo re-enters the module graph (now pinned to the latest, incl. the
front-loaded-output fix). model/ + llmmeta + structured follow next.

Docs: CLAUDE.md now requires README/examples to stay in sync with changes in
the same commit; CI skips docs/example-only pushes via paths-ignore.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 19:31:47 -04:00

402 lines
13 KiB
Go

package tool
import (
"context"
"encoding/json"
"errors"
"strings"
"sync"
"testing"
llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
)
// gatedTestParams is a typed param struct used by the gated_tool tests.
// Mirrors a real production tool: a couple of strings the LLM supplies.
type gatedTestParams struct {
Question string `json:"question" description:"The question to answer."`
Detail string `json:"detail,omitempty" description:"Optional detail level."`
}
// recordingAudit captures every AuditCall the wrapper emits so tests
// can assert exactly what the wrapper logged. Concurrent-safe in case a
// future test parallelises across goroutines.
type recordingAudit struct {
mu sync.Mutex
calls []AuditCall
}
func (r *recordingAudit) hook() AuditHook {
return func(call AuditCall) {
r.mu.Lock()
defer r.mu.Unlock()
r.calls = append(r.calls, call)
}
}
func (r *recordingAudit) snapshot() []AuditCall {
r.mu.Lock()
defer r.mu.Unlock()
out := make([]AuditCall, len(r.calls))
copy(out, r.calls)
return out
}
// buildAndExecute is the test-only convenience for going from a
// constructed Tool to an llm.Tool result. Mirrors how the production
// registry's Build call wires inv.gate / inv.audit.
func buildAndExecute(t *testing.T, tool Tool, inv Invocation, vis Visibility, audit AuditHook, args string) (string, error) {
t.Helper()
r := NewRegistry()
if err := r.Register(tool); err != nil {
t.Fatalf("register: %v", err)
}
box, err := r.Build([]string{tool.Name()}, inv, vis, audit)
if err != nil {
t.Fatalf("build: %v", err)
}
return execBox(box, toolCall{Name: tool.Name(), Arguments: args})
}
// TestNewGatedTool_GateRejection verifies that the wrapper auto-injects
// CheckGate: if the invocation's SkillName doesn't match the tool's
// SkillNameGate, fn never runs and the audit row is emitted with the
// gate error. This is the core contract that v1 hotfix #4 had to
// retrofit by hand.
func TestNewGatedTool_GateRejection(t *testing.T) {
called := false
tool := NewGatedTool[gatedTestParams](
"gated_test_tool",
"A test tool gated to my-skill.",
Permission{
AuthoringRequirement: RequirementAnyone,
OperatesOn: ScopeGlobal,
SafeForShare: true,
SkillNameGate: "my-skill",
},
func(ctx context.Context, inv Invocation, args gatedTestParams) (string, error) {
called = true
return "should not be reached", nil
},
)
rec := &recordingAudit{}
out, err := buildAndExecute(t, tool,
Invocation{SkillName: "other-skill"},
VisibilityPrivate, rec.hook(),
`{"question":"hi"}`)
if err == nil {
t.Fatalf("expected gate-rejection error, got out=%q err=nil", out)
}
if !strings.Contains(err.Error(), "restricted to") {
t.Fatalf("expected error containing 'restricted to', got %v", err)
}
if called {
t.Errorf("fn was called despite gate rejection — wrapper failed to inject CheckGate")
}
calls := rec.snapshot()
if len(calls) != 1 {
t.Fatalf("expected exactly 1 audit call, got %d: %+v", len(calls), calls)
}
if calls[0].Err == nil {
t.Errorf("audit call.Err was nil; expected the gate error")
}
if calls[0].Args != "{}" {
t.Errorf("audit call.Args=%q, want \"{}\" (no args parsed pre-gate)", calls[0].Args)
}
}
// TestNewGatedTool_HappyPath verifies the wrapper passes args to fn,
// returns fn's result, and emits a successful audit row with the
// re-marshaled args.
func TestNewGatedTool_HappyPath(t *testing.T) {
var seen gatedTestParams
var seenInv Invocation
tool := NewGatedTool[gatedTestParams](
"gated_happy_tool",
"A test tool with no gate.",
Permission{
AuthoringRequirement: RequirementAnyone,
OperatesOn: ScopeGlobal,
SafeForShare: true,
},
func(ctx context.Context, inv Invocation, args gatedTestParams) (string, error) {
seen = args
seenInv = inv
return "answered: " + args.Question, nil
},
)
rec := &recordingAudit{}
out, err := buildAndExecute(t, tool,
Invocation{SkillName: "any-skill", CallerID: "user-7"},
VisibilityPrivate, rec.hook(),
`{"question":"what is the time?","detail":"verbose"}`)
if err != nil {
t.Fatalf("execute: %v", err)
}
if out != "answered: what is the time?" {
t.Errorf("unexpected output: %q", out)
}
if seen.Question != "what is the time?" || seen.Detail != "verbose" {
t.Errorf("fn received %+v, want question/detail populated", seen)
}
if seenInv.CallerID != "user-7" {
t.Errorf("fn saw CallerID=%q, want user-7", seenInv.CallerID)
}
calls := rec.snapshot()
if len(calls) != 1 {
t.Fatalf("expected exactly 1 audit call, got %d", len(calls))
}
if calls[0].Err != nil {
t.Errorf("audit call.Err=%v, want nil", calls[0].Err)
}
if calls[0].Result != "answered: what is the time?" {
t.Errorf("audit call.Result=%q, want match output", calls[0].Result)
}
// The wrapper re-marshals the args — verify the JSON is well-formed
// and contains the expected fields.
var argsBack gatedTestParams
if err := json.Unmarshal([]byte(calls[0].Args), &argsBack); err != nil {
t.Fatalf("audit args not valid JSON: %q (%v)", calls[0].Args, err)
}
if argsBack.Question != "what is the time?" || argsBack.Detail != "verbose" {
t.Errorf("audit args round-trip mismatch: %+v", argsBack)
}
}
// TestNewGatedTool_FnError verifies the wrapper surfaces fn's error
// AND captures the partial result + error in the audit row.
func TestNewGatedTool_FnError(t *testing.T) {
tool := NewGatedTool[gatedTestParams](
"gated_fn_err_tool",
"A test tool whose handler always errors.",
Permission{
AuthoringRequirement: RequirementAnyone,
OperatesOn: ScopeGlobal,
SafeForShare: true,
},
func(ctx context.Context, inv Invocation, args gatedTestParams) (string, error) {
return "partial output", errors.New("boom")
},
)
rec := &recordingAudit{}
out, err := buildAndExecute(t, tool,
Invocation{SkillName: "any-skill"},
VisibilityPrivate, rec.hook(),
`{"question":"x"}`)
// llm.Define's Execute returns ("", err) when the handler returns a
// non-nil error — out is dropped on the LLM side. But the wrapper's
// audit row should still capture both partial result + error.
if err == nil || !strings.Contains(err.Error(), "boom") {
t.Fatalf("expected boom error, got out=%q err=%v", out, err)
}
calls := rec.snapshot()
if len(calls) != 1 {
t.Fatalf("expected exactly 1 audit call, got %d", len(calls))
}
if calls[0].Err == nil || !strings.Contains(calls[0].Err.Error(), "boom") {
t.Errorf("audit call.Err=%v, want boom", calls[0].Err)
}
if calls[0].Result != "partial output" {
t.Errorf("audit call.Result=%q, want 'partial output' (partial captured)", calls[0].Result)
}
}
// TestNewGatedTool_ArgsParseHandledByLLM_NoAuditEmitted documents the
// behaviour at the wrapper boundary: when the LLM sends malformed JSON
// args, llm.Define's Execute fails BEFORE the wrapper's inner closure
// runs. The wrapper does NOT emit an audit row in that case — it never
// got the chance. This is intentional: arg-parse failure is a
// tool-call wiring problem, not a tool-handler problem; the audit log
// reflects what the handler did, and on parse failure no handler ran.
//
// The test exists so future readers see this invariant documented in
// code and don't re-introduce a "log everything" path that breaks the
// wrapper's contract with the audit storage layer.
func TestNewGatedTool_ArgsParseHandledByLLM_NoAuditEmitted(t *testing.T) {
tool := NewGatedTool[gatedTestParams](
"gated_parse_err_tool",
"A test tool that should never receive bad JSON.",
Permission{
AuthoringRequirement: RequirementAnyone,
OperatesOn: ScopeGlobal,
SafeForShare: true,
},
func(ctx context.Context, inv Invocation, args gatedTestParams) (string, error) {
t.Fatalf("fn ran despite malformed JSON — should never happen")
return "", nil
},
)
rec := &recordingAudit{}
_, err := buildAndExecute(t, tool,
Invocation{SkillName: "any-skill"},
VisibilityPrivate, rec.hook(),
`{"question":not-quoted}`) // intentionally malformed
if err == nil {
t.Fatalf("expected JSON parse error, got nil")
}
if calls := rec.snapshot(); len(calls) != 0 {
t.Errorf("audit emitted %d calls on parse error; expected 0 (parse-fail is pre-handler)", len(calls))
}
}
// TestIsGatedTool_DetectsWrapped confirms that NewGatedTool's return
// value satisfies the gatedToolMarker interface so the meta-test can
// distinguish wrapped from unwrapped tools.
func TestIsGatedTool_DetectsWrapped(t *testing.T) {
tool := NewGatedTool[gatedTestParams](
"gated_marker_tool", "marker test",
Permission{AuthoringRequirement: RequirementAnyone},
func(ctx context.Context, inv Invocation, args gatedTestParams) (string, error) {
return "", nil
},
)
if !IsGatedTool(tool) {
t.Fatalf("IsGatedTool returned false for a NewGatedTool result")
}
}
// TestIsGatedTool_DetectsNonWrapped is the negative half of the
// detection test: a hand-rolled Tool that does NOT go through
// NewGatedTool must fail IsGatedTool. This guards the meta-test
// against trivially passing for everything.
func TestIsGatedTool_DetectsNonWrapped(t *testing.T) {
stub := manualToolStub{}
if IsGatedTool(stub) {
t.Fatalf("IsGatedTool returned true for a non-wrapped Tool — detection broken")
}
}
// manualToolStub satisfies skilltools.Tool by hand without going
// through NewGatedTool. Used only to prove IsGatedTool rejects
// non-wrapped implementations.
type manualToolStub struct{}
func (manualToolStub) Name() string { return "manual_stub" }
func (manualToolStub) Description() string { return "manual stub" }
func (manualToolStub) Permission() Permission { return Permission{} }
func (manualToolStub) BuildLLM(Invocation) llm.Tool {
type p struct{}
return llm.DefineTool("manual_stub", "manual stub",
func(ctx context.Context, _ p) (any, error) { return "", nil })
}
// TestNewGatedToolWithAudit_RedactsAuditResult covers the variant used
// by paste_create: the LLM receives a sensitive string (e.g. URL with
// fragment-encoded key) but the audit row records only a redacted
// summary. Confirms LLMResult ↔ AuditResult separation works.
func TestNewGatedToolWithAudit_RedactsAuditResult(t *testing.T) {
tool := NewGatedToolWithAudit[gatedTestParams](
"audited_tool",
"A tool whose audit result is redacted from its LLM result.",
Permission{AuthoringRequirement: RequirementAnyone, SafeForShare: true},
func(ctx context.Context, inv Invocation, args gatedTestParams) (AuditedResult, error) {
return AuditedResult{
LLMResult: "secret-fragment-12345",
AuditArgs: "redacted",
AuditResult: "[redacted]",
}, nil
},
)
if !IsGatedTool(tool) {
t.Fatalf("audited variant must satisfy IsGatedTool")
}
rec := &recordingAudit{}
out, err := buildAndExecute(t, tool,
Invocation{SkillName: "any"},
VisibilityPrivate, rec.hook(),
`{"question":"x"}`)
if err != nil {
t.Fatalf("execute: %v", err)
}
if out != "secret-fragment-12345" {
t.Errorf("LLM saw %q, want secret-fragment-12345", out)
}
calls := rec.snapshot()
if len(calls) != 1 {
t.Fatalf("expected 1 audit call, got %d", len(calls))
}
if calls[0].Args != "redacted" {
t.Errorf("audit args=%q, want redacted", calls[0].Args)
}
if calls[0].Result != "[redacted]" {
t.Errorf("audit result=%q, want [redacted]", calls[0].Result)
}
if strings.Contains(calls[0].Result, "secret-fragment-12345") {
t.Fatalf("audit leaked LLM result into Result field: %q", calls[0].Result)
}
}
// TestNewGatedToolWithAudit_GateRejection mirrors the gate-rejection
// test for the default wrapper to anchor the same contract for the
// audited variant.
func TestNewGatedToolWithAudit_GateRejection(t *testing.T) {
tool := NewGatedToolWithAudit[gatedTestParams](
"audited_gated_tool", "gated tool",
Permission{
AuthoringRequirement: RequirementAnyone,
SkillNameGate: "my-skill",
},
func(ctx context.Context, inv Invocation, args gatedTestParams) (AuditedResult, error) {
t.Fatalf("fn should not run on gate rejection")
return AuditedResult{}, nil
},
)
rec := &recordingAudit{}
_, err := buildAndExecute(t, tool,
Invocation{SkillName: "other"},
VisibilityPrivate, rec.hook(),
`{}`)
if err == nil || !strings.Contains(err.Error(), "restricted to") {
t.Fatalf("expected gate rejection, got %v", err)
}
calls := rec.snapshot()
if len(calls) != 1 || calls[0].Err == nil {
t.Fatalf("expected gate-rejection audit row, got %+v", calls)
}
}
// TestNewGatedToolWithAudit_FallbackArgs verifies that an empty
// AuditArgs falls back to the JSON-marshaled typed args (matching the
// default wrapper's behaviour).
func TestNewGatedToolWithAudit_FallbackArgs(t *testing.T) {
tool := NewGatedToolWithAudit[gatedTestParams](
"audited_fallback_tool", "fallback args test",
Permission{AuthoringRequirement: RequirementAnyone},
func(ctx context.Context, inv Invocation, args gatedTestParams) (AuditedResult, error) {
return AuditedResult{
LLMResult: "ok",
AuditResult: "ok",
// AuditArgs intentionally empty
}, nil
},
)
rec := &recordingAudit{}
_, err := buildAndExecute(t, tool,
Invocation{SkillName: "x"},
VisibilityPrivate, rec.hook(),
`{"question":"hi"}`)
if err != nil {
t.Fatalf("execute: %v", err)
}
calls := rec.snapshot()
if len(calls) != 1 {
t.Fatalf("expected 1 audit call, got %d", len(calls))
}
if !strings.Contains(calls[0].Args, "hi") {
t.Errorf("expected fallback to JSON args containing 'hi', got %q", calls[0].Args)
}
}