feat: foundations — canonical types, Parse grammar, env DSNs, health, chains

Phase 1 of the majordomo build: - llm/ canonical contract (messages, parts, tools, capabilities, streaming, Model/Provider, error classification) - health/ clock-injected tracker (threshold bench, exponential capped cooldown, reset-on-success) - root Registry + Parse (verbatim model ids, inline recursive alias expansion with cycle detection, chain dedup), LLM_* env-DSN providers (go-llm parity: lazy fallback + eager LoadEnv), health-aware chain executor behind the Model interface - provider/fake scriptable test provider; hermetic test suite incl. the trailing-thinking chain and foreman:// env loading - ADRs 0001-0008, CLAUDE.md, README (honest matrix), CI workflow, docs/phase-1-design.md Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-10 12:35:23 +02:00
parent 3025044817
commit dcd004289f
42 changed files with 3863 additions and 0 deletions
@@ -0,0 +1,45 @@
+package llm
+
+import "slices"
+
+// Capabilities declares what a model (or provider) supports and the limits
+// it imposes. Providers declare defaults; individual models may override.
+// The media pipeline normalizes image inputs against these values before a
+// request is serialized.
+//
+// Zero-value semantics:
+//   - MaxImagesPerReq == 0 means image input is NOT supported.
+//   - MaxImageBytes / MaxImageDimension / ContextWindow == 0 mean
+//     "no declared limit", not zero.
+//   - AllowedImageMIME empty means any MIME type is acceptable
+//     (only meaningful when images are supported at all).
+type Capabilities struct {
+	// MaxImageBytes is the largest single image payload, in bytes.
+	MaxImageBytes int
+	// MaxImageDimension is the largest allowed width or height, in pixels.
+	MaxImageDimension int
+	// AllowedImageMIME lists acceptable image content types
+	// (e.g. "image/jpeg", "image/png").
+	AllowedImageMIME []string
+	// MaxImagesPerReq is the most images one request may carry; 0 = images
+	// unsupported.
+	MaxImagesPerReq int
+
+	SupportsTools      bool
+	SupportsStructured bool
+	SupportsStreaming  bool
+
+	// ContextWindow is the model's context size in tokens, when known.
+	ContextWindow int
+}
+
+// SupportsImages reports whether the target accepts image input.
+func (c Capabilities) SupportsImages() bool { return c.MaxImagesPerReq > 0 }
+
+// MIMEAllowed reports whether the given image MIME type is acceptable.
+func (c Capabilities) MIMEAllowed(mime string) bool {
+	if len(c.AllowedImageMIME) == 0 {
+		return true
+	}
+	return slices.Contains(c.AllowedImageMIME, mime)
+}
@@ -0,0 +1,39 @@
+package llm
+
+// Part is one piece of message content: text, an image, or future media
+// kinds. The set of implementations is closed (sealed by the unexported
+// method) so providers can switch exhaustively over content kinds.
+//
+// Why: providers need a finite, known content vocabulary to serialize into
+// their wire formats; an open interface would silently drop unknown content.
+type Part interface {
+	isPart()
+}
+
+// TextPart is plain text content.
+type TextPart struct {
+	Text string
+}
+
+func (TextPart) isPart() {}
+
+// ImagePart is image content carried as raw bytes plus a MIME type.
+//
+// Why bytes-only (no URL form): the media pipeline must be able to inspect,
+// downscale, and re-encode every image to fit the target's capabilities, and
+// that requires the bytes. Callers with a URL fetch it themselves; majordomo
+// does not download remote content on a caller's behalf.
+type ImagePart struct {
+	// MIME is the image content type, e.g. "image/png" or "image/jpeg".
+	MIME string
+	// Data is the raw, unencoded image bytes (providers base64 as needed).
+	Data []byte
+}
+
+func (ImagePart) isPart() {}
+
+// Text constructs a text content part.
+func Text(s string) Part { return TextPart{Text: s} }
+
+// Image constructs an image content part from raw bytes.
+func Image(mime string, data []byte) Part { return ImagePart{MIME: mime, Data: data} }
@@ -0,0 +1,119 @@
+package llm
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"net"
+	"net/http"
+	"strings"
+	"syscall"
+)
+
+// ErrorClass buckets errors for retry/failover decisions.
+type ErrorClass int
+
+const (
+	// ClassTransient errors may succeed on retry or on another target:
+	// rate limits, server errors, timeouts, connection failures.
+	ClassTransient ErrorClass = iota
+	// ClassPermanent errors will not improve on retry of the same request:
+	// malformed requests, auth failures, model-not-found.
+	ClassPermanent
+)
+
+// ErrModelNotFound marks a permanent "this target does not know this model"
+// condition. Chains advance past it without penalizing the target's health.
+var ErrModelNotFound = errors.New("model not found")
+
+// APIError is a structured provider error carrying enough context to
+// classify it and to debug it.
+type APIError struct {
+	// Provider and Model identify the target that failed.
+	Provider string
+	Model    string
+
+	// Status is the HTTP status code, or 0 when the failure was not an HTTP
+	// response (connection error, decode error, ...).
+	Status int
+
+	// Code is the provider-specific error code, when one was supplied.
+	Code string
+
+	// Message is the provider's human-readable error message.
+	Message string
+
+	// Err is the wrapped underlying cause, if any.
+	Err error
+}
+
+func (e *APIError) Error() string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "%s/%s", e.Provider, e.Model)
+	if e.Status != 0 {
+		fmt.Fprintf(&b, ": HTTP %d", e.Status)
+	}
+	if e.Code != "" {
+		fmt.Fprintf(&b, " [%s]", e.Code)
+	}
+	if e.Message != "" {
+		fmt.Fprintf(&b, ": %s", e.Message)
+	}
+	if e.Err != nil {
+		fmt.Fprintf(&b, ": %v", e.Err)
+	}
+	return b.String()
+}
+
+func (e *APIError) Unwrap() error {
+	if e.Err != nil {
+		return e.Err
+	}
+	if e.Status == http.StatusNotFound {
+		return ErrModelNotFound
+	}
+	return nil
+}
+
+// Classify buckets an error as transient or permanent.
+//
+// The default policy (overridable via health configuration):
+//   - context.Canceled is permanent — the caller gave up; retrying defies
+//     their intent. context.DeadlineExceeded is transient.
+//   - Network timeouts, refused/reset connections, and DNS failures are
+//     transient ("high demand" conditions).
+//   - HTTP 400/401/403/404/405/422 (and ErrModelNotFound) are permanent;
+//     408/429 and all 5xx are transient.
+//   - Anything unrecognized is transient: when in doubt, failing over to the
+//     next target in a chain can only help availability.
+func Classify(err error) ErrorClass {
+	if err == nil {
+		return ClassTransient
+	}
+	if errors.Is(err, context.Canceled) {
+		return ClassPermanent
+	}
+	if errors.Is(err, context.DeadlineExceeded) {
+		return ClassTransient
+	}
+	if errors.Is(err, ErrModelNotFound) {
+		return ClassPermanent
+	}
+	if errors.Is(err, syscall.ECONNREFUSED) || errors.Is(err, syscall.ECONNRESET) {
+		return ClassTransient
+	}
+	if _, ok := errors.AsType[net.Error](err); ok {
+		return ClassTransient
+	}
+	if apiErr, ok := errors.AsType[*APIError](err); ok && apiErr.Status != 0 {
+		switch {
+		case apiErr.Status == http.StatusRequestTimeout, // 408
+			apiErr.Status == http.StatusTooManyRequests, // 429
+			apiErr.Status >= 500:
+			return ClassTransient
+		case apiErr.Status >= 400:
+			return ClassPermanent
+		}
+	}
+	return ClassTransient
+}
@@ -0,0 +1,84 @@
+package llm
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"net"
+	"strings"
+	"syscall"
+	"testing"
+)
+
+type fakeNetErr struct{ timeout bool }
+
+func (e fakeNetErr) Error() string   { return "fake net error" }
+func (e fakeNetErr) Timeout() bool   { return e.timeout }
+func (e fakeNetErr) Temporary() bool { return true }
+
+var _ net.Error = fakeNetErr{}
+
+func TestClassify(t *testing.T) {
+	tests := []struct {
+		name string
+		err  error
+		want ErrorClass
+	}{
+		{"canceled is permanent", context.Canceled, ClassPermanent},
+		{"deadline is transient", context.DeadlineExceeded, ClassTransient},
+		{"wrapped canceled", fmt.Errorf("call: %w", context.Canceled), ClassPermanent},
+		{"model not found", fmt.Errorf("x: %w", ErrModelNotFound), ClassPermanent},
+		{"conn refused", syscall.ECONNREFUSED, ClassTransient},
+		{"conn reset", fmt.Errorf("write: %w", syscall.ECONNRESET), ClassTransient},
+		{"net timeout", fakeNetErr{timeout: true}, ClassTransient},
+		{"http 429", &APIError{Status: 429}, ClassTransient},
+		{"http 408", &APIError{Status: 408}, ClassTransient},
+		{"http 500", &APIError{Status: 500}, ClassTransient},
+		{"http 503", &APIError{Status: 503}, ClassTransient},
+		{"http 529", &APIError{Status: 529}, ClassTransient},
+		{"http 400", &APIError{Status: 400}, ClassPermanent},
+		{"http 401", &APIError{Status: 401}, ClassPermanent},
+		{"http 403", &APIError{Status: 403}, ClassPermanent},
+		{"http 404", &APIError{Status: 404}, ClassPermanent},
+		{"http 422", &APIError{Status: 422}, ClassPermanent},
+		{"wrapped api error", fmt.Errorf("call: %w", &APIError{Status: 503}), ClassTransient},
+		{"unknown defaults transient", errors.New("mystery"), ClassTransient},
+		{"non-http api error defaults transient", &APIError{Message: "decode failed"}, ClassTransient},
+	}
+	for _, tt := range tests {
+		if got := Classify(tt.err); got != tt.want {
+			t.Errorf("%s: Classify = %v, want %v", tt.name, got, tt.want)
+		}
+	}
+}
+
+func TestAPIError404UnwrapsToModelNotFound(t *testing.T) {
+	err := &APIError{Provider: "openai", Model: "nope", Status: 404}
+	if !errors.Is(err, ErrModelNotFound) {
+		t.Error("404 APIError should match ErrModelNotFound")
+	}
+	if errors.Is(&APIError{Status: 500}, ErrModelNotFound) {
+		t.Error("500 APIError must not match ErrModelNotFound")
+	}
+}
+
+func TestAPIErrorMessage(t *testing.T) {
+	err := &APIError{
+		Provider: "anthropic", Model: "opus-4.8",
+		Status: 429, Code: "rate_limit_error", Message: "slow down",
+	}
+	got := err.Error()
+	for _, frag := range []string{"anthropic/opus-4.8", "429", "rate_limit_error", "slow down"} {
+		if !strings.Contains(got, frag) {
+			t.Errorf("error string %q missing %q", got, frag)
+		}
+	}
+}
+
+func TestAPIErrorUnwrapsCause(t *testing.T) {
+	cause := errors.New("boom")
+	err := &APIError{Provider: "p", Model: "m", Err: cause}
+	if !errors.Is(err, cause) {
+		t.Error("APIError should unwrap to its cause")
+	}
+}
@@ -0,0 +1,12 @@
+// Package llm defines majordomo's canonical, provider-agnostic contract:
+// messages and content parts, requests and responses, tools, capabilities,
+// streaming, and the Model/Provider interfaces every backend implements.
+//
+// Why: provider implementations (openai, anthropic, google, ollama, foreman,
+// and any client-defined backend) must share one vocabulary without importing
+// each other or the root package. This package is the dependency leaf — it
+// imports nothing else in the module, and everything else imports it.
+//
+// Most consumers never import this package directly: the root majordomo
+// package re-exports every type here via type aliases.
+package llm
@@ -0,0 +1,71 @@
+package llm
+
+import "strings"
+
+// Role identifies the author of a message.
+type Role string
+
+const (
+	RoleSystem    Role = "system"
+	RoleUser      Role = "user"
+	RoleAssistant Role = "assistant"
+	RoleTool      Role = "tool"
+)
+
+// Message is one turn in a conversation.
+//
+// Exactly which fields are populated depends on the role: user and system
+// messages carry Parts; assistant messages carry Parts and/or ToolCalls;
+// tool messages carry ToolResults. Providers translate this canonical shape
+// to and from their wire formats.
+type Message struct {
+	Role Role
+
+	// Parts is the message content (text, images, ...).
+	Parts []Part
+
+	// ToolCalls are tool invocations requested by the assistant
+	// (meaningful only when Role == RoleAssistant).
+	ToolCalls []ToolCall
+
+	// ToolResults carry the outcomes of earlier ToolCalls
+	// (meaningful only when Role == RoleTool).
+	ToolResults []ToolResult
+}
+
+// Text returns the concatenation of all text parts in the message.
+func (m Message) Text() string {
+	var b strings.Builder
+	for _, p := range m.Parts {
+		if t, ok := p.(TextPart); ok {
+			b.WriteString(t.Text)
+		}
+	}
+	return b.String()
+}
+
+// SystemText constructs a system message with one text part.
+func SystemText(s string) Message {
+	return Message{Role: RoleSystem, Parts: []Part{Text(s)}}
+}
+
+// UserText constructs a user message with one text part.
+func UserText(s string) Message {
+	return Message{Role: RoleUser, Parts: []Part{Text(s)}}
+}
+
+// UserParts constructs a user message from arbitrary content parts
+// (e.g. text plus images).
+func UserParts(parts ...Part) Message {
+	return Message{Role: RoleUser, Parts: parts}
+}
+
+// AssistantText constructs an assistant message with one text part.
+func AssistantText(s string) Message {
+	return Message{Role: RoleAssistant, Parts: []Part{Text(s)}}
+}
+
+// ToolResultsMessage constructs a tool message carrying one or more results.
+func ToolResultsMessage(results ...ToolResult) Message {
+	return Message{Role: RoleTool, ToolResults: results}
+}
@@ -0,0 +1,62 @@
+package llm
+
+import "testing"
+
+func TestMessageText(t *testing.T) {
+	m := UserParts(Text("a "), Image("image/png", []byte{1}), Text("b"))
+	if got := m.Text(); got != "a b" {
+		t.Errorf("Text = %q, want %q", got, "a b")
+	}
+}
+
+func TestConstructors(t *testing.T) {
+	if m := SystemText("s"); m.Role != RoleSystem || m.Text() != "s" {
+		t.Errorf("SystemText = %+v", m)
+	}
+	if m := UserText("u"); m.Role != RoleUser || m.Text() != "u" {
+		t.Errorf("UserText = %+v", m)
+	}
+	if m := AssistantText("a"); m.Role != RoleAssistant || m.Text() != "a" {
+		t.Errorf("AssistantText = %+v", m)
+	}
+	m := ToolResultsMessage(ToolResult{ID: "1", Content: "ok"})
+	if m.Role != RoleTool || len(m.ToolResults) != 1 {
+		t.Errorf("ToolResultsMessage = %+v", m)
+	}
+}
+
+func TestResponseTextAndMessage(t *testing.T) {
+	r := &Response{
+		Parts:     []Part{Text("hello "), Text("world")},
+		ToolCalls: []ToolCall{{ID: "1", Name: "t"}},
+	}
+	if got := r.Text(); got != "hello world" {
+		t.Errorf("Text = %q", got)
+	}
+	m := r.Message()
+	if m.Role != RoleAssistant || m.Text() != "hello world" || len(m.ToolCalls) != 1 {
+		t.Errorf("Message = %+v", m)
+	}
+}
+
+func TestUsageAccumulation(t *testing.T) {
+	u := Usage{InputTokens: 10, OutputTokens: 5}
+	u.Add(Usage{InputTokens: 1, OutputTokens: 2})
+	if u.InputTokens != 11 || u.OutputTokens != 7 || u.Total() != 18 {
+		t.Errorf("usage = %+v", u)
+	}
+}
+
+func TestCapabilitiesHelpers(t *testing.T) {
+	c := Capabilities{}
+	if c.SupportsImages() {
+		t.Error("zero MaxImagesPerReq must mean images unsupported")
+	}
+	if !c.MIMEAllowed("image/png") {
+		t.Error("empty AllowedImageMIME must allow any type")
+	}
+	c = Capabilities{MaxImagesPerReq: 2, AllowedImageMIME: []string{"image/jpeg"}}
+	if !c.SupportsImages() || c.MIMEAllowed("image/png") || !c.MIMEAllowed("image/jpeg") {
+		t.Errorf("capabilities helpers misbehave: %+v", c)
+	}
+}
@@ -0,0 +1,58 @@
+package llm
+
+import "context"
+
+// Model is the canonical generation interface. A Model may be a single
+// provider-bound target or a failover chain — the two are interchangeable
+// and callers never branch on which they got.
+type Model interface {
+	// Generate performs one request/response round trip.
+	Generate(ctx context.Context, req Request, opts ...Option) (*Response, error)
+
+	// Stream performs one request with incremental delivery.
+	Stream(ctx context.Context, req Request, opts ...Option) (Stream, error)
+
+	// Capabilities reports what this model supports. For chains this is the
+	// head element's capabilities (the preferred target); per-attempt media
+	// normalization always uses the actual target's capabilities.
+	Capabilities() Capabilities
+}
+
+// ModelOption configures a Model at construction time (Provider.Model).
+type ModelOption func(*ModelConfig)
+
+// ModelConfig carries per-model construction settings shared by all
+// providers.
+type ModelConfig struct {
+	// Capabilities, when non-nil, overrides the provider's default
+	// capabilities for this model.
+	Capabilities *Capabilities
+}
+
+// ApplyModelOptions folds options into a config.
+func ApplyModelOptions(opts []ModelOption) ModelConfig {
+	var cfg ModelConfig
+	for _, opt := range opts {
+		opt(&cfg)
+	}
+	return cfg
+}
+
+// WithCapabilities overrides the provider's default capabilities for one
+// model (e.g. a vision-capable tag on an otherwise text-only provider).
+func WithCapabilities(caps Capabilities) ModelOption {
+	return func(cfg *ModelConfig) { cfg.Capabilities = &caps }
+}
+
+// Provider mints Models bound to one backend. Implementations translate the
+// canonical Request/Response to and from their wire format and enforce their
+// declared Capabilities.
+type Provider interface {
+	// Name is the registry identifier used in "provider/model" specs.
+	Name() string
+
+	// Model returns a Model bound to the given id. The id is whatever the
+	// backend accepts — majordomo passes it through verbatim and never
+	// validates it against a catalog.
+	Model(id string, opts ...ModelOption) (Model, error)
+}
@@ -0,0 +1,98 @@
+package llm
+
+import "encoding/json"
+
+// Request is the canonical generation request. Providers translate it to
+// their wire format and enforce their declared Capabilities against it.
+type Request struct {
+	// System is the system prompt. Providers map it to their native system
+	// mechanism (top-level system field, system message, SystemInstruction).
+	// Any RoleSystem messages in Messages are folded in after this field.
+	System string
+
+	// Messages is the conversation so far, oldest first.
+	Messages []Message
+
+	// Tools the model may call.
+	Tools []Tool
+
+	// ToolChoice constrains tool use: "" or "auto" lets the model decide,
+	// "none" forbids tool calls, "required" forces some tool call, and any
+	// other value names the one tool the model must call.
+	ToolChoice string
+
+	// Schema, when non-nil, is a JSON Schema object the response must
+	// conform to (structured output). Providers map it to their native
+	// mechanism. SchemaName names the schema for providers that require one.
+	Schema     json.RawMessage
+	SchemaName string
+
+	// Sampling and limit knobs. Pointer fields distinguish "unset" (provider
+	// default) from an explicit zero.
+	Temperature *float64
+	TopP        *float64
+
+	// MaxTokens caps the response length; 0 means provider default.
+	MaxTokens int
+
+	// StopSequences halt generation when emitted.
+	StopSequences []string
+}
+
+// Option mutates a Request before it is sent. Options passed to Generate or
+// Stream are applied to a copy of the request, so a Request value can be
+// safely reused across calls.
+type Option func(*Request)
+
+// WithSystem sets the system prompt.
+func WithSystem(s string) Option { return func(r *Request) { r.System = s } }
+
+// WithTools appends tools to the request.
+func WithTools(tools ...Tool) Option {
+	return func(r *Request) { r.Tools = append(r.Tools, tools...) }
+}
+
+// WithToolbox appends every tool in the toolbox to the request.
+func WithToolbox(b *Toolbox) Option {
+	return func(r *Request) { r.Tools = append(r.Tools, b.Tools()...) }
+}
+
+// WithToolChoice sets the tool-choice policy ("auto", "none", "required",
+// or a specific tool name).
+func WithToolChoice(choice string) Option {
+	return func(r *Request) { r.ToolChoice = choice }
+}
+
+// WithSchema requests structured output conforming to the given JSON Schema.
+// name is optional; providers that require a schema name fall back to
+// "response" when it is empty.
+func WithSchema(schema json.RawMessage, name string) Option {
+	return func(r *Request) { r.Schema = schema; r.SchemaName = name }
+}
+
+// WithTemperature sets the sampling temperature.
+func WithTemperature(t float64) Option {
+	return func(r *Request) { r.Temperature = &t }
+}
+
+// WithTopP sets nucleus-sampling top-p.
+func WithTopP(p float64) Option {
+	return func(r *Request) { r.TopP = &p }
+}
+
+// WithMaxTokens caps the response length.
+func WithMaxTokens(n int) Option { return func(r *Request) { r.MaxTokens = n } }
+
+// WithStopSequences sets stop sequences.
+func WithStopSequences(stops ...string) Option {
+	return func(r *Request) { r.StopSequences = stops }
+}
+
+// Apply returns a copy of the request with all options applied. Providers
+// and wrappers call this once at the top of Generate/Stream.
+func (r Request) Apply(opts ...Option) Request {
+	for _, opt := range opts {
+		opt(&r)
+	}
+	return r
+}
@@ -0,0 +1,73 @@
+package llm
+
+import "strings"
+
+// FinishReason explains why generation stopped.
+type FinishReason string
+
+const (
+	// FinishStop: the model completed its answer (or hit a stop sequence).
+	FinishStop FinishReason = "stop"
+	// FinishLength: the MaxTokens (or context) limit was hit.
+	FinishLength FinishReason = "length"
+	// FinishToolCalls: the model stopped to request tool invocations.
+	FinishToolCalls FinishReason = "tool_calls"
+	// FinishContentFilter: the provider suppressed content.
+	FinishContentFilter FinishReason = "content_filter"
+	// FinishOther: any provider-specific reason not mapped above.
+	FinishOther FinishReason = "other"
+)
+
+// Usage reports token accounting for one request.
+type Usage struct {
+	InputTokens  int
+	OutputTokens int
+}
+
+// Total returns input plus output tokens.
+func (u Usage) Total() int { return u.InputTokens + u.OutputTokens }
+
+// Add accumulates another usage record (used by agents summing steps).
+func (u *Usage) Add(o Usage) {
+	u.InputTokens += o.InputTokens
+	u.OutputTokens += o.OutputTokens
+}
+
+// Response is the canonical generation result.
+type Response struct {
+	// Parts is the response content (text, and for multimodal-output models,
+	// other media).
+	Parts []Part
+
+	// ToolCalls are the tool invocations the model requested, if any.
+	ToolCalls []ToolCall
+
+	FinishReason FinishReason
+	Usage        Usage
+
+	// Model identifies the resolved target that produced this response as
+	// "provider/model-id". With failover chains this names the element that
+	// actually served the request.
+	Model string
+
+	// Raw is the provider-native response object, an escape hatch for
+	// provider-specific fields. May be nil; never required for normal use.
+	Raw any
+}
+
+// Text returns the concatenation of all text parts in the response.
+func (r *Response) Text() string {
+	var b strings.Builder
+	for _, p := range r.Parts {
+		if t, ok := p.(TextPart); ok {
+			b.WriteString(t.Text)
+		}
+	}
+	return b.String()
+}
+
+// Message converts the response into an assistant message suitable for
+// appending to a conversation history.
+func (r *Response) Message() Message {
+	return Message{Role: RoleAssistant, Parts: r.Parts, ToolCalls: r.ToolCalls}
+}
@@ -0,0 +1,28 @@
+package llm
+
+// StreamEvent is one increment of a streaming response.
+//
+// Exactly one field group is meaningful per event: a text delta, a completed
+// tool call, or the final response. Tool-call arguments are buffered by the
+// provider until complete — consumers never see partial JSON.
+type StreamEvent struct {
+	// TextDelta is a fragment of assistant text.
+	TextDelta string
+
+	// ToolCall, when non-nil, is a fully-assembled tool call.
+	ToolCall *ToolCall
+
+	// Response, when non-nil, is the final accumulated response (content,
+	// tool calls, finish reason, usage). It is always the last event.
+	Response *Response
+}
+
+// Stream delivers a response incrementally.
+//
+// Next returns io.EOF after the final event (the one carrying Response).
+// Close releases the underlying connection and is safe to call at any time,
+// including after io.EOF or concurrently with Next returning.
+type Stream interface {
+	Next() (StreamEvent, error)
+	Close() error
+}
@@ -0,0 +1,165 @@
+package llm
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+)
+
+// Tool is a callable capability exposed to a model: a name, a description,
+// JSON-Schema parameters, and a Go handler. Providers map this one canonical
+// shape onto their native function-calling formats.
+type Tool struct {
+	Name        string
+	Description string
+
+	// Parameters is a JSON Schema object describing the tool's arguments.
+	// nil means the tool takes no arguments.
+	Parameters json.RawMessage
+
+	// Handler executes the tool. args is the raw JSON arguments object the
+	// model supplied. The returned value is JSON-encoded into the ToolResult.
+	Handler func(ctx context.Context, args json.RawMessage) (any, error)
+}
+
+// ToolCall is a model's request to invoke a tool.
+type ToolCall struct {
+	// ID is the provider-assigned call id; majordomo synthesizes one for
+	// providers that do not supply ids. ToolResult.ID must echo it.
+	ID   string
+	Name string
+
+	// Arguments is the raw JSON arguments object.
+	Arguments json.RawMessage
+}
+
+// ToolResult is the outcome of executing a ToolCall, sent back to the model.
+type ToolResult struct {
+	// ID matches the originating ToolCall.ID.
+	ID   string
+	Name string
+
+	// Content is the result serialized as text (JSON for structured values).
+	Content string
+
+	// IsError marks the result as a failure; the content then describes the
+	// error so the model can react (retry, apologize, try another tool).
+	IsError bool
+}
+
+// Toolbox is a named, ordered set of tools.
+//
+// Why: agents compose their available tools from several sources (multiple
+// toolboxes plus skills); a small named container with duplicate detection
+// keeps that merge explicit and debuggable.
+type Toolbox struct {
+	name  string
+	order []string
+	tools map[string]Tool
+}
+
+// NewToolbox creates a toolbox with the given name and initial tools.
+// Duplicate tool names panic: toolboxes are assembled at startup, and a
+// silently shadowed tool is a programming error worth failing loudly on.
+func NewToolbox(name string, tools ...Tool) *Toolbox {
+	b := &Toolbox{name: name, tools: make(map[string]Tool, len(tools))}
+	for _, t := range tools {
+		if err := b.Add(t); err != nil {
+			panic(err)
+		}
+	}
+	return b
+}
+
+// Name returns the toolbox name.
+func (b *Toolbox) Name() string { return b.name }
+
+// Add registers a tool, rejecting empty or duplicate names.
+func (b *Toolbox) Add(t Tool) error {
+	if t.Name == "" {
+		return fmt.Errorf("toolbox %q: tool with empty name", b.name)
+	}
+	if _, exists := b.tools[t.Name]; exists {
+		return fmt.Errorf("toolbox %q: duplicate tool %q", b.name, t.Name)
+	}
+	b.tools[t.Name] = t
+	b.order = append(b.order, t.Name)
+	return nil
+}
+
+// Tools returns the tools in insertion order.
+func (b *Toolbox) Tools() []Tool {
+	out := make([]Tool, 0, len(b.order))
+	for _, name := range b.order {
+		out = append(out, b.tools[name])
+	}
+	return out
+}
+
+// Get returns the named tool.
+func (b *Toolbox) Get(name string) (Tool, bool) {
+	t, ok := b.tools[name]
+	return t, ok
+}
+
+// Execute runs the named tool for the given call and packages the outcome as
+// a ToolResult. It never panics and never returns an error: handler errors
+// and panics become IsError results so an agent loop can always continue.
+func (b *Toolbox) Execute(ctx context.Context, call ToolCall) ToolResult {
+	t, ok := b.tools[call.Name]
+	if !ok {
+		return ToolResult{
+			ID: call.ID, Name: call.Name,
+			Content: fmt.Sprintf("unknown tool %q", call.Name),
+			IsError: true,
+		}
+	}
+	return ExecuteTool(ctx, t, call)
+}
+
+// ExecuteTool runs a single tool for the given call, recovering panics and
+// converting errors into IsError results.
+func ExecuteTool(ctx context.Context, t Tool, call ToolCall) (res ToolResult) {
+	res = ToolResult{ID: call.ID, Name: call.Name}
+	defer func() {
+		if r := recover(); r != nil {
+			res.Content = fmt.Sprintf("tool %q panicked: %v", call.Name, r)
+			res.IsError = true
+		}
+	}()
+
+	if t.Handler == nil {
+		res.Content = fmt.Sprintf("tool %q has no handler", call.Name)
+		res.IsError = true
+		return res
+	}
+
+	args := call.Arguments
+	if len(args) == 0 {
+		args = json.RawMessage("{}")
+	}
+	out, err := t.Handler(ctx, args)
+	if err != nil {
+		res.Content = err.Error()
+		res.IsError = true
+		return res
+	}
+
+	switch v := out.(type) {
+	case nil:
+		res.Content = "null"
+	case string:
+		res.Content = v
+	case json.RawMessage:
+		res.Content = string(v)
+	default:
+		enc, err := json.Marshal(v)
+		if err != nil {
+			res.Content = fmt.Sprintf("tool %q returned unencodable value: %v", call.Name, err)
+			res.IsError = true
+			return res
+		}
+		res.Content = string(enc)
+	}
+	return res
+}
@@ -0,0 +1,98 @@
+package llm
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"strings"
+	"testing"
+)
+
+func TestToolboxAddRejectsDuplicatesAndEmptyNames(t *testing.T) {
+	b := NewToolbox("box")
+	if err := b.Add(Tool{Name: "a"}); err != nil {
+		t.Fatalf("Add: %v", err)
+	}
+	if err := b.Add(Tool{Name: "a"}); err == nil {
+		t.Error("duplicate name should error")
+	}
+	if err := b.Add(Tool{}); err == nil {
+		t.Error("empty name should error")
+	}
+}
+
+func TestToolboxOrderPreserved(t *testing.T) {
+	b := NewToolbox("box", Tool{Name: "z"}, Tool{Name: "a"}, Tool{Name: "m"})
+	var names []string
+	for _, tool := range b.Tools() {
+		names = append(names, tool.Name)
+	}
+	if got, want := strings.Join(names, ","), "z,a,m"; got != want {
+		t.Errorf("order = %s, want %s", got, want)
+	}
+}
+
+func TestExecuteUnknownTool(t *testing.T) {
+	b := NewToolbox("box")
+	res := b.Execute(context.Background(), ToolCall{ID: "1", Name: "missing"})
+	if !res.IsError || !strings.Contains(res.Content, "missing") {
+		t.Errorf("result = %+v, want unknown-tool error", res)
+	}
+}
+
+func TestExecuteHandlerOutcomes(t *testing.T) {
+	echo := func(v any, err error) Tool {
+		return Tool{Name: "t", Handler: func(context.Context, json.RawMessage) (any, error) { return v, err }}
+	}
+
+	tests := []struct {
+		name        string
+		tool        Tool
+		wantContent string
+		wantErr     bool
+	}{
+		{"string passthrough", echo("plain", nil), "plain", false},
+		{"struct json-encoded", echo(struct {
+			N int `json:"n"`
+		}{4}, nil), `{"n":4}`, false},
+		{"raw message passthrough", echo(json.RawMessage(`{"k":1}`), nil), `{"k":1}`, false},
+		{"nil becomes null", echo(nil, nil), "null", false},
+		{"handler error", echo(nil, errors.New("boom")), "boom", true},
+		{"unencodable value", echo(func() {}, nil), "unencodable", true},
+		{"no handler", Tool{Name: "t"}, "no handler", true},
+	}
+	for _, tt := range tests {
+		res := ExecuteTool(context.Background(), tt.tool, ToolCall{ID: "c1", Name: "t"})
+		if res.IsError != tt.wantErr {
+			t.Errorf("%s: IsError = %v, want %v (%+v)", tt.name, res.IsError, tt.wantErr, res)
+		}
+		if !strings.Contains(res.Content, tt.wantContent) {
+			t.Errorf("%s: content = %q, want it to contain %q", tt.name, res.Content, tt.wantContent)
+		}
+		if res.ID != "c1" {
+			t.Errorf("%s: result ID = %q, want c1", tt.name, res.ID)
+		}
+	}
+}
+
+func TestExecuteRecoversPanic(t *testing.T) {
+	tool := Tool{Name: "t", Handler: func(context.Context, json.RawMessage) (any, error) {
+		panic("kaboom")
+	}}
+	res := ExecuteTool(context.Background(), tool, ToolCall{ID: "1", Name: "t"})
+	if !res.IsError || !strings.Contains(res.Content, "kaboom") {
+		t.Errorf("result = %+v, want recovered panic error", res)
+	}
+}
+
+func TestExecuteEmptyArgsBecomeEmptyObject(t *testing.T) {
+	var got json.RawMessage
+	tool := Tool{Name: "t", Handler: func(_ context.Context, args json.RawMessage) (any, error) {
+		got = args
+		return "ok", nil
+	}}
+	ExecuteTool(context.Background(), tool, ToolCall{ID: "1", Name: "t"})
+	if string(got) != "{}" {
+		t.Errorf("args = %q, want {}", got)
+	}
+}