// Package llmmeta is the shared meta-LLM helper used by the v12 // authoring tools (summarize, translate, extract_entities, classify). // // Why a dedicated package: each of those four tools makes "one fast-tier // LLM call → typed result", with shared concerns (tier allowlist, // ledger row, JSON-retry on malformed output). Centralising the pattern // stops every tool from re-implementing the surrounding bookkeeping and // keeps the audit trail uniform. // // The helper itself does NOT know about the four tools — it just exposes // a Call(ctx, CallSpec) → CallResult shape. Each tool builds its own // prompt + parses the typed result. The helper records the meta-call // ledger row on every call, success or failure. // // Concurrency / lanes: the helper resolves the tier to an llm.Model via // model.ParseModelForContext and uses model.Generate. Lane routing is // already baked in at the LLM transport layer (see // pkg/logic/llms/lane_transport.go) so each Generate call automatically // goes through the right lane without further plumbing. Usage recording // is automatic too: parsed models are instrumented by pkg/logic/llms, // so the helper does NOT call model.RecordUsage itself. // // Tier allowlist: convar `skills.llm_meta.allowed_tiers` (default // `["fast"]`) controls which tiers a meta-tool may use. A request for // a disallowed tier returns error_kind="tier_not_allowed" WITHOUT // making the call AND WITHOUT recording a ledger row (the call did // not happen). // // Test: helper_test.go covers tier allowed, tier rejected, JSON // retry path, malformed-twice path, and ledger-row emission semantics. package llmmeta import ( "context" "encoding/json" "fmt" "strings" "time" llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm" "github.com/google/uuid" "gitea.stevedudenhoeffer.com/steve/executus/model" ) // MetaCall is the domain row written to skill_llm_meta_calls on every // helper call. // // Why a dedicated table (not skill_run_logs): per-skill token // aggregation is cleaner with typed columns. Folding meta-calls into // the generic event log would force a SUM-from-JSON path on every // dashboard query. // // Why the field set is tight (no payload columns): the request bodies // can be 32KB+. The agent's main run already captures system_prompt // + user_message in the trace; storing them again here would double // the audit footprint with no diagnostic value (the meta-call's // inputs are derivable from the parent run's tool-call args). type MetaCall struct { ID string RunID string SkillID string ToolName string TierUsed string // "fast" / "standard" ModelUsed string // resolved provider/model InputTokens int OutputTokens int DurationMs int Success bool ErrorKind string // empty on success; one of the sentinel kinds otherwise CreatedAt time.Time } // Storage is the narrow surface the helper uses to persist meta-call // ledger rows. Production wires a thin adapter around the skills GORM // storage; tests substitute a fake. // // Why an interface (vs depending on pkg/logic/skills.Storage): the // skills package imports skilltools (tool registry); having // skilltools/llmmeta depend back on skills would form an import // cycle. A narrow interface mirrored across the boundary is the // project's standard cycle-break pattern (see KVStorage / FileStorage // in pkg/skilltools/tools/). type Storage interface { RecordMetaCall(ctx context.Context, call MetaCall) error } // ConvarReader is the narrow surface the helper uses to read // `skills.llm_meta.allowed_tiers`. The convar package is database- // backed; tests pass a static fake. // // Why an interface (vs reading convars directly): unit tests want to // fake the allowlist without spinning up a convar manager. type ConvarReader interface { // AllowedTiers returns the list of tier names a meta-tool may use. // Default ["fast"]. AllowedTiers(ctx context.Context) []string } // ConvarReaderFunc adapts a closure into a ConvarReader. Useful in // production wiring (mort.go) where the underlying access is a // single line of logic. type ConvarReaderFunc func(ctx context.Context) []string // AllowedTiers satisfies ConvarReader. func (f ConvarReaderFunc) AllowedTiers(ctx context.Context) []string { if f == nil { return []string{"fast"} } return f(ctx) } // Helper makes one fast-tier LLM call with surrounding bookkeeping // (tier allowlist, JSON retry, ledger row). // // Construct once at boot; all four meta-tools share the same Helper. type Helper struct { storage Storage convars ConvarReader } // New constructs a Helper. storage MUST be non-nil; passing nil makes // every Call write a no-op ledger row (callers that need a fully no-op // helper should instead avoid registering the tool). // // convars may be nil — the helper falls back to the default allowlist // `["fast"]`. // // Why a constructor with explicit deps (vs Helper{...} struct // initialiser): forces the deployment-time decision about which // dependencies are wired vs nil-safe at the construction call site, // not at the call site of each tool. func New(storage Storage, convars ConvarReader) *Helper { return &Helper{ storage: storage, convars: convars, } } // CallSpec is the per-call input. // // Why every field is explicit (vs builder pattern): the four meta-tools // each populate the spec in one place; a struct literal at the call // site is more readable than chained setters. type CallSpec struct { // Tier is the tier alias to use ("fast" / "standard"). Empty falls // back to "fast". Disallowed tiers (per the convar allowlist) cause // Call to return CallResult{Success: false, ErrorKind: // "tier_not_allowed"} WITHOUT making the LLM call AND without // writing a ledger row (the call did not happen). Tier string // SystemPrompt is the system message. May be empty. SystemPrompt string // UserPrompt is the user message. Required. UserPrompt string // MaxOutputTokens caps the response. 0 disables the cap (provider // default). The helper uses this both to bound the cost estimate // AND to set llm.WithMaxTokens on the request. MaxOutputTokens int // ResponseFormat is "text" or "json". When "json", the helper // attempts to parse the response into JSON. Other values fall // through as "text". ResponseFormat string // RetryOnMalformedJSON, when true and ResponseFormat=="json", // retries the call ONCE with a stricter JSON-only prompt prefix // when the first response fails to parse. Second-failure returns // CallResult{Success: true, Parsed: nil, ErrorKind: // "malformed_json"} so callers can fall back to result.Text. RetryOnMalformedJSON bool // ToolName is the meta-tool name recorded in the ledger row // ("summarize", "translate", "extract_entities", "classify"). The // helper does not branch on this value. ToolName string // RunID is the calling skill run ID. Recorded in the ledger row; // also used by the cost-cap callback to find the running 7-day // total. RunID string // SkillID is the calling skill ID. Recorded in the ledger row; // passed to the cost-cap callback. SkillID string // CallerID is the Discord member ID that triggered the parent // skill run. Passed to the cost-cap callback so the per-user // 7-day cap can be evaluated. CallerID string } // CallResult is the per-call output. // // Why text + parsed (vs only one): JSON-format calls expose both the // raw response (in .Text) and the parsed map (in .Parsed). Text-format // calls leave .Parsed nil. Callers requesting JSON that fails to parse // twice get .Text populated and ErrorKind="malformed_json" so they // can fall back to text-mode without an error path. type CallResult struct { // Text is the raw response text from the LLM. Populated on every // successful call (success=true) AND when JSON parsing failed // twice (success=true, parsed=nil, error_kind="malformed_json"). // Empty on tier_not_allowed rejections (no LLM call happened). Text string // Parsed is the JSON-decoded response. nil for text-format calls, // nil for failed JSON parses, populated for successful JSON // responses. The interior shape is whatever the LLM returned; the // caller is responsible for asserting a typed view. Parsed any // InputTokens is the tokens billed against the input. 0 when the // provider didn't surface usage. InputTokens int // OutputTokens is the tokens billed against the output. 0 when the // provider didn't surface usage. OutputTokens int // DurationMs is wall-clock duration of the LLM call (or call+retry // in the JSON-retry case). DurationMs int // ModelUsed is the resolved provider/model string ("anthropic/ // claude-haiku-4-5-20251001"). Populated on every actual LLM call; // empty on tier_not_allowed rejections. ModelUsed string // Success reports whether the LLM call returned a usable response. // True on happy-path AND on malformed-json second-failure (the // caller can fall back to .Text). False on transport errors, // tier_not_allowed, llm_unavailable. Success bool // ErrorKind, when non-empty, is one of: // - "tier_not_allowed" → no call, no ledger row // - "llm_unavailable" → call attempted, ledger row written // - "malformed_json" → call succeeded but JSON parse failed ErrorKind string } // Sentinel error_kind values for CallResult.ErrorKind. const ( ErrorKindTierNotAllowed = "tier_not_allowed" ErrorKindLLMUnavailable = "llm_unavailable" ErrorKindMalformedJSON = "malformed_json" ) // Call performs the meta-LLM call and returns a typed CallResult. // // Why no error return (vs an error second value): every meaningful // failure is captured as a CallResult.ErrorKind so the caller's branch // logic stays single-pathed. Internal transport errors are surfaced // as ErrorKind=llm_unavailable. The function only returns a non-nil // error for argument-validation failures (empty UserPrompt) — a // programmer error the caller would have to fix anyway. // // Test: helper_test.go covers all outcomes (tier_not_allowed, happy // text, happy json, malformed_json retry-pass, malformed_json // retry-fail, llm_unavailable). func (h *Helper) Call(ctx context.Context, spec CallSpec) (CallResult, error) { if strings.TrimSpace(spec.UserPrompt) == "" { return CallResult{}, fmt.Errorf("llmmeta: user_prompt required") } tier := strings.TrimSpace(spec.Tier) if tier == "" { tier = "fast" } // Tier allowlist: rejected tiers do NOT make the call AND do NOT // record a ledger row. if !h.tierAllowed(ctx, tier) { return CallResult{ Success: false, ErrorKind: ErrorKindTierNotAllowed, }, nil } resolvedModel := model.ResolveModelName(tier) // Resolve model. ParseModelForContext attaches the resolved model // name to ctx (for usage attribution) AND returns the llm.Model // whose Generate already routes through the lane wrapper. ctx, model, err := model.ParseModelForContext(ctx, tier) if err != nil { // Tier convar mis-set: surface as tier_not_allowed to the // caller (the agent's recovery path is the same as for an // admin-disabled tier) but DO record the failure for the // admin who needs to fix the convar. h.recordLedger(ctx, MetaCall{ ID: uuid.NewString(), RunID: spec.RunID, SkillID: spec.SkillID, ToolName: spec.ToolName, TierUsed: tier, ModelUsed: resolvedModel, Success: false, ErrorKind: ErrorKindTierNotAllowed, CreatedAt: time.Now(), }) return CallResult{ Success: false, ErrorKind: ErrorKindTierNotAllowed, }, nil } // First call. start := time.Now() systemPrompt := spec.SystemPrompt userMessage := spec.UserPrompt opts := []llm.Option{} if spec.MaxOutputTokens > 0 { opts = append(opts, llm.WithMaxTokens(spec.MaxOutputTokens)) } text, usage, llmErr := h.complete(ctx, model, systemPrompt, userMessage, opts) if llmErr != nil { duration := int(time.Since(start) / time.Millisecond) h.recordLedger(ctx, MetaCall{ ID: uuid.NewString(), RunID: spec.RunID, SkillID: spec.SkillID, ToolName: spec.ToolName, TierUsed: tier, ModelUsed: resolvedModel, InputTokens: usage.InputTokens, OutputTokens: usage.OutputTokens, DurationMs: duration, Success: false, ErrorKind: ErrorKindLLMUnavailable, CreatedAt: time.Now(), }) return CallResult{ Success: false, ErrorKind: ErrorKindLLMUnavailable, ModelUsed: resolvedModel, DurationMs: duration, InputTokens: usage.InputTokens, OutputTokens: usage.OutputTokens, }, nil } // Determine outcome based on response format. parsed, parsedOK := tryParseJSON(text, spec.ResponseFormat) wantJSON := strings.EqualFold(spec.ResponseFormat, "json") if !wantJSON || parsedOK { // Happy path (text mode OR JSON mode that parsed first try). duration := int(time.Since(start) / time.Millisecond) h.recordLedger(ctx, MetaCall{ ID: uuid.NewString(), RunID: spec.RunID, SkillID: spec.SkillID, ToolName: spec.ToolName, TierUsed: tier, ModelUsed: resolvedModel, InputTokens: usage.InputTokens, OutputTokens: usage.OutputTokens, DurationMs: duration, Success: true, CreatedAt: time.Now(), }) return CallResult{ Text: text, Parsed: parsed, Success: true, ModelUsed: resolvedModel, InputTokens: usage.InputTokens, OutputTokens: usage.OutputTokens, DurationMs: duration, }, nil } // JSON requested but first response failed to parse. if !spec.RetryOnMalformedJSON { duration := int(time.Since(start) / time.Millisecond) h.recordLedger(ctx, MetaCall{ ID: uuid.NewString(), RunID: spec.RunID, SkillID: spec.SkillID, ToolName: spec.ToolName, TierUsed: tier, ModelUsed: resolvedModel, InputTokens: usage.InputTokens, OutputTokens: usage.OutputTokens, DurationMs: duration, Success: true, ErrorKind: ErrorKindMalformedJSON, CreatedAt: time.Now(), }) return CallResult{ Text: text, Success: true, ErrorKind: ErrorKindMalformedJSON, ModelUsed: resolvedModel, InputTokens: usage.InputTokens, OutputTokens: usage.OutputTokens, DurationMs: duration, }, nil } // Retry once with stricter JSON-only prompt prefix. stricterPrompt := "Return ONLY valid JSON. No prose, no markdown fencing.\n\n" + userMessage text2, usage2, llmErr2 := h.complete(ctx, model, systemPrompt, stricterPrompt, opts) combinedUsage := Tokens{ InputTokens: usage.InputTokens + usage2.InputTokens, OutputTokens: usage.OutputTokens + usage2.OutputTokens, } duration := int(time.Since(start) / time.Millisecond) if llmErr2 != nil { // Retry call itself failed transport-wise. Record the round- // trip tokens and surface llm_unavailable. h.recordLedger(ctx, MetaCall{ ID: uuid.NewString(), RunID: spec.RunID, SkillID: spec.SkillID, ToolName: spec.ToolName, TierUsed: tier, ModelUsed: resolvedModel, InputTokens: combinedUsage.InputTokens, OutputTokens: combinedUsage.OutputTokens, DurationMs: duration, Success: false, ErrorKind: ErrorKindLLMUnavailable, CreatedAt: time.Now(), }) return CallResult{ Text: text, Success: false, ErrorKind: ErrorKindLLMUnavailable, ModelUsed: resolvedModel, InputTokens: combinedUsage.InputTokens, OutputTokens: combinedUsage.OutputTokens, DurationMs: duration, }, nil } parsed2, parsedOK2 := tryParseJSON(text2, "json") if parsedOK2 { h.recordLedger(ctx, MetaCall{ ID: uuid.NewString(), RunID: spec.RunID, SkillID: spec.SkillID, ToolName: spec.ToolName, TierUsed: tier, ModelUsed: resolvedModel, InputTokens: combinedUsage.InputTokens, OutputTokens: combinedUsage.OutputTokens, DurationMs: duration, Success: true, CreatedAt: time.Now(), }) return CallResult{ Text: text2, Parsed: parsed2, Success: true, ModelUsed: resolvedModel, InputTokens: combinedUsage.InputTokens, OutputTokens: combinedUsage.OutputTokens, DurationMs: duration, }, nil } // Second-failure path. Caller can fall back to result.Text. h.recordLedger(ctx, MetaCall{ ID: uuid.NewString(), RunID: spec.RunID, SkillID: spec.SkillID, ToolName: spec.ToolName, TierUsed: tier, ModelUsed: resolvedModel, InputTokens: combinedUsage.InputTokens, OutputTokens: combinedUsage.OutputTokens, DurationMs: duration, Success: true, ErrorKind: ErrorKindMalformedJSON, CreatedAt: time.Now(), }) return CallResult{ Text: text2, Success: true, ErrorKind: ErrorKindMalformedJSON, ModelUsed: resolvedModel, InputTokens: combinedUsage.InputTokens, OutputTokens: combinedUsage.OutputTokens, DurationMs: duration, }, nil } // Tokens is the input/output token count returned by the LLM round- // trip. Mirrors llm.Usage's two cost-bearing fields. Exported so // downstream test code (the four meta-tools' tests, integration // tests) can use SetCompleteForTest. type Tokens struct { InputTokens int OutputTokens int } // CompleteFn is the seam used by tests to fake the LLM round-trip // without spinning up a real provider. Exported for tests in other // packages (the four meta-tools live in pkg/skilltools/tools/). type CompleteFn func(ctx context.Context, model llm.Model, systemPrompt, userMessage string, opts []llm.Option) (string, Tokens, error) // completeOverride is set in tests via SetCompleteForTest. nil falls // back to the real model.Generate path. var completeOverride CompleteFn // complete is the actual LLM round-trip. Calls model.Generate (which // already routes through the lane transport wrapper) and returns the // text + usage + error. // // Why not call model.SimpleCall: SimpleCall doesn't surface Usage; we // need the input/output token counts for the ledger row. // // Usage attribution to the per-user / per-skill dashboards is handled // by the instrumented model that model.ParseModelForContext returns — // a manual model.RecordUsage here would double-count. func (h *Helper) complete(ctx context.Context, model llm.Model, systemPrompt, userMessage string, opts []llm.Option) (string, Tokens, error) { if completeOverride != nil { return completeOverride(ctx, model, systemPrompt, userMessage, opts) } req := llm.Request{ System: systemPrompt, Messages: []llm.Message{llm.UserText(userMessage)}, } resp, err := model.Generate(ctx, req, opts...) if err != nil { return "", Tokens{}, err } usage := Tokens{ InputTokens: resp.Usage.InputTokens, OutputTokens: resp.Usage.OutputTokens, } return resp.Text(), usage, nil } // SetCompleteForTest installs a fake completer used by Call. Returns a // restore function that the test deferes to revert the override. // // Why exported (vs in a _test.go file): the four meta-tools' tests live // in pkg/skilltools/tools/, in a different package than the helper. // They need a way to fake the LLM without depending on a real model. func SetCompleteForTest(fn CompleteFn) func() { prev := completeOverride completeOverride = fn return func() { completeOverride = prev } } // tierAllowed reports whether the given tier appears in the configured // allowlist. Empty allowlist defaults to ["fast"]. func (h *Helper) tierAllowed(ctx context.Context, tier string) bool { var allowed []string if h.convars != nil { allowed = h.convars.AllowedTiers(ctx) } if len(allowed) == 0 { allowed = []string{"fast"} } for _, t := range allowed { if strings.EqualFold(strings.TrimSpace(t), tier) { return true } } return false } // recordLedger writes one meta-call row. Storage failures are logged // at the storage layer; the helper does not propagate them — meta-call // accounting MUST NOT break user-visible execution. func (h *Helper) recordLedger(ctx context.Context, call MetaCall) { if h.storage == nil { return } _ = h.storage.RecordMetaCall(ctx, call) } // tryParseJSON attempts to decode text as JSON. Returns the parsed // value (any) and ok=true on success. ok=false on failure or when // format is not "json". // // Why we accept arbitrary JSON shapes (vs requiring an object): the // extract_entities tool returns objects, but classify returns objects // with arrays inside. Accepting `any` keeps the helper agnostic to the // caller's downstream typing. // // Tolerance: strips a leading "```json" code fence + matching closing // fence so the agent can include surrounding markdown without // breaking parse. The stricter retry prompt explicitly asks for no // fence; this tolerance is for the first-attempt path. func tryParseJSON(text, format string) (any, bool) { if !strings.EqualFold(format, "json") { return nil, false } trimmed := strings.TrimSpace(text) // Strip optional ```json ... ``` fence. if strings.HasPrefix(trimmed, "```") { // Drop opening fence (with or without language tag). if idx := strings.Index(trimmed, "\n"); idx >= 0 { trimmed = trimmed[idx+1:] } // Drop trailing fence. if idx := strings.LastIndex(trimmed, "```"); idx >= 0 { trimmed = trimmed[:idx] } trimmed = strings.TrimSpace(trimmed) } var parsed any if err := json.Unmarshal([]byte(trimmed), &parsed); err != nil { return nil, false } return parsed, true }