executus/llmmeta/helper.go

// Package llmmeta is the shared meta-LLM helper used by the v12
// authoring tools (summarize, translate, extract_entities, classify).
//
// Why a dedicated package: each of those four tools makes "one fast-tier
// LLM call → typed result", with shared concerns (tier allowlist,
// ledger row, JSON-retry on malformed output). Centralising the pattern
// stops every tool from re-implementing the surrounding bookkeeping and
// keeps the audit trail uniform.
//
// The helper itself does NOT know about the four tools — it just exposes
// a Call(ctx, CallSpec) → CallResult shape. Each tool builds its own
// prompt + parses the typed result. The helper records the meta-call
// ledger row on every call, success or failure.
//
// Concurrency / lanes: the helper resolves the tier to an llm.Model via
// model.ParseModelForContext and uses model.Generate. Lane routing is
// already baked in at the LLM transport layer (see
// pkg/logic/llms/lane_transport.go) so each Generate call automatically
// goes through the right lane without further plumbing. Usage recording
// is automatic too: parsed models are instrumented by pkg/logic/llms,
// so the helper does NOT call model.RecordUsage itself.
//
// Tier allowlist: convar `skills.llm_meta.allowed_tiers` (default
// `["fast"]`) controls which tiers a meta-tool may use. A request for
// a disallowed tier returns error_kind="tier_not_allowed" WITHOUT
// making the call AND WITHOUT recording a ledger row (the call did
// not happen).
//
// Test: helper_test.go covers tier allowed, tier rejected, JSON
// retry path, malformed-twice path, and ledger-row emission semantics.
package llmmeta

import (
	"context"
	"encoding/json"
	"fmt"
	"strings"
	"time"

	llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
	"github.com/google/uuid"

	"gitea.stevedudenhoeffer.com/steve/executus/model"
)

// MetaCall is the domain row written to skill_llm_meta_calls on every
// helper call.
//
// Why a dedicated table (not skill_run_logs): per-skill token
// aggregation is cleaner with typed columns. Folding meta-calls into
// the generic event log would force a SUM-from-JSON path on every
// dashboard query.
//
// Why the field set is tight (no payload columns): the request bodies
// can be 32KB+. The agent's main run already captures system_prompt
// + user_message in the trace; storing them again here would double
// the audit footprint with no diagnostic value (the meta-call's
// inputs are derivable from the parent run's tool-call args).
type MetaCall struct {
	ID           string
	RunID        string
	SkillID      string
	ToolName     string
	TierUsed     string // "fast" / "standard"
	ModelUsed    string // resolved provider/model
	InputTokens  int
	OutputTokens int
	DurationMs   int
	Success      bool
	ErrorKind    string // empty on success; one of the sentinel kinds otherwise
	CreatedAt    time.Time
}

// Storage is the narrow surface the helper uses to persist meta-call
// ledger rows. Production wires a thin adapter around the skills GORM
// storage; tests substitute a fake.
//
// Why an interface (vs depending on pkg/logic/skills.Storage): the
// skills package imports skilltools (tool registry); having
// skilltools/llmmeta depend back on skills would form an import
// cycle. A narrow interface mirrored across the boundary is the
// project's standard cycle-break pattern (see KVStorage / FileStorage
// in pkg/skilltools/tools/).
type Storage interface {
	RecordMetaCall(ctx context.Context, call MetaCall) error
}

// ConvarReader is the narrow surface the helper uses to read
// `skills.llm_meta.allowed_tiers`. The convar package is database-
// backed; tests pass a static fake.
//
// Why an interface (vs reading convars directly): unit tests want to
// fake the allowlist without spinning up a convar manager.
type ConvarReader interface {
	// AllowedTiers returns the list of tier names a meta-tool may use.
	// Default ["fast"].
	AllowedTiers(ctx context.Context) []string
}

// ConvarReaderFunc adapts a closure into a ConvarReader. Useful in
// production wiring (mort.go) where the underlying access is a
// single line of logic.
type ConvarReaderFunc func(ctx context.Context) []string

// AllowedTiers satisfies ConvarReader.
func (f ConvarReaderFunc) AllowedTiers(ctx context.Context) []string {
	if f == nil {
		return []string{"fast"}
	}
	return f(ctx)
}

// Helper makes one fast-tier LLM call with surrounding bookkeeping
// (tier allowlist, JSON retry, ledger row).
//
// Construct once at boot; all four meta-tools share the same Helper.
type Helper struct {
	storage Storage
	convars ConvarReader
}

// New constructs a Helper. storage MUST be non-nil; passing nil makes
// every Call write a no-op ledger row (callers that need a fully no-op
// helper should instead avoid registering the tool).
//
// convars may be nil — the helper falls back to the default allowlist
// `["fast"]`.
//
// Why a constructor with explicit deps (vs Helper{...} struct
// initialiser): forces the deployment-time decision about which
// dependencies are wired vs nil-safe at the construction call site,
// not at the call site of each tool.
func New(storage Storage, convars ConvarReader) *Helper {
	return &Helper{
		storage: storage,
		convars: convars,
	}
}

// CallSpec is the per-call input.
//
// Why every field is explicit (vs builder pattern): the four meta-tools
// each populate the spec in one place; a struct literal at the call
// site is more readable than chained setters.
type CallSpec struct {
	// Tier is the tier alias to use ("fast" / "standard"). Empty falls
	// back to "fast". Disallowed tiers (per the convar allowlist) cause
	// Call to return CallResult{Success: false, ErrorKind:
	// "tier_not_allowed"} WITHOUT making the LLM call AND without
	// writing a ledger row (the call did not happen).
	Tier string

	// SystemPrompt is the system message. May be empty.
	SystemPrompt string

	// UserPrompt is the user message. Required.
	UserPrompt string

	// MaxOutputTokens caps the response. 0 disables the cap (provider
	// default). The helper uses this both to bound the cost estimate
	// AND to set llm.WithMaxTokens on the request.
	MaxOutputTokens int

	// ResponseFormat is "text" or "json". When "json", the helper
	// attempts to parse the response into JSON. Other values fall
	// through as "text".
	ResponseFormat string

	// RetryOnMalformedJSON, when true and ResponseFormat=="json",
	// retries the call ONCE with a stricter JSON-only prompt prefix
	// when the first response fails to parse. Second-failure returns
	// CallResult{Success: true, Parsed: nil, ErrorKind:
	// "malformed_json"} so callers can fall back to result.Text.
	RetryOnMalformedJSON bool

	// ToolName is the meta-tool name recorded in the ledger row
	// ("summarize", "translate", "extract_entities", "classify"). The
	// helper does not branch on this value.
	ToolName string

	// RunID is the calling skill run ID. Recorded in the ledger row;
	// also used by the cost-cap callback to find the running 7-day
	// total.
	RunID string

	// SkillID is the calling skill ID. Recorded in the ledger row;
	// passed to the cost-cap callback.
	SkillID string

	// CallerID is the Discord member ID that triggered the parent
	// skill run. Passed to the cost-cap callback so the per-user
	// 7-day cap can be evaluated.
	CallerID string
}

// CallResult is the per-call output.
//
// Why text + parsed (vs only one): JSON-format calls expose both the
// raw response (in .Text) and the parsed map (in .Parsed). Text-format
// calls leave .Parsed nil. Callers requesting JSON that fails to parse
// twice get .Text populated and ErrorKind="malformed_json" so they
// can fall back to text-mode without an error path.
type CallResult struct {
	// Text is the raw response text from the LLM. Populated on every
	// successful call (success=true) AND when JSON parsing failed
	// twice (success=true, parsed=nil, error_kind="malformed_json").
	// Empty on tier_not_allowed rejections (no LLM call happened).
	Text string

	// Parsed is the JSON-decoded response. nil for text-format calls,
	// nil for failed JSON parses, populated for successful JSON
	// responses. The interior shape is whatever the LLM returned; the
	// caller is responsible for asserting a typed view.
	Parsed any

	// InputTokens is the tokens billed against the input. 0 when the
	// provider didn't surface usage.
	InputTokens int

	// OutputTokens is the tokens billed against the output. 0 when the
	// provider didn't surface usage.
	OutputTokens int

	// DurationMs is wall-clock duration of the LLM call (or call+retry
	// in the JSON-retry case).
	DurationMs int

	// ModelUsed is the resolved provider/model string ("anthropic/
	// claude-haiku-4-5-20251001"). Populated on every actual LLM call;
	// empty on tier_not_allowed rejections.
	ModelUsed string

	// Success reports whether the LLM call returned a usable response.
	// True on happy-path AND on malformed-json second-failure (the
	// caller can fall back to .Text). False on transport errors,
	// tier_not_allowed, llm_unavailable.
	Success bool

	// ErrorKind, when non-empty, is one of:
	//   - "tier_not_allowed" → no call, no ledger row
	//   - "llm_unavailable"  → call attempted, ledger row written
	//   - "malformed_json"   → call succeeded but JSON parse failed
	ErrorKind string
}

// Sentinel error_kind values for CallResult.ErrorKind.
const (
	ErrorKindTierNotAllowed = "tier_not_allowed"
	ErrorKindLLMUnavailable = "llm_unavailable"
	ErrorKindMalformedJSON  = "malformed_json"
)

// Call performs the meta-LLM call and returns a typed CallResult.
//
// Why no error return (vs an error second value): every meaningful
// failure is captured as a CallResult.ErrorKind so the caller's branch
// logic stays single-pathed. Internal transport errors are surfaced
// as ErrorKind=llm_unavailable. The function only returns a non-nil
// error for argument-validation failures (empty UserPrompt) — a
// programmer error the caller would have to fix anyway.
//
// Test: helper_test.go covers all outcomes (tier_not_allowed, happy
// text, happy json, malformed_json retry-pass, malformed_json
// retry-fail, llm_unavailable).
func (h *Helper) Call(ctx context.Context, spec CallSpec) (CallResult, error) {
	if strings.TrimSpace(spec.UserPrompt) == "" {
		return CallResult{}, fmt.Errorf("llmmeta: user_prompt required")
	}
	tier := strings.TrimSpace(spec.Tier)
	if tier == "" {
		tier = "fast"
	}

	// Tier allowlist: rejected tiers do NOT make the call AND do NOT
	// record a ledger row.
	if !h.tierAllowed(ctx, tier) {
		return CallResult{
			Success:   false,
			ErrorKind: ErrorKindTierNotAllowed,
		}, nil
	}

	resolvedModel := model.ResolveModelName(tier)

	// Resolve model. ParseModelForContext attaches the resolved model
	// name to ctx (for usage attribution) AND returns the llm.Model
	// whose Generate already routes through the lane wrapper.
	ctx, model, err := model.ParseModelForContext(ctx, tier)
	if err != nil {
		// Tier convar mis-set: surface as tier_not_allowed to the
		// caller (the agent's recovery path is the same as for an
		// admin-disabled tier) but DO record the failure for the
		// admin who needs to fix the convar.
		h.recordLedger(ctx, MetaCall{
			ID:        uuid.NewString(),
			RunID:     spec.RunID,
			SkillID:   spec.SkillID,
			ToolName:  spec.ToolName,
			TierUsed:  tier,
			ModelUsed: resolvedModel,
			Success:   false,
			ErrorKind: ErrorKindTierNotAllowed,
			CreatedAt: time.Now(),
		})
		return CallResult{
			Success:   false,
			ErrorKind: ErrorKindTierNotAllowed,
		}, nil
	}

	// First call.
	start := time.Now()
	systemPrompt := spec.SystemPrompt
	userMessage := spec.UserPrompt
	opts := []llm.Option{}
	if spec.MaxOutputTokens > 0 {
		opts = append(opts, llm.WithMaxTokens(spec.MaxOutputTokens))
	}
	text, usage, llmErr := h.complete(ctx, model, systemPrompt, userMessage, opts)
	if llmErr != nil {
		duration := int(time.Since(start) / time.Millisecond)
		h.recordLedger(ctx, MetaCall{
			ID:           uuid.NewString(),
			RunID:        spec.RunID,
			SkillID:      spec.SkillID,
			ToolName:     spec.ToolName,
			TierUsed:     tier,
			ModelUsed:    resolvedModel,
			InputTokens:  usage.InputTokens,
			OutputTokens: usage.OutputTokens,
			DurationMs:   duration,
			Success:      false,
			ErrorKind:    ErrorKindLLMUnavailable,
			CreatedAt:    time.Now(),
		})
		return CallResult{
			Success:      false,
			ErrorKind:    ErrorKindLLMUnavailable,
			ModelUsed:    resolvedModel,
			DurationMs:   duration,
			InputTokens:  usage.InputTokens,
			OutputTokens: usage.OutputTokens,
		}, nil
	}

	// Determine outcome based on response format.
	parsed, parsedOK := tryParseJSON(text, spec.ResponseFormat)
	wantJSON := strings.EqualFold(spec.ResponseFormat, "json")

	if !wantJSON || parsedOK {
		// Happy path (text mode OR JSON mode that parsed first try).
		duration := int(time.Since(start) / time.Millisecond)
		h.recordLedger(ctx, MetaCall{
			ID:           uuid.NewString(),
			RunID:        spec.RunID,
			SkillID:      spec.SkillID,
			ToolName:     spec.ToolName,
			TierUsed:     tier,
			ModelUsed:    resolvedModel,
			InputTokens:  usage.InputTokens,
			OutputTokens: usage.OutputTokens,
			DurationMs:   duration,
			Success:      true,
			CreatedAt:    time.Now(),
		})
		return CallResult{
			Text:         text,
			Parsed:       parsed,
			Success:      true,
			ModelUsed:    resolvedModel,
			InputTokens:  usage.InputTokens,
			OutputTokens: usage.OutputTokens,
			DurationMs:   duration,
		}, nil
	}

	// JSON requested but first response failed to parse.
	if !spec.RetryOnMalformedJSON {
		duration := int(time.Since(start) / time.Millisecond)
		h.recordLedger(ctx, MetaCall{
			ID:           uuid.NewString(),
			RunID:        spec.RunID,
			SkillID:      spec.SkillID,
			ToolName:     spec.ToolName,
			TierUsed:     tier,
			ModelUsed:    resolvedModel,
			InputTokens:  usage.InputTokens,
			OutputTokens: usage.OutputTokens,
			DurationMs:   duration,
			Success:      true,
			ErrorKind:    ErrorKindMalformedJSON,
			CreatedAt:    time.Now(),
		})
		return CallResult{
			Text:         text,
			Success:      true,
			ErrorKind:    ErrorKindMalformedJSON,
			ModelUsed:    resolvedModel,
			InputTokens:  usage.InputTokens,
			OutputTokens: usage.OutputTokens,
			DurationMs:   duration,
		}, nil
	}

	// Retry once with stricter JSON-only prompt prefix.
	stricterPrompt := "Return ONLY valid JSON. No prose, no markdown fencing.\n\n" + userMessage
	text2, usage2, llmErr2 := h.complete(ctx, model, systemPrompt, stricterPrompt, opts)
	combinedUsage := Tokens{
		InputTokens:  usage.InputTokens + usage2.InputTokens,
		OutputTokens: usage.OutputTokens + usage2.OutputTokens,
	}
	duration := int(time.Since(start) / time.Millisecond)
	if llmErr2 != nil {
		// Retry call itself failed transport-wise. Record the round-
		// trip tokens and surface llm_unavailable.
		h.recordLedger(ctx, MetaCall{
			ID:           uuid.NewString(),
			RunID:        spec.RunID,
			SkillID:      spec.SkillID,
			ToolName:     spec.ToolName,
			TierUsed:     tier,
			ModelUsed:    resolvedModel,
			InputTokens:  combinedUsage.InputTokens,
			OutputTokens: combinedUsage.OutputTokens,
			DurationMs:   duration,
			Success:      false,
			ErrorKind:    ErrorKindLLMUnavailable,
			CreatedAt:    time.Now(),
		})
		return CallResult{
			Text:         text,
			Success:      false,
			ErrorKind:    ErrorKindLLMUnavailable,
			ModelUsed:    resolvedModel,
			InputTokens:  combinedUsage.InputTokens,
			OutputTokens: combinedUsage.OutputTokens,
			DurationMs:   duration,
		}, nil
	}

	parsed2, parsedOK2 := tryParseJSON(text2, "json")
	if parsedOK2 {
		h.recordLedger(ctx, MetaCall{
			ID:           uuid.NewString(),
			RunID:        spec.RunID,
			SkillID:      spec.SkillID,
			ToolName:     spec.ToolName,
			TierUsed:     tier,
			ModelUsed:    resolvedModel,
			InputTokens:  combinedUsage.InputTokens,
			OutputTokens: combinedUsage.OutputTokens,
			DurationMs:   duration,
			Success:      true,
			CreatedAt:    time.Now(),
		})
		return CallResult{
			Text:         text2,
			Parsed:       parsed2,
			Success:      true,
			ModelUsed:    resolvedModel,
			InputTokens:  combinedUsage.InputTokens,
			OutputTokens: combinedUsage.OutputTokens,
			DurationMs:   duration,
		}, nil
	}

	// Second-failure path. Caller can fall back to result.Text.
	h.recordLedger(ctx, MetaCall{
		ID:           uuid.NewString(),
		RunID:        spec.RunID,
		SkillID:      spec.SkillID,
		ToolName:     spec.ToolName,
		TierUsed:     tier,
		ModelUsed:    resolvedModel,
		InputTokens:  combinedUsage.InputTokens,
		OutputTokens: combinedUsage.OutputTokens,
		DurationMs:   duration,
		Success:      true,
		ErrorKind:    ErrorKindMalformedJSON,
		CreatedAt:    time.Now(),
	})
	return CallResult{
		Text:         text2,
		Success:      true,
		ErrorKind:    ErrorKindMalformedJSON,
		ModelUsed:    resolvedModel,
		InputTokens:  combinedUsage.InputTokens,
		OutputTokens: combinedUsage.OutputTokens,
		DurationMs:   duration,
	}, nil
}

// Tokens is the input/output token count returned by the LLM round-
// trip. Mirrors llm.Usage's two cost-bearing fields. Exported so
// downstream test code (the four meta-tools' tests, integration
// tests) can use SetCompleteForTest.
type Tokens struct {
	InputTokens  int
	OutputTokens int
}

// CompleteFn is the seam used by tests to fake the LLM round-trip
// without spinning up a real provider. Exported for tests in other
// packages (the four meta-tools live in pkg/skilltools/tools/).
type CompleteFn func(ctx context.Context, model llm.Model, systemPrompt, userMessage string, opts []llm.Option) (string, Tokens, error)

// completeOverride is set in tests via SetCompleteForTest. nil falls
// back to the real model.Generate path.
var completeOverride CompleteFn

// complete is the actual LLM round-trip. Calls model.Generate (which
// already routes through the lane transport wrapper) and returns the
// text + usage + error.
//
// Why not call model.SimpleCall: SimpleCall doesn't surface Usage; we
// need the input/output token counts for the ledger row.
//
// Usage attribution to the per-user / per-skill dashboards is handled
// by the instrumented model that model.ParseModelForContext returns —
// a manual model.RecordUsage here would double-count.
func (h *Helper) complete(ctx context.Context, model llm.Model, systemPrompt, userMessage string, opts []llm.Option) (string, Tokens, error) {
	if completeOverride != nil {
		return completeOverride(ctx, model, systemPrompt, userMessage, opts)
	}
	req := llm.Request{
		System:   systemPrompt,
		Messages: []llm.Message{llm.UserText(userMessage)},
	}
	resp, err := model.Generate(ctx, req, opts...)
	if err != nil {
		return "", Tokens{}, err
	}
	usage := Tokens{
		InputTokens:  resp.Usage.InputTokens,
		OutputTokens: resp.Usage.OutputTokens,
	}
	return resp.Text(), usage, nil
}

// SetCompleteForTest installs a fake completer used by Call. Returns a
// restore function that the test deferes to revert the override.
//
// Why exported (vs in a _test.go file): the four meta-tools' tests live
// in pkg/skilltools/tools/, in a different package than the helper.
// They need a way to fake the LLM without depending on a real model.
func SetCompleteForTest(fn CompleteFn) func() {
	prev := completeOverride
	completeOverride = fn
	return func() { completeOverride = prev }
}

// tierAllowed reports whether the given tier appears in the configured
// allowlist. Empty allowlist defaults to ["fast"].
func (h *Helper) tierAllowed(ctx context.Context, tier string) bool {
	var allowed []string
	if h.convars != nil {
		allowed = h.convars.AllowedTiers(ctx)
	}
	if len(allowed) == 0 {
		allowed = []string{"fast"}
	}
	for _, t := range allowed {
		if strings.EqualFold(strings.TrimSpace(t), tier) {
			return true
		}
	}
	return false
}

// recordLedger writes one meta-call row. Storage failures are logged
// at the storage layer; the helper does not propagate them — meta-call
// accounting MUST NOT break user-visible execution.
func (h *Helper) recordLedger(ctx context.Context, call MetaCall) {
	if h.storage == nil {
		return
	}
	_ = h.storage.RecordMetaCall(ctx, call)
}

// tryParseJSON attempts to decode text as JSON. Returns the parsed
// value (any) and ok=true on success. ok=false on failure or when
// format is not "json".
//
// Why we accept arbitrary JSON shapes (vs requiring an object): the
// extract_entities tool returns objects, but classify returns objects
// with arrays inside. Accepting `any` keeps the helper agnostic to the
// caller's downstream typing.
//
// Tolerance: strips a leading "```json" code fence + matching closing
// fence so the agent can include surrounding markdown without
// breaking parse. The stricter retry prompt explicitly asks for no
// fence; this tolerance is for the first-attempt path.
func tryParseJSON(text, format string) (any, bool) {
	if !strings.EqualFold(format, "json") {
		return nil, false
	}
	trimmed := strings.TrimSpace(text)
	// Strip optional ```json ... ``` fence.
	if strings.HasPrefix(trimmed, "```") {
		// Drop opening fence (with or without language tag).
		if idx := strings.Index(trimmed, "\n"); idx >= 0 {
			trimmed = trimmed[idx+1:]
		}
		// Drop trailing fence.
		if idx := strings.LastIndex(trimmed, "```"); idx >= 0 {
			trimmed = trimmed[:idx]
		}
		trimmed = strings.TrimSpace(trimmed)
	}
	var parsed any
	if err := json.Unmarshal([]byte(trimmed), &parsed); err != nil {
		return nil, false
	}
	return parsed, true
}