P3: meta + primitive tool group (think/now/cite + classify/extract/summarize)

Grow executus/tools into a real generic tool library: - Register(reg): the always-available, zero-config tools — think, now (UTC unless a CurrentTimeProvider is wired), cite (inert unless a CitationStorage is wired). All nil-safe; a light host calls Register and is useful. - RegisterMeta(reg, MetaDeps): the LLM-backed meta tools — classify, extract_entities, summarize — over the llmmeta helper. Budget defaults to the shipped in-memory per-run cap; Files optional; caps default. - Seams moved (interface/type-only, no host coupling): research_providers.go (CurrentTimeProvider/CitationStorage/SearchBudget/PageExtractor/PDFFetcher/…) and file_storage.go (FileStorage + FileDomainMeta). Plus the in-memory budget default (research_defaults.go) and scope_validate.go. calculate deferred (drags github.com/Krognol/go-wolfram + a module-path replace — not worth it in the lean core for one tool). Core go.sum still free of gorm/redis/discordgo/sqlite/wolfram. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 21:00:45 -04:00
parent df95425bb5
commit 1e201550b3
11 changed files with 1802 additions and 17 deletions
@@ -0,0 +1,319 @@
+// Package tools — v12 classify.
+//
+// Classification primitive: text + categories → labels + per-category
+// scores. Single-label mode (default) returns the top-1 category;
+// multi-label mode returns every category whose score crosses the
+// threshold.
+//
+// Why a dedicated tool (vs reusing extract_entities for one-of-N
+// classification): classification has a typed result (labels[] +
+// scores{}) that downstream agents consume programmatically. Folding
+// it into extract_entities would force every author to re-spec the
+// scoring schema.
+//
+// Score normalisation: the LLM's reply is normalised so each score
+// lands in [0, 1]. The single-label result returns scores for ALL
+// categories so the author can read the distribution; multi-label
+// returns labels[] of categories above 0.5.
+//
+// Test: classify_test.go covers single-label, multi-label, score
+// normalisation, > 20 categories rejected, unknown category in the
+// reply silently dropped.
+package tools
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"strings"
+
+	"gitea.stevedudenhoeffer.com/steve/executus/llmmeta"
+	"gitea.stevedudenhoeffer.com/steve/executus/tool"
+)
+
+// classifyMaxInputBytes is the input cap.
+const classifyMaxInputBytes = 16 * 1024
+
+// classifyMaxCategories is the hard cap on category count.
+const classifyMaxCategories = 20
+
+// classifyMultiLabelThreshold is the score threshold above which a
+// category appears in the labels[] array in multi-label mode.
+const classifyMultiLabelThreshold = 0.5
+
+// classifyFallbackMaxPerRun is the per-run cap when ClassifyConfig is
+// nil.
+const classifyFallbackMaxPerRun = 20
+
+// ClassifyConfig is the narrow per-deployment config surface.
+type ClassifyConfig interface {
+	MaxPerRun(ctx context.Context) int
+}
+
+// classifyArgs is the LLM-facing param struct.
+type classifyArgs struct {
+	Text       string   `json:"text" description:"The text to classify. Required. Capped at 16KB."`
+	Categories []string `json:"categories" description:"List of categories to score the text against. Required. Max 20."`
+	MultiLabel bool     `json:"multi_label,omitempty" description:"When true, returns every category scoring above 0.5. Default false → single-label (top-1) result."`
+}
+
+type classifyResult struct {
+	Labels    []string           `json:"labels,omitempty"`
+	Scores    map[string]float64 `json:"scores,omitempty"`
+	ModelUsed string             `json:"model_used,omitempty"`
+	RawReply  string             `json:"raw_reply,omitempty"`
+	Error     string             `json:"error,omitempty"`
+	BudgetMsg string             `json:"budget_message,omitempty"`
+}
+
+// NewClassify constructs the classify tool.
+func NewClassify(helper *llmmeta.Helper, cfg ClassifyConfig, budget SearchBudget) tool.Tool {
+	return tool.NewGatedTool[classifyArgs](
+		"classify",
+		"Classify text into one of N categories (or multiple via multi_label=true). Returns labels[] (top-1 by default) + scores{category: 0..1}. Counts against per-run and 7-day cost budgets.",
+		tool.Permission{
+			AuthoringRequirement: tool.RequirementAnyone,
+			OperatesOn:           tool.ScopeCaller,
+			SafeForShare:         true,
+			Categories:           []string{"llm-meta", "cost-bearing"},
+		},
+		func(ctx context.Context, inv tool.Invocation, args classifyArgs) (string, error) {
+			if helper == nil {
+				return "", fmt.Errorf("classify: not configured")
+			}
+			text := args.Text
+			if strings.TrimSpace(text) == "" {
+				return marshalClassifyResult(classifyResult{Error: "text is empty"}), nil
+			}
+			if len(args.Categories) == 0 {
+				return marshalClassifyResult(classifyResult{Error: "categories is empty"}), nil
+			}
+			if len(args.Categories) > classifyMaxCategories {
+				return marshalClassifyResult(classifyResult{
+					Error: fmt.Sprintf("too many categories (%d > %d)", len(args.Categories), classifyMaxCategories),
+				}), nil
+			}
+			// Trim + dedupe categories so the LLM sees a clean
+			// schema. Order is preserved for the prompt; the result
+			// map is order-agnostic.
+			categories := make([]string, 0, len(args.Categories))
+			seen := make(map[string]bool, len(args.Categories))
+			for _, c := range args.Categories {
+				c = strings.TrimSpace(c)
+				if c == "" || seen[c] {
+					continue
+				}
+				seen[c] = true
+				categories = append(categories, c)
+			}
+			if len(categories) == 0 {
+				return marshalClassifyResult(classifyResult{Error: "categories has no non-empty entries"}), nil
+			}
+
+			if len(text) > classifyMaxInputBytes {
+				text = text[:classifyMaxInputBytes]
+			}
+
+			// Per-run budget gate.
+			if budget == nil {
+				maxPerRun := classifyFallbackMaxPerRun
+				if cfg != nil {
+					maxPerRun = cfg.MaxPerRun(ctx)
+				}
+				budget = NewInMemorySearchBudget(map[string]int{
+					"classify": maxPerRun,
+				})
+			}
+			count, max, exceeded := budget.CheckAndIncrement(ctx, inv.RunID, "classify")
+			if exceeded {
+				return marshalClassifyResult(classifyResult{
+					Error:     "classify_budget_exceeded",
+					BudgetMsg: fmt.Sprintf("per-run classify budget exceeded (%d/%d). Ask an admin to raise skills.classify.max_per_run.", count, max),
+				}), nil
+			}
+
+			systemPrompt := "You classify text into a fixed set of categories. Return ONLY JSON. Score each category in [0,1] (1 = perfect fit). Sum of all scores does NOT need to be 1 — high overlap across categories is allowed."
+			userPrompt := buildClassifyPrompt(text, categories, args.MultiLabel)
+
+			res, callErr := helper.Call(ctx, llmmeta.CallSpec{
+				Tier:                 "fast",
+				SystemPrompt:         systemPrompt,
+				UserPrompt:           userPrompt,
+				MaxOutputTokens:      2048,
+				ResponseFormat:       "json",
+				RetryOnMalformedJSON: true,
+				ToolName:             "classify",
+				RunID:                inv.RunID,
+				SkillID:              inv.SkillID,
+				CallerID:             inv.CallerID,
+			})
+			if callErr != nil {
+				return "", callErr
+			}
+			if !res.Success {
+				kind := res.ErrorKind
+				if kind == "" {
+					kind = "llm_unavailable"
+				}
+				return marshalClassifyResult(classifyResult{Error: kind}), nil
+			}
+			if res.ErrorKind == llmmeta.ErrorKindMalformedJSON || res.Parsed == nil {
+				return marshalClassifyResult(classifyResult{
+					Error:     "classification_failed",
+					RawReply:  res.Text,
+					ModelUsed: res.ModelUsed,
+				}), nil
+			}
+
+			parsedMap, ok := res.Parsed.(map[string]any)
+			if !ok {
+				return marshalClassifyResult(classifyResult{
+					Error:     "classification_failed_not_object",
+					RawReply:  res.Text,
+					ModelUsed: res.ModelUsed,
+				}), nil
+			}
+
+			scores := normaliseClassifyScores(parsedMap, categories)
+			labels := selectClassifyLabels(scores, categories, args.MultiLabel)
+
+			return marshalClassifyResult(classifyResult{
+				Labels:    labels,
+				Scores:    scores,
+				ModelUsed: res.ModelUsed,
+			}), nil
+		},
+	)
+}
+
+// buildClassifyPrompt composes the user message.
+func buildClassifyPrompt(text string, categories []string, multiLabel bool) string {
+	var sb strings.Builder
+	sb.WriteString("Classify the text below.\n\nCategories:\n")
+	for _, c := range categories {
+		sb.WriteString("- ")
+		sb.WriteString(c)
+		sb.WriteString("\n")
+	}
+	sb.WriteString("\nText:\n")
+	sb.WriteString(text)
+	sb.WriteString("\n\nReturn ONLY a JSON object: {\"scores\": {\"<category>\": <0..1 float>, ...}}.")
+	if multiLabel {
+		sb.WriteString(" The same text may score high in MULTIPLE categories — score each independently.")
+	} else {
+		sb.WriteString(" Score each category; the highest-scoring one will be the chosen label.")
+	}
+	return sb.String()
+}
+
+// normaliseClassifyScores extracts the scores map from the LLM's
+// reply and clamps each value into [0, 1]. Categories absent from the
+// reply default to 0.
+//
+// Why we accept either {"scores": {...}} or {...}: some models reply
+// with the inner object directly, dropping the wrapping key. Both
+// shapes are valid as long as the keys match the requested category
+// names.
+func normaliseClassifyScores(parsed map[string]any, categories []string) map[string]float64 {
+	scoresIn, ok := parsed["scores"].(map[string]any)
+	if !ok {
+		// Accept the bare-map shape too.
+		scoresIn = parsed
+	}
+	out := make(map[string]float64, len(categories))
+	for _, c := range categories {
+		v, has := scoresIn[c]
+		if !has {
+			out[c] = 0
+			continue
+		}
+		f, ok := coerceClassifyScore(v)
+		if !ok {
+			out[c] = 0
+			continue
+		}
+		// Clamp into [0, 1].
+		if f < 0 {
+			f = 0
+		}
+		if f > 1 {
+			f = 1
+		}
+		out[c] = f
+	}
+	return out
+}
+
+// coerceClassifyScore reads a JSON value as a float in [0, 1]. Accepts
+// floats, ints, and percent-strings ("85%" → 0.85).
+func coerceClassifyScore(raw any) (float64, bool) {
+	switch v := raw.(type) {
+	case float64:
+		return v, true
+	case int:
+		return float64(v), true
+	case int64:
+		return float64(v), true
+	case string:
+		s := strings.TrimSuffix(strings.TrimSpace(v), "%")
+		var f float64
+		if _, err := fmt.Sscanf(s, "%f", &f); err == nil {
+			if strings.HasSuffix(strings.TrimSpace(v), "%") {
+				f = f / 100.0
+			}
+			return f, true
+		}
+	}
+	return 0, false
+}
+
+// selectClassifyLabels picks the labels to surface. Single-label mode
+// returns the highest-scoring category. Multi-label returns every
+// category above the threshold (sorted by score desc for stable
+// rendering).
+func selectClassifyLabels(scores map[string]float64, categories []string, multiLabel bool) []string {
+	if multiLabel {
+		var labels []string
+		for _, c := range categories {
+			if scores[c] >= classifyMultiLabelThreshold {
+				labels = append(labels, c)
+			}
+		}
+		// Sort labels by score desc, then category-list order for ties.
+		sortClassifyLabelsByScore(labels, scores)
+		return labels
+	}
+	// Single-label: top-1.
+	bestCat := ""
+	bestScore := -1.0
+	for _, c := range categories {
+		if scores[c] > bestScore {
+			bestScore = scores[c]
+			bestCat = c
+		}
+	}
+	if bestCat == "" {
+		return nil
+	}
+	return []string{bestCat}
+}
+
+// sortClassifyLabelsByScore sorts labels desc by score. Stable on
+// ties (preserves category-list order).
+func sortClassifyLabelsByScore(labels []string, scores map[string]float64) {
+	for i := 1; i < len(labels); i++ {
+		j := i
+		for j > 0 && scores[labels[j]] > scores[labels[j-1]] {
+			labels[j], labels[j-1] = labels[j-1], labels[j]
+			j--
+		}
+	}
+}
+
+func marshalClassifyResult(r classifyResult) string {
+	b, err := json.Marshal(r)
+	if err != nil {
+		return fmt.Sprintf(`{"error":"marshal_failed: %v"}`, err)
+	}
+	return string(b)
+}