// Package tools — v12 classify. // // Classification primitive: text + categories → labels + per-category // scores. Single-label mode (default) returns the top-1 category; // multi-label mode returns every category whose score crosses the // threshold. // // Why a dedicated tool (vs reusing extract_entities for one-of-N // classification): classification has a typed result (labels[] + // scores{}) that downstream agents consume programmatically. Folding // it into extract_entities would force every author to re-spec the // scoring schema. // // Score normalisation: the LLM's reply is normalised so each score // lands in [0, 1]. The single-label result returns scores for ALL // categories so the author can read the distribution; multi-label // returns labels[] of categories above 0.5. // // Test: classify_test.go covers single-label, multi-label, score // normalisation, > 20 categories rejected, unknown category in the // reply silently dropped. package tools import ( "context" "encoding/json" "fmt" "strconv" "strings" "gitea.stevedudenhoeffer.com/steve/executus/llmmeta" "gitea.stevedudenhoeffer.com/steve/executus/tool" ) // classifyMaxInputBytes is the input cap. const classifyMaxInputBytes = 16 * 1024 // classifyMaxCategories is the hard cap on category count. const classifyMaxCategories = 20 // classifyMultiLabelThreshold is the score threshold above which a // category appears in the labels[] array in multi-label mode. const classifyMultiLabelThreshold = 0.5 // classifyFallbackMaxPerRun is the per-run cap when ClassifyConfig is // nil. const classifyFallbackMaxPerRun = 20 // ClassifyConfig is the narrow per-deployment config surface. type ClassifyConfig interface { MaxPerRun(ctx context.Context) int } // classifyArgs is the LLM-facing param struct. type classifyArgs struct { Text string `json:"text" description:"The text to classify. Required. Capped at 16KB."` Categories []string `json:"categories" description:"List of categories to score the text against. Required. Max 20."` MultiLabel bool `json:"multi_label,omitempty" description:"When true, returns every category scoring above 0.5. Default false → single-label (top-1) result."` } type classifyResult struct { Labels []string `json:"labels,omitempty"` Scores map[string]float64 `json:"scores,omitempty"` ModelUsed string `json:"model_used,omitempty"` RawReply string `json:"raw_reply,omitempty"` Error string `json:"error,omitempty"` BudgetMsg string `json:"budget_message,omitempty"` } // NewClassify constructs the classify tool. func NewClassify(helper *llmmeta.Helper, cfg ClassifyConfig, budget SearchBudget) tool.Tool { return tool.NewGatedTool[classifyArgs]( "classify", "Classify text into one of N categories (or multiple via multi_label=true). Returns labels[] (top-1 by default) + scores{category: 0..1}. Counts against per-run and 7-day cost budgets.", tool.Permission{ AuthoringRequirement: tool.RequirementAnyone, OperatesOn: tool.ScopeCaller, SafeForShare: true, Categories: []string{"llm-meta", "cost-bearing"}, }, func(ctx context.Context, inv tool.Invocation, args classifyArgs) (string, error) { if helper == nil { return "", fmt.Errorf("classify: not configured") } text := args.Text if strings.TrimSpace(text) == "" { return marshalClassifyResult(classifyResult{Error: "text is empty"}), nil } if len(args.Categories) == 0 { return marshalClassifyResult(classifyResult{Error: "categories is empty"}), nil } if len(args.Categories) > classifyMaxCategories { return marshalClassifyResult(classifyResult{ Error: fmt.Sprintf("too many categories (%d > %d)", len(args.Categories), classifyMaxCategories), }), nil } // Trim + dedupe categories so the LLM sees a clean // schema. Order is preserved for the prompt; the result // map is order-agnostic. categories := make([]string, 0, len(args.Categories)) seen := make(map[string]bool, len(args.Categories)) for _, c := range args.Categories { c = strings.TrimSpace(c) if c == "" || seen[c] { continue } seen[c] = true categories = append(categories, c) } if len(categories) == 0 { return marshalClassifyResult(classifyResult{Error: "categories has no non-empty entries"}), nil } if len(text) > classifyMaxInputBytes { text = truncateUTF8(text, classifyMaxInputBytes) } // Per-run budget gate. if budget == nil { maxPerRun := classifyFallbackMaxPerRun if cfg != nil { maxPerRun = cfg.MaxPerRun(ctx) } budget = NewInMemorySearchBudget(map[string]int{ "classify": maxPerRun, }) } count, max, exceeded := budget.CheckAndIncrement(ctx, inv.RunID, "classify") if exceeded { return marshalClassifyResult(classifyResult{ Error: "classify_budget_exceeded", BudgetMsg: fmt.Sprintf("per-run classify budget exceeded (%d/%d). Ask an admin to raise skills.classify.max_per_run.", count, max), }), nil } systemPrompt := "You classify text into a fixed set of categories. Return ONLY JSON. Score each category in [0,1] (1 = perfect fit). Sum of all scores does NOT need to be 1 — high overlap across categories is allowed." userPrompt := buildClassifyPrompt(text, categories, args.MultiLabel) res, callErr := helper.Call(ctx, llmmeta.CallSpec{ Tier: "fast", SystemPrompt: systemPrompt, UserPrompt: userPrompt, MaxOutputTokens: 2048, ResponseFormat: "json", RetryOnMalformedJSON: true, ToolName: "classify", RunID: inv.RunID, SkillID: inv.SkillID, CallerID: inv.CallerID, }) if callErr != nil { return "", callErr } if !res.Success { kind := res.ErrorKind if kind == "" { kind = "llm_unavailable" } return marshalClassifyResult(classifyResult{Error: kind}), nil } if res.ErrorKind == llmmeta.ErrorKindMalformedJSON || res.Parsed == nil { return marshalClassifyResult(classifyResult{ Error: "classification_failed", RawReply: res.Text, ModelUsed: res.ModelUsed, }), nil } parsedMap, ok := res.Parsed.(map[string]any) if !ok { return marshalClassifyResult(classifyResult{ Error: "classification_failed_not_object", RawReply: res.Text, ModelUsed: res.ModelUsed, }), nil } scores := normaliseClassifyScores(parsedMap, categories) labels := selectClassifyLabels(scores, categories, args.MultiLabel) return marshalClassifyResult(classifyResult{ Labels: labels, Scores: scores, ModelUsed: res.ModelUsed, }), nil }, ) } // buildClassifyPrompt composes the user message. func buildClassifyPrompt(text string, categories []string, multiLabel bool) string { var sb strings.Builder sb.WriteString("Classify the text below.\n\nCategories:\n") for _, c := range categories { sb.WriteString("- ") sb.WriteString(c) sb.WriteString("\n") } sb.WriteString("\nText:\n") sb.WriteString(text) sb.WriteString("\n\nReturn ONLY a JSON object: {\"scores\": {\"\": <0..1 float>, ...}}.") if multiLabel { sb.WriteString(" The same text may score high in MULTIPLE categories — score each independently.") } else { sb.WriteString(" Score each category; the highest-scoring one will be the chosen label.") } return sb.String() } // normaliseClassifyScores extracts the scores map from the LLM's // reply and clamps each value into [0, 1]. Categories absent from the // reply default to 0. // // Why we accept either {"scores": {...}} or {...}: some models reply // with the inner object directly, dropping the wrapping key. Both // shapes are valid as long as the keys match the requested category // names. func normaliseClassifyScores(parsed map[string]any, categories []string) map[string]float64 { scoresIn, ok := parsed["scores"].(map[string]any) if !ok { // Accept the bare-map shape too. scoresIn = parsed } out := make(map[string]float64, len(categories)) for _, c := range categories { v, has := scoresIn[c] if !has { out[c] = 0 continue } f, ok := coerceClassifyScore(v) if !ok { out[c] = 0 continue } // Clamp into [0, 1]. if f < 0 { f = 0 } if f > 1 { f = 1 } out[c] = f } return out } // coerceClassifyScore reads a JSON value as a float in [0, 1]. Accepts // floats, ints, and percent-strings ("85%" → 0.85). func coerceClassifyScore(raw any) (float64, bool) { switch v := raw.(type) { case float64: return v, true case int: return float64(v), true case int64: return float64(v), true case string: trimmed := strings.TrimSpace(v) hasPct := strings.HasSuffix(trimmed, "%") s := strings.TrimSuffix(trimmed, "%") // strconv.ParseFloat (unlike fmt.Sscanf %f) rejects trailing garbage, // so "50extra" / "0.5x" are refused instead of silently parsed as 50/0.5. f, err := strconv.ParseFloat(strings.TrimSpace(s), 64) if err == nil { if hasPct { f = f / 100.0 } return f, true } } return 0, false } // selectClassifyLabels picks the labels to surface. Single-label mode // returns the highest-scoring category. Multi-label returns every // category above the threshold (sorted by score desc for stable // rendering). func selectClassifyLabels(scores map[string]float64, categories []string, multiLabel bool) []string { if multiLabel { var labels []string for _, c := range categories { if scores[c] >= classifyMultiLabelThreshold { labels = append(labels, c) } } // Sort labels by score desc, then category-list order for ties. sortClassifyLabelsByScore(labels, scores) return labels } // Single-label: top-1. bestCat := "" bestScore := -1.0 for _, c := range categories { if scores[c] > bestScore { bestScore = scores[c] bestCat = c } } // No category fit: an all-zero score set must not yield a false-positive // top-1 (the first category trivially beats the -1.0 sentinel). Returning // no label keeps "nothing matched" distinguishable from "category A won". if bestCat == "" || bestScore <= 0 { return nil } return []string{bestCat} } // sortClassifyLabelsByScore sorts labels desc by score. Stable on // ties (preserves category-list order). func sortClassifyLabelsByScore(labels []string, scores map[string]float64) { for i := 1; i < len(labels); i++ { j := i for j > 0 && scores[labels[j]] > scores[labels[j-1]] { labels[j], labels[j-1] = labels[j-1], labels[j] j-- } } } func marshalClassifyResult(r classifyResult) string { b, err := json.Marshal(r) if err != nil { return fmt.Sprintf(`{"error":"marshal_failed: %v"}`, err) } return string(b) }