executus/tools/classify.go

// Package tools — v12 classify.
//
// Classification primitive: text + categories → labels + per-category
// scores. Single-label mode (default) returns the top-1 category;
// multi-label mode returns every category whose score crosses the
// threshold.
//
// Why a dedicated tool (vs reusing extract_entities for one-of-N
// classification): classification has a typed result (labels[] +
// scores{}) that downstream agents consume programmatically. Folding
// it into extract_entities would force every author to re-spec the
// scoring schema.
//
// Score normalisation: the LLM's reply is normalised so each score
// lands in [0, 1]. The single-label result returns scores for ALL
// categories so the author can read the distribution; multi-label
// returns labels[] of categories above 0.5.
//
// Test: classify_test.go covers single-label, multi-label, score
// normalisation, > 20 categories rejected, unknown category in the
// reply silently dropped.
package tools

import (
	"context"
	"encoding/json"
	"fmt"
	"strconv"
	"strings"

	"gitea.stevedudenhoeffer.com/steve/executus/llmmeta"
	"gitea.stevedudenhoeffer.com/steve/executus/tool"
)

// classifyMaxInputBytes is the input cap.
const classifyMaxInputBytes = 16 * 1024

// classifyMaxCategories is the hard cap on category count.
const classifyMaxCategories = 20

// classifyMultiLabelThreshold is the score threshold above which a
// category appears in the labels[] array in multi-label mode.
const classifyMultiLabelThreshold = 0.5

// classifyFallbackMaxPerRun is the per-run cap when ClassifyConfig is
// nil.
const classifyFallbackMaxPerRun = 20

// ClassifyConfig is the narrow per-deployment config surface.
type ClassifyConfig interface {
	MaxPerRun(ctx context.Context) int
}

// classifyArgs is the LLM-facing param struct.
type classifyArgs struct {
	Text       string   `json:"text" description:"The text to classify. Required. Capped at 16KB."`
	Categories []string `json:"categories" description:"List of categories to score the text against. Required. Max 20."`
	MultiLabel bool     `json:"multi_label,omitempty" description:"When true, returns every category scoring above 0.5. Default false → single-label (top-1) result."`
}

type classifyResult struct {
	Labels    []string           `json:"labels,omitempty"`
	Scores    map[string]float64 `json:"scores,omitempty"`
	ModelUsed string             `json:"model_used,omitempty"`
	RawReply  string             `json:"raw_reply,omitempty"`
	Error     string             `json:"error,omitempty"`
	BudgetMsg string             `json:"budget_message,omitempty"`
}

// NewClassify constructs the classify tool.
func NewClassify(helper *llmmeta.Helper, cfg ClassifyConfig, budget SearchBudget) tool.Tool {
	return tool.NewGatedTool[classifyArgs](
		"classify",
		"Classify text into one of N categories (or multiple via multi_label=true). Returns labels[] (top-1 by default) + scores{category: 0..1}. Counts against per-run and 7-day cost budgets.",
		tool.Permission{
			AuthoringRequirement: tool.RequirementAnyone,
			OperatesOn:           tool.ScopeCaller,
			SafeForShare:         true,
			Categories:           []string{"llm-meta", "cost-bearing"},
		},
		func(ctx context.Context, inv tool.Invocation, args classifyArgs) (string, error) {
			if helper == nil {
				return "", fmt.Errorf("classify: not configured")
			}
			text := args.Text
			if strings.TrimSpace(text) == "" {
				return marshalClassifyResult(classifyResult{Error: "text is empty"}), nil
			}
			if len(args.Categories) == 0 {
				return marshalClassifyResult(classifyResult{Error: "categories is empty"}), nil
			}
			if len(args.Categories) > classifyMaxCategories {
				return marshalClassifyResult(classifyResult{
					Error: fmt.Sprintf("too many categories (%d > %d)", len(args.Categories), classifyMaxCategories),
				}), nil
			}
			// Trim + dedupe categories so the LLM sees a clean
			// schema. Order is preserved for the prompt; the result
			// map is order-agnostic.
			categories := make([]string, 0, len(args.Categories))
			seen := make(map[string]bool, len(args.Categories))
			for _, c := range args.Categories {
				c = strings.TrimSpace(c)
				if c == "" || seen[c] {
					continue
				}
				seen[c] = true
				categories = append(categories, c)
			}
			if len(categories) == 0 {
				return marshalClassifyResult(classifyResult{Error: "categories has no non-empty entries"}), nil
			}

			if len(text) > classifyMaxInputBytes {
				text = truncateUTF8(text, classifyMaxInputBytes)
			}

			// Per-run budget gate.
			if budget == nil {
				maxPerRun := classifyFallbackMaxPerRun
				if cfg != nil {
					maxPerRun = cfg.MaxPerRun(ctx)
				}
				budget = NewInMemorySearchBudget(map[string]int{
					"classify": maxPerRun,
				})
			}
			count, max, exceeded := budget.CheckAndIncrement(ctx, inv.RunID, "classify")
			if exceeded {
				return marshalClassifyResult(classifyResult{
					Error:     "classify_budget_exceeded",
					BudgetMsg: fmt.Sprintf("per-run classify budget exceeded (%d/%d). Ask an admin to raise skills.classify.max_per_run.", count, max),
				}), nil
			}

			systemPrompt := "You classify text into a fixed set of categories. Return ONLY JSON. Score each category in [0,1] (1 = perfect fit). Sum of all scores does NOT need to be 1 — high overlap across categories is allowed."
			userPrompt := buildClassifyPrompt(text, categories, args.MultiLabel)

			res, callErr := helper.Call(ctx, llmmeta.CallSpec{
				Tier:                 "fast",
				SystemPrompt:         systemPrompt,
				UserPrompt:           userPrompt,
				MaxOutputTokens:      2048,
				ResponseFormat:       "json",
				RetryOnMalformedJSON: true,
				ToolName:             "classify",
				RunID:                inv.RunID,
				SkillID:              inv.SkillID,
				CallerID:             inv.CallerID,
			})
			if callErr != nil {
				return "", callErr
			}
			if !res.Success {
				kind := res.ErrorKind
				if kind == "" {
					kind = "llm_unavailable"
				}
				return marshalClassifyResult(classifyResult{Error: kind}), nil
			}
			if res.ErrorKind == llmmeta.ErrorKindMalformedJSON || res.Parsed == nil {
				return marshalClassifyResult(classifyResult{
					Error:     "classification_failed",
					RawReply:  res.Text,
					ModelUsed: res.ModelUsed,
				}), nil
			}

			parsedMap, ok := res.Parsed.(map[string]any)
			if !ok {
				return marshalClassifyResult(classifyResult{
					Error:     "classification_failed_not_object",
					RawReply:  res.Text,
					ModelUsed: res.ModelUsed,
				}), nil
			}

			scores := normaliseClassifyScores(parsedMap, categories)
			labels := selectClassifyLabels(scores, categories, args.MultiLabel)

			return marshalClassifyResult(classifyResult{
				Labels:    labels,
				Scores:    scores,
				ModelUsed: res.ModelUsed,
			}), nil
		},
	)
}

// buildClassifyPrompt composes the user message.
func buildClassifyPrompt(text string, categories []string, multiLabel bool) string {
	var sb strings.Builder
	sb.WriteString("Classify the text below.\n\nCategories:\n")
	for _, c := range categories {
		sb.WriteString("- ")
		sb.WriteString(c)
		sb.WriteString("\n")
	}
	sb.WriteString("\nText:\n")
	sb.WriteString(text)
	sb.WriteString("\n\nReturn ONLY a JSON object: {\"scores\": {\"<category>\": <0..1 float>, ...}}.")
	if multiLabel {
		sb.WriteString(" The same text may score high in MULTIPLE categories — score each independently.")
	} else {
		sb.WriteString(" Score each category; the highest-scoring one will be the chosen label.")
	}
	return sb.String()
}

// normaliseClassifyScores extracts the scores map from the LLM's
// reply and clamps each value into [0, 1]. Categories absent from the
// reply default to 0.
//
// Why we accept either {"scores": {...}} or {...}: some models reply
// with the inner object directly, dropping the wrapping key. Both
// shapes are valid as long as the keys match the requested category
// names.
func normaliseClassifyScores(parsed map[string]any, categories []string) map[string]float64 {
	scoresIn, ok := parsed["scores"].(map[string]any)
	if !ok {
		// Accept the bare-map shape too.
		scoresIn = parsed
	}
	out := make(map[string]float64, len(categories))
	for _, c := range categories {
		v, has := scoresIn[c]
		if !has {
			out[c] = 0
			continue
		}
		f, ok := coerceClassifyScore(v)
		if !ok {
			out[c] = 0
			continue
		}
		// Clamp into [0, 1].
		if f < 0 {
			f = 0
		}
		if f > 1 {
			f = 1
		}
		out[c] = f
	}
	return out
}

// coerceClassifyScore reads a JSON value as a float in [0, 1]. Accepts
// floats, ints, and percent-strings ("85%" → 0.85).
func coerceClassifyScore(raw any) (float64, bool) {
	switch v := raw.(type) {
	case float64:
		return v, true
	case int:
		return float64(v), true
	case int64:
		return float64(v), true
	case string:
		trimmed := strings.TrimSpace(v)
		hasPct := strings.HasSuffix(trimmed, "%")
		s := strings.TrimSuffix(trimmed, "%")
		// strconv.ParseFloat (unlike fmt.Sscanf %f) rejects trailing garbage,
		// so "50extra" / "0.5x" are refused instead of silently parsed as 50/0.5.
		f, err := strconv.ParseFloat(strings.TrimSpace(s), 64)
		if err == nil {
			if hasPct {
				f = f / 100.0
			}
			return f, true
		}
	}
	return 0, false
}

// selectClassifyLabels picks the labels to surface. Single-label mode
// returns the highest-scoring category. Multi-label returns every
// category above the threshold (sorted by score desc for stable
// rendering).
func selectClassifyLabels(scores map[string]float64, categories []string, multiLabel bool) []string {
	if multiLabel {
		var labels []string
		for _, c := range categories {
			if scores[c] >= classifyMultiLabelThreshold {
				labels = append(labels, c)
			}
		}
		// Sort labels by score desc, then category-list order for ties.
		sortClassifyLabelsByScore(labels, scores)
		return labels
	}
	// Single-label: top-1.
	bestCat := ""
	bestScore := -1.0
	for _, c := range categories {
		if scores[c] > bestScore {
			bestScore = scores[c]
			bestCat = c
		}
	}
	// No category fit: an all-zero score set must not yield a false-positive
	// top-1 (the first category trivially beats the -1.0 sentinel). Returning
	// no label keeps "nothing matched" distinguishable from "category A won".
	if bestCat == "" || bestScore <= 0 {
		return nil
	}
	return []string{bestCat}
}

// sortClassifyLabelsByScore sorts labels desc by score. Stable on
// ties (preserves category-list order).
func sortClassifyLabelsByScore(labels []string, scores map[string]float64) {
	for i := 1; i < len(labels); i++ {
		j := i
		for j > 0 && scores[labels[j]] > scores[labels[j-1]] {
			labels[j], labels[j-1] = labels[j-1], labels[j]
			j--
		}
	}
}

func marshalClassifyResult(r classifyResult) string {
	b, err := json.Marshal(r)
	if err != nil {
		return fmt.Sprintf(`{"error":"marshal_failed: %v"}`, err)
	}
	return string(b)
}