executus/tools/extract_entities.go

// Package tools — v12 extract_entities.
//
// Structured-output workhorse: text + field schema → typed JSON
// object. The author specifies which fields they want and what
// types; the tool builds an appropriate prompt, asks for JSON, and
// validates + coerces the response back into the requested types.
//
// Why a structured-output tool (vs forcing the agent to write its
// own prompt): every agentic skill that needs to "pull X, Y, Z out
// of unstructured text" otherwise re-invents the same prompt-
// engineering pattern. extract_entities centralises it so authors
// just describe the schema.
//
// Type coercion: an LLM responding with "42" when an int field was
// requested is normal noise. The tool coerces strings to
// int/float/bool when possible; coercion failures land the field in
// missing_fields rather than the entities map.
//
// Test: extract_entities_test.go covers happy path, missing optional
// field, missing required field surfaces in missing_fields, malformed
// JSON retry, second-attempt failure, type coercion (string→int,
// string→bool), unknown field type rejected at args validation.
package tools

import (
	"context"
	"encoding/json"
	"fmt"
	"strconv"
	"strings"

	"gitea.stevedudenhoeffer.com/steve/executus/llmmeta"
	"gitea.stevedudenhoeffer.com/steve/executus/tool"
)

// extractEntitiesMaxInputBytes is the hard input cap.
const extractEntitiesMaxInputBytes = 32 * 1024

// extractEntitiesFallbackMaxPerRun is the per-run cap when
// ExtractEntitiesConfig is nil.
const extractEntitiesFallbackMaxPerRun = 10

// ExtractEntitiesConfig is the narrow per-deployment config surface
// extract_entities reads at execute time.
type ExtractEntitiesConfig interface {
	MaxPerRun(ctx context.Context) int
}

// extractField is one row in the schema the agent supplies. The four
// supported types match the JSON-shape primitives we can validate +
// coerce reliably.
//
// Why an enum-shaped Type field (vs free-form): we need to know how
// to validate the LLM's reply. Free-form ("integer", "Number",
// "boolean") would invite typos that silently miss the validation.
type extractField struct {
	Name        string `json:"name" description:"Field name to populate (e.g. 'author', 'year_published'). Becomes a key in the returned entities object."`
	Description string `json:"description" description:"Short description of what to extract (e.g. 'the book author', 'the year the article was published'). Helps the model find the right value."`
	Type        string `json:"type" description:"One of: 'string', 'int', 'float', 'bool', 'list_of_strings'. Determines how the LLM's reply is validated and coerced."`
	Required    bool   `json:"required,omitempty" description:"When true, a missing/uncoercible value lands in missing_fields rather than skipping silently."`
}

// extractEntitiesArgs is the LLM-facing param struct.
type extractEntitiesArgs struct {
	Text   string         `json:"text" description:"The text to extract from. Required. Capped at 32KB."`
	Fields []extractField `json:"fields" description:"Schema describing what to extract. Each field has name, description, type, and optional required flag."`
}

type extractEntitiesResult struct {
	Entities      map[string]any `json:"entities,omitempty"`
	MissingFields []string       `json:"missing_fields,omitempty"`
	ModelUsed     string         `json:"model_used,omitempty"`
	RawReply      string         `json:"raw_reply,omitempty"`
	Error         string         `json:"error,omitempty"`
	BudgetMsg     string         `json:"budget_message,omitempty"`
}

// validExtractTypes is the closed set of Type strings the tool
// accepts. Anything else is rejected at args validation.
var validExtractTypes = map[string]bool{
	"string":          true,
	"int":             true,
	"float":           true,
	"bool":            true,
	"list_of_strings": true,
}

// NewExtractEntities constructs the extract_entities tool.
func NewExtractEntities(helper *llmmeta.Helper, cfg ExtractEntitiesConfig, budget SearchBudget) tool.Tool {
	return tool.NewGatedTool[extractEntitiesArgs](
		"extract_entities",
		"Extract structured fields from unstructured text via a fast LLM. Caller supplies a schema (each field has name + description + type + required); tool returns an entities object with values matching the requested types. Types: string, int, float, bool, list_of_strings. Counts against per-run and 7-day cost budgets.",
		tool.Permission{
			AuthoringRequirement: tool.RequirementAnyone,
			OperatesOn:           tool.ScopeCaller,
			SafeForShare:         true,
			Categories:           []string{"llm-meta", "cost-bearing"},
		},
		func(ctx context.Context, inv tool.Invocation, args extractEntitiesArgs) (string, error) {
			if helper == nil {
				return "", fmt.Errorf("extract_entities: not configured")
			}
			text := args.Text
			if strings.TrimSpace(text) == "" {
				return marshalExtractEntities(extractEntitiesResult{Error: "text is empty"}), nil
			}
			if len(args.Fields) == 0 {
				return marshalExtractEntities(extractEntitiesResult{Error: "fields is empty"}), nil
			}
			// Validate each field's Type before paying for an LLM
			// call.
			for _, f := range args.Fields {
				if strings.TrimSpace(f.Name) == "" {
					return marshalExtractEntities(extractEntitiesResult{Error: "field with empty name"}), nil
				}
				if !validExtractTypes[strings.ToLower(strings.TrimSpace(f.Type))] {
					return marshalExtractEntities(extractEntitiesResult{
						Error: fmt.Sprintf("field %q has unsupported type %q (allowed: string|int|float|bool|list_of_strings)", f.Name, f.Type),
					}), nil
				}
			}

			if len(text) > extractEntitiesMaxInputBytes {
				text = truncateUTF8(text, extractEntitiesMaxInputBytes)
			}

			// Per-run budget gate.
			if budget == nil {
				maxPerRun := extractEntitiesFallbackMaxPerRun
				if cfg != nil {
					maxPerRun = cfg.MaxPerRun(ctx)
				}
				budget = NewInMemorySearchBudget(map[string]int{
					"extract_entities": maxPerRun,
				})
			}
			count, max, exceeded := budget.CheckAndIncrement(ctx, inv.RunID, "extract_entities")
			if exceeded {
				return marshalExtractEntities(extractEntitiesResult{
					Error:     "extract_entities_budget_exceeded",
					BudgetMsg: fmt.Sprintf("per-run extract_entities budget exceeded (%d/%d). Ask an admin to raise skills.extract_entities.max_per_run.", count, max),
				}), nil
			}

			systemPrompt := "You extract structured data from unstructured text. Return ONLY valid JSON with the requested keys. If a value is not present in the text, omit the key. Do NOT invent values."
			userPrompt := buildExtractPrompt(text, args.Fields)

			res, callErr := helper.Call(ctx, llmmeta.CallSpec{
				Tier:                 "fast",
				SystemPrompt:         systemPrompt,
				UserPrompt:           userPrompt,
				MaxOutputTokens:      4096,
				ResponseFormat:       "json",
				RetryOnMalformedJSON: true,
				ToolName:             "extract_entities",
				RunID:                inv.RunID,
				SkillID:              inv.SkillID,
				CallerID:             inv.CallerID,
			})
			if callErr != nil {
				return "", callErr
			}
			if !res.Success {
				kind := res.ErrorKind
				if kind == "" {
					kind = "llm_unavailable"
				}
				return marshalExtractEntities(extractEntitiesResult{Error: kind}), nil
			}

			// Second-failure malformed JSON (success=true but parsed
			// is nil and ErrorKind=malformed_json). Surface the raw
			// reply so the agent can salvage.
			if res.ErrorKind == llmmeta.ErrorKindMalformedJSON || res.Parsed == nil {
				return marshalExtractEntities(extractEntitiesResult{
					Error:     "extraction_failed",
					RawReply:  res.Text,
					ModelUsed: res.ModelUsed,
				}), nil
			}

			parsedMap, ok := res.Parsed.(map[string]any)
			if !ok {
				return marshalExtractEntities(extractEntitiesResult{
					Error:     "extraction_failed_not_object",
					RawReply:  res.Text,
					ModelUsed: res.ModelUsed,
				}), nil
			}

			entities, missing := coerceExtractedEntities(parsedMap, args.Fields)
			return marshalExtractEntities(extractEntitiesResult{
				Entities:      entities,
				MissingFields: missing,
				ModelUsed:     res.ModelUsed,
			}), nil
		},
	)
}

// buildExtractPrompt composes the user message describing the schema
// + source text.
func buildExtractPrompt(text string, fields []extractField) string {
	var sb strings.Builder
	sb.WriteString("Extract the following fields from the text below. Return a JSON object with the field names as keys.\n\nFields:\n")
	for _, f := range fields {
		fmt.Fprintf(&sb, "- %s (%s): %s", f.Name, f.Type, f.Description)
		if f.Required {
			sb.WriteString(" [required]")
		}
		sb.WriteString("\n")
	}
	sb.WriteString("\nText:\n")
	sb.WriteString(text)
	return sb.String()
}

// coerceExtractedEntities walks the LLM's response, validating + (when
// possible) coercing each value to the requested type. Required fields
// missing or uncoercible land in missing[]; optional fields silently
// drop.
func coerceExtractedEntities(parsed map[string]any, fields []extractField) (map[string]any, []string) {
	entities := make(map[string]any, len(fields))
	var missing []string
	for _, f := range fields {
		raw, present := parsed[f.Name]
		if !present || raw == nil {
			if f.Required {
				missing = append(missing, f.Name)
			}
			continue
		}
		value, ok := coerceFieldValue(raw, f.Type)
		if !ok {
			if f.Required {
				missing = append(missing, f.Name)
			}
			continue
		}
		entities[f.Name] = value
	}
	return entities, missing
}

// coerceFieldValue attempts to convert raw to the requested type.
// Returns (value, true) on success or (nil, false) on failure.
//
// Why coerce (vs strict reject): LLMs frequently reply with strings
// that contain numbers ("42") or pseudo-booleans ("yes"). Strict
// rejection would force every author to clean the response themselves.
// Coercion is conservative — string "42" → int 42 succeeds; string
// "forty-two" → int 42 fails (the agent never asked for word-form
// parsing).
func coerceFieldValue(raw any, fieldType string) (any, bool) {
	switch strings.ToLower(strings.TrimSpace(fieldType)) {
	case "string":
		switch v := raw.(type) {
		case string:
			return v, true
		case float64:
			return strconv.FormatFloat(v, 'f', -1, 64), true
		case bool:
			return strconv.FormatBool(v), true
		}
		return nil, false

	case "int":
		switch v := raw.(type) {
		case float64:
			// JSON numbers are float64 by default.
			if v == float64(int64(v)) {
				return int64(v), true
			}
			return nil, false
		case string:
			if n, err := strconv.ParseInt(strings.TrimSpace(v), 10, 64); err == nil {
				return n, true
			}
			// Try float-string-with-zero-fractional ("42.0").
			if f, err := strconv.ParseFloat(strings.TrimSpace(v), 64); err == nil && f == float64(int64(f)) {
				return int64(f), true
			}
		}
		return nil, false

	case "float":
		switch v := raw.(type) {
		case float64:
			return v, true
		case string:
			if f, err := strconv.ParseFloat(strings.TrimSpace(v), 64); err == nil {
				return f, true
			}
		}
		return nil, false

	case "bool":
		switch v := raw.(type) {
		case bool:
			return v, true
		case string:
			s := strings.ToLower(strings.TrimSpace(v))
			switch s {
			case "true", "yes", "1", "y":
				return true, true
			case "false", "no", "0", "n":
				return false, true
			}
		case float64:
			return v != 0, true
		}
		return nil, false

	case "list_of_strings":
		switch v := raw.(type) {
		case []any:
			out := make([]string, 0, len(v))
			for _, e := range v {
				if s, ok := e.(string); ok {
					out = append(out, s)
				} else {
					// Mixed-type lists fail the type contract.
					return nil, false
				}
			}
			return out, true
		case string:
			// Single-string can be lifted into a one-element list.
			return []string{v}, true
		}
		return nil, false
	}
	return nil, false
}

func marshalExtractEntities(r extractEntitiesResult) string {
	b, err := json.Marshal(r)
	if err != nil {
		return fmt.Sprintf(`{"error":"marshal_failed: %v"}`, err)
	}
	return string(b)
}