// Package tools — research provider plumbing for v11.
//
// This file declares the narrow interfaces v11's research tools
// (web_search, read_page, read_video, read_pdf, verify_url, etc.) need
// at execute time. Production wiring lives in pkg/logic/mort.go and
// closes over the searcher chain, the extractor / chromedp client, the
// PDF extractor, and the yt-dlp wrapper.
//
// Why narrow interfaces (vs importing pkg/logic/searcher / extractor
// directly): the same cycle-break pattern used by KVStorage, FileStorage,
// HTTPConfigProvider — keeps pkg/skilltools/tools free of the wiring
// layer so tests can stub each dependency. Each provider is nil-safe:
// the tool surfaces "not configured" at first call rather than failing
// at registration.
//
// Test: each tool under pkg/skilltools/tools/ wired against these
// interfaces has its own *_test.go using the in-package fakes in
// research_providers_fakes_test.go.
package tools

import (
	"context"
	"errors"
	"time"
)

// PageCache is the narrow surface read_page (and read_pdf) consult to
// avoid re-fetching the same URL within the cache's TTL. Production
// wiring bridges this interface to the legacy *cache.Cache held by
// pkg/logic/query.System so a `.query foo.com` and a
// `.skill query foo.com` for the same URL share one cache slot.
//
// Why a narrow interface (vs importing the cache package directly):
// same cycle-break pattern as KVStorage / FileStorage / CitationStorage
// — keeps pkg/skilltools/tools free of the wiring layer. The legacy
// cache slot key is `sha256(url)`; the production adapter is
// responsible for hashing so this interface stays clean (raw URL in/out)
// and skill-tool authors never need to know the slot shape.
//
// nil-safe: a tool constructed with a nil PageCache simply skips the
// cache layer (always treat Get as a miss; Set is a no-op).
//
// Test: tests pass a fake PageCache that records Get/Set calls and
// returns canned hits. See page_cache_test.go for the read_page hit /
// miss scenarios.
type PageCache interface {
	// Get returns the cached body for urlStr and true on hit, or
	// (nil, false) on miss. Implementations MUST treat any backing-
	// store error as a miss (best-effort, never fail the caller).
	Get(ctx context.Context, urlStr string) ([]byte, bool)

	// Set writes body under the slot for urlStr with the supplied TTL.
	// Implementations MUST swallow backing-store errors (best-effort
	// caching is correct: a write failure should not propagate to the
	// agent loop).
	Set(ctx context.Context, urlStr string, body []byte, ttl time.Duration)
}

// PageCacheTTL is the default TTL applied by tools that consult a
// PageCache. Mirrors the legacy `query.pageCacheTTL` constant
// (1 hour) so a `.query`-warmed slot reads back from a `.skill query`
// (and vice versa) within the same window.
//
// Tools that want a different TTL pass an explicit value to
// PageCache.Set; this constant is the project default the v11 / v-research
// tools all use.
const PageCacheTTL = 1 * time.Hour

// PageExtractor is the narrow surface read_page needs at execute
// time. The production adapter wraps mort's existing extractor
// (Ollama web_fetch first, chromedp fallback on JS-heavy pages).
//
// nil-safe: a tool constructed with a nil PageExtractor surfaces
// "not configured" at first call.
//
// Why: read_page used to be a thin io.ReadAll over the URL — it
// missed JS rendering, didn't honour the v6 page cache, and could
// not surface the underlying provider name. v11 routes through this
// interface so the production wiring (mort.go) can plug in the
// existing query-side extractor without exposing query.Agent.
type PageExtractor interface {
	// ExtractPage fetches and extracts readable text from urlStr.
	// Returns the extracted body, a final URL (after any redirects
	// the extractor followed), the provider name ("ollama" |
	// "chromedp" | "ytdlp"), and an error.
	//
	// The returned body is the FULL extracted text — callers apply
	// the v10 byte-vs-reference cap before surfacing to the agent.
	//
	// bypassCache=true skips any page cache and forces a fresh
	// extraction. Default false.
	ExtractPage(ctx context.Context, urlStr string, bypassCache bool) (text string, finalURL string, provider string, err error)
}

// VideoTranscriber is the narrow surface read_video needs at
// execute time. Production wiring wraps internal/ytdlp.
//
// nil-safe: tool surfaces "not configured" at first call.
//
// Why a separate interface from PageExtractor: video is a different
// shape (transcript + metadata) and a different binary (yt-dlp).
// Keeping them distinct lets tests stub each independently.
type VideoTranscriber interface {
	// ExtractVideoTranscript returns the transcript text and the
	// best-effort metadata (title, duration in seconds, channel).
	// Implementations MUST return a non-empty transcript or an
	// error — empty-transcript success is interpreted by the tool
	// as a "transcript_unavailable" failure.
	ExtractVideoTranscript(ctx context.Context, urlStr string) (transcript string, meta VideoMeta, err error)
}

// VideoMeta is best-effort metadata returned alongside a video
// transcript. Any field may be empty/zero if the implementation
// could not extract it.
type VideoMeta struct {
	Title           string
	Channel         string
	DurationSeconds int
}

// PDFFetcher is the narrow surface read_pdf needs at execute time.
// Production wiring uses an HTTP-aware fetcher that HEAD-validates
// content-type before downloading the body.
//
// nil-safe: tool surfaces "not configured" at first call.
//
// Why: a tool that just embedded PDF extraction would couple
// fetching + parsing. Splitting the fetch (allowlist + SSRF +
// HEAD check) from the extract (page-level parsing) keeps each
// step testable and lets the same fetcher serve verify_url one
// day if we want a PDF-aware fast path.
type PDFFetcher interface {
	// FetchPDF downloads the PDF at urlStr (after HEAD-validating
	// content-type) and returns the raw bytes plus the final URL.
	// HEAD-validation rejects a URL whose Content-Type is not a
	// PDF mime AND whose path does not end in .pdf.
	FetchPDF(ctx context.Context, urlStr string) (body []byte, finalURL string, err error)
}

// PDFExtractor parses PDF bytes into plain text + page count.
// Production wires internal.ExtractPDFText.
//
// Why split from PDFFetcher: tests want to vary the fetch (mock
// server returning bytes) without rebuilding the extractor.
type PDFExtractor interface {
	// ExtractPDFText returns the concatenated plain-text content
	// of the PDF along with the page count. The caller applies any
	// per-page cap and the v10 byte-vs-reference cap on the result.
	ExtractPDFText(ctx context.Context, body []byte, maxPages int) (text string, pageCount int, truncated bool, err error)
}

// HEADChecker is the narrow surface verify_url needs at execute
// time. Production wiring uses the same SSRF-pinned transport as
// http_get so the security envelope is consistent.
//
// Why a separate interface (vs reusing HTTPConfigProvider+doHTTP):
// verify_url's contract is simpler — HEAD only, no body bytes
// returned, and the agent only cares about reachable / status /
// final URL / content-type. A bespoke surface lets the production
// adapter optimise for that path (no body buffer, no body close).
type HEADChecker interface {
	// HEAD performs a HEAD request against urlStr (with SSRF +
	// allowlist enforcement) and returns the final URL after any
	// redirects, the HTTP status code, and the Content-Type header.
	// Returns reachable=false with a non-nil err for transport
	// failures (DNS, TCP, allowlist rejection); reachable=true with
	// any HTTP status (including 4xx/5xx) is the success shape —
	// the agent decides whether the URL is "real".
	HEAD(ctx context.Context, urlStr string) (finalURL string, status int, contentType string, reachable bool, err error)
}

// CitationStorage is the narrow surface cite() needs at execute
// time. Production wires *skills.System.Storage(); tests stub.
//
// nil-safe: tool surfaces "not configured" at first call.
//
// Why a narrow interface (vs importing pkg/logic/skills): same
// cycle constraint as KVStorage / FileStorage. Production adapter
// in mort.go bridges to skills.Storage's RecordCitation /
// ListCitations methods AND a separate URL-history tracker.
//
// Two responsibilities, deliberately separate:
//
//  1. RecordCitation writes a row into skill_run_sources — this is
//     the user-visible citations table for the Sources panel and
//     CSV export. ONLY rows the agent successfully cited via
//     cite() land here.
//  2. RecordURLTouch / GetTouchedURLs maintains a per-run set of
//     URLs the agent has interacted with (web_search results,
//     read_page input, read_pdf input, read_video input). cite()
//     reads this set to reject claims for URLs the agent never
//     touched. This set lives in a different table or scope from
//     the citations table — it's working state, not a record.
type CitationStorage interface {
	// RecordCitation appends one (run_id, url, claim, cited_at)
	// row to the citations table (skill_run_sources). cited_at is
	// set by the storage layer to time.Now() when zero. The caller
	// has already verified the URL is in the touched-URL set
	// (via GetTouchedURLs); this method is the persistence step.
	RecordCitation(ctx context.Context, runID, url, claim string) error

	// RecordURLTouch records that the agent has interacted with
	// `url` during `runID`. Called by web_search (per result),
	// read_page, read_pdf, and read_video. Idempotent — repeat
	// calls for the same (run_id, url) are no-ops at the storage
	// layer.
	RecordURLTouch(ctx context.Context, runID, url string) error

	// GetTouchedURLs returns the set of URLs the run has
	// interacted with. Used by cite() to verify that a claim's
	// URL is one the agent actually visited. Empty for a fresh
	// run — cite() then rejects every claim with
	// "url_not_in_run_history".
	GetTouchedURLs(ctx context.Context, runID string) (map[string]struct{}, error)

	// ListCitations returns all citations recorded for the run, in
	// insertion order. Powers the /skills/{id}/runs/{run_id}
	// Sources panel.
	ListCitations(ctx context.Context, runID string) ([]CitationRow, error)
}

// CitationRow mirrors the skill_run_sources row shape. Fields
// match the spec: run_id is implicit in the query, url + claim are
// what the agent submitted, cited_at is the wall-clock timestamp
// at insert.
type CitationRow struct {
	URL     string
	Claim   string
	CitedAt int64 // unix-seconds; storage adapter normalises from time.Time
}

// CurrentTimeProvider exposes a "now" + per-user timezone lookup.
// Production wiring closes over the bot's member-config getter.
//
// nil-safe: a tool constructed with a nil provider falls back to
// server-time + UTC (current behaviour of NewNow before v11).
type CurrentTimeProvider interface {
	// UserTimezone returns the IANA timezone name configured for
	// the given Discord member ID, or "" when the member has no
	// timezone configured. Empty fallback is "UTC".
	UserTimezone(ctx context.Context, memberID string) string
}

// SearchBudget is the narrow surface web_search reads at execute
// time to honour skills.web_search.max_per_run.
//
// Production wiring closes over a per-run counter held by the
// executor. nil-safe: tool falls back to a built-in package
// counter (process-wide, NOT per-run) — useful for tests but NOT
// production-correct because budget bleeds across runs. The
// production adapter MUST be wired.
type SearchBudget interface {
	// CheckAndIncrement returns the current count AFTER incrementing
	// for the given runID, the configured max, and an error when
	// the call would exceed the cap. The handler returns a clean
	// "search_budget_exceeded" string on exceed (not an error so
	// the agent can react).
	CheckAndIncrement(ctx context.Context, runID, kind string) (count, max int, exceeded bool)
}

// ResearchConfig is the narrow surface that read_page / read_video /
// read_pdf / verify_url read at execute time for per-tool budget caps
// and inline-vs-file_id thresholds. Production wiring closes over
// the relevant convars.
//
// nil-safe: tools fall back to package defaults.
type ResearchConfig interface {
	// MaxInlineBytes returns the cap above which extracted text is
	// persisted as a file_id under run-scope (v10 byte-vs-reference
	// principle). Default 12 KiB.
	MaxInlineBytes(ctx context.Context) int

	// PDFMaxPages returns the cap on pages extracted from a PDF
	// before truncation. Default 50.
	PDFMaxPages(ctx context.Context) int

	// WebSearchEnabled is the master switch for web_search.
	WebSearchEnabled(ctx context.Context) bool

	// WebSearchMaxPerRun is the per-run search cap.
	WebSearchMaxPerRun(ctx context.Context) int

	// ReadPageMaxPerRun is the per-run page-read cap.
	ReadPageMaxPerRun(ctx context.Context) int

	// VideoMaxPerRun is the per-run video-read cap.
	VideoMaxPerRun(ctx context.Context) int

	// VerifyURLMaxPerRun is the per-run HEAD-check cap.
	VerifyURLMaxPerRun(ctx context.Context) int

	// ReadPDFMaxPerRun is the per-run PDF-read cap.
	ReadPDFMaxPerRun(ctx context.Context) int

	// HTTPGetMaxPerRun (v15.2) is the per-run http_get cap. The agent
	// otherwise can retry-storm through random URLs and bloat its own
	// context with each tool result. Default 20.
	HTTPGetMaxPerRun(ctx context.Context) int

	// HTTPPostMaxPerRun (v15.2) is the per-run http_post cap. Default 20.
	HTTPPostMaxPerRun(ctx context.Context) int

	// WebSearchAugmentThreshold is the minimum number of primary
	// (Ollama) results required to skip the secondary (DDG/Brave)
	// search. When the primary backend returns fewer than this many
	// results, the augmented searcher also queries the secondary and
	// merges both result sets. Default 5.
	WebSearchAugmentThreshold(ctx context.Context) int

	// ReplyChainDepthMax is unused here; placeholder shape for
	// future per-tool caps. Kept off this interface — callers reach
	// into the convar reader directly when they need it.
}

// ErrPageExtractionFailed is the sentinel returned by a PageExtractor
// when both Ollama and chromedp paths produce empty content.
var ErrPageExtractionFailed = errors.New("page extraction failed: empty content")

// ErrVideoTranscriptUnavailable is the sentinel returned by a
// VideoTranscriber when no captions / transcript could be obtained.
var ErrVideoTranscriptUnavailable = errors.New("video transcript unavailable")

// ErrPDFNotPDF is the sentinel returned by a PDFFetcher when the
// HEAD response indicates a non-PDF content-type AND the URL path
// has no .pdf extension. Surfaces a clean "url_is_not_a_pdf"
// rejection rather than a generic transport error.
var ErrPDFNotPDF = errors.New("url does not serve a PDF")

// ErrPDFEncrypted is returned by a PDFExtractor when the PDF refuses
// extraction because it is password-protected. Surfaces a clean
// "pdf_encrypted" rejection.
var ErrPDFEncrypted = errors.New("pdf is encrypted")