go-llm/v2/classify.go

package llm

import (
	"context"
	"errors"
	"strings"

	anth "github.com/liushuangls/go-anthropic/v2"
	"github.com/openai/openai-go"

	"gitea.stevedudenhoeffer.com/steve/go-llm/v2/openaicompat"
)

// ErrKind classifies a provider error for failover decision-making.
//
// Why: failover must decide, per error, whether to retry the same model
// (transient), bench it as broken (auth/model dead), or fail over without
// benching (this request's fault). Without a classifier every error looks
// the same and we'd either thrash a dead model or bench a healthy one.
// What: an enum of the four outcomes the failover algorithm distinguishes.
// Test: see classify_test.go — every branch is table-tested with faked SDK errors.
type ErrKind int

const (
	// ErrUnknown is an unrecognized error. Failover treats it as transient
	// (conservative — retry then fail over), EXCEPT context.Canceled which
	// the caller special-cases as an abort.
	ErrUnknown ErrKind = iota
	// ErrTransient is a temporary failure (429/5xx/timeout): retry, then
	// bench-and-fail-over if retries are exhausted.
	ErrTransient
	// ErrAuthDead is an auth failure or model-not-found (401/403/404): the
	// model is unusable; bench immediately and fail over.
	ErrAuthDead
	// ErrRequestSpecific is the caller's fault for THIS request (400/413/422,
	// unsupported feature): fail over to try a more capable model, but do NOT
	// bench — the model itself is healthy.
	ErrRequestSpecific
)

// classifyStatus maps an HTTP status code to an ErrKind.
//
// Why: openai-go and anthropic RequestError both expose a numeric StatusCode;
// centralizing the mapping keeps the per-SDK branches thin and consistent.
// What: 408/409/429/5xx → transient, 401/403/404 → auth-dead, 400/413/422 →
// request-specific, anything else → unknown.
// Test: covered indirectly via Classify table tests for each SDK.
func classifyStatus(code int) ErrKind {
	switch code {
	case 408, 409, 429, 500, 502, 503, 504:
		return ErrTransient
	case 401, 403, 404:
		return ErrAuthDead
	case 400, 413, 422:
		return ErrRequestSpecific
	default:
		return ErrUnknown
	}
}

// Classify inspects a provider error and returns its ErrKind.
//
// Why: the failover composite needs typed, status-code-aware classification to
// retry/bench/skip correctly across the anthropic, openai-compat, and ollama
// providers, each of which surfaces errors differently.
// What: prefers anthropic's typed Is*Err helpers, falls back to numeric status
// codes (openai-go, anthropic RequestError), then the openaicompat
// FeatureUnsupportedError, context errors, and finally an ollama HTTP-string
// fallback; unrecognized errors are ErrUnknown.
// Test: classify_test.go faked SDK errors exercise every branch.
func Classify(err error) ErrKind {
	if err == nil {
		return ErrUnknown
	}

	// context.Canceled is reported as ErrUnknown here; the failover algorithm
	// special-cases it as an abort before consulting the kind.
	if errors.Is(err, context.Canceled) {
		return ErrUnknown
	}
	if errors.Is(err, context.DeadlineExceeded) {
		return ErrTransient
	}

	// FeatureUnsupportedError is a permanent, request-shaped failure.
	var featErr *openaicompat.FeatureUnsupportedError
	if errors.As(err, &featErr) {
		return ErrRequestSpecific
	}

	// Anthropic APIError: prefer the typed helpers (no StatusCode available).
	var apiErr *anth.APIError
	if errors.As(err, &apiErr) {
		switch {
		case apiErr.IsRateLimitErr(), apiErr.IsOverloadedErr(), apiErr.IsApiErr():
			return ErrTransient
		case apiErr.IsAuthenticationErr(), apiErr.IsPermissionErr(), apiErr.IsNotFoundErr():
			return ErrAuthDead
		case apiErr.IsTooLargeErr(), apiErr.IsInvalidRequestErr():
			return ErrRequestSpecific
		default:
			return ErrUnknown
		}
	}

	// Anthropic RequestError: status-code based.
	var anthReqErr *anth.RequestError
	if errors.As(err, &anthReqErr) {
		return classifyStatus(anthReqErr.StatusCode)
	}

	// openai-go (openai/deepseek/moonshot/xai/groq): status-code based.
	var oaiErr *openai.Error
	if errors.As(err, &oaiErr) {
		return classifyStatus(oaiErr.StatusCode)
	}

	// Ollama: no typed status — fall back to its "ollama: HTTP <code>:" string.
	if k := classifyOllamaString(err.Error()); k != ErrUnknown {
		return k
	}

	return ErrUnknown
}

// classifyOllamaString extracts an HTTP status from ollama's error string
// format ("ollama: HTTP <code>: ...") and classifies it.
//
// Why: the ollama provider stringifies errors without a typed status code, so
// failover can only classify by parsing the message.
// What: looks for "HTTP <code>" in the message and maps the code; returns
// ErrUnknown when no recognizable status is present.
// Test: classify_test.go ollama cases cover 5xx/429/401/404/400.
func classifyOllamaString(msg string) ErrKind {
	const marker = "HTTP "
	idx := strings.Index(msg, marker)
	if idx < 0 {
		return ErrUnknown
	}
	rest := msg[idx+len(marker):]
	// Read up to 3 leading digits.
	end := 0
	for end < len(rest) && end < 3 && rest[end] >= '0' && rest[end] <= '9' {
		end++
	}
	if end == 0 {
		return ErrUnknown
	}
	code := 0
	for i := 0; i < end; i++ {
		code = code*10 + int(rest[i]-'0')
	}
	return classifyStatus(code)
}

// extractStatus best-effort pulls an HTTP status code out of a provider error
// for structured logging. Returns 0 when none is available.
//
// Why: log lines benefit from the numeric status even though classification
// may use typed helpers; this keeps that detail out of the hot path.
// What: checks anthropic RequestError and openai-go Error StatusCode fields,
// then parses ollama's "HTTP <code>" string; returns 0 otherwise.
// Test: covered indirectly via failover log assertions / manual inspection.
func extractStatus(err error) int {
	if err == nil {
		return 0
	}
	var anthReqErr *anth.RequestError
	if errors.As(err, &anthReqErr) {
		return anthReqErr.StatusCode
	}
	var oaiErr *openai.Error
	if errors.As(err, &oaiErr) {
		return oaiErr.StatusCode
	}
	const marker = "HTTP "
	msg := err.Error()
	if idx := strings.Index(msg, marker); idx >= 0 {
		rest := msg[idx+len(marker):]
		code, n := 0, 0
		for n < len(rest) && n < 3 && rest[n] >= '0' && rest[n] <= '9' {
			code = code*10 + int(rest[n]-'0')
			n++
		}
		if n > 0 {
			return code
		}
	}
	return 0
}

// IsTransient reports whether an error should be retried/failed-over rather
// than treated as a hard, model-specific failure.
//
// Why: callers (and failover) want a one-call "is this worth retrying?" check
// that is conservative about unknown errors.
// What: returns true for ErrTransient and ErrUnknown (conservative), false for
// ErrAuthDead and ErrRequestSpecific.
// Test: TestIsTransient asserts 503→true, unknown→true, 401→false, 400→false.
func IsTransient(err error) bool {
	switch Classify(err) {
	case ErrTransient, ErrUnknown:
		return true
	default:
		return false
	}
}