Files
steve ae8e194fad feat(failover): model failover chains via comma-separated specs
Parse("a,b,c") now returns one composite *llm.Model that tries each model
in order, retrying transient failures, benching dead models, and failing
over to the next. Comma-free specs are completely unchanged.

- classify.go: Classify(err) ErrKind + IsTransient(err) error classifier
  mapping anthropic (typed Is*Err helpers + RequestError status),
  openai-go (*openai.Error status), openaicompat.FeatureUnsupportedError,
  context errors, and ollama "HTTP <code>" strings to
  transient/auth-dead/request-specific/unknown.
- failover.go: failoverProvider (satisfies provider.Provider) wrapped into a
  *Model via NewClient. Process-wide mutex-guarded modelHealth bench
  registry keyed by concrete spec, with cooldowns and a control API
  (ListBenched/BenchModel/UnbenchModel/IsBenched). NewFailoverModel +
  ParseChain constructors, FailoverOption config, FailoverObserver (carries
  the full request), and configurable package-level defaults.
- parse.go: comma-aware Parse splits into a failover chain; alias/resolver
  targets that expand to comma chains are routed through the comma-aware
  path and flattened.

All access to global health is mutex-guarded; tests reset it via
resetHealthForTest and pass under go test -race.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-01 00:30:08 +02:00

208 lines
6.7 KiB
Go

package llm
import (
"context"
"errors"
"strings"
anth "github.com/liushuangls/go-anthropic/v2"
"github.com/openai/openai-go"
"gitea.stevedudenhoeffer.com/steve/go-llm/v2/openaicompat"
)
// ErrKind classifies a provider error for failover decision-making.
//
// Why: failover must decide, per error, whether to retry the same model
// (transient), bench it as broken (auth/model dead), or fail over without
// benching (this request's fault). Without a classifier every error looks
// the same and we'd either thrash a dead model or bench a healthy one.
// What: an enum of the four outcomes the failover algorithm distinguishes.
// Test: see classify_test.go — every branch is table-tested with faked SDK errors.
type ErrKind int
const (
// ErrUnknown is an unrecognized error. Failover treats it as transient
// (conservative — retry then fail over), EXCEPT context.Canceled which
// the caller special-cases as an abort.
ErrUnknown ErrKind = iota
// ErrTransient is a temporary failure (429/5xx/timeout): retry, then
// bench-and-fail-over if retries are exhausted.
ErrTransient
// ErrAuthDead is an auth failure or model-not-found (401/403/404): the
// model is unusable; bench immediately and fail over.
ErrAuthDead
// ErrRequestSpecific is the caller's fault for THIS request (400/413/422,
// unsupported feature): fail over to try a more capable model, but do NOT
// bench — the model itself is healthy.
ErrRequestSpecific
)
// classifyStatus maps an HTTP status code to an ErrKind.
//
// Why: openai-go and anthropic RequestError both expose a numeric StatusCode;
// centralizing the mapping keeps the per-SDK branches thin and consistent.
// What: 408/409/429/5xx → transient, 401/403/404 → auth-dead, 400/413/422 →
// request-specific, anything else → unknown.
// Test: covered indirectly via Classify table tests for each SDK.
func classifyStatus(code int) ErrKind {
switch code {
case 408, 409, 429, 500, 502, 503, 504:
return ErrTransient
case 401, 403, 404:
return ErrAuthDead
case 400, 413, 422:
return ErrRequestSpecific
default:
return ErrUnknown
}
}
// Classify inspects a provider error and returns its ErrKind.
//
// Why: the failover composite needs typed, status-code-aware classification to
// retry/bench/skip correctly across the anthropic, openai-compat, and ollama
// providers, each of which surfaces errors differently.
// What: prefers anthropic's typed Is*Err helpers, falls back to numeric status
// codes (openai-go, anthropic RequestError), then the openaicompat
// FeatureUnsupportedError, context errors, and finally an ollama HTTP-string
// fallback; unrecognized errors are ErrUnknown.
// Test: classify_test.go faked SDK errors exercise every branch.
func Classify(err error) ErrKind {
if err == nil {
return ErrUnknown
}
// context.Canceled is reported as ErrUnknown here; the failover algorithm
// special-cases it as an abort before consulting the kind.
if errors.Is(err, context.Canceled) {
return ErrUnknown
}
if errors.Is(err, context.DeadlineExceeded) {
return ErrTransient
}
// FeatureUnsupportedError is a permanent, request-shaped failure.
var featErr *openaicompat.FeatureUnsupportedError
if errors.As(err, &featErr) {
return ErrRequestSpecific
}
// Anthropic APIError: prefer the typed helpers (no StatusCode available).
var apiErr *anth.APIError
if errors.As(err, &apiErr) {
switch {
case apiErr.IsRateLimitErr(), apiErr.IsOverloadedErr(), apiErr.IsApiErr():
return ErrTransient
case apiErr.IsAuthenticationErr(), apiErr.IsPermissionErr(), apiErr.IsNotFoundErr():
return ErrAuthDead
case apiErr.IsTooLargeErr(), apiErr.IsInvalidRequestErr():
return ErrRequestSpecific
default:
return ErrUnknown
}
}
// Anthropic RequestError: status-code based.
var anthReqErr *anth.RequestError
if errors.As(err, &anthReqErr) {
return classifyStatus(anthReqErr.StatusCode)
}
// openai-go (openai/deepseek/moonshot/xai/groq): status-code based.
var oaiErr *openai.Error
if errors.As(err, &oaiErr) {
return classifyStatus(oaiErr.StatusCode)
}
// Ollama: no typed status — fall back to its "ollama: HTTP <code>:" string.
if k := classifyOllamaString(err.Error()); k != ErrUnknown {
return k
}
return ErrUnknown
}
// classifyOllamaString extracts an HTTP status from ollama's error string
// format ("ollama: HTTP <code>: ...") and classifies it.
//
// Why: the ollama provider stringifies errors without a typed status code, so
// failover can only classify by parsing the message.
// What: looks for "HTTP <code>" in the message and maps the code; returns
// ErrUnknown when no recognizable status is present.
// Test: classify_test.go ollama cases cover 5xx/429/401/404/400.
func classifyOllamaString(msg string) ErrKind {
const marker = "HTTP "
idx := strings.Index(msg, marker)
if idx < 0 {
return ErrUnknown
}
rest := msg[idx+len(marker):]
// Read up to 3 leading digits.
end := 0
for end < len(rest) && end < 3 && rest[end] >= '0' && rest[end] <= '9' {
end++
}
if end == 0 {
return ErrUnknown
}
code := 0
for i := 0; i < end; i++ {
code = code*10 + int(rest[i]-'0')
}
return classifyStatus(code)
}
// extractStatus best-effort pulls an HTTP status code out of a provider error
// for structured logging. Returns 0 when none is available.
//
// Why: log lines benefit from the numeric status even though classification
// may use typed helpers; this keeps that detail out of the hot path.
// What: checks anthropic RequestError and openai-go Error StatusCode fields,
// then parses ollama's "HTTP <code>" string; returns 0 otherwise.
// Test: covered indirectly via failover log assertions / manual inspection.
func extractStatus(err error) int {
if err == nil {
return 0
}
var anthReqErr *anth.RequestError
if errors.As(err, &anthReqErr) {
return anthReqErr.StatusCode
}
var oaiErr *openai.Error
if errors.As(err, &oaiErr) {
return oaiErr.StatusCode
}
const marker = "HTTP "
msg := err.Error()
if idx := strings.Index(msg, marker); idx >= 0 {
rest := msg[idx+len(marker):]
code, n := 0, 0
for n < len(rest) && n < 3 && rest[n] >= '0' && rest[n] <= '9' {
code = code*10 + int(rest[n]-'0')
n++
}
if n > 0 {
return code
}
}
return 0
}
// IsTransient reports whether an error should be retried/failed-over rather
// than treated as a hard, model-specific failure.
//
// Why: callers (and failover) want a one-call "is this worth retrying?" check
// that is conservative about unknown errors.
// What: returns true for ErrTransient and ErrUnknown (conservative), false for
// ErrAuthDead and ErrRequestSpecific.
// Test: TestIsTransient asserts 503→true, unknown→true, 401→false, 400→false.
func IsTransient(err error) bool {
switch Classify(err) {
case ErrTransient, ErrUnknown:
return true
default:
return false
}
}