package llm import ( "context" "errors" "strings" anth "github.com/liushuangls/go-anthropic/v2" "github.com/openai/openai-go" "gitea.stevedudenhoeffer.com/steve/go-llm/v2/openaicompat" ) // ErrKind classifies a provider error for failover decision-making. // // Why: failover must decide, per error, whether to retry the same model // (transient), bench it as broken (auth/model dead), or fail over without // benching (this request's fault). Without a classifier every error looks // the same and we'd either thrash a dead model or bench a healthy one. // What: an enum of the four outcomes the failover algorithm distinguishes. // Test: see classify_test.go — every branch is table-tested with faked SDK errors. type ErrKind int const ( // ErrUnknown is an unrecognized error. Failover treats it as transient // (conservative — retry then fail over), EXCEPT context.Canceled which // the caller special-cases as an abort. ErrUnknown ErrKind = iota // ErrTransient is a temporary failure (429/5xx/timeout): retry, then // bench-and-fail-over if retries are exhausted. ErrTransient // ErrAuthDead is an auth failure or model-not-found (401/403/404): the // model is unusable; bench immediately and fail over. ErrAuthDead // ErrRequestSpecific is the caller's fault for THIS request (400/413/422, // unsupported feature): fail over to try a more capable model, but do NOT // bench — the model itself is healthy. ErrRequestSpecific ) // classifyStatus maps an HTTP status code to an ErrKind. // // Why: openai-go and anthropic RequestError both expose a numeric StatusCode; // centralizing the mapping keeps the per-SDK branches thin and consistent. // What: 408/409/429/5xx → transient, 401/403/404 → auth-dead, 400/413/422 → // request-specific, anything else → unknown. // Test: covered indirectly via Classify table tests for each SDK. func classifyStatus(code int) ErrKind { switch code { case 408, 409, 429, 500, 502, 503, 504: return ErrTransient case 401, 403, 404: return ErrAuthDead case 400, 413, 422: return ErrRequestSpecific default: return ErrUnknown } } // Classify inspects a provider error and returns its ErrKind. // // Why: the failover composite needs typed, status-code-aware classification to // retry/bench/skip correctly across the anthropic, openai-compat, and ollama // providers, each of which surfaces errors differently. // What: prefers anthropic's typed Is*Err helpers, falls back to numeric status // codes (openai-go, anthropic RequestError), then the openaicompat // FeatureUnsupportedError, context errors, and finally an ollama HTTP-string // fallback; unrecognized errors are ErrUnknown. // Test: classify_test.go faked SDK errors exercise every branch. func Classify(err error) ErrKind { if err == nil { return ErrUnknown } // context.Canceled is reported as ErrUnknown here; the failover algorithm // special-cases it as an abort before consulting the kind. if errors.Is(err, context.Canceled) { return ErrUnknown } if errors.Is(err, context.DeadlineExceeded) { return ErrTransient } // FeatureUnsupportedError is a permanent, request-shaped failure. var featErr *openaicompat.FeatureUnsupportedError if errors.As(err, &featErr) { return ErrRequestSpecific } // Anthropic APIError: prefer the typed helpers (no StatusCode available). var apiErr *anth.APIError if errors.As(err, &apiErr) { switch { case apiErr.IsRateLimitErr(), apiErr.IsOverloadedErr(), apiErr.IsApiErr(): return ErrTransient case apiErr.IsAuthenticationErr(), apiErr.IsPermissionErr(), apiErr.IsNotFoundErr(): return ErrAuthDead case apiErr.IsTooLargeErr(), apiErr.IsInvalidRequestErr(): return ErrRequestSpecific default: return ErrUnknown } } // Anthropic RequestError: status-code based. var anthReqErr *anth.RequestError if errors.As(err, &anthReqErr) { return classifyStatus(anthReqErr.StatusCode) } // openai-go (openai/deepseek/moonshot/xai/groq): status-code based. var oaiErr *openai.Error if errors.As(err, &oaiErr) { return classifyStatus(oaiErr.StatusCode) } // Ollama: no typed status — fall back to its "ollama: HTTP :" string. if k := classifyOllamaString(err.Error()); k != ErrUnknown { return k } return ErrUnknown } // classifyOllamaString extracts an HTTP status from ollama's error string // format ("ollama: HTTP : ...") and classifies it. // // Why: the ollama provider stringifies errors without a typed status code, so // failover can only classify by parsing the message. // What: looks for "HTTP " in the message and maps the code; returns // ErrUnknown when no recognizable status is present. // Test: classify_test.go ollama cases cover 5xx/429/401/404/400. func classifyOllamaString(msg string) ErrKind { const marker = "HTTP " idx := strings.Index(msg, marker) if idx < 0 { return ErrUnknown } rest := msg[idx+len(marker):] // Read up to 3 leading digits. end := 0 for end < len(rest) && end < 3 && rest[end] >= '0' && rest[end] <= '9' { end++ } if end == 0 { return ErrUnknown } code := 0 for i := 0; i < end; i++ { code = code*10 + int(rest[i]-'0') } return classifyStatus(code) } // extractStatus best-effort pulls an HTTP status code out of a provider error // for structured logging. Returns 0 when none is available. // // Why: log lines benefit from the numeric status even though classification // may use typed helpers; this keeps that detail out of the hot path. // What: checks anthropic RequestError and openai-go Error StatusCode fields, // then parses ollama's "HTTP " string; returns 0 otherwise. // Test: covered indirectly via failover log assertions / manual inspection. func extractStatus(err error) int { if err == nil { return 0 } var anthReqErr *anth.RequestError if errors.As(err, &anthReqErr) { return anthReqErr.StatusCode } var oaiErr *openai.Error if errors.As(err, &oaiErr) { return oaiErr.StatusCode } const marker = "HTTP " msg := err.Error() if idx := strings.Index(msg, marker); idx >= 0 { rest := msg[idx+len(marker):] code, n := 0, 0 for n < len(rest) && n < 3 && rest[n] >= '0' && rest[n] <= '9' { code = code*10 + int(rest[n]-'0') n++ } if n > 0 { return code } } return 0 } // IsTransient reports whether an error should be retried/failed-over rather // than treated as a hard, model-specific failure. // // Why: callers (and failover) want a one-call "is this worth retrying?" check // that is conservative about unknown errors. // What: returns true for ErrTransient and ErrUnknown (conservative), false for // ErrAuthDead and ErrRequestSpecific. // Test: TestIsTransient asserts 503→true, unknown→true, 401→false, 400→false. func IsTransient(err error) bool { switch Classify(err) { case ErrTransient, ErrUnknown: return true default: return false } }