package llm import ( "context" "errors" "fmt" "net" "net/http" "strings" "syscall" ) // ErrorClass buckets errors for retry/failover decisions. type ErrorClass int const ( // ClassTransient errors may succeed on retry or on another target: // rate limits, server errors, timeouts, connection failures. ClassTransient ErrorClass = iota // ClassPermanent errors will not improve on retry of the same request: // malformed requests, auth failures, model-not-found. ClassPermanent ) // ErrModelNotFound marks a permanent "this target does not know this model" // condition. Chains advance past it without penalizing the target's health. var ErrModelNotFound = errors.New("model not found") // APIError is a structured provider error carrying enough context to // classify it and to debug it. type APIError struct { // Provider and Model identify the target that failed. Provider string Model string // Status is the HTTP status code, or 0 when the failure was not an HTTP // response (connection error, decode error, ...). Status int // Code is the provider-specific error code, when one was supplied. Code string // Message is the provider's human-readable error message. Message string // Err is the wrapped underlying cause, if any. Err error } func (e *APIError) Error() string { var b strings.Builder fmt.Fprintf(&b, "%s/%s", e.Provider, e.Model) if e.Status != 0 { fmt.Fprintf(&b, ": HTTP %d", e.Status) } if e.Code != "" { fmt.Fprintf(&b, " [%s]", e.Code) } if e.Message != "" { fmt.Fprintf(&b, ": %s", e.Message) } if e.Err != nil { fmt.Fprintf(&b, ": %v", e.Err) } return b.String() } func (e *APIError) Unwrap() error { if e.Err != nil { return e.Err } if e.Status == http.StatusNotFound { return ErrModelNotFound } return nil } // Classify buckets an error as transient or permanent. // // The default policy (overridable via health configuration): // - context.Canceled is permanent — the caller gave up; retrying defies // their intent. context.DeadlineExceeded is transient. // - Network timeouts, refused/reset connections, and DNS failures are // transient ("high demand" conditions). // - HTTP 400/401/403/404/405/422 (and ErrModelNotFound) are permanent; // 408/429 and all 5xx are transient. // - Anything unrecognized is transient: when in doubt, failing over to the // next target in a chain can only help availability. func Classify(err error) ErrorClass { if err == nil { return ClassTransient } if errors.Is(err, context.Canceled) { return ClassPermanent } if errors.Is(err, context.DeadlineExceeded) { return ClassTransient } if errors.Is(err, ErrModelNotFound) { return ClassPermanent } if errors.Is(err, syscall.ECONNREFUSED) || errors.Is(err, syscall.ECONNRESET) { return ClassTransient } if _, ok := errors.AsType[net.Error](err); ok { return ClassTransient } if apiErr, ok := errors.AsType[*APIError](err); ok && apiErr.Status != 0 { switch { case apiErr.Status == http.StatusRequestTimeout, // 408 apiErr.Status == http.StatusTooManyRequests, // 429 apiErr.Status >= 500: return ClassTransient case apiErr.Status >= 400: return ClassPermanent } } return ClassTransient }