feat(ollama): add automatic retry with exponential backoff for transient HTTP errors
CI / Build, Test & Lint (push) Successful in 10m50s
CI / Build, Test & Lint (push) Successful in 10m50s
Ollama Cloud returns HTTP 503 when the model is temporarily overloaded, 429 on rate limit, and 502 on upstream failures. These are transient conditions that resolve on retry. Previously they bubbled up as hard errors, forcing callers to implement their own retry logic. The retry is implemented at the HTTP transport level in doChatRequest, so both Complete and Stream benefit transparently. Strategy: up to 3 retries with exponential backoff (1s, 2s, 4s), Retry-After header respected for 429, context cancellation checked between retries. Non-transient errors (400, 401, 403, 404, 500) are never retried. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+93
-11
@@ -12,8 +12,11 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-llm/v2/provider"
|
||||
)
|
||||
@@ -25,6 +28,22 @@ const DefaultLocalBaseURL = "http://localhost:11434"
|
||||
// DefaultCloudBaseURL is the default base URL for Ollama Cloud.
|
||||
const DefaultCloudBaseURL = "https://ollama.com"
|
||||
|
||||
// retryMaxAttempts is the maximum number of retry attempts for transient HTTP
|
||||
// errors (503, 429, 502). Total attempts = 1 initial + retryMaxAttempts.
|
||||
const retryMaxAttempts = 3
|
||||
|
||||
// retryBaseDelay is the base delay for exponential backoff between retries.
|
||||
// Actual delays: 1s, 2s, 4s (base * 2^attempt).
|
||||
const retryBaseDelay = 1 * time.Second
|
||||
|
||||
// isTransientHTTPStatus reports whether the HTTP status code indicates a
|
||||
// transient server-side condition that may resolve on retry.
|
||||
func isTransientHTTPStatus(code int) bool {
|
||||
return code == http.StatusBadGateway || // 502
|
||||
code == http.StatusServiceUnavailable || // 503
|
||||
code == http.StatusTooManyRequests // 429
|
||||
}
|
||||
|
||||
// Provider implements provider.Provider over Ollama's native /api/chat
|
||||
// endpoint. An empty apiKey means local-mode (no Authorization header sent);
|
||||
// a non-empty apiKey is sent as a Bearer token (cloud-mode).
|
||||
@@ -32,6 +51,10 @@ type Provider struct {
|
||||
apiKey string
|
||||
baseURL string
|
||||
client *http.Client
|
||||
|
||||
// retryBaseDelayOverride, when non-zero, replaces retryBaseDelay for
|
||||
// testing. Production code leaves this at the zero value.
|
||||
retryBaseDelayOverride time.Duration
|
||||
}
|
||||
|
||||
// newNative constructs a native Ollama provider. Callers should use the
|
||||
@@ -420,22 +443,81 @@ func (p *Provider) buildChatRequest(req provider.Request, stream bool) ([]byte,
|
||||
}
|
||||
|
||||
// doChatRequest POSTs the wire body to /api/chat and returns the raw HTTP
|
||||
// response. The caller is responsible for closing the response body.
|
||||
// response. Transient HTTP errors (502, 503, 429) are retried with exponential
|
||||
// backoff up to retryMaxAttempts times. The caller is responsible for closing
|
||||
// the response body.
|
||||
func (p *Provider) doChatRequest(ctx context.Context, body []byte) (*http.Response, error) {
|
||||
url := strings.TrimRight(p.baseURL, "/") + "/api/chat"
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("ollama: build request: %w", err)
|
||||
|
||||
for attempt := 0; ; attempt++ {
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("ollama: build request: %w", err)
|
||||
}
|
||||
httpReq.Header.Set("Content-Type", "application/json")
|
||||
if p.apiKey != "" {
|
||||
httpReq.Header.Set("Authorization", "Bearer "+p.apiKey)
|
||||
}
|
||||
|
||||
resp, err := p.client.Do(httpReq)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("ollama: HTTP request: %w", err)
|
||||
}
|
||||
|
||||
// On success or non-transient error, return immediately.
|
||||
if !isTransientHTTPStatus(resp.StatusCode) || attempt >= retryMaxAttempts {
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
// Transient error — drain and close the body before retrying.
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
resp.Body.Close()
|
||||
|
||||
delay := retryBackoff(attempt, resp.Header, p.retryBaseDelayOverride)
|
||||
slog.Info("ollama: retrying after transient HTTP error",
|
||||
"status", resp.StatusCode,
|
||||
"attempt", attempt+1,
|
||||
"max_attempts", retryMaxAttempts,
|
||||
"delay", delay,
|
||||
"body", truncateBody(respBody, 200),
|
||||
)
|
||||
|
||||
// Wait for backoff or context cancellation.
|
||||
timer := time.NewTimer(delay)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
timer.Stop()
|
||||
return nil, ctx.Err()
|
||||
case <-timer.C:
|
||||
}
|
||||
}
|
||||
httpReq.Header.Set("Content-Type", "application/json")
|
||||
if p.apiKey != "" {
|
||||
httpReq.Header.Set("Authorization", "Bearer "+p.apiKey)
|
||||
}
|
||||
|
||||
// retryBackoff computes the delay before the next retry attempt. It uses
|
||||
// exponential backoff (base * 2^attempt), but respects the Retry-After header
|
||||
// when present (for 429 responses). baseOverride, when non-zero, replaces the
|
||||
// package-level retryBaseDelay constant (used by tests to avoid real waits).
|
||||
func retryBackoff(attempt int, header http.Header, baseOverride time.Duration) time.Duration {
|
||||
// Check Retry-After header (seconds value or HTTP-date; we only parse seconds).
|
||||
if ra := header.Get("Retry-After"); ra != "" {
|
||||
if secs, err := strconv.Atoi(ra); err == nil && secs > 0 {
|
||||
return time.Duration(secs) * time.Second
|
||||
}
|
||||
}
|
||||
resp, err := p.client.Do(httpReq)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("ollama: HTTP request: %w", err)
|
||||
base := retryBaseDelay
|
||||
if baseOverride > 0 {
|
||||
base = baseOverride
|
||||
}
|
||||
return resp, nil
|
||||
return base * (1 << attempt)
|
||||
}
|
||||
|
||||
// truncateBody returns a string of at most maxLen bytes from b, appending
|
||||
// "..." when truncated. Used for readable log output of error response bodies.
|
||||
func truncateBody(b []byte, maxLen int) string {
|
||||
if len(b) <= maxLen {
|
||||
return string(b)
|
||||
}
|
||||
return string(b[:maxLen]) + "..."
|
||||
}
|
||||
|
||||
// convertMessage maps a provider.Message into a native wire message.
|
||||
|
||||
Reference in New Issue
Block a user