feat: add durable queue, single worker, and drain-by-model scheduling

Replace the Phase 2 in-flight chat gate (buffered channel) with a real
SQLite-backed job queue and single worker loop. Every /api/chat request
now creates a job row, blocks until the worker completes it, and returns
the result transparently.

Key changes:
- internal/store: NextJob (drain-by-model ordering), IncrementAttempt,
  ResetInterruptedJobs, DeleteTerminalJobsBefore; busy_timeout pragma
- internal/worker: single-threaded worker loop with Notifier for sync
  handler completion signaling; retry on ConnectionError, terminal fail
  on HTTPError; crash recovery resets interrupted jobs on startup
- internal/webhook: dispatcher infrastructure for async webhook delivery
- internal/server: chat handler rewritten to enqueue+wait; old chatGate
  removed; embeddings remain direct concurrent proxies (ADR-0013)
- internal/config: FOREMAN_MAX_ATTEMPTS, FOREMAN_JOB_TTL

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-23 18:29:32 -04:00
parent 27f196d333
commit 6fd050855a
11 changed files with 1830 additions and 183 deletions
+23
View File
@@ -11,6 +11,7 @@ package config
import (
"fmt"
"os"
"strconv"
"time"
)
@@ -39,6 +40,14 @@ type Config struct {
// WebhookSecret is an optional HMAC key for signing webhook payloads.
WebhookSecret string
// MaxAttempts is the maximum number of retry attempts for a job before it is
// marked as failed (default 3).
MaxAttempts int
// JobTTL is how long terminal jobs are retained before the pruner deletes them
// (default 24h).
JobTTL time.Duration
}
// Load reads configuration from environment variables and returns a validated Config.
@@ -64,6 +73,20 @@ func Load() (Config, error) {
}
cfg.PollInterval = dur
maxAttemptsStr := envOr("FOREMAN_MAX_ATTEMPTS", "3")
maxAttempts, err := strconv.Atoi(maxAttemptsStr)
if err != nil {
return Config{}, fmt.Errorf("invalid FOREMAN_MAX_ATTEMPTS %q: %w", maxAttemptsStr, err)
}
cfg.MaxAttempts = maxAttempts
jobTTLStr := envOr("FOREMAN_JOB_TTL", "24h")
jobTTL, err := time.ParseDuration(jobTTLStr)
if err != nil {
return Config{}, fmt.Errorf("invalid FOREMAN_JOB_TTL %q: %w", jobTTLStr, err)
}
cfg.JobTTL = jobTTL
if cfg.OllamaURL == "" {
return Config{}, fmt.Errorf("FOREMAN_OLLAMA_URL is required")
}