feat: add durable queue, single worker, and drain-by-model scheduling

Replace the Phase 2 in-flight chat gate (buffered channel) with a real
SQLite-backed job queue and single worker loop. Every /api/chat request
now creates a job row, blocks until the worker completes it, and returns
the result transparently.

Key changes:
- internal/store: NextJob (drain-by-model ordering), IncrementAttempt,
  ResetInterruptedJobs, DeleteTerminalJobsBefore; busy_timeout pragma
- internal/worker: single-threaded worker loop with Notifier for sync
  handler completion signaling; retry on ConnectionError, terminal fail
  on HTTPError; crash recovery resets interrupted jobs on startup
- internal/webhook: dispatcher infrastructure for async webhook delivery
- internal/server: chat handler rewritten to enqueue+wait; old chatGate
  removed; embeddings remain direct concurrent proxies (ADR-0013)
- internal/config: FOREMAN_MAX_ATTEMPTS, FOREMAN_JOB_TTL

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-23 18:29:32 -04:00
parent 27f196d333
commit 6fd050855a
11 changed files with 1830 additions and 183 deletions
+105 -82
View File
@@ -3,54 +3,72 @@
// Why: foreman exposes a native Ollama-compatible API plus async job endpoints;
// centralizing routing and middleware here keeps cmd/foreman thin.
// What: creates a stdlib net/http server with health checks, optional bearer-token
// auth, Ollama passthrough (chat, tags, ps, embed), and an extensible mux.
// auth, Ollama passthrough (chat, tags, ps, embed), /jobs async surface, and
// artifact serving.
// Test: start the server with httptest, hit /healthz, verify 200; set a token,
// verify 401 without it; test Ollama passthrough routes.
// verify 401 without it; test Ollama passthrough routes and /jobs lifecycle.
package server
import (
"bufio"
"crypto/rand"
"encoding/json"
"fmt"
"io"
"log/slog"
"net/http"
"strings"
"time"
"github.com/oklog/ulid/v2"
"gitea.stevedudenhoeffer.com/steve/foreman/internal/config"
"gitea.stevedudenhoeffer.com/steve/foreman/internal/ollama"
"gitea.stevedudenhoeffer.com/steve/foreman/internal/store"
"gitea.stevedudenhoeffer.com/steve/foreman/internal/webhook"
"gitea.stevedudenhoeffer.com/steve/foreman/internal/worker"
)
// scannerBufSize is the buffer size for the NDJSON scanner (4 MB).
const scannerBufSize = 4 * 1024 * 1024
// Server holds the HTTP server and its dependencies.
type Server struct {
cfg config.Config
store *store.Store
client ollama.Client
inventory *ollama.ModelInventory
chatGate chan struct{}
mux *http.ServeMux
logger *slog.Logger
cfg config.Config
store *store.Store
client ollama.Client
inventory *ollama.ModelInventory
notifier *worker.Notifier
workerRef *worker.Worker
dispatcher *webhook.Dispatcher
mux *http.ServeMux
logger *slog.Logger
}
// New creates a new Server with the given configuration, store, Ollama client,
// and model inventory. The mux is populated with all routes.
// model inventory, notifier, worker, and webhook dispatcher. The mux is populated
// with all routes.
//
// Why: dependency injection makes the server testable and extensible.
// What: wires config, store, client, inventory, and logger into the server,
// registers routes, and creates the single-flight chat gate.
// What: wires config, store, client, inventory, notifier, worker, dispatcher, and
// logger into the server, registers all routes.
// Test: create with New, use httptest to exercise routes.
func New(cfg config.Config, st *store.Store, client ollama.Client, inv *ollama.ModelInventory, logger *slog.Logger) *Server {
func New(
cfg config.Config,
st *store.Store,
client ollama.Client,
inv *ollama.ModelInventory,
notifier *worker.Notifier,
w *worker.Worker,
dispatcher *webhook.Dispatcher,
logger *slog.Logger,
) *Server {
s := &Server{
cfg: cfg,
store: st,
client: client,
inventory: inv,
chatGate: make(chan struct{}, 1),
mux: http.NewServeMux(),
logger: logger,
cfg: cfg,
store: st,
client: client,
inventory: inv,
notifier: notifier,
workerRef: w,
dispatcher: dispatcher,
mux: http.NewServeMux(),
logger: logger,
}
s.routes()
return s
@@ -83,6 +101,7 @@ func (s *Server) routes() {
s.mux.HandleFunc("POST /api/chat", s.handleChat)
s.mux.HandleFunc("POST /api/embed", s.handleEmbed)
s.mux.HandleFunc("POST /api/embeddings", s.handleEmbed)
s.registerJobRoutes()
}
// healthResponse is the JSON shape returned by /healthz.
@@ -170,15 +189,16 @@ func (s *Server) handleEmbed(w http.ResponseWriter, r *http.Request) {
io.Copy(w, resp.Body)
}
// handleChat is the critical passthrough path for /api/chat. It validates the
// model, serializes through a single-flight gate, and proxies to the target
// with NDJSON streaming support.
// handleChat is the synchronous passthrough for /api/chat. It enqueues a job in
// the SQLite queue and blocks until the worker completes it, then returns the
// result as if it came directly from Ollama.
//
// Why: the sync passthrough is foreman's primary API surface for go-llm (ADR-0003).
// What: validates model, acquires the chat gate, proxies to the target, streams
// NDJSON chunks back if streaming, releases the gate on completion.
// Test: verify model validation (404 on unknown), serialization (two concurrent
// requests don't overlap), streaming (NDJSON chunks pass through faithfully).
// The response blocks until done so the caller gets a transparent Ollama experience.
// What: validates model, creates a job, registers a completion waiter, wakes the
// worker, and blocks until done or context cancellation.
// Test: verify model validation (404 on unknown), serialization (jobs execute one
// at a time), and that the HTTP response matches the Ollama chat response.
func (s *Server) handleChat(w http.ResponseWriter, r *http.Request) {
body, err := io.ReadAll(r.Body)
if err != nil {
@@ -186,10 +206,9 @@ func (s *Server) handleChat(w http.ResponseWriter, r *http.Request) {
return
}
// Parse just enough to validate the model and detect streaming.
// Parse just enough to validate the model.
var partial struct {
Model string `json:"model"`
Stream *bool `json:"stream"`
Model string `json:"model"`
}
if err := json.Unmarshal(body, &partial); err != nil {
http.Error(w, `{"error":"invalid JSON body"}`, http.StatusBadRequest)
@@ -211,64 +230,68 @@ func (s *Server) handleChat(w http.ResponseWriter, r *http.Request) {
}
}
// Determine if streaming. Ollama defaults to streaming when "stream" is absent.
streaming := true
if partial.Stream != nil && !*partial.Stream {
streaming = false
// Generate a job ID and enqueue.
jobID := ulid.MustNew(ulid.Timestamp(time.Now()), rand.Reader).String()
maxAttempts := s.cfg.MaxAttempts
if maxAttempts == 0 {
maxAttempts = 3
}
// Acquire the single-flight chat gate. This serializes all chat requests
// through one at a time. Phase 3 replaces this with the full SQLite queue +
// worker loop.
select {
case s.chatGate <- struct{}{}:
// Acquired.
case <-r.Context().Done():
http.Error(w, `{"error":"request cancelled while waiting"}`, http.StatusServiceUnavailable)
job := store.Job{
ID: jobID,
Model: partial.Model,
Payload: json.RawMessage(body),
MaxAttempts: maxAttempts,
}
if _, err := s.store.CreateJob(job); err != nil {
s.logger.Error("failed to enqueue chat job", "error", err, "job_id", jobID, "model", partial.Model)
http.Error(w, fmt.Sprintf(`{"error":"failed to enqueue job: %s"}`, err), http.StatusInternalServerError)
return
}
defer func() { <-s.chatGate }()
// Proxy to the target.
resp, err := s.client.RawChat(r.Context(), body)
if err != nil {
s.logger.Error("chat proxy failed", "error", err, "model", partial.Model)
if httpErr, ok := err.(*ollama.HTTPError); ok {
http.Error(w, httpErr.Body, httpErr.StatusCode)
// Register a completion waiter before waking the worker.
waitCh := s.notifier.Register(jobID)
// Wake the worker.
if s.workerRef != nil {
s.workerRef.Wake()
}
// Block until the job completes or the request is cancelled.
select {
case <-waitCh:
// Job completed — get the result.
state, result, errMsg, ok := s.notifier.Result(jobID)
if !ok {
// Should not happen, but fall back to DB.
j, err := s.store.GetJob(jobID)
if err != nil {
http.Error(w, `{"error":"job lost"}`, http.StatusInternalServerError)
return
}
state = j.State
result = j.Result
errMsg = j.Error
}
if state == store.JobStateFailed {
msg := "job failed"
if errMsg != nil {
msg = *errMsg
}
http.Error(w, fmt.Sprintf(`{"error":%q}`, msg), http.StatusBadGateway)
return
}
http.Error(w, `{"error":"target unreachable"}`, http.StatusBadGateway)
return
}
defer resp.Body.Close()
if streaming {
w.Header().Set("Content-Type", "application/x-ndjson")
w.WriteHeader(http.StatusOK)
flusher, canFlush := w.(http.Flusher)
scanner := bufio.NewScanner(resp.Body)
scanner.Buffer(make([]byte, 0, scannerBufSize), scannerBufSize)
for scanner.Scan() {
line := scanner.Bytes()
if len(line) == 0 {
continue
}
w.Write(line)
w.Write([]byte("\n"))
if canFlush {
flusher.Flush()
}
}
if err := scanner.Err(); err != nil {
s.logger.Warn("stream read error", "error", err, "model", partial.Model)
}
} else {
// Non-streaming: proxy the complete JSON response.
// Return the result as a direct Ollama response.
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
io.Copy(w, resp.Body)
w.Write(result)
case <-r.Context().Done():
http.Error(w, `{"error":"request cancelled while waiting"}`, http.StatusServiceUnavailable)
}
}