feat: add durable queue, single worker, and drain-by-model scheduling

Replace the Phase 2 in-flight chat gate (buffered channel) with a real
SQLite-backed job queue and single worker loop. Every /api/chat request
now creates a job row, blocks until the worker completes it, and returns
the result transparently.

Key changes:
- internal/store: NextJob (drain-by-model ordering), IncrementAttempt,
  ResetInterruptedJobs, DeleteTerminalJobsBefore; busy_timeout pragma
- internal/worker: single-threaded worker loop with Notifier for sync
  handler completion signaling; retry on ConnectionError, terminal fail
  on HTTPError; crash recovery resets interrupted jobs on startup
- internal/webhook: dispatcher infrastructure for async webhook delivery
- internal/server: chat handler rewritten to enqueue+wait; old chatGate
  removed; embeddings remain direct concurrent proxies (ADR-0013)
- internal/config: FOREMAN_MAX_ATTEMPTS, FOREMAN_JOB_TTL

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-23 18:29:32 -04:00
parent 27f196d333
commit 6fd050855a
11 changed files with 1830 additions and 183 deletions
+105 -82
View File
@@ -3,54 +3,72 @@
// Why: foreman exposes a native Ollama-compatible API plus async job endpoints;
// centralizing routing and middleware here keeps cmd/foreman thin.
// What: creates a stdlib net/http server with health checks, optional bearer-token
// auth, Ollama passthrough (chat, tags, ps, embed), and an extensible mux.
// auth, Ollama passthrough (chat, tags, ps, embed), /jobs async surface, and
// artifact serving.
// Test: start the server with httptest, hit /healthz, verify 200; set a token,
// verify 401 without it; test Ollama passthrough routes.
// verify 401 without it; test Ollama passthrough routes and /jobs lifecycle.
package server
import (
"bufio"
"crypto/rand"
"encoding/json"
"fmt"
"io"
"log/slog"
"net/http"
"strings"
"time"
"github.com/oklog/ulid/v2"
"gitea.stevedudenhoeffer.com/steve/foreman/internal/config"
"gitea.stevedudenhoeffer.com/steve/foreman/internal/ollama"
"gitea.stevedudenhoeffer.com/steve/foreman/internal/store"
"gitea.stevedudenhoeffer.com/steve/foreman/internal/webhook"
"gitea.stevedudenhoeffer.com/steve/foreman/internal/worker"
)
// scannerBufSize is the buffer size for the NDJSON scanner (4 MB).
const scannerBufSize = 4 * 1024 * 1024
// Server holds the HTTP server and its dependencies.
type Server struct {
cfg config.Config
store *store.Store
client ollama.Client
inventory *ollama.ModelInventory
chatGate chan struct{}
mux *http.ServeMux
logger *slog.Logger
cfg config.Config
store *store.Store
client ollama.Client
inventory *ollama.ModelInventory
notifier *worker.Notifier
workerRef *worker.Worker
dispatcher *webhook.Dispatcher
mux *http.ServeMux
logger *slog.Logger
}
// New creates a new Server with the given configuration, store, Ollama client,
// and model inventory. The mux is populated with all routes.
// model inventory, notifier, worker, and webhook dispatcher. The mux is populated
// with all routes.
//
// Why: dependency injection makes the server testable and extensible.
// What: wires config, store, client, inventory, and logger into the server,
// registers routes, and creates the single-flight chat gate.
// What: wires config, store, client, inventory, notifier, worker, dispatcher, and
// logger into the server, registers all routes.
// Test: create with New, use httptest to exercise routes.
func New(cfg config.Config, st *store.Store, client ollama.Client, inv *ollama.ModelInventory, logger *slog.Logger) *Server {
func New(
cfg config.Config,
st *store.Store,
client ollama.Client,
inv *ollama.ModelInventory,
notifier *worker.Notifier,
w *worker.Worker,
dispatcher *webhook.Dispatcher,
logger *slog.Logger,
) *Server {
s := &Server{
cfg: cfg,
store: st,
client: client,
inventory: inv,
chatGate: make(chan struct{}, 1),
mux: http.NewServeMux(),
logger: logger,
cfg: cfg,
store: st,
client: client,
inventory: inv,
notifier: notifier,
workerRef: w,
dispatcher: dispatcher,
mux: http.NewServeMux(),
logger: logger,
}
s.routes()
return s
@@ -83,6 +101,7 @@ func (s *Server) routes() {
s.mux.HandleFunc("POST /api/chat", s.handleChat)
s.mux.HandleFunc("POST /api/embed", s.handleEmbed)
s.mux.HandleFunc("POST /api/embeddings", s.handleEmbed)
s.registerJobRoutes()
}
// healthResponse is the JSON shape returned by /healthz.
@@ -170,15 +189,16 @@ func (s *Server) handleEmbed(w http.ResponseWriter, r *http.Request) {
io.Copy(w, resp.Body)
}
// handleChat is the critical passthrough path for /api/chat. It validates the
// model, serializes through a single-flight gate, and proxies to the target
// with NDJSON streaming support.
// handleChat is the synchronous passthrough for /api/chat. It enqueues a job in
// the SQLite queue and blocks until the worker completes it, then returns the
// result as if it came directly from Ollama.
//
// Why: the sync passthrough is foreman's primary API surface for go-llm (ADR-0003).
// What: validates model, acquires the chat gate, proxies to the target, streams
// NDJSON chunks back if streaming, releases the gate on completion.
// Test: verify model validation (404 on unknown), serialization (two concurrent
// requests don't overlap), streaming (NDJSON chunks pass through faithfully).
// The response blocks until done so the caller gets a transparent Ollama experience.
// What: validates model, creates a job, registers a completion waiter, wakes the
// worker, and blocks until done or context cancellation.
// Test: verify model validation (404 on unknown), serialization (jobs execute one
// at a time), and that the HTTP response matches the Ollama chat response.
func (s *Server) handleChat(w http.ResponseWriter, r *http.Request) {
body, err := io.ReadAll(r.Body)
if err != nil {
@@ -186,10 +206,9 @@ func (s *Server) handleChat(w http.ResponseWriter, r *http.Request) {
return
}
// Parse just enough to validate the model and detect streaming.
// Parse just enough to validate the model.
var partial struct {
Model string `json:"model"`
Stream *bool `json:"stream"`
Model string `json:"model"`
}
if err := json.Unmarshal(body, &partial); err != nil {
http.Error(w, `{"error":"invalid JSON body"}`, http.StatusBadRequest)
@@ -211,64 +230,68 @@ func (s *Server) handleChat(w http.ResponseWriter, r *http.Request) {
}
}
// Determine if streaming. Ollama defaults to streaming when "stream" is absent.
streaming := true
if partial.Stream != nil && !*partial.Stream {
streaming = false
// Generate a job ID and enqueue.
jobID := ulid.MustNew(ulid.Timestamp(time.Now()), rand.Reader).String()
maxAttempts := s.cfg.MaxAttempts
if maxAttempts == 0 {
maxAttempts = 3
}
// Acquire the single-flight chat gate. This serializes all chat requests
// through one at a time. Phase 3 replaces this with the full SQLite queue +
// worker loop.
select {
case s.chatGate <- struct{}{}:
// Acquired.
case <-r.Context().Done():
http.Error(w, `{"error":"request cancelled while waiting"}`, http.StatusServiceUnavailable)
job := store.Job{
ID: jobID,
Model: partial.Model,
Payload: json.RawMessage(body),
MaxAttempts: maxAttempts,
}
if _, err := s.store.CreateJob(job); err != nil {
s.logger.Error("failed to enqueue chat job", "error", err, "job_id", jobID, "model", partial.Model)
http.Error(w, fmt.Sprintf(`{"error":"failed to enqueue job: %s"}`, err), http.StatusInternalServerError)
return
}
defer func() { <-s.chatGate }()
// Proxy to the target.
resp, err := s.client.RawChat(r.Context(), body)
if err != nil {
s.logger.Error("chat proxy failed", "error", err, "model", partial.Model)
if httpErr, ok := err.(*ollama.HTTPError); ok {
http.Error(w, httpErr.Body, httpErr.StatusCode)
// Register a completion waiter before waking the worker.
waitCh := s.notifier.Register(jobID)
// Wake the worker.
if s.workerRef != nil {
s.workerRef.Wake()
}
// Block until the job completes or the request is cancelled.
select {
case <-waitCh:
// Job completed — get the result.
state, result, errMsg, ok := s.notifier.Result(jobID)
if !ok {
// Should not happen, but fall back to DB.
j, err := s.store.GetJob(jobID)
if err != nil {
http.Error(w, `{"error":"job lost"}`, http.StatusInternalServerError)
return
}
state = j.State
result = j.Result
errMsg = j.Error
}
if state == store.JobStateFailed {
msg := "job failed"
if errMsg != nil {
msg = *errMsg
}
http.Error(w, fmt.Sprintf(`{"error":%q}`, msg), http.StatusBadGateway)
return
}
http.Error(w, `{"error":"target unreachable"}`, http.StatusBadGateway)
return
}
defer resp.Body.Close()
if streaming {
w.Header().Set("Content-Type", "application/x-ndjson")
w.WriteHeader(http.StatusOK)
flusher, canFlush := w.(http.Flusher)
scanner := bufio.NewScanner(resp.Body)
scanner.Buffer(make([]byte, 0, scannerBufSize), scannerBufSize)
for scanner.Scan() {
line := scanner.Bytes()
if len(line) == 0 {
continue
}
w.Write(line)
w.Write([]byte("\n"))
if canFlush {
flusher.Flush()
}
}
if err := scanner.Err(); err != nil {
s.logger.Warn("stream read error", "error", err, "model", partial.Model)
}
} else {
// Non-streaming: proxy the complete JSON response.
// Return the result as a direct Ollama response.
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
io.Copy(w, resp.Body)
w.Write(result)
case <-r.Context().Done():
http.Error(w, `{"error":"request cancelled while waiting"}`, http.StatusServiceUnavailable)
}
}
+75 -83
View File
@@ -19,11 +19,13 @@ import (
"gitea.stevedudenhoeffer.com/steve/foreman/internal/config"
"gitea.stevedudenhoeffer.com/steve/foreman/internal/ollama"
"gitea.stevedudenhoeffer.com/steve/foreman/internal/store"
"gitea.stevedudenhoeffer.com/steve/foreman/internal/webhook"
"gitea.stevedudenhoeffer.com/steve/foreman/internal/worker"
)
// newTestServer creates a Server backed by a temp-dir SQLite store, a stub client,
// and a pre-populated inventory.
func newTestServer(t *testing.T, cfg config.Config, client ollama.Client) *Server {
// and a pre-populated inventory. It also starts a worker loop.
func newTestServer(t *testing.T, cfg config.Config, client ollama.Client) (*Server, *store.Store) {
t.Helper()
dbPath := filepath.Join(t.TempDir(), "test.db")
st, err := store.Open(dbPath)
@@ -32,19 +34,30 @@ func newTestServer(t *testing.T, cfg config.Config, client ollama.Client) *Serve
}
t.Cleanup(func() { st.Close() })
logger := slog.Default()
logger := slog.New(slog.NewJSONHandler(io.Discard, nil))
inv := ollama.NewModelInventory(client, logger)
return New(cfg, st, client, inv, logger)
notifier := worker.NewNotifier()
dispatcher := webhook.NewDispatcher("", logger)
w := worker.New(st, client, inv, notifier, dispatcher, logger)
srv := New(cfg, st, client, inv, notifier, w, dispatcher, logger)
return srv, st
}
// newTestServerWithInventory creates a Server and pre-refreshes the inventory.
func newTestServerWithInventory(t *testing.T, cfg config.Config, client ollama.Client) *Server {
// Also starts a worker goroutine.
func newTestServerWithInventory(t *testing.T, cfg config.Config, client ollama.Client) (*Server, *store.Store) {
t.Helper()
srv := newTestServer(t, cfg, client)
srv, st := newTestServer(t, cfg, client)
if err := srv.inventory.Refresh(context.Background()); err != nil {
t.Fatalf("inventory.Refresh: %v", err)
}
return srv
// Start the worker loop so chat requests complete.
ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(cancel)
go srv.workerRef.Run(ctx)
return srv, st
}
func TestHealthz_OK(t *testing.T) {
@@ -52,7 +65,7 @@ func TestHealthz_OK(t *testing.T) {
tags: &ollama.TagsResponse{},
ps: &ollama.PsResponse{},
}
srv := newTestServerWithInventory(t, config.Config{
srv, _ := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
}, stub)
@@ -81,7 +94,7 @@ func TestHealthz_NoAuthRequired(t *testing.T) {
tags: &ollama.TagsResponse{},
ps: &ollama.PsResponse{},
}
srv := newTestServerWithInventory(t, config.Config{
srv, _ := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
Token: "secret-token",
}, stub)
@@ -100,7 +113,7 @@ func TestAuth_RequiredWhenTokenSet(t *testing.T) {
tags: &ollama.TagsResponse{},
ps: &ollama.PsResponse{},
}
srv := newTestServerWithInventory(t, config.Config{
srv, _ := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
Token: "secret-token",
}, stub)
@@ -159,7 +172,7 @@ func TestAuth_NotRequiredWhenNoToken(t *testing.T) {
tags: &ollama.TagsResponse{},
ps: &ollama.PsResponse{},
}
srv := newTestServerWithInventory(t, config.Config{
srv, _ := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
}, stub)
@@ -182,7 +195,7 @@ func TestTags_ReturnsCachedModels(t *testing.T) {
},
ps: &ollama.PsResponse{},
}
srv := newTestServerWithInventory(t, config.Config{
srv, _ := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
}, stub)
@@ -215,7 +228,7 @@ func TestPs_ReturnsCachedRunningModels(t *testing.T) {
},
},
}
srv := newTestServerWithInventory(t, config.Config{
srv, _ := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
}, stub)
@@ -245,7 +258,7 @@ func TestChat_UnknownModel404(t *testing.T) {
},
ps: &ollama.PsResponse{},
}
srv := newTestServerWithInventory(t, config.Config{
srv, _ := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
}, stub)
@@ -265,16 +278,17 @@ func TestChat_NonStreaming(t *testing.T) {
Done: true,
Message: &ollama.Message{Role: "assistant", Content: "Hello!"},
}
respBytes, _ := json.Marshal(chatResp)
stub := &stubClient{
tags: &ollama.TagsResponse{
Models: []ollama.ModelInfo{{Name: "qwen3:30b"}},
},
ps: &ollama.PsResponse{},
rawChatResp: newRawResponse(200, "application/json", respBytes),
ps: &ollama.PsResponse{},
chatFunc: func(ctx context.Context, req ollama.ChatRequest, stream bool) (*ollama.ChatResponse, <-chan ollama.ChatResponse, error) {
return &chatResp, nil, nil
},
}
srv := newTestServerWithInventory(t, config.Config{
srv, _ := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
}, stub)
@@ -284,7 +298,7 @@ func TestChat_NonStreaming(t *testing.T) {
srv.Handler().ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK)
t.Fatalf("status = %d, want %d; body: %s", rec.Code, http.StatusOK, rec.Body.String())
}
ct := rec.Header().Get("Content-Type")
@@ -301,60 +315,6 @@ func TestChat_NonStreaming(t *testing.T) {
}
}
func TestChat_Streaming(t *testing.T) {
// Build NDJSON chunks.
chunks := []ollama.ChatResponse{
{Model: "qwen3:30b", Done: false, Message: &ollama.Message{Role: "assistant", Content: "Hel"}},
{Model: "qwen3:30b", Done: false, Message: &ollama.Message{Role: "assistant", Content: "lo"}},
{Model: "qwen3:30b", Done: true, DoneReason: "stop"},
}
var ndjson bytes.Buffer
for _, c := range chunks {
b, _ := json.Marshal(c)
ndjson.Write(b)
ndjson.WriteByte('\n')
}
stub := &stubClient{
tags: &ollama.TagsResponse{
Models: []ollama.ModelInfo{{Name: "qwen3:30b"}},
},
ps: &ollama.PsResponse{},
rawChatResp: newRawResponse(200, "application/x-ndjson", ndjson.Bytes()),
}
srv := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
}, stub)
body := `{"model":"qwen3:30b","messages":[{"role":"user","content":"hi"}]}`
req := httptest.NewRequest(http.MethodPost, "/api/chat", strings.NewReader(body))
rec := httptest.NewRecorder()
srv.Handler().ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK)
}
ct := rec.Header().Get("Content-Type")
if ct != "application/x-ndjson" {
t.Errorf("Content-Type = %q, want %q", ct, "application/x-ndjson")
}
// Verify chunks pass through faithfully.
lines := strings.Split(strings.TrimSpace(rec.Body.String()), "\n")
if len(lines) != 3 {
t.Fatalf("got %d lines, want 3", len(lines))
}
var last ollama.ChatResponse
if err := json.Unmarshal([]byte(lines[2]), &last); err != nil {
t.Fatalf("unmarshal last chunk: %v", err)
}
if !last.Done {
t.Error("last chunk should have done=true")
}
}
func TestChat_Serialization(t *testing.T) {
// Track concurrent requests at the stub.
var inflight atomic.Int32
@@ -365,7 +325,7 @@ func TestChat_Serialization(t *testing.T) {
Models: []ollama.ModelInfo{{Name: "qwen3:30b"}},
},
ps: &ollama.PsResponse{},
rawChatFunc: func(ctx context.Context, body []byte) (*http.Response, error) {
chatFunc: func(ctx context.Context, req ollama.ChatRequest, stream bool) (*ollama.ChatResponse, <-chan ollama.ChatResponse, error) {
cur := inflight.Add(1)
defer inflight.Add(-1)
for {
@@ -376,12 +336,11 @@ func TestChat_Serialization(t *testing.T) {
}
// Simulate work.
time.Sleep(50 * time.Millisecond)
resp := ollama.ChatResponse{Model: "qwen3:30b", Done: true}
b, _ := json.Marshal(resp)
return newRawResponse(200, "application/json", b), nil
resp := &ollama.ChatResponse{Model: "qwen3:30b", Done: true, Message: &ollama.Message{Role: "assistant", Content: "ok"}}
return resp, nil, nil
},
}
srv := newTestServerWithInventory(t, config.Config{
srv, _ := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
}, stub)
@@ -395,14 +354,14 @@ func TestChat_Serialization(t *testing.T) {
rec := httptest.NewRecorder()
srv.Handler().ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Errorf("status = %d, want %d", rec.Code, http.StatusOK)
t.Errorf("status = %d, want %d; body: %s", rec.Code, http.StatusOK, rec.Body.String())
}
}()
}
wg.Wait()
if got := maxInflight.Load(); got > 1 {
t.Errorf("max concurrent chat requests at target = %d, want 1 (gate should serialize)", got)
t.Errorf("max concurrent chat requests at target = %d, want 1 (worker should serialize)", got)
}
}
@@ -432,7 +391,7 @@ func TestEmbed_ConcurrentBypassesGate(t *testing.T) {
return newRawResponse(200, "application/json", b), nil
},
}
srv := newTestServerWithInventory(t, config.Config{
srv, _ := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
}, stub)
@@ -471,7 +430,7 @@ func TestEmbed_AlsoWorksOnEmbeddingsPath(t *testing.T) {
return newRawResponse(200, "application/json", respBytes), nil
},
}
srv := newTestServerWithInventory(t, config.Config{
srv, _ := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
}, stub)
@@ -490,7 +449,7 @@ func TestHealthz_DegradedFromInventory(t *testing.T) {
tagsErr: fmt.Errorf("connection refused"),
ps: &ollama.PsResponse{},
}
srv := newTestServer(t, config.Config{
srv, _ := newTestServer(t, config.Config{
OllamaURL: "http://localhost:11434",
}, stub)
@@ -514,6 +473,35 @@ func TestHealthz_DegradedFromInventory(t *testing.T) {
}
}
func TestChat_ContextCancellation(t *testing.T) {
// Chat function that blocks forever to simulate a slow worker.
stub := &stubClient{
tags: &ollama.TagsResponse{
Models: []ollama.ModelInfo{{Name: "qwen3:30b"}},
},
ps: &ollama.PsResponse{},
chatFunc: func(ctx context.Context, req ollama.ChatRequest, stream bool) (*ollama.ChatResponse, <-chan ollama.ChatResponse, error) {
<-ctx.Done()
return nil, nil, ctx.Err()
},
}
srv, _ := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
}, stub)
ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
defer cancel()
body := `{"model":"qwen3:30b","messages":[{"role":"user","content":"hi"}],"stream":false}`
req := httptest.NewRequestWithContext(ctx, http.MethodPost, "/api/chat", strings.NewReader(body))
rec := httptest.NewRecorder()
srv.Handler().ServeHTTP(rec, req)
if rec.Code != http.StatusServiceUnavailable {
t.Fatalf("status = %d, want %d", rec.Code, http.StatusServiceUnavailable)
}
}
// --- Stub client for testing ---
// stubClient implements ollama.Client for testing.
@@ -523,6 +511,7 @@ type stubClient struct {
ps *ollama.PsResponse
psErr error
chatFunc func(ctx context.Context, req ollama.ChatRequest, stream bool) (*ollama.ChatResponse, <-chan ollama.ChatResponse, error)
rawChatResp *http.Response
rawChatFunc func(ctx context.Context, body []byte) (*http.Response, error)
@@ -531,6 +520,9 @@ type stubClient struct {
}
func (s *stubClient) Chat(ctx context.Context, req ollama.ChatRequest, stream bool) (*ollama.ChatResponse, <-chan ollama.ChatResponse, error) {
if s.chatFunc != nil {
return s.chatFunc(ctx, req, stream)
}
return nil, nil, fmt.Errorf("stubClient.Chat not implemented")
}