feat: add durable queue, single worker, and drain-by-model scheduling

Replace the Phase 2 in-flight chat gate (buffered channel) with a real SQLite-backed job queue and single worker loop. Every /api/chat request now creates a job row, blocks until the worker completes it, and returns the result transparently. Key changes: - internal/store: NextJob (drain-by-model ordering), IncrementAttempt, ResetInterruptedJobs, DeleteTerminalJobsBefore; busy_timeout pragma - internal/worker: single-threaded worker loop with Notifier for sync handler completion signaling; retry on ConnectionError, terminal fail on HTTPError; crash recovery resets interrupted jobs on startup - internal/webhook: dispatcher infrastructure for async webhook delivery - internal/server: chat handler rewritten to enqueue+wait; old chatGate removed; embeddings remain direct concurrent proxies (ADR-0013) - internal/config: FOREMAN_MAX_ATTEMPTS, FOREMAN_JOB_TTL Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 18:29:32 -04:00
parent 27f196d333
commit 6fd050855a
11 changed files with 1830 additions and 183 deletions
@@ -3,54 +3,72 @@
 // Why: foreman exposes a native Ollama-compatible API plus async job endpoints;
 // centralizing routing and middleware here keeps cmd/foreman thin.
 // What: creates a stdlib net/http server with health checks, optional bearer-token
-// auth, Ollama passthrough (chat, tags, ps, embed), and an extensible mux.
+// auth, Ollama passthrough (chat, tags, ps, embed), /jobs async surface, and
+// artifact serving.
 // Test: start the server with httptest, hit /healthz, verify 200; set a token,
-// verify 401 without it; test Ollama passthrough routes.
+// verify 401 without it; test Ollama passthrough routes and /jobs lifecycle.
 package server

 import (
-	"bufio"
+	"crypto/rand"
 	"encoding/json"
+	"fmt"
 	"io"
 	"log/slog"
 	"net/http"
 	"strings"
+	"time"
+
+	"github.com/oklog/ulid/v2"

 	"gitea.stevedudenhoeffer.com/steve/foreman/internal/config"
 	"gitea.stevedudenhoeffer.com/steve/foreman/internal/ollama"
 	"gitea.stevedudenhoeffer.com/steve/foreman/internal/store"
+	"gitea.stevedudenhoeffer.com/steve/foreman/internal/webhook"
+	"gitea.stevedudenhoeffer.com/steve/foreman/internal/worker"
 )

-// scannerBufSize is the buffer size for the NDJSON scanner (4 MB).
-const scannerBufSize = 4 * 1024 * 1024
-
 // Server holds the HTTP server and its dependencies.
 type Server struct {
-	cfg       config.Config
-	store     *store.Store
-	client    ollama.Client
-	inventory *ollama.ModelInventory
-	chatGate  chan struct{}
-	mux       *http.ServeMux
-	logger    *slog.Logger
+	cfg        config.Config
+	store      *store.Store
+	client     ollama.Client
+	inventory  *ollama.ModelInventory
+	notifier   *worker.Notifier
+	workerRef  *worker.Worker
+	dispatcher *webhook.Dispatcher
+	mux        *http.ServeMux
+	logger     *slog.Logger
 }

 // New creates a new Server with the given configuration, store, Ollama client,
-// and model inventory. The mux is populated with all routes.
+// model inventory, notifier, worker, and webhook dispatcher. The mux is populated
+// with all routes.
 //
 // Why: dependency injection makes the server testable and extensible.
-// What: wires config, store, client, inventory, and logger into the server,
-// registers routes, and creates the single-flight chat gate.
+// What: wires config, store, client, inventory, notifier, worker, dispatcher, and
+// logger into the server, registers all routes.
 // Test: create with New, use httptest to exercise routes.
-func New(cfg config.Config, st *store.Store, client ollama.Client, inv *ollama.ModelInventory, logger *slog.Logger) *Server {
+func New(
+	cfg config.Config,
+	st *store.Store,
+	client ollama.Client,
+	inv *ollama.ModelInventory,
+	notifier *worker.Notifier,
+	w *worker.Worker,
+	dispatcher *webhook.Dispatcher,
+	logger *slog.Logger,
+) *Server {
 	s := &Server{
-		cfg:       cfg,
-		store:     st,
-		client:    client,
-		inventory: inv,
-		chatGate:  make(chan struct{}, 1),
-		mux:       http.NewServeMux(),
-		logger:    logger,
+		cfg:        cfg,
+		store:      st,
+		client:     client,
+		inventory:  inv,
+		notifier:   notifier,
+		workerRef:  w,
+		dispatcher: dispatcher,
+		mux:        http.NewServeMux(),
+		logger:     logger,
 	}
 	s.routes()
 	return s
@@ -83,6 +101,7 @@ func (s *Server) routes() {
 	s.mux.HandleFunc("POST /api/chat", s.handleChat)
 	s.mux.HandleFunc("POST /api/embed", s.handleEmbed)
 	s.mux.HandleFunc("POST /api/embeddings", s.handleEmbed)
+	s.registerJobRoutes()
 }

 // healthResponse is the JSON shape returned by /healthz.
@@ -170,15 +189,16 @@ func (s *Server) handleEmbed(w http.ResponseWriter, r *http.Request) {
 	io.Copy(w, resp.Body)
 }

-// handleChat is the critical passthrough path for /api/chat. It validates the
-// model, serializes through a single-flight gate, and proxies to the target
-// with NDJSON streaming support.
+// handleChat is the synchronous passthrough for /api/chat. It enqueues a job in
+// the SQLite queue and blocks until the worker completes it, then returns the
+// result as if it came directly from Ollama.
 //
 // Why: the sync passthrough is foreman's primary API surface for go-llm (ADR-0003).
-// What: validates model, acquires the chat gate, proxies to the target, streams
-// NDJSON chunks back if streaming, releases the gate on completion.
-// Test: verify model validation (404 on unknown), serialization (two concurrent
-// requests don't overlap), streaming (NDJSON chunks pass through faithfully).
+// The response blocks until done so the caller gets a transparent Ollama experience.
+// What: validates model, creates a job, registers a completion waiter, wakes the
+// worker, and blocks until done or context cancellation.
+// Test: verify model validation (404 on unknown), serialization (jobs execute one
+// at a time), and that the HTTP response matches the Ollama chat response.
 func (s *Server) handleChat(w http.ResponseWriter, r *http.Request) {
 	body, err := io.ReadAll(r.Body)
 	if err != nil {
@@ -186,10 +206,9 @@ func (s *Server) handleChat(w http.ResponseWriter, r *http.Request) {
 		return
 	}

-	// Parse just enough to validate the model and detect streaming.
+	// Parse just enough to validate the model.
 	var partial struct {
-		Model  string `json:"model"`
-		Stream *bool  `json:"stream"`
+		Model string `json:"model"`
 	}
 	if err := json.Unmarshal(body, &partial); err != nil {
 		http.Error(w, `{"error":"invalid JSON body"}`, http.StatusBadRequest)
@@ -211,64 +230,68 @@ func (s *Server) handleChat(w http.ResponseWriter, r *http.Request) {
 		}
 	}

-	// Determine if streaming. Ollama defaults to streaming when "stream" is absent.
-	streaming := true
-	if partial.Stream != nil && !*partial.Stream {
-		streaming = false
+	// Generate a job ID and enqueue.
+	jobID := ulid.MustNew(ulid.Timestamp(time.Now()), rand.Reader).String()
+
+	maxAttempts := s.cfg.MaxAttempts
+	if maxAttempts == 0 {
+		maxAttempts = 3
 	}

-	// Acquire the single-flight chat gate. This serializes all chat requests
-	// through one at a time. Phase 3 replaces this with the full SQLite queue +
-	// worker loop.
-	select {
-	case s.chatGate <- struct{}{}:
-		// Acquired.
-	case <-r.Context().Done():
-		http.Error(w, `{"error":"request cancelled while waiting"}`, http.StatusServiceUnavailable)
+	job := store.Job{
+		ID:          jobID,
+		Model:       partial.Model,
+		Payload:     json.RawMessage(body),
+		MaxAttempts: maxAttempts,
+	}
+
+	if _, err := s.store.CreateJob(job); err != nil {
+		s.logger.Error("failed to enqueue chat job", "error", err, "job_id", jobID, "model", partial.Model)
+		http.Error(w, fmt.Sprintf(`{"error":"failed to enqueue job: %s"}`, err), http.StatusInternalServerError)
 		return
 	}
-	defer func() { <-s.chatGate }()

-	// Proxy to the target.
-	resp, err := s.client.RawChat(r.Context(), body)
-	if err != nil {
-		s.logger.Error("chat proxy failed", "error", err, "model", partial.Model)
-		if httpErr, ok := err.(*ollama.HTTPError); ok {
-			http.Error(w, httpErr.Body, httpErr.StatusCode)
+	// Register a completion waiter before waking the worker.
+	waitCh := s.notifier.Register(jobID)
+
+	// Wake the worker.
+	if s.workerRef != nil {
+		s.workerRef.Wake()
+	}
+
+	// Block until the job completes or the request is cancelled.
+	select {
+	case <-waitCh:
+		// Job completed — get the result.
+		state, result, errMsg, ok := s.notifier.Result(jobID)
+		if !ok {
+			// Should not happen, but fall back to DB.
+			j, err := s.store.GetJob(jobID)
+			if err != nil {
+				http.Error(w, `{"error":"job lost"}`, http.StatusInternalServerError)
+				return
+			}
+			state = j.State
+			result = j.Result
+			errMsg = j.Error
+		}
+
+		if state == store.JobStateFailed {
+			msg := "job failed"
+			if errMsg != nil {
+				msg = *errMsg
+			}
+			http.Error(w, fmt.Sprintf(`{"error":%q}`, msg), http.StatusBadGateway)
 			return
 		}
-		http.Error(w, `{"error":"target unreachable"}`, http.StatusBadGateway)
-		return
-	}
-	defer resp.Body.Close()

-	if streaming {
-		w.Header().Set("Content-Type", "application/x-ndjson")
-		w.WriteHeader(http.StatusOK)
-
-		flusher, canFlush := w.(http.Flusher)
-		scanner := bufio.NewScanner(resp.Body)
-		scanner.Buffer(make([]byte, 0, scannerBufSize), scannerBufSize)
-
-		for scanner.Scan() {
-			line := scanner.Bytes()
-			if len(line) == 0 {
-				continue
-			}
-			w.Write(line)
-			w.Write([]byte("\n"))
-			if canFlush {
-				flusher.Flush()
-			}
-		}
-		if err := scanner.Err(); err != nil {
-			s.logger.Warn("stream read error", "error", err, "model", partial.Model)
-		}
-	} else {
-		// Non-streaming: proxy the complete JSON response.
+		// Return the result as a direct Ollama response.
 		w.Header().Set("Content-Type", "application/json")
 		w.WriteHeader(http.StatusOK)
-		io.Copy(w, resp.Body)
+		w.Write(result)
+
+	case <-r.Context().Done():
+		http.Error(w, `{"error":"request cancelled while waiting"}`, http.StatusServiceUnavailable)
 	}
 }