feat: add durable queue, single worker, and drain-by-model scheduling
Replace the Phase 2 in-flight chat gate (buffered channel) with a real SQLite-backed job queue and single worker loop. Every /api/chat request now creates a job row, blocks until the worker completes it, and returns the result transparently. Key changes: - internal/store: NextJob (drain-by-model ordering), IncrementAttempt, ResetInterruptedJobs, DeleteTerminalJobsBefore; busy_timeout pragma - internal/worker: single-threaded worker loop with Notifier for sync handler completion signaling; retry on ConnectionError, terminal fail on HTTPError; crash recovery resets interrupted jobs on startup - internal/webhook: dispatcher infrastructure for async webhook delivery - internal/server: chat handler rewritten to enqueue+wait; old chatGate removed; embeddings remain direct concurrent proxies (ADR-0013) - internal/config: FOREMAN_MAX_ATTEMPTS, FOREMAN_JOB_TTL Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+105
-82
@@ -3,54 +3,72 @@
|
||||
// Why: foreman exposes a native Ollama-compatible API plus async job endpoints;
|
||||
// centralizing routing and middleware here keeps cmd/foreman thin.
|
||||
// What: creates a stdlib net/http server with health checks, optional bearer-token
|
||||
// auth, Ollama passthrough (chat, tags, ps, embed), and an extensible mux.
|
||||
// auth, Ollama passthrough (chat, tags, ps, embed), /jobs async surface, and
|
||||
// artifact serving.
|
||||
// Test: start the server with httptest, hit /healthz, verify 200; set a token,
|
||||
// verify 401 without it; test Ollama passthrough routes.
|
||||
// verify 401 without it; test Ollama passthrough routes and /jobs lifecycle.
|
||||
package server
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"crypto/rand"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/oklog/ulid/v2"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/foreman/internal/config"
|
||||
"gitea.stevedudenhoeffer.com/steve/foreman/internal/ollama"
|
||||
"gitea.stevedudenhoeffer.com/steve/foreman/internal/store"
|
||||
"gitea.stevedudenhoeffer.com/steve/foreman/internal/webhook"
|
||||
"gitea.stevedudenhoeffer.com/steve/foreman/internal/worker"
|
||||
)
|
||||
|
||||
// scannerBufSize is the buffer size for the NDJSON scanner (4 MB).
|
||||
const scannerBufSize = 4 * 1024 * 1024
|
||||
|
||||
// Server holds the HTTP server and its dependencies.
|
||||
type Server struct {
|
||||
cfg config.Config
|
||||
store *store.Store
|
||||
client ollama.Client
|
||||
inventory *ollama.ModelInventory
|
||||
chatGate chan struct{}
|
||||
mux *http.ServeMux
|
||||
logger *slog.Logger
|
||||
cfg config.Config
|
||||
store *store.Store
|
||||
client ollama.Client
|
||||
inventory *ollama.ModelInventory
|
||||
notifier *worker.Notifier
|
||||
workerRef *worker.Worker
|
||||
dispatcher *webhook.Dispatcher
|
||||
mux *http.ServeMux
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// New creates a new Server with the given configuration, store, Ollama client,
|
||||
// and model inventory. The mux is populated with all routes.
|
||||
// model inventory, notifier, worker, and webhook dispatcher. The mux is populated
|
||||
// with all routes.
|
||||
//
|
||||
// Why: dependency injection makes the server testable and extensible.
|
||||
// What: wires config, store, client, inventory, and logger into the server,
|
||||
// registers routes, and creates the single-flight chat gate.
|
||||
// What: wires config, store, client, inventory, notifier, worker, dispatcher, and
|
||||
// logger into the server, registers all routes.
|
||||
// Test: create with New, use httptest to exercise routes.
|
||||
func New(cfg config.Config, st *store.Store, client ollama.Client, inv *ollama.ModelInventory, logger *slog.Logger) *Server {
|
||||
func New(
|
||||
cfg config.Config,
|
||||
st *store.Store,
|
||||
client ollama.Client,
|
||||
inv *ollama.ModelInventory,
|
||||
notifier *worker.Notifier,
|
||||
w *worker.Worker,
|
||||
dispatcher *webhook.Dispatcher,
|
||||
logger *slog.Logger,
|
||||
) *Server {
|
||||
s := &Server{
|
||||
cfg: cfg,
|
||||
store: st,
|
||||
client: client,
|
||||
inventory: inv,
|
||||
chatGate: make(chan struct{}, 1),
|
||||
mux: http.NewServeMux(),
|
||||
logger: logger,
|
||||
cfg: cfg,
|
||||
store: st,
|
||||
client: client,
|
||||
inventory: inv,
|
||||
notifier: notifier,
|
||||
workerRef: w,
|
||||
dispatcher: dispatcher,
|
||||
mux: http.NewServeMux(),
|
||||
logger: logger,
|
||||
}
|
||||
s.routes()
|
||||
return s
|
||||
@@ -83,6 +101,7 @@ func (s *Server) routes() {
|
||||
s.mux.HandleFunc("POST /api/chat", s.handleChat)
|
||||
s.mux.HandleFunc("POST /api/embed", s.handleEmbed)
|
||||
s.mux.HandleFunc("POST /api/embeddings", s.handleEmbed)
|
||||
s.registerJobRoutes()
|
||||
}
|
||||
|
||||
// healthResponse is the JSON shape returned by /healthz.
|
||||
@@ -170,15 +189,16 @@ func (s *Server) handleEmbed(w http.ResponseWriter, r *http.Request) {
|
||||
io.Copy(w, resp.Body)
|
||||
}
|
||||
|
||||
// handleChat is the critical passthrough path for /api/chat. It validates the
|
||||
// model, serializes through a single-flight gate, and proxies to the target
|
||||
// with NDJSON streaming support.
|
||||
// handleChat is the synchronous passthrough for /api/chat. It enqueues a job in
|
||||
// the SQLite queue and blocks until the worker completes it, then returns the
|
||||
// result as if it came directly from Ollama.
|
||||
//
|
||||
// Why: the sync passthrough is foreman's primary API surface for go-llm (ADR-0003).
|
||||
// What: validates model, acquires the chat gate, proxies to the target, streams
|
||||
// NDJSON chunks back if streaming, releases the gate on completion.
|
||||
// Test: verify model validation (404 on unknown), serialization (two concurrent
|
||||
// requests don't overlap), streaming (NDJSON chunks pass through faithfully).
|
||||
// The response blocks until done so the caller gets a transparent Ollama experience.
|
||||
// What: validates model, creates a job, registers a completion waiter, wakes the
|
||||
// worker, and blocks until done or context cancellation.
|
||||
// Test: verify model validation (404 on unknown), serialization (jobs execute one
|
||||
// at a time), and that the HTTP response matches the Ollama chat response.
|
||||
func (s *Server) handleChat(w http.ResponseWriter, r *http.Request) {
|
||||
body, err := io.ReadAll(r.Body)
|
||||
if err != nil {
|
||||
@@ -186,10 +206,9 @@ func (s *Server) handleChat(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
// Parse just enough to validate the model and detect streaming.
|
||||
// Parse just enough to validate the model.
|
||||
var partial struct {
|
||||
Model string `json:"model"`
|
||||
Stream *bool `json:"stream"`
|
||||
Model string `json:"model"`
|
||||
}
|
||||
if err := json.Unmarshal(body, &partial); err != nil {
|
||||
http.Error(w, `{"error":"invalid JSON body"}`, http.StatusBadRequest)
|
||||
@@ -211,64 +230,68 @@ func (s *Server) handleChat(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
}
|
||||
|
||||
// Determine if streaming. Ollama defaults to streaming when "stream" is absent.
|
||||
streaming := true
|
||||
if partial.Stream != nil && !*partial.Stream {
|
||||
streaming = false
|
||||
// Generate a job ID and enqueue.
|
||||
jobID := ulid.MustNew(ulid.Timestamp(time.Now()), rand.Reader).String()
|
||||
|
||||
maxAttempts := s.cfg.MaxAttempts
|
||||
if maxAttempts == 0 {
|
||||
maxAttempts = 3
|
||||
}
|
||||
|
||||
// Acquire the single-flight chat gate. This serializes all chat requests
|
||||
// through one at a time. Phase 3 replaces this with the full SQLite queue +
|
||||
// worker loop.
|
||||
select {
|
||||
case s.chatGate <- struct{}{}:
|
||||
// Acquired.
|
||||
case <-r.Context().Done():
|
||||
http.Error(w, `{"error":"request cancelled while waiting"}`, http.StatusServiceUnavailable)
|
||||
job := store.Job{
|
||||
ID: jobID,
|
||||
Model: partial.Model,
|
||||
Payload: json.RawMessage(body),
|
||||
MaxAttempts: maxAttempts,
|
||||
}
|
||||
|
||||
if _, err := s.store.CreateJob(job); err != nil {
|
||||
s.logger.Error("failed to enqueue chat job", "error", err, "job_id", jobID, "model", partial.Model)
|
||||
http.Error(w, fmt.Sprintf(`{"error":"failed to enqueue job: %s"}`, err), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
defer func() { <-s.chatGate }()
|
||||
|
||||
// Proxy to the target.
|
||||
resp, err := s.client.RawChat(r.Context(), body)
|
||||
if err != nil {
|
||||
s.logger.Error("chat proxy failed", "error", err, "model", partial.Model)
|
||||
if httpErr, ok := err.(*ollama.HTTPError); ok {
|
||||
http.Error(w, httpErr.Body, httpErr.StatusCode)
|
||||
// Register a completion waiter before waking the worker.
|
||||
waitCh := s.notifier.Register(jobID)
|
||||
|
||||
// Wake the worker.
|
||||
if s.workerRef != nil {
|
||||
s.workerRef.Wake()
|
||||
}
|
||||
|
||||
// Block until the job completes or the request is cancelled.
|
||||
select {
|
||||
case <-waitCh:
|
||||
// Job completed — get the result.
|
||||
state, result, errMsg, ok := s.notifier.Result(jobID)
|
||||
if !ok {
|
||||
// Should not happen, but fall back to DB.
|
||||
j, err := s.store.GetJob(jobID)
|
||||
if err != nil {
|
||||
http.Error(w, `{"error":"job lost"}`, http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
state = j.State
|
||||
result = j.Result
|
||||
errMsg = j.Error
|
||||
}
|
||||
|
||||
if state == store.JobStateFailed {
|
||||
msg := "job failed"
|
||||
if errMsg != nil {
|
||||
msg = *errMsg
|
||||
}
|
||||
http.Error(w, fmt.Sprintf(`{"error":%q}`, msg), http.StatusBadGateway)
|
||||
return
|
||||
}
|
||||
http.Error(w, `{"error":"target unreachable"}`, http.StatusBadGateway)
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if streaming {
|
||||
w.Header().Set("Content-Type", "application/x-ndjson")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
|
||||
flusher, canFlush := w.(http.Flusher)
|
||||
scanner := bufio.NewScanner(resp.Body)
|
||||
scanner.Buffer(make([]byte, 0, scannerBufSize), scannerBufSize)
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Bytes()
|
||||
if len(line) == 0 {
|
||||
continue
|
||||
}
|
||||
w.Write(line)
|
||||
w.Write([]byte("\n"))
|
||||
if canFlush {
|
||||
flusher.Flush()
|
||||
}
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
s.logger.Warn("stream read error", "error", err, "model", partial.Model)
|
||||
}
|
||||
} else {
|
||||
// Non-streaming: proxy the complete JSON response.
|
||||
// Return the result as a direct Ollama response.
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
io.Copy(w, resp.Body)
|
||||
w.Write(result)
|
||||
|
||||
case <-r.Context().Done():
|
||||
http.Error(w, `{"error":"request cancelled while waiting"}`, http.StatusServiceUnavailable)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user