6fd050855a
Replace the Phase 2 in-flight chat gate (buffered channel) with a real SQLite-backed job queue and single worker loop. Every /api/chat request now creates a job row, blocks until the worker completes it, and returns the result transparently. Key changes: - internal/store: NextJob (drain-by-model ordering), IncrementAttempt, ResetInterruptedJobs, DeleteTerminalJobsBefore; busy_timeout pragma - internal/worker: single-threaded worker loop with Notifier for sync handler completion signaling; retry on ConnectionError, terminal fail on HTTPError; crash recovery resets interrupted jobs on startup - internal/webhook: dispatcher infrastructure for async webhook delivery - internal/server: chat handler rewritten to enqueue+wait; old chatGate removed; embeddings remain direct concurrent proxies (ADR-0013) - internal/config: FOREMAN_MAX_ATTEMPTS, FOREMAN_JOB_TTL Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
329 lines
10 KiB
Go
329 lines
10 KiB
Go
// Package server provides the HTTP API for the foreman daemon.
|
|
//
|
|
// Why: foreman exposes a native Ollama-compatible API plus async job endpoints;
|
|
// centralizing routing and middleware here keeps cmd/foreman thin.
|
|
// What: creates a stdlib net/http server with health checks, optional bearer-token
|
|
// auth, Ollama passthrough (chat, tags, ps, embed), /jobs async surface, and
|
|
// artifact serving.
|
|
// Test: start the server with httptest, hit /healthz, verify 200; set a token,
|
|
// verify 401 without it; test Ollama passthrough routes and /jobs lifecycle.
|
|
package server
|
|
|
|
import (
|
|
"crypto/rand"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"log/slog"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/oklog/ulid/v2"
|
|
|
|
"gitea.stevedudenhoeffer.com/steve/foreman/internal/config"
|
|
"gitea.stevedudenhoeffer.com/steve/foreman/internal/ollama"
|
|
"gitea.stevedudenhoeffer.com/steve/foreman/internal/store"
|
|
"gitea.stevedudenhoeffer.com/steve/foreman/internal/webhook"
|
|
"gitea.stevedudenhoeffer.com/steve/foreman/internal/worker"
|
|
)
|
|
|
|
// Server holds the HTTP server and its dependencies.
|
|
type Server struct {
|
|
cfg config.Config
|
|
store *store.Store
|
|
client ollama.Client
|
|
inventory *ollama.ModelInventory
|
|
notifier *worker.Notifier
|
|
workerRef *worker.Worker
|
|
dispatcher *webhook.Dispatcher
|
|
mux *http.ServeMux
|
|
logger *slog.Logger
|
|
}
|
|
|
|
// New creates a new Server with the given configuration, store, Ollama client,
|
|
// model inventory, notifier, worker, and webhook dispatcher. The mux is populated
|
|
// with all routes.
|
|
//
|
|
// Why: dependency injection makes the server testable and extensible.
|
|
// What: wires config, store, client, inventory, notifier, worker, dispatcher, and
|
|
// logger into the server, registers all routes.
|
|
// Test: create with New, use httptest to exercise routes.
|
|
func New(
|
|
cfg config.Config,
|
|
st *store.Store,
|
|
client ollama.Client,
|
|
inv *ollama.ModelInventory,
|
|
notifier *worker.Notifier,
|
|
w *worker.Worker,
|
|
dispatcher *webhook.Dispatcher,
|
|
logger *slog.Logger,
|
|
) *Server {
|
|
s := &Server{
|
|
cfg: cfg,
|
|
store: st,
|
|
client: client,
|
|
inventory: inv,
|
|
notifier: notifier,
|
|
workerRef: w,
|
|
dispatcher: dispatcher,
|
|
mux: http.NewServeMux(),
|
|
logger: logger,
|
|
}
|
|
s.routes()
|
|
return s
|
|
}
|
|
|
|
// Handler returns the server's http.Handler, with auth middleware applied.
|
|
//
|
|
// Why: allows httptest usage in tests without starting a real listener.
|
|
// What: wraps the mux with optional bearer-token middleware.
|
|
// Test: call Handler(), use httptest.NewServer, exercise endpoints.
|
|
func (s *Server) Handler() http.Handler {
|
|
var h http.Handler = s.mux
|
|
if s.cfg.Token != "" {
|
|
h = s.authMiddleware(h)
|
|
}
|
|
return h
|
|
}
|
|
|
|
// ListenAndServe starts the HTTP server on the configured address.
|
|
func (s *Server) ListenAndServe() error {
|
|
s.logger.Info("starting server", "addr", s.cfg.Addr)
|
|
return http.ListenAndServe(s.cfg.Addr, s.Handler())
|
|
}
|
|
|
|
// routes registers all HTTP routes on the mux.
|
|
func (s *Server) routes() {
|
|
s.mux.HandleFunc("GET /healthz", s.handleHealthz)
|
|
s.mux.HandleFunc("GET /api/tags", s.handleTags)
|
|
s.mux.HandleFunc("GET /api/ps", s.handlePs)
|
|
s.mux.HandleFunc("POST /api/chat", s.handleChat)
|
|
s.mux.HandleFunc("POST /api/embed", s.handleEmbed)
|
|
s.mux.HandleFunc("POST /api/embeddings", s.handleEmbed)
|
|
s.registerJobRoutes()
|
|
}
|
|
|
|
// healthResponse is the JSON shape returned by /healthz.
|
|
type healthResponse struct {
|
|
Status string `json:"status"`
|
|
Degraded bool `json:"degraded"`
|
|
}
|
|
|
|
// handleHealthz returns the daemon's health status, including the poller's
|
|
// degraded flag so probes and operators can see target connectivity.
|
|
//
|
|
// Why: load balancers and operators need a single endpoint for health.
|
|
// What: returns 200 with a JSON body including the degraded flag from the poller.
|
|
// Test: set up a server with a degraded inventory, assert degraded=true in response.
|
|
func (s *Server) handleHealthz(w http.ResponseWriter, r *http.Request) {
|
|
degraded := false
|
|
if s.inventory != nil {
|
|
degraded = s.inventory.Degraded()
|
|
}
|
|
w.Header().Set("Content-Type", "application/json")
|
|
w.WriteHeader(http.StatusOK)
|
|
json.NewEncoder(w).Encode(healthResponse{
|
|
Status: "ok",
|
|
Degraded: degraded,
|
|
})
|
|
}
|
|
|
|
// handleTags returns the cached model inventory as Ollama-format JSON.
|
|
//
|
|
// Why: foreman's /api/tags must be indistinguishable from Ollama's /api/tags.
|
|
// What: returns the poller's cached TagsResponse.
|
|
// Test: populate the inventory, GET /api/tags, assert the response matches.
|
|
func (s *Server) handleTags(w http.ResponseWriter, r *http.Request) {
|
|
models := s.inventory.Models()
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(ollama.TagsResponse{Models: models})
|
|
}
|
|
|
|
// handlePs returns the cached running models from the poller.
|
|
//
|
|
// Why: foreman's /api/ps lets callers see what's resident on the target.
|
|
// What: returns the poller's cached PsResponse.
|
|
// Test: populate the inventory with running models, GET /api/ps, assert match.
|
|
func (s *Server) handlePs(w http.ResponseWriter, r *http.Request) {
|
|
running := s.inventory.ResidentModels()
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(ollama.PsResponse{Models: running})
|
|
}
|
|
|
|
// handleEmbed proxies embedding requests directly and concurrently to the target.
|
|
// These bypass any serialization gate per ADR-0013.
|
|
//
|
|
// Why: embeddings hit the always-resident embedder and must not wait behind chat jobs.
|
|
// What: reads the request body, proxies to the target, and returns the response.
|
|
// Test: send concurrent embed requests, assert they all complete without serialization.
|
|
func (s *Server) handleEmbed(w http.ResponseWriter, r *http.Request) {
|
|
body, err := io.ReadAll(r.Body)
|
|
if err != nil {
|
|
http.Error(w, `{"error":"failed to read request body"}`, http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
resp, err := s.client.RawEmbed(r.Context(), body)
|
|
if err != nil {
|
|
s.logger.Error("embed proxy failed", "error", err)
|
|
if httpErr, ok := err.(*ollama.HTTPError); ok {
|
|
http.Error(w, httpErr.Body, httpErr.StatusCode)
|
|
return
|
|
}
|
|
http.Error(w, `{"error":"target unreachable"}`, http.StatusBadGateway)
|
|
return
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
// Copy response headers and body.
|
|
for k, vv := range resp.Header {
|
|
for _, v := range vv {
|
|
w.Header().Add(k, v)
|
|
}
|
|
}
|
|
if w.Header().Get("Content-Type") == "" {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
}
|
|
w.WriteHeader(resp.StatusCode)
|
|
io.Copy(w, resp.Body)
|
|
}
|
|
|
|
// handleChat is the synchronous passthrough for /api/chat. It enqueues a job in
|
|
// the SQLite queue and blocks until the worker completes it, then returns the
|
|
// result as if it came directly from Ollama.
|
|
//
|
|
// Why: the sync passthrough is foreman's primary API surface for go-llm (ADR-0003).
|
|
// The response blocks until done so the caller gets a transparent Ollama experience.
|
|
// What: validates model, creates a job, registers a completion waiter, wakes the
|
|
// worker, and blocks until done or context cancellation.
|
|
// Test: verify model validation (404 on unknown), serialization (jobs execute one
|
|
// at a time), and that the HTTP response matches the Ollama chat response.
|
|
func (s *Server) handleChat(w http.ResponseWriter, r *http.Request) {
|
|
body, err := io.ReadAll(r.Body)
|
|
if err != nil {
|
|
http.Error(w, `{"error":"failed to read request body"}`, http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
// Parse just enough to validate the model.
|
|
var partial struct {
|
|
Model string `json:"model"`
|
|
}
|
|
if err := json.Unmarshal(body, &partial); err != nil {
|
|
http.Error(w, `{"error":"invalid JSON body"}`, http.StatusBadRequest)
|
|
return
|
|
}
|
|
if partial.Model == "" {
|
|
http.Error(w, `{"error":"model is required"}`, http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
// Validate the model exists. One re-poll on miss (ADR-0007).
|
|
if !s.inventory.HasModel(partial.Model) {
|
|
if err := s.inventory.Refresh(r.Context()); err != nil {
|
|
s.logger.Warn("model re-poll failed", "error", err)
|
|
}
|
|
if !s.inventory.HasModel(partial.Model) {
|
|
http.Error(w, `{"error":"model not found"}`, http.StatusNotFound)
|
|
return
|
|
}
|
|
}
|
|
|
|
// Generate a job ID and enqueue.
|
|
jobID := ulid.MustNew(ulid.Timestamp(time.Now()), rand.Reader).String()
|
|
|
|
maxAttempts := s.cfg.MaxAttempts
|
|
if maxAttempts == 0 {
|
|
maxAttempts = 3
|
|
}
|
|
|
|
job := store.Job{
|
|
ID: jobID,
|
|
Model: partial.Model,
|
|
Payload: json.RawMessage(body),
|
|
MaxAttempts: maxAttempts,
|
|
}
|
|
|
|
if _, err := s.store.CreateJob(job); err != nil {
|
|
s.logger.Error("failed to enqueue chat job", "error", err, "job_id", jobID, "model", partial.Model)
|
|
http.Error(w, fmt.Sprintf(`{"error":"failed to enqueue job: %s"}`, err), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
// Register a completion waiter before waking the worker.
|
|
waitCh := s.notifier.Register(jobID)
|
|
|
|
// Wake the worker.
|
|
if s.workerRef != nil {
|
|
s.workerRef.Wake()
|
|
}
|
|
|
|
// Block until the job completes or the request is cancelled.
|
|
select {
|
|
case <-waitCh:
|
|
// Job completed — get the result.
|
|
state, result, errMsg, ok := s.notifier.Result(jobID)
|
|
if !ok {
|
|
// Should not happen, but fall back to DB.
|
|
j, err := s.store.GetJob(jobID)
|
|
if err != nil {
|
|
http.Error(w, `{"error":"job lost"}`, http.StatusInternalServerError)
|
|
return
|
|
}
|
|
state = j.State
|
|
result = j.Result
|
|
errMsg = j.Error
|
|
}
|
|
|
|
if state == store.JobStateFailed {
|
|
msg := "job failed"
|
|
if errMsg != nil {
|
|
msg = *errMsg
|
|
}
|
|
http.Error(w, fmt.Sprintf(`{"error":%q}`, msg), http.StatusBadGateway)
|
|
return
|
|
}
|
|
|
|
// Return the result as a direct Ollama response.
|
|
w.Header().Set("Content-Type", "application/json")
|
|
w.WriteHeader(http.StatusOK)
|
|
w.Write(result)
|
|
|
|
case <-r.Context().Done():
|
|
http.Error(w, `{"error":"request cancelled while waiting"}`, http.StatusServiceUnavailable)
|
|
}
|
|
}
|
|
|
|
// authMiddleware validates the Authorization: Bearer <token> header on all
|
|
// requests except /healthz. Returns 401 if the token is missing or wrong.
|
|
func (s *Server) authMiddleware(next http.Handler) http.Handler {
|
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
// /healthz is always public so load balancers and probes work without auth.
|
|
if r.URL.Path == "/healthz" {
|
|
next.ServeHTTP(w, r)
|
|
return
|
|
}
|
|
|
|
auth := r.Header.Get("Authorization")
|
|
if auth == "" {
|
|
http.Error(w, `{"error":"missing authorization header"}`, http.StatusUnauthorized)
|
|
return
|
|
}
|
|
|
|
const prefix = "Bearer "
|
|
if !strings.HasPrefix(auth, prefix) {
|
|
http.Error(w, `{"error":"invalid authorization header"}`, http.StatusUnauthorized)
|
|
return
|
|
}
|
|
|
|
token := strings.TrimPrefix(auth, prefix)
|
|
if token != s.cfg.Token {
|
|
http.Error(w, `{"error":"invalid token"}`, http.StatusUnauthorized)
|
|
return
|
|
}
|
|
|
|
next.ServeHTTP(w, r)
|
|
})
|
|
}
|