// Package server provides the HTTP API for the foreman daemon. // // Why: foreman exposes a native Ollama-compatible API plus async job endpoints; // centralizing routing and middleware here keeps cmd/foreman thin. // What: creates a stdlib net/http server with health checks, optional bearer-token // auth, Ollama passthrough (chat, tags, ps, embed), /jobs async surface, and // artifact serving. // Test: start the server with httptest, hit /healthz, verify 200; set a token, // verify 401 without it; test Ollama passthrough routes and /jobs lifecycle. package server import ( "crypto/rand" "encoding/json" "fmt" "io" "log/slog" "net/http" "strings" "time" "github.com/oklog/ulid/v2" "gitea.stevedudenhoeffer.com/steve/foreman/internal/config" "gitea.stevedudenhoeffer.com/steve/foreman/internal/ollama" "gitea.stevedudenhoeffer.com/steve/foreman/internal/store" "gitea.stevedudenhoeffer.com/steve/foreman/internal/webhook" "gitea.stevedudenhoeffer.com/steve/foreman/internal/worker" ) // Server holds the HTTP server and its dependencies. type Server struct { cfg config.Config store *store.Store client ollama.Client inventory *ollama.ModelInventory notifier *worker.Notifier workerRef *worker.Worker dispatcher *webhook.Dispatcher mux *http.ServeMux logger *slog.Logger } // New creates a new Server with the given configuration, store, Ollama client, // model inventory, notifier, worker, and webhook dispatcher. The mux is populated // with all routes. // // Why: dependency injection makes the server testable and extensible. // What: wires config, store, client, inventory, notifier, worker, dispatcher, and // logger into the server, registers all routes. // Test: create with New, use httptest to exercise routes. func New( cfg config.Config, st *store.Store, client ollama.Client, inv *ollama.ModelInventory, notifier *worker.Notifier, w *worker.Worker, dispatcher *webhook.Dispatcher, logger *slog.Logger, ) *Server { s := &Server{ cfg: cfg, store: st, client: client, inventory: inv, notifier: notifier, workerRef: w, dispatcher: dispatcher, mux: http.NewServeMux(), logger: logger, } s.routes() return s } // Handler returns the server's http.Handler, with auth middleware applied. // // Why: allows httptest usage in tests without starting a real listener. // What: wraps the mux with optional bearer-token middleware. // Test: call Handler(), use httptest.NewServer, exercise endpoints. func (s *Server) Handler() http.Handler { var h http.Handler = s.mux if s.cfg.Token != "" { h = s.authMiddleware(h) } return h } // ListenAndServe starts the HTTP server on the configured address. func (s *Server) ListenAndServe() error { s.logger.Info("starting server", "addr", s.cfg.Addr) return http.ListenAndServe(s.cfg.Addr, s.Handler()) } // routes registers all HTTP routes on the mux. func (s *Server) routes() { s.mux.HandleFunc("GET /healthz", s.handleHealthz) s.mux.HandleFunc("GET /api/tags", s.handleTags) s.mux.HandleFunc("GET /api/ps", s.handlePs) s.mux.HandleFunc("POST /api/chat", s.handleChat) s.mux.HandleFunc("POST /api/embed", s.handleEmbed) s.mux.HandleFunc("POST /api/embeddings", s.handleEmbed) s.registerJobRoutes() } // healthResponse is the JSON shape returned by /healthz. type healthResponse struct { Status string `json:"status"` Degraded bool `json:"degraded"` } // handleHealthz returns the daemon's health status, including the poller's // degraded flag so probes and operators can see target connectivity. // // Why: load balancers and operators need a single endpoint for health. // What: returns 200 with a JSON body including the degraded flag from the poller. // Test: set up a server with a degraded inventory, assert degraded=true in response. func (s *Server) handleHealthz(w http.ResponseWriter, r *http.Request) { degraded := false if s.inventory != nil { degraded = s.inventory.Degraded() } w.Header().Set("Content-Type", "application/json") w.WriteHeader(http.StatusOK) json.NewEncoder(w).Encode(healthResponse{ Status: "ok", Degraded: degraded, }) } // handleTags returns the cached model inventory as Ollama-format JSON. // // Why: foreman's /api/tags must be indistinguishable from Ollama's /api/tags. // What: returns the poller's cached TagsResponse. // Test: populate the inventory, GET /api/tags, assert the response matches. func (s *Server) handleTags(w http.ResponseWriter, r *http.Request) { models := s.inventory.Models() w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(ollama.TagsResponse{Models: models}) } // handlePs returns the cached running models from the poller. // // Why: foreman's /api/ps lets callers see what's resident on the target. // What: returns the poller's cached PsResponse. // Test: populate the inventory with running models, GET /api/ps, assert match. func (s *Server) handlePs(w http.ResponseWriter, r *http.Request) { running := s.inventory.ResidentModels() w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(ollama.PsResponse{Models: running}) } // handleEmbed proxies embedding requests directly and concurrently to the target. // These bypass any serialization gate per ADR-0013. // // Why: embeddings hit the always-resident embedder and must not wait behind chat jobs. // What: reads the request body, proxies to the target, and returns the response. // Test: send concurrent embed requests, assert they all complete without serialization. func (s *Server) handleEmbed(w http.ResponseWriter, r *http.Request) { body, err := io.ReadAll(r.Body) if err != nil { http.Error(w, `{"error":"failed to read request body"}`, http.StatusBadRequest) return } resp, err := s.client.RawEmbed(r.Context(), body) if err != nil { s.logger.Error("embed proxy failed", "error", err) if httpErr, ok := err.(*ollama.HTTPError); ok { http.Error(w, httpErr.Body, httpErr.StatusCode) return } http.Error(w, `{"error":"target unreachable"}`, http.StatusBadGateway) return } defer resp.Body.Close() // Copy response headers and body. for k, vv := range resp.Header { for _, v := range vv { w.Header().Add(k, v) } } if w.Header().Get("Content-Type") == "" { w.Header().Set("Content-Type", "application/json") } w.WriteHeader(resp.StatusCode) io.Copy(w, resp.Body) } // handleChat is the synchronous passthrough for /api/chat. It enqueues a job in // the SQLite queue and blocks until the worker completes it, then returns the // result as if it came directly from Ollama. // // Why: the sync passthrough is foreman's primary API surface for go-llm (ADR-0003). // The response blocks until done so the caller gets a transparent Ollama experience. // What: validates model, creates a job, registers a completion waiter, wakes the // worker, and blocks until done or context cancellation. // Test: verify model validation (404 on unknown), serialization (jobs execute one // at a time), and that the HTTP response matches the Ollama chat response. func (s *Server) handleChat(w http.ResponseWriter, r *http.Request) { body, err := io.ReadAll(r.Body) if err != nil { http.Error(w, `{"error":"failed to read request body"}`, http.StatusBadRequest) return } // Parse just enough to validate the model. var partial struct { Model string `json:"model"` } if err := json.Unmarshal(body, &partial); err != nil { http.Error(w, `{"error":"invalid JSON body"}`, http.StatusBadRequest) return } if partial.Model == "" { http.Error(w, `{"error":"model is required"}`, http.StatusBadRequest) return } // Validate the model exists. One re-poll on miss (ADR-0007). if !s.inventory.HasModel(partial.Model) { if err := s.inventory.Refresh(r.Context()); err != nil { s.logger.Warn("model re-poll failed", "error", err) } if !s.inventory.HasModel(partial.Model) { http.Error(w, `{"error":"model not found"}`, http.StatusNotFound) return } } // Generate a job ID and enqueue. jobID := ulid.MustNew(ulid.Timestamp(time.Now()), rand.Reader).String() maxAttempts := s.cfg.MaxAttempts if maxAttempts == 0 { maxAttempts = 3 } job := store.Job{ ID: jobID, Model: partial.Model, Payload: json.RawMessage(body), MaxAttempts: maxAttempts, } if _, err := s.store.CreateJob(job); err != nil { s.logger.Error("failed to enqueue chat job", "error", err, "job_id", jobID, "model", partial.Model) http.Error(w, fmt.Sprintf(`{"error":"failed to enqueue job: %s"}`, err), http.StatusInternalServerError) return } // Register a completion waiter before waking the worker. waitCh := s.notifier.Register(jobID) // Wake the worker. if s.workerRef != nil { s.workerRef.Wake() } // Block until the job completes or the request is cancelled. select { case <-waitCh: // Job completed — get the result. state, result, errMsg, ok := s.notifier.Result(jobID) if !ok { // Should not happen, but fall back to DB. j, err := s.store.GetJob(jobID) if err != nil { http.Error(w, `{"error":"job lost"}`, http.StatusInternalServerError) return } state = j.State result = j.Result errMsg = j.Error } if state == store.JobStateFailed { msg := "job failed" if errMsg != nil { msg = *errMsg } http.Error(w, fmt.Sprintf(`{"error":%q}`, msg), http.StatusBadGateway) return } // Return the result as a direct Ollama response. w.Header().Set("Content-Type", "application/json") w.WriteHeader(http.StatusOK) w.Write(result) case <-r.Context().Done(): http.Error(w, `{"error":"request cancelled while waiting"}`, http.StatusServiceUnavailable) } } // authMiddleware validates the Authorization: Bearer header on all // requests except /healthz. Returns 401 if the token is missing or wrong. func (s *Server) authMiddleware(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { // /healthz is always public so load balancers and probes work without auth. if r.URL.Path == "/healthz" { next.ServeHTTP(w, r) return } auth := r.Header.Get("Authorization") if auth == "" { http.Error(w, `{"error":"missing authorization header"}`, http.StatusUnauthorized) return } const prefix = "Bearer " if !strings.HasPrefix(auth, prefix) { http.Error(w, `{"error":"invalid authorization header"}`, http.StatusUnauthorized) return } token := strings.TrimPrefix(auth, prefix) if token != s.cfg.Token { http.Error(w, `{"error":"invalid token"}`, http.StatusUnauthorized) return } next.ServeHTTP(w, r) }) }