feat: add Ollama target client, model poller, and native passthrough

Phase 2 of foreman: the daemon now acts as a transparent Ollama proxy. - internal/ollama: Client interface and HTTP implementation for chat (streaming + non-streaming), embed, tags, ps with auth forwarding, NDJSON streaming via bufio.Scanner, and connection vs HTTP error classification via custom error types. - internal/ollama: ModelInventory with background poller for /api/tags and /api/ps, degraded mode on target unreachable with model retention, automatic recovery on reconnect. - internal/server: Passthrough routes (/api/chat, /api/tags, /api/ps, /api/embed, /api/embeddings) with model validation, chat serialization gate (capacity-1 channel), concurrent embedding bypass (ADR-0013), NDJSON streaming with per-chunk flush, and degraded health reporting. - cmd/foreman: Full serve wiring with Ollama client, poller goroutine, embedder warmup (keep_alive:-1), and signal-based shutdown. The Mac is now usable as a go-llm target through foreman. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 18:07:33 -04:00
parent 9cdf4b2472
commit 27f196d333
10 changed files with 1877 additions and 39 deletions
@@ -3,41 +3,54 @@
 // Why: foreman exposes a native Ollama-compatible API plus async job endpoints;
 // centralizing routing and middleware here keeps cmd/foreman thin.
 // What: creates a stdlib net/http server with health checks, optional bearer-token
-// auth, and an extensible mux for later phases.
+// auth, Ollama passthrough (chat, tags, ps, embed), and an extensible mux.
 // Test: start the server with httptest, hit /healthz, verify 200; set a token,
-// verify 401 without it.
+// verify 401 without it; test Ollama passthrough routes.
 package server

 import (
+	"bufio"
 	"encoding/json"
+	"io"
 	"log/slog"
 	"net/http"
 	"strings"

 	"gitea.stevedudenhoeffer.com/steve/foreman/internal/config"
+	"gitea.stevedudenhoeffer.com/steve/foreman/internal/ollama"
 	"gitea.stevedudenhoeffer.com/steve/foreman/internal/store"
 )

+// scannerBufSize is the buffer size for the NDJSON scanner (4 MB).
+const scannerBufSize = 4 * 1024 * 1024
+
 // Server holds the HTTP server and its dependencies.
 type Server struct {
-	cfg    config.Config
-	store  *store.Store
-	mux    *http.ServeMux
-	logger *slog.Logger
+	cfg       config.Config
+	store     *store.Store
+	client    ollama.Client
+	inventory *ollama.ModelInventory
+	chatGate  chan struct{}
+	mux       *http.ServeMux
+	logger    *slog.Logger
 }

-// New creates a new Server with the given configuration and store. The mux is
-// populated with initial routes; callers can add more before calling ListenAndServe.
+// New creates a new Server with the given configuration, store, Ollama client,
+// and model inventory. The mux is populated with all routes.
 //
 // Why: dependency injection makes the server testable and extensible.
-// What: wires config, store, and logger into the server, registers routes.
+// What: wires config, store, client, inventory, and logger into the server,
+// registers routes, and creates the single-flight chat gate.
 // Test: create with New, use httptest to exercise routes.
-func New(cfg config.Config, st *store.Store, logger *slog.Logger) *Server {
+func New(cfg config.Config, st *store.Store, client ollama.Client, inv *ollama.ModelInventory, logger *slog.Logger) *Server {
 	s := &Server{
-		cfg:    cfg,
-		store:  st,
-		mux:    http.NewServeMux(),
-		logger: logger,
+		cfg:       cfg,
+		store:     st,
+		client:    client,
+		inventory: inv,
+		chatGate:  make(chan struct{}, 1),
+		mux:       http.NewServeMux(),
+		logger:    logger,
 	}
 	s.routes()
 	return s
@@ -65,6 +78,11 @@ func (s *Server) ListenAndServe() error {
 // routes registers all HTTP routes on the mux.
 func (s *Server) routes() {
 	s.mux.HandleFunc("GET /healthz", s.handleHealthz)
+	s.mux.HandleFunc("GET /api/tags", s.handleTags)
+	s.mux.HandleFunc("GET /api/ps", s.handlePs)
+	s.mux.HandleFunc("POST /api/chat", s.handleChat)
+	s.mux.HandleFunc("POST /api/embed", s.handleEmbed)
+	s.mux.HandleFunc("POST /api/embeddings", s.handleEmbed)
 }

 // healthResponse is the JSON shape returned by /healthz.
@@ -73,17 +91,187 @@ type healthResponse struct {
 	Degraded bool   `json:"degraded"`
 }

-// handleHealthz returns the daemon's health status. The degraded flag is a
-// placeholder for the model poller's connectivity state (Phase 2).
+// handleHealthz returns the daemon's health status, including the poller's
+// degraded flag so probes and operators can see target connectivity.
+//
+// Why: load balancers and operators need a single endpoint for health.
+// What: returns 200 with a JSON body including the degraded flag from the poller.
+// Test: set up a server with a degraded inventory, assert degraded=true in response.
 func (s *Server) handleHealthz(w http.ResponseWriter, r *http.Request) {
+	degraded := false
+	if s.inventory != nil {
+		degraded = s.inventory.Degraded()
+	}
 	w.Header().Set("Content-Type", "application/json")
 	w.WriteHeader(http.StatusOK)
 	json.NewEncoder(w).Encode(healthResponse{
 		Status:   "ok",
-		Degraded: false,
+		Degraded: degraded,
 	})
 }

+// handleTags returns the cached model inventory as Ollama-format JSON.
+//
+// Why: foreman's /api/tags must be indistinguishable from Ollama's /api/tags.
+// What: returns the poller's cached TagsResponse.
+// Test: populate the inventory, GET /api/tags, assert the response matches.
+func (s *Server) handleTags(w http.ResponseWriter, r *http.Request) {
+	models := s.inventory.Models()
+	w.Header().Set("Content-Type", "application/json")
+	json.NewEncoder(w).Encode(ollama.TagsResponse{Models: models})
+}
+
+// handlePs returns the cached running models from the poller.
+//
+// Why: foreman's /api/ps lets callers see what's resident on the target.
+// What: returns the poller's cached PsResponse.
+// Test: populate the inventory with running models, GET /api/ps, assert match.
+func (s *Server) handlePs(w http.ResponseWriter, r *http.Request) {
+	running := s.inventory.ResidentModels()
+	w.Header().Set("Content-Type", "application/json")
+	json.NewEncoder(w).Encode(ollama.PsResponse{Models: running})
+}
+
+// handleEmbed proxies embedding requests directly and concurrently to the target.
+// These bypass any serialization gate per ADR-0013.
+//
+// Why: embeddings hit the always-resident embedder and must not wait behind chat jobs.
+// What: reads the request body, proxies to the target, and returns the response.
+// Test: send concurrent embed requests, assert they all complete without serialization.
+func (s *Server) handleEmbed(w http.ResponseWriter, r *http.Request) {
+	body, err := io.ReadAll(r.Body)
+	if err != nil {
+		http.Error(w, `{"error":"failed to read request body"}`, http.StatusBadRequest)
+		return
+	}
+
+	resp, err := s.client.RawEmbed(r.Context(), body)
+	if err != nil {
+		s.logger.Error("embed proxy failed", "error", err)
+		if httpErr, ok := err.(*ollama.HTTPError); ok {
+			http.Error(w, httpErr.Body, httpErr.StatusCode)
+			return
+		}
+		http.Error(w, `{"error":"target unreachable"}`, http.StatusBadGateway)
+		return
+	}
+	defer resp.Body.Close()
+
+	// Copy response headers and body.
+	for k, vv := range resp.Header {
+		for _, v := range vv {
+			w.Header().Add(k, v)
+		}
+	}
+	if w.Header().Get("Content-Type") == "" {
+		w.Header().Set("Content-Type", "application/json")
+	}
+	w.WriteHeader(resp.StatusCode)
+	io.Copy(w, resp.Body)
+}
+
+// handleChat is the critical passthrough path for /api/chat. It validates the
+// model, serializes through a single-flight gate, and proxies to the target
+// with NDJSON streaming support.
+//
+// Why: the sync passthrough is foreman's primary API surface for go-llm (ADR-0003).
+// What: validates model, acquires the chat gate, proxies to the target, streams
+// NDJSON chunks back if streaming, releases the gate on completion.
+// Test: verify model validation (404 on unknown), serialization (two concurrent
+// requests don't overlap), streaming (NDJSON chunks pass through faithfully).
+func (s *Server) handleChat(w http.ResponseWriter, r *http.Request) {
+	body, err := io.ReadAll(r.Body)
+	if err != nil {
+		http.Error(w, `{"error":"failed to read request body"}`, http.StatusBadRequest)
+		return
+	}
+
+	// Parse just enough to validate the model and detect streaming.
+	var partial struct {
+		Model  string `json:"model"`
+		Stream *bool  `json:"stream"`
+	}
+	if err := json.Unmarshal(body, &partial); err != nil {
+		http.Error(w, `{"error":"invalid JSON body"}`, http.StatusBadRequest)
+		return
+	}
+	if partial.Model == "" {
+		http.Error(w, `{"error":"model is required"}`, http.StatusBadRequest)
+		return
+	}
+
+	// Validate the model exists. One re-poll on miss (ADR-0007).
+	if !s.inventory.HasModel(partial.Model) {
+		if err := s.inventory.Refresh(r.Context()); err != nil {
+			s.logger.Warn("model re-poll failed", "error", err)
+		}
+		if !s.inventory.HasModel(partial.Model) {
+			http.Error(w, `{"error":"model not found"}`, http.StatusNotFound)
+			return
+		}
+	}
+
+	// Determine if streaming. Ollama defaults to streaming when "stream" is absent.
+	streaming := true
+	if partial.Stream != nil && !*partial.Stream {
+		streaming = false
+	}
+
+	// Acquire the single-flight chat gate. This serializes all chat requests
+	// through one at a time. Phase 3 replaces this with the full SQLite queue +
+	// worker loop.
+	select {
+	case s.chatGate <- struct{}{}:
+		// Acquired.
+	case <-r.Context().Done():
+		http.Error(w, `{"error":"request cancelled while waiting"}`, http.StatusServiceUnavailable)
+		return
+	}
+	defer func() { <-s.chatGate }()
+
+	// Proxy to the target.
+	resp, err := s.client.RawChat(r.Context(), body)
+	if err != nil {
+		s.logger.Error("chat proxy failed", "error", err, "model", partial.Model)
+		if httpErr, ok := err.(*ollama.HTTPError); ok {
+			http.Error(w, httpErr.Body, httpErr.StatusCode)
+			return
+		}
+		http.Error(w, `{"error":"target unreachable"}`, http.StatusBadGateway)
+		return
+	}
+	defer resp.Body.Close()
+
+	if streaming {
+		w.Header().Set("Content-Type", "application/x-ndjson")
+		w.WriteHeader(http.StatusOK)
+
+		flusher, canFlush := w.(http.Flusher)
+		scanner := bufio.NewScanner(resp.Body)
+		scanner.Buffer(make([]byte, 0, scannerBufSize), scannerBufSize)
+
+		for scanner.Scan() {
+			line := scanner.Bytes()
+			if len(line) == 0 {
+				continue
+			}
+			w.Write(line)
+			w.Write([]byte("\n"))
+			if canFlush {
+				flusher.Flush()
+			}
+		}
+		if err := scanner.Err(); err != nil {
+			s.logger.Warn("stream read error", "error", err, "model", partial.Model)
+		}
+	} else {
+		// Non-streaming: proxy the complete JSON response.
+		w.Header().Set("Content-Type", "application/json")
+		w.WriteHeader(http.StatusOK)
+		io.Copy(w, resp.Body)
+	}
+}
+
 // authMiddleware validates the Authorization: Bearer <token> header on all
 // requests except /healthz. Returns 401 if the token is missing or wrong.
 func (s *Server) authMiddleware(next http.Handler) http.Handler {