feat: add Ollama target client, model poller, and native passthrough

Phase 2 of foreman: the daemon now acts as a transparent Ollama proxy.

- internal/ollama: Client interface and HTTP implementation for chat
  (streaming + non-streaming), embed, tags, ps with auth forwarding,
  NDJSON streaming via bufio.Scanner, and connection vs HTTP error
  classification via custom error types.
- internal/ollama: ModelInventory with background poller for /api/tags
  and /api/ps, degraded mode on target unreachable with model retention,
  automatic recovery on reconnect.
- internal/server: Passthrough routes (/api/chat, /api/tags, /api/ps,
  /api/embed, /api/embeddings) with model validation, chat serialization
  gate (capacity-1 channel), concurrent embedding bypass (ADR-0013),
  NDJSON streaming with per-chunk flush, and degraded health reporting.
- cmd/foreman: Full serve wiring with Ollama client, poller goroutine,
  embedder warmup (keep_alive:-1), and signal-based shutdown.

The Mac is now usable as a go-llm target through foreman.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-23 18:07:33 -04:00
parent 9cdf4b2472
commit 27f196d333
10 changed files with 1877 additions and 39 deletions
+205 -17
View File
@@ -3,41 +3,54 @@
// Why: foreman exposes a native Ollama-compatible API plus async job endpoints;
// centralizing routing and middleware here keeps cmd/foreman thin.
// What: creates a stdlib net/http server with health checks, optional bearer-token
// auth, and an extensible mux for later phases.
// auth, Ollama passthrough (chat, tags, ps, embed), and an extensible mux.
// Test: start the server with httptest, hit /healthz, verify 200; set a token,
// verify 401 without it.
// verify 401 without it; test Ollama passthrough routes.
package server
import (
"bufio"
"encoding/json"
"io"
"log/slog"
"net/http"
"strings"
"gitea.stevedudenhoeffer.com/steve/foreman/internal/config"
"gitea.stevedudenhoeffer.com/steve/foreman/internal/ollama"
"gitea.stevedudenhoeffer.com/steve/foreman/internal/store"
)
// scannerBufSize is the buffer size for the NDJSON scanner (4 MB).
const scannerBufSize = 4 * 1024 * 1024
// Server holds the HTTP server and its dependencies.
type Server struct {
cfg config.Config
store *store.Store
mux *http.ServeMux
logger *slog.Logger
cfg config.Config
store *store.Store
client ollama.Client
inventory *ollama.ModelInventory
chatGate chan struct{}
mux *http.ServeMux
logger *slog.Logger
}
// New creates a new Server with the given configuration and store. The mux is
// populated with initial routes; callers can add more before calling ListenAndServe.
// New creates a new Server with the given configuration, store, Ollama client,
// and model inventory. The mux is populated with all routes.
//
// Why: dependency injection makes the server testable and extensible.
// What: wires config, store, and logger into the server, registers routes.
// What: wires config, store, client, inventory, and logger into the server,
// registers routes, and creates the single-flight chat gate.
// Test: create with New, use httptest to exercise routes.
func New(cfg config.Config, st *store.Store, logger *slog.Logger) *Server {
func New(cfg config.Config, st *store.Store, client ollama.Client, inv *ollama.ModelInventory, logger *slog.Logger) *Server {
s := &Server{
cfg: cfg,
store: st,
mux: http.NewServeMux(),
logger: logger,
cfg: cfg,
store: st,
client: client,
inventory: inv,
chatGate: make(chan struct{}, 1),
mux: http.NewServeMux(),
logger: logger,
}
s.routes()
return s
@@ -65,6 +78,11 @@ func (s *Server) ListenAndServe() error {
// routes registers all HTTP routes on the mux.
func (s *Server) routes() {
s.mux.HandleFunc("GET /healthz", s.handleHealthz)
s.mux.HandleFunc("GET /api/tags", s.handleTags)
s.mux.HandleFunc("GET /api/ps", s.handlePs)
s.mux.HandleFunc("POST /api/chat", s.handleChat)
s.mux.HandleFunc("POST /api/embed", s.handleEmbed)
s.mux.HandleFunc("POST /api/embeddings", s.handleEmbed)
}
// healthResponse is the JSON shape returned by /healthz.
@@ -73,17 +91,187 @@ type healthResponse struct {
Degraded bool `json:"degraded"`
}
// handleHealthz returns the daemon's health status. The degraded flag is a
// placeholder for the model poller's connectivity state (Phase 2).
// handleHealthz returns the daemon's health status, including the poller's
// degraded flag so probes and operators can see target connectivity.
//
// Why: load balancers and operators need a single endpoint for health.
// What: returns 200 with a JSON body including the degraded flag from the poller.
// Test: set up a server with a degraded inventory, assert degraded=true in response.
func (s *Server) handleHealthz(w http.ResponseWriter, r *http.Request) {
degraded := false
if s.inventory != nil {
degraded = s.inventory.Degraded()
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(healthResponse{
Status: "ok",
Degraded: false,
Degraded: degraded,
})
}
// handleTags returns the cached model inventory as Ollama-format JSON.
//
// Why: foreman's /api/tags must be indistinguishable from Ollama's /api/tags.
// What: returns the poller's cached TagsResponse.
// Test: populate the inventory, GET /api/tags, assert the response matches.
func (s *Server) handleTags(w http.ResponseWriter, r *http.Request) {
models := s.inventory.Models()
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(ollama.TagsResponse{Models: models})
}
// handlePs returns the cached running models from the poller.
//
// Why: foreman's /api/ps lets callers see what's resident on the target.
// What: returns the poller's cached PsResponse.
// Test: populate the inventory with running models, GET /api/ps, assert match.
func (s *Server) handlePs(w http.ResponseWriter, r *http.Request) {
running := s.inventory.ResidentModels()
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(ollama.PsResponse{Models: running})
}
// handleEmbed proxies embedding requests directly and concurrently to the target.
// These bypass any serialization gate per ADR-0013.
//
// Why: embeddings hit the always-resident embedder and must not wait behind chat jobs.
// What: reads the request body, proxies to the target, and returns the response.
// Test: send concurrent embed requests, assert they all complete without serialization.
func (s *Server) handleEmbed(w http.ResponseWriter, r *http.Request) {
body, err := io.ReadAll(r.Body)
if err != nil {
http.Error(w, `{"error":"failed to read request body"}`, http.StatusBadRequest)
return
}
resp, err := s.client.RawEmbed(r.Context(), body)
if err != nil {
s.logger.Error("embed proxy failed", "error", err)
if httpErr, ok := err.(*ollama.HTTPError); ok {
http.Error(w, httpErr.Body, httpErr.StatusCode)
return
}
http.Error(w, `{"error":"target unreachable"}`, http.StatusBadGateway)
return
}
defer resp.Body.Close()
// Copy response headers and body.
for k, vv := range resp.Header {
for _, v := range vv {
w.Header().Add(k, v)
}
}
if w.Header().Get("Content-Type") == "" {
w.Header().Set("Content-Type", "application/json")
}
w.WriteHeader(resp.StatusCode)
io.Copy(w, resp.Body)
}
// handleChat is the critical passthrough path for /api/chat. It validates the
// model, serializes through a single-flight gate, and proxies to the target
// with NDJSON streaming support.
//
// Why: the sync passthrough is foreman's primary API surface for go-llm (ADR-0003).
// What: validates model, acquires the chat gate, proxies to the target, streams
// NDJSON chunks back if streaming, releases the gate on completion.
// Test: verify model validation (404 on unknown), serialization (two concurrent
// requests don't overlap), streaming (NDJSON chunks pass through faithfully).
func (s *Server) handleChat(w http.ResponseWriter, r *http.Request) {
body, err := io.ReadAll(r.Body)
if err != nil {
http.Error(w, `{"error":"failed to read request body"}`, http.StatusBadRequest)
return
}
// Parse just enough to validate the model and detect streaming.
var partial struct {
Model string `json:"model"`
Stream *bool `json:"stream"`
}
if err := json.Unmarshal(body, &partial); err != nil {
http.Error(w, `{"error":"invalid JSON body"}`, http.StatusBadRequest)
return
}
if partial.Model == "" {
http.Error(w, `{"error":"model is required"}`, http.StatusBadRequest)
return
}
// Validate the model exists. One re-poll on miss (ADR-0007).
if !s.inventory.HasModel(partial.Model) {
if err := s.inventory.Refresh(r.Context()); err != nil {
s.logger.Warn("model re-poll failed", "error", err)
}
if !s.inventory.HasModel(partial.Model) {
http.Error(w, `{"error":"model not found"}`, http.StatusNotFound)
return
}
}
// Determine if streaming. Ollama defaults to streaming when "stream" is absent.
streaming := true
if partial.Stream != nil && !*partial.Stream {
streaming = false
}
// Acquire the single-flight chat gate. This serializes all chat requests
// through one at a time. Phase 3 replaces this with the full SQLite queue +
// worker loop.
select {
case s.chatGate <- struct{}{}:
// Acquired.
case <-r.Context().Done():
http.Error(w, `{"error":"request cancelled while waiting"}`, http.StatusServiceUnavailable)
return
}
defer func() { <-s.chatGate }()
// Proxy to the target.
resp, err := s.client.RawChat(r.Context(), body)
if err != nil {
s.logger.Error("chat proxy failed", "error", err, "model", partial.Model)
if httpErr, ok := err.(*ollama.HTTPError); ok {
http.Error(w, httpErr.Body, httpErr.StatusCode)
return
}
http.Error(w, `{"error":"target unreachable"}`, http.StatusBadGateway)
return
}
defer resp.Body.Close()
if streaming {
w.Header().Set("Content-Type", "application/x-ndjson")
w.WriteHeader(http.StatusOK)
flusher, canFlush := w.(http.Flusher)
scanner := bufio.NewScanner(resp.Body)
scanner.Buffer(make([]byte, 0, scannerBufSize), scannerBufSize)
for scanner.Scan() {
line := scanner.Bytes()
if len(line) == 0 {
continue
}
w.Write(line)
w.Write([]byte("\n"))
if canFlush {
flusher.Flush()
}
}
if err := scanner.Err(); err != nil {
s.logger.Warn("stream read error", "error", err, "model", partial.Model)
}
} else {
// Non-streaming: proxy the complete JSON response.
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
io.Copy(w, resp.Body)
}
}
// authMiddleware validates the Authorization: Bearer <token> header on all
// requests except /healthz. Returns 401 if the token is missing or wrong.
func (s *Server) authMiddleware(next http.Handler) http.Handler {
+461 -19
View File
@@ -1,19 +1,29 @@
package server
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"log/slog"
"net/http"
"net/http/httptest"
"path/filepath"
"strings"
"sync"
"sync/atomic"
"testing"
"time"
"gitea.stevedudenhoeffer.com/steve/foreman/internal/config"
"gitea.stevedudenhoeffer.com/steve/foreman/internal/ollama"
"gitea.stevedudenhoeffer.com/steve/foreman/internal/store"
)
// newTestServer creates a Server backed by a temp-dir SQLite store.
func newTestServer(t *testing.T, cfg config.Config) *Server {
// newTestServer creates a Server backed by a temp-dir SQLite store, a stub client,
// and a pre-populated inventory.
func newTestServer(t *testing.T, cfg config.Config, client ollama.Client) *Server {
t.Helper()
dbPath := filepath.Join(t.TempDir(), "test.db")
st, err := store.Open(dbPath)
@@ -23,13 +33,28 @@ func newTestServer(t *testing.T, cfg config.Config) *Server {
t.Cleanup(func() { st.Close() })
logger := slog.Default()
return New(cfg, st, logger)
inv := ollama.NewModelInventory(client, logger)
return New(cfg, st, client, inv, logger)
}
// newTestServerWithInventory creates a Server and pre-refreshes the inventory.
func newTestServerWithInventory(t *testing.T, cfg config.Config, client ollama.Client) *Server {
t.Helper()
srv := newTestServer(t, cfg, client)
if err := srv.inventory.Refresh(context.Background()); err != nil {
t.Fatalf("inventory.Refresh: %v", err)
}
return srv
}
func TestHealthz_OK(t *testing.T) {
srv := newTestServer(t, config.Config{
stub := &stubClient{
tags: &ollama.TagsResponse{},
ps: &ollama.PsResponse{},
}
srv := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
})
}, stub)
req := httptest.NewRequest(http.MethodGet, "/healthz", nil)
rec := httptest.NewRecorder()
@@ -52,12 +77,15 @@ func TestHealthz_OK(t *testing.T) {
}
func TestHealthz_NoAuthRequired(t *testing.T) {
srv := newTestServer(t, config.Config{
stub := &stubClient{
tags: &ollama.TagsResponse{},
ps: &ollama.PsResponse{},
}
srv := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
Token: "secret-token",
})
}, stub)
// /healthz should work without any auth header even when token is configured.
req := httptest.NewRequest(http.MethodGet, "/healthz", nil)
rec := httptest.NewRecorder()
srv.Handler().ServeHTTP(rec, req)
@@ -68,16 +96,20 @@ func TestHealthz_NoAuthRequired(t *testing.T) {
}
func TestAuth_RequiredWhenTokenSet(t *testing.T) {
srv := newTestServer(t, config.Config{
stub := &stubClient{
tags: &ollama.TagsResponse{},
ps: &ollama.PsResponse{},
}
srv := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
Token: "secret-token",
})
}, stub)
tests := []struct {
name string
path string
auth string
want int
name string
path string
auth string
want int
}{
{
name: "no auth header",
@@ -123,13 +155,14 @@ func TestAuth_RequiredWhenTokenSet(t *testing.T) {
}
func TestAuth_NotRequiredWhenNoToken(t *testing.T) {
srv := newTestServer(t, config.Config{
stub := &stubClient{
tags: &ollama.TagsResponse{},
ps: &ollama.PsResponse{},
}
srv := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
// Token intentionally empty.
})
}, stub)
// Without a configured token, any request should pass auth (even to a
// nonexistent route, which returns 404 rather than 401).
req := httptest.NewRequest(http.MethodGet, "/some-route", nil)
rec := httptest.NewRecorder()
srv.Handler().ServeHTTP(rec, req)
@@ -138,3 +171,412 @@ func TestAuth_NotRequiredWhenNoToken(t *testing.T) {
t.Error("should not require auth when no token is configured")
}
}
func TestTags_ReturnsCachedModels(t *testing.T) {
stub := &stubClient{
tags: &ollama.TagsResponse{
Models: []ollama.ModelInfo{
{Name: "qwen3:30b", Model: "qwen3:30b", Size: 19000000000},
{Name: "nomic-embed-text", Model: "nomic-embed-text", Size: 300000000},
},
},
ps: &ollama.PsResponse{},
}
srv := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
}, stub)
req := httptest.NewRequest(http.MethodGet, "/api/tags", nil)
rec := httptest.NewRecorder()
srv.Handler().ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK)
}
var resp ollama.TagsResponse
if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil {
t.Fatalf("decode: %v", err)
}
if len(resp.Models) != 2 {
t.Fatalf("got %d models, want 2", len(resp.Models))
}
if resp.Models[0].Name != "qwen3:30b" {
t.Errorf("first model = %q, want %q", resp.Models[0].Name, "qwen3:30b")
}
}
func TestPs_ReturnsCachedRunningModels(t *testing.T) {
stub := &stubClient{
tags: &ollama.TagsResponse{},
ps: &ollama.PsResponse{
Models: []ollama.RunningModel{
{Name: "nomic-embed-text", Model: "nomic-embed-text", Size: 300000000},
},
},
}
srv := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
}, stub)
req := httptest.NewRequest(http.MethodGet, "/api/ps", nil)
rec := httptest.NewRecorder()
srv.Handler().ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK)
}
var resp ollama.PsResponse
if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil {
t.Fatalf("decode: %v", err)
}
if len(resp.Models) != 1 {
t.Fatalf("got %d models, want 1", len(resp.Models))
}
}
func TestChat_UnknownModel404(t *testing.T) {
stub := &stubClient{
tags: &ollama.TagsResponse{
Models: []ollama.ModelInfo{
{Name: "qwen3:30b"},
},
},
ps: &ollama.PsResponse{},
}
srv := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
}, stub)
body := `{"model":"nonexistent-model","messages":[{"role":"user","content":"hi"}],"stream":false}`
req := httptest.NewRequest(http.MethodPost, "/api/chat", strings.NewReader(body))
rec := httptest.NewRecorder()
srv.Handler().ServeHTTP(rec, req)
if rec.Code != http.StatusNotFound {
t.Fatalf("status = %d, want %d", rec.Code, http.StatusNotFound)
}
}
func TestChat_NonStreaming(t *testing.T) {
chatResp := ollama.ChatResponse{
Model: "qwen3:30b",
Done: true,
Message: &ollama.Message{Role: "assistant", Content: "Hello!"},
}
respBytes, _ := json.Marshal(chatResp)
stub := &stubClient{
tags: &ollama.TagsResponse{
Models: []ollama.ModelInfo{{Name: "qwen3:30b"}},
},
ps: &ollama.PsResponse{},
rawChatResp: newRawResponse(200, "application/json", respBytes),
}
srv := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
}, stub)
body := `{"model":"qwen3:30b","messages":[{"role":"user","content":"hi"}],"stream":false}`
req := httptest.NewRequest(http.MethodPost, "/api/chat", strings.NewReader(body))
rec := httptest.NewRecorder()
srv.Handler().ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK)
}
ct := rec.Header().Get("Content-Type")
if ct != "application/json" {
t.Errorf("Content-Type = %q, want %q", ct, "application/json")
}
var got ollama.ChatResponse
if err := json.NewDecoder(rec.Body).Decode(&got); err != nil {
t.Fatalf("decode: %v", err)
}
if got.Message == nil || got.Message.Content != "Hello!" {
t.Errorf("response content = %v, want Hello!", got.Message)
}
}
func TestChat_Streaming(t *testing.T) {
// Build NDJSON chunks.
chunks := []ollama.ChatResponse{
{Model: "qwen3:30b", Done: false, Message: &ollama.Message{Role: "assistant", Content: "Hel"}},
{Model: "qwen3:30b", Done: false, Message: &ollama.Message{Role: "assistant", Content: "lo"}},
{Model: "qwen3:30b", Done: true, DoneReason: "stop"},
}
var ndjson bytes.Buffer
for _, c := range chunks {
b, _ := json.Marshal(c)
ndjson.Write(b)
ndjson.WriteByte('\n')
}
stub := &stubClient{
tags: &ollama.TagsResponse{
Models: []ollama.ModelInfo{{Name: "qwen3:30b"}},
},
ps: &ollama.PsResponse{},
rawChatResp: newRawResponse(200, "application/x-ndjson", ndjson.Bytes()),
}
srv := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
}, stub)
body := `{"model":"qwen3:30b","messages":[{"role":"user","content":"hi"}]}`
req := httptest.NewRequest(http.MethodPost, "/api/chat", strings.NewReader(body))
rec := httptest.NewRecorder()
srv.Handler().ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK)
}
ct := rec.Header().Get("Content-Type")
if ct != "application/x-ndjson" {
t.Errorf("Content-Type = %q, want %q", ct, "application/x-ndjson")
}
// Verify chunks pass through faithfully.
lines := strings.Split(strings.TrimSpace(rec.Body.String()), "\n")
if len(lines) != 3 {
t.Fatalf("got %d lines, want 3", len(lines))
}
var last ollama.ChatResponse
if err := json.Unmarshal([]byte(lines[2]), &last); err != nil {
t.Fatalf("unmarshal last chunk: %v", err)
}
if !last.Done {
t.Error("last chunk should have done=true")
}
}
func TestChat_Serialization(t *testing.T) {
// Track concurrent requests at the stub.
var inflight atomic.Int32
var maxInflight atomic.Int32
stub := &stubClient{
tags: &ollama.TagsResponse{
Models: []ollama.ModelInfo{{Name: "qwen3:30b"}},
},
ps: &ollama.PsResponse{},
rawChatFunc: func(ctx context.Context, body []byte) (*http.Response, error) {
cur := inflight.Add(1)
defer inflight.Add(-1)
for {
old := maxInflight.Load()
if cur <= old || maxInflight.CompareAndSwap(old, cur) {
break
}
}
// Simulate work.
time.Sleep(50 * time.Millisecond)
resp := ollama.ChatResponse{Model: "qwen3:30b", Done: true}
b, _ := json.Marshal(resp)
return newRawResponse(200, "application/json", b), nil
},
}
srv := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
}, stub)
var wg sync.WaitGroup
for i := 0; i < 3; i++ {
wg.Add(1)
go func() {
defer wg.Done()
body := `{"model":"qwen3:30b","messages":[{"role":"user","content":"hi"}],"stream":false}`
req := httptest.NewRequest(http.MethodPost, "/api/chat", strings.NewReader(body))
rec := httptest.NewRecorder()
srv.Handler().ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Errorf("status = %d, want %d", rec.Code, http.StatusOK)
}
}()
}
wg.Wait()
if got := maxInflight.Load(); got > 1 {
t.Errorf("max concurrent chat requests at target = %d, want 1 (gate should serialize)", got)
}
}
func TestEmbed_ConcurrentBypassesGate(t *testing.T) {
// Track concurrent embed requests.
var inflight atomic.Int32
var maxInflight atomic.Int32
stub := &stubClient{
tags: &ollama.TagsResponse{
Models: []ollama.ModelInfo{{Name: "qwen3:30b"}},
},
ps: &ollama.PsResponse{},
rawEmbedFunc: func(ctx context.Context, body []byte) (*http.Response, error) {
cur := inflight.Add(1)
defer inflight.Add(-1)
for {
old := maxInflight.Load()
if cur <= old || maxInflight.CompareAndSwap(old, cur) {
break
}
}
// Simulate some work so concurrent requests overlap.
time.Sleep(50 * time.Millisecond)
resp := ollama.EmbedResponse{Model: "nomic-embed-text", Embeddings: [][]float64{{0.1, 0.2}}}
b, _ := json.Marshal(resp)
return newRawResponse(200, "application/json", b), nil
},
}
srv := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
}, stub)
var wg sync.WaitGroup
for i := 0; i < 5; i++ {
wg.Add(1)
go func() {
defer wg.Done()
body := `{"model":"nomic-embed-text","input":"test"}`
req := httptest.NewRequest(http.MethodPost, "/api/embed", strings.NewReader(body))
rec := httptest.NewRecorder()
srv.Handler().ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Errorf("embed status = %d, want %d", rec.Code, http.StatusOK)
}
}()
}
wg.Wait()
if got := maxInflight.Load(); got < 2 {
t.Errorf("max concurrent embed requests = %d, want >= 2 (embeds should run in parallel)", got)
}
}
func TestEmbed_AlsoWorksOnEmbeddingsPath(t *testing.T) {
embedResp := ollama.EmbedResponse{
Model: "nomic-embed-text",
Embeddings: [][]float64{{0.1, 0.2, 0.3}},
}
respBytes, _ := json.Marshal(embedResp)
stub := &stubClient{
tags: &ollama.TagsResponse{},
ps: &ollama.PsResponse{},
rawEmbedFunc: func(ctx context.Context, body []byte) (*http.Response, error) {
return newRawResponse(200, "application/json", respBytes), nil
},
}
srv := newTestServerWithInventory(t, config.Config{
OllamaURL: "http://localhost:11434",
}, stub)
body := `{"model":"nomic-embed-text","input":"test"}`
req := httptest.NewRequest(http.MethodPost, "/api/embeddings", strings.NewReader(body))
rec := httptest.NewRecorder()
srv.Handler().ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK)
}
}
func TestHealthz_DegradedFromInventory(t *testing.T) {
stub := &stubClient{
tagsErr: fmt.Errorf("connection refused"),
ps: &ollama.PsResponse{},
}
srv := newTestServer(t, config.Config{
OllamaURL: "http://localhost:11434",
}, stub)
// Refresh will fail, setting degraded = true.
srv.inventory.Refresh(context.Background())
req := httptest.NewRequest(http.MethodGet, "/healthz", nil)
rec := httptest.NewRecorder()
srv.Handler().ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK)
}
var resp healthResponse
if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil {
t.Fatalf("decode: %v", err)
}
if !resp.Degraded {
t.Error("expected degraded=true when inventory poll failed")
}
}
// --- Stub client for testing ---
// stubClient implements ollama.Client for testing.
type stubClient struct {
tags *ollama.TagsResponse
tagsErr error
ps *ollama.PsResponse
psErr error
rawChatResp *http.Response
rawChatFunc func(ctx context.Context, body []byte) (*http.Response, error)
rawEmbedResp *http.Response
rawEmbedFunc func(ctx context.Context, body []byte) (*http.Response, error)
}
func (s *stubClient) Chat(ctx context.Context, req ollama.ChatRequest, stream bool) (*ollama.ChatResponse, <-chan ollama.ChatResponse, error) {
return nil, nil, fmt.Errorf("stubClient.Chat not implemented")
}
func (s *stubClient) Embed(ctx context.Context, req ollama.EmbedRequest) (*ollama.EmbedResponse, error) {
return nil, fmt.Errorf("stubClient.Embed not implemented")
}
func (s *stubClient) Tags(ctx context.Context) (*ollama.TagsResponse, error) {
if s.tagsErr != nil {
return nil, s.tagsErr
}
return s.tags, nil
}
func (s *stubClient) Ps(ctx context.Context) (*ollama.PsResponse, error) {
if s.psErr != nil {
return nil, s.psErr
}
return s.ps, nil
}
func (s *stubClient) RawChat(ctx context.Context, body []byte) (*http.Response, error) {
if s.rawChatFunc != nil {
return s.rawChatFunc(ctx, body)
}
if s.rawChatResp != nil {
return s.rawChatResp, nil
}
return nil, fmt.Errorf("stubClient.RawChat not configured")
}
func (s *stubClient) RawEmbed(ctx context.Context, body []byte) (*http.Response, error) {
if s.rawEmbedFunc != nil {
return s.rawEmbedFunc(ctx, body)
}
if s.rawEmbedResp != nil {
return s.rawEmbedResp, nil
}
return nil, fmt.Errorf("stubClient.RawEmbed not configured")
}
// newRawResponse builds a minimal *http.Response for testing.
func newRawResponse(status int, contentType string, body []byte) *http.Response {
return &http.Response{
StatusCode: status,
Header: http.Header{"Content-Type": {contentType}},
Body: io.NopCloser(bytes.NewReader(body)),
}
}