feat: add Ollama target client, model poller, and native passthrough

Phase 2 of foreman: the daemon now acts as a transparent Ollama proxy. - internal/ollama: Client interface and HTTP implementation for chat (streaming + non-streaming), embed, tags, ps with auth forwarding, NDJSON streaming via bufio.Scanner, and connection vs HTTP error classification via custom error types. - internal/ollama: ModelInventory with background poller for /api/tags and /api/ps, degraded mode on target unreachable with model retention, automatic recovery on reconnect. - internal/server: Passthrough routes (/api/chat, /api/tags, /api/ps, /api/embed, /api/embeddings) with model validation, chat serialization gate (capacity-1 channel), concurrent embedding bypass (ADR-0013), NDJSON streaming with per-chunk flush, and degraded health reporting. - cmd/foreman: Full serve wiring with Ollama client, poller goroutine, embedder warmup (keep_alive:-1), and signal-based shutdown. The Mac is now usable as a go-llm target through foreman. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 18:07:33 -04:00
parent 9cdf4b2472
commit 27f196d333
10 changed files with 1877 additions and 39 deletions
@@ -1,19 +1,29 @@
 package server

 import (
+	"bytes"
+	"context"
 	"encoding/json"
+	"fmt"
+	"io"
 	"log/slog"
 	"net/http"
 	"net/http/httptest"
 	"path/filepath"
+	"strings"
+	"sync"
+	"sync/atomic"
 	"testing"
+	"time"

 	"gitea.stevedudenhoeffer.com/steve/foreman/internal/config"
+	"gitea.stevedudenhoeffer.com/steve/foreman/internal/ollama"
 	"gitea.stevedudenhoeffer.com/steve/foreman/internal/store"
 )

-// newTestServer creates a Server backed by a temp-dir SQLite store.
-func newTestServer(t *testing.T, cfg config.Config) *Server {
+// newTestServer creates a Server backed by a temp-dir SQLite store, a stub client,
+// and a pre-populated inventory.
+func newTestServer(t *testing.T, cfg config.Config, client ollama.Client) *Server {
 	t.Helper()
 	dbPath := filepath.Join(t.TempDir(), "test.db")
 	st, err := store.Open(dbPath)
@@ -23,13 +33,28 @@ func newTestServer(t *testing.T, cfg config.Config) *Server {
 	t.Cleanup(func() { st.Close() })

 	logger := slog.Default()
-	return New(cfg, st, logger)
+	inv := ollama.NewModelInventory(client, logger)
+	return New(cfg, st, client, inv, logger)
+}
+
+// newTestServerWithInventory creates a Server and pre-refreshes the inventory.
+func newTestServerWithInventory(t *testing.T, cfg config.Config, client ollama.Client) *Server {
+	t.Helper()
+	srv := newTestServer(t, cfg, client)
+	if err := srv.inventory.Refresh(context.Background()); err != nil {
+		t.Fatalf("inventory.Refresh: %v", err)
+	}
+	return srv
 }

 func TestHealthz_OK(t *testing.T) {
-	srv := newTestServer(t, config.Config{
+	stub := &stubClient{
+		tags: &ollama.TagsResponse{},
+		ps:   &ollama.PsResponse{},
+	}
+	srv := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
-	})
+	}, stub)

 	req := httptest.NewRequest(http.MethodGet, "/healthz", nil)
 	rec := httptest.NewRecorder()
@@ -52,12 +77,15 @@ func TestHealthz_OK(t *testing.T) {
 }

 func TestHealthz_NoAuthRequired(t *testing.T) {
-	srv := newTestServer(t, config.Config{
+	stub := &stubClient{
+		tags: &ollama.TagsResponse{},
+		ps:   &ollama.PsResponse{},
+	}
+	srv := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 		Token:     "secret-token",
-	})
+	}, stub)

-	// /healthz should work without any auth header even when token is configured.
 	req := httptest.NewRequest(http.MethodGet, "/healthz", nil)
 	rec := httptest.NewRecorder()
 	srv.Handler().ServeHTTP(rec, req)
@@ -68,16 +96,20 @@ func TestHealthz_NoAuthRequired(t *testing.T) {
 }

 func TestAuth_RequiredWhenTokenSet(t *testing.T) {
-	srv := newTestServer(t, config.Config{
+	stub := &stubClient{
+		tags: &ollama.TagsResponse{},
+		ps:   &ollama.PsResponse{},
+	}
+	srv := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 		Token:     "secret-token",
-	})
+	}, stub)

 	tests := []struct {
-		name   string
-		path   string
-		auth   string
-		want   int
+		name string
+		path string
+		auth string
+		want int
 	}{
 		{
 			name: "no auth header",
@@ -123,13 +155,14 @@ func TestAuth_RequiredWhenTokenSet(t *testing.T) {
 }

 func TestAuth_NotRequiredWhenNoToken(t *testing.T) {
-	srv := newTestServer(t, config.Config{
+	stub := &stubClient{
+		tags: &ollama.TagsResponse{},
+		ps:   &ollama.PsResponse{},
+	}
+	srv := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
-		// Token intentionally empty.
-	})
+	}, stub)

-	// Without a configured token, any request should pass auth (even to a
-	// nonexistent route, which returns 404 rather than 401).
 	req := httptest.NewRequest(http.MethodGet, "/some-route", nil)
 	rec := httptest.NewRecorder()
 	srv.Handler().ServeHTTP(rec, req)
@@ -138,3 +171,412 @@ func TestAuth_NotRequiredWhenNoToken(t *testing.T) {
 		t.Error("should not require auth when no token is configured")
 	}
 }
+
+func TestTags_ReturnsCachedModels(t *testing.T) {
+	stub := &stubClient{
+		tags: &ollama.TagsResponse{
+			Models: []ollama.ModelInfo{
+				{Name: "qwen3:30b", Model: "qwen3:30b", Size: 19000000000},
+				{Name: "nomic-embed-text", Model: "nomic-embed-text", Size: 300000000},
+			},
+		},
+		ps: &ollama.PsResponse{},
+	}
+	srv := newTestServerWithInventory(t, config.Config{
+		OllamaURL: "http://localhost:11434",
+	}, stub)
+
+	req := httptest.NewRequest(http.MethodGet, "/api/tags", nil)
+	rec := httptest.NewRecorder()
+	srv.Handler().ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK)
+	}
+
+	var resp ollama.TagsResponse
+	if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil {
+		t.Fatalf("decode: %v", err)
+	}
+	if len(resp.Models) != 2 {
+		t.Fatalf("got %d models, want 2", len(resp.Models))
+	}
+	if resp.Models[0].Name != "qwen3:30b" {
+		t.Errorf("first model = %q, want %q", resp.Models[0].Name, "qwen3:30b")
+	}
+}
+
+func TestPs_ReturnsCachedRunningModels(t *testing.T) {
+	stub := &stubClient{
+		tags: &ollama.TagsResponse{},
+		ps: &ollama.PsResponse{
+			Models: []ollama.RunningModel{
+				{Name: "nomic-embed-text", Model: "nomic-embed-text", Size: 300000000},
+			},
+		},
+	}
+	srv := newTestServerWithInventory(t, config.Config{
+		OllamaURL: "http://localhost:11434",
+	}, stub)
+
+	req := httptest.NewRequest(http.MethodGet, "/api/ps", nil)
+	rec := httptest.NewRecorder()
+	srv.Handler().ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK)
+	}
+
+	var resp ollama.PsResponse
+	if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil {
+		t.Fatalf("decode: %v", err)
+	}
+	if len(resp.Models) != 1 {
+		t.Fatalf("got %d models, want 1", len(resp.Models))
+	}
+}
+
+func TestChat_UnknownModel404(t *testing.T) {
+	stub := &stubClient{
+		tags: &ollama.TagsResponse{
+			Models: []ollama.ModelInfo{
+				{Name: "qwen3:30b"},
+			},
+		},
+		ps: &ollama.PsResponse{},
+	}
+	srv := newTestServerWithInventory(t, config.Config{
+		OllamaURL: "http://localhost:11434",
+	}, stub)
+
+	body := `{"model":"nonexistent-model","messages":[{"role":"user","content":"hi"}],"stream":false}`
+	req := httptest.NewRequest(http.MethodPost, "/api/chat", strings.NewReader(body))
+	rec := httptest.NewRecorder()
+	srv.Handler().ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusNotFound {
+		t.Fatalf("status = %d, want %d", rec.Code, http.StatusNotFound)
+	}
+}
+
+func TestChat_NonStreaming(t *testing.T) {
+	chatResp := ollama.ChatResponse{
+		Model:   "qwen3:30b",
+		Done:    true,
+		Message: &ollama.Message{Role: "assistant", Content: "Hello!"},
+	}
+	respBytes, _ := json.Marshal(chatResp)
+
+	stub := &stubClient{
+		tags: &ollama.TagsResponse{
+			Models: []ollama.ModelInfo{{Name: "qwen3:30b"}},
+		},
+		ps:          &ollama.PsResponse{},
+		rawChatResp: newRawResponse(200, "application/json", respBytes),
+	}
+	srv := newTestServerWithInventory(t, config.Config{
+		OllamaURL: "http://localhost:11434",
+	}, stub)
+
+	body := `{"model":"qwen3:30b","messages":[{"role":"user","content":"hi"}],"stream":false}`
+	req := httptest.NewRequest(http.MethodPost, "/api/chat", strings.NewReader(body))
+	rec := httptest.NewRecorder()
+	srv.Handler().ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK)
+	}
+
+	ct := rec.Header().Get("Content-Type")
+	if ct != "application/json" {
+		t.Errorf("Content-Type = %q, want %q", ct, "application/json")
+	}
+
+	var got ollama.ChatResponse
+	if err := json.NewDecoder(rec.Body).Decode(&got); err != nil {
+		t.Fatalf("decode: %v", err)
+	}
+	if got.Message == nil || got.Message.Content != "Hello!" {
+		t.Errorf("response content = %v, want Hello!", got.Message)
+	}
+}
+
+func TestChat_Streaming(t *testing.T) {
+	// Build NDJSON chunks.
+	chunks := []ollama.ChatResponse{
+		{Model: "qwen3:30b", Done: false, Message: &ollama.Message{Role: "assistant", Content: "Hel"}},
+		{Model: "qwen3:30b", Done: false, Message: &ollama.Message{Role: "assistant", Content: "lo"}},
+		{Model: "qwen3:30b", Done: true, DoneReason: "stop"},
+	}
+	var ndjson bytes.Buffer
+	for _, c := range chunks {
+		b, _ := json.Marshal(c)
+		ndjson.Write(b)
+		ndjson.WriteByte('\n')
+	}
+
+	stub := &stubClient{
+		tags: &ollama.TagsResponse{
+			Models: []ollama.ModelInfo{{Name: "qwen3:30b"}},
+		},
+		ps:          &ollama.PsResponse{},
+		rawChatResp: newRawResponse(200, "application/x-ndjson", ndjson.Bytes()),
+	}
+	srv := newTestServerWithInventory(t, config.Config{
+		OllamaURL: "http://localhost:11434",
+	}, stub)
+
+	body := `{"model":"qwen3:30b","messages":[{"role":"user","content":"hi"}]}`
+	req := httptest.NewRequest(http.MethodPost, "/api/chat", strings.NewReader(body))
+	rec := httptest.NewRecorder()
+	srv.Handler().ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK)
+	}
+
+	ct := rec.Header().Get("Content-Type")
+	if ct != "application/x-ndjson" {
+		t.Errorf("Content-Type = %q, want %q", ct, "application/x-ndjson")
+	}
+
+	// Verify chunks pass through faithfully.
+	lines := strings.Split(strings.TrimSpace(rec.Body.String()), "\n")
+	if len(lines) != 3 {
+		t.Fatalf("got %d lines, want 3", len(lines))
+	}
+
+	var last ollama.ChatResponse
+	if err := json.Unmarshal([]byte(lines[2]), &last); err != nil {
+		t.Fatalf("unmarshal last chunk: %v", err)
+	}
+	if !last.Done {
+		t.Error("last chunk should have done=true")
+	}
+}
+
+func TestChat_Serialization(t *testing.T) {
+	// Track concurrent requests at the stub.
+	var inflight atomic.Int32
+	var maxInflight atomic.Int32
+
+	stub := &stubClient{
+		tags: &ollama.TagsResponse{
+			Models: []ollama.ModelInfo{{Name: "qwen3:30b"}},
+		},
+		ps: &ollama.PsResponse{},
+		rawChatFunc: func(ctx context.Context, body []byte) (*http.Response, error) {
+			cur := inflight.Add(1)
+			defer inflight.Add(-1)
+			for {
+				old := maxInflight.Load()
+				if cur <= old || maxInflight.CompareAndSwap(old, cur) {
+					break
+				}
+			}
+			// Simulate work.
+			time.Sleep(50 * time.Millisecond)
+			resp := ollama.ChatResponse{Model: "qwen3:30b", Done: true}
+			b, _ := json.Marshal(resp)
+			return newRawResponse(200, "application/json", b), nil
+		},
+	}
+	srv := newTestServerWithInventory(t, config.Config{
+		OllamaURL: "http://localhost:11434",
+	}, stub)
+
+	var wg sync.WaitGroup
+	for i := 0; i < 3; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			body := `{"model":"qwen3:30b","messages":[{"role":"user","content":"hi"}],"stream":false}`
+			req := httptest.NewRequest(http.MethodPost, "/api/chat", strings.NewReader(body))
+			rec := httptest.NewRecorder()
+			srv.Handler().ServeHTTP(rec, req)
+			if rec.Code != http.StatusOK {
+				t.Errorf("status = %d, want %d", rec.Code, http.StatusOK)
+			}
+		}()
+	}
+	wg.Wait()
+
+	if got := maxInflight.Load(); got > 1 {
+		t.Errorf("max concurrent chat requests at target = %d, want 1 (gate should serialize)", got)
+	}
+}
+
+func TestEmbed_ConcurrentBypassesGate(t *testing.T) {
+	// Track concurrent embed requests.
+	var inflight atomic.Int32
+	var maxInflight atomic.Int32
+
+	stub := &stubClient{
+		tags: &ollama.TagsResponse{
+			Models: []ollama.ModelInfo{{Name: "qwen3:30b"}},
+		},
+		ps: &ollama.PsResponse{},
+		rawEmbedFunc: func(ctx context.Context, body []byte) (*http.Response, error) {
+			cur := inflight.Add(1)
+			defer inflight.Add(-1)
+			for {
+				old := maxInflight.Load()
+				if cur <= old || maxInflight.CompareAndSwap(old, cur) {
+					break
+				}
+			}
+			// Simulate some work so concurrent requests overlap.
+			time.Sleep(50 * time.Millisecond)
+			resp := ollama.EmbedResponse{Model: "nomic-embed-text", Embeddings: [][]float64{{0.1, 0.2}}}
+			b, _ := json.Marshal(resp)
+			return newRawResponse(200, "application/json", b), nil
+		},
+	}
+	srv := newTestServerWithInventory(t, config.Config{
+		OllamaURL: "http://localhost:11434",
+	}, stub)
+
+	var wg sync.WaitGroup
+	for i := 0; i < 5; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			body := `{"model":"nomic-embed-text","input":"test"}`
+			req := httptest.NewRequest(http.MethodPost, "/api/embed", strings.NewReader(body))
+			rec := httptest.NewRecorder()
+			srv.Handler().ServeHTTP(rec, req)
+			if rec.Code != http.StatusOK {
+				t.Errorf("embed status = %d, want %d", rec.Code, http.StatusOK)
+			}
+		}()
+	}
+	wg.Wait()
+
+	if got := maxInflight.Load(); got < 2 {
+		t.Errorf("max concurrent embed requests = %d, want >= 2 (embeds should run in parallel)", got)
+	}
+}
+
+func TestEmbed_AlsoWorksOnEmbeddingsPath(t *testing.T) {
+	embedResp := ollama.EmbedResponse{
+		Model:      "nomic-embed-text",
+		Embeddings: [][]float64{{0.1, 0.2, 0.3}},
+	}
+	respBytes, _ := json.Marshal(embedResp)
+
+	stub := &stubClient{
+		tags: &ollama.TagsResponse{},
+		ps:   &ollama.PsResponse{},
+		rawEmbedFunc: func(ctx context.Context, body []byte) (*http.Response, error) {
+			return newRawResponse(200, "application/json", respBytes), nil
+		},
+	}
+	srv := newTestServerWithInventory(t, config.Config{
+		OllamaURL: "http://localhost:11434",
+	}, stub)
+
+	body := `{"model":"nomic-embed-text","input":"test"}`
+	req := httptest.NewRequest(http.MethodPost, "/api/embeddings", strings.NewReader(body))
+	rec := httptest.NewRecorder()
+	srv.Handler().ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK)
+	}
+}
+
+func TestHealthz_DegradedFromInventory(t *testing.T) {
+	stub := &stubClient{
+		tagsErr: fmt.Errorf("connection refused"),
+		ps:      &ollama.PsResponse{},
+	}
+	srv := newTestServer(t, config.Config{
+		OllamaURL: "http://localhost:11434",
+	}, stub)
+
+	// Refresh will fail, setting degraded = true.
+	srv.inventory.Refresh(context.Background())
+
+	req := httptest.NewRequest(http.MethodGet, "/healthz", nil)
+	rec := httptest.NewRecorder()
+	srv.Handler().ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK)
+	}
+
+	var resp healthResponse
+	if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil {
+		t.Fatalf("decode: %v", err)
+	}
+	if !resp.Degraded {
+		t.Error("expected degraded=true when inventory poll failed")
+	}
+}
+
+// --- Stub client for testing ---
+
+// stubClient implements ollama.Client for testing.
+type stubClient struct {
+	tags    *ollama.TagsResponse
+	tagsErr error
+	ps      *ollama.PsResponse
+	psErr   error
+
+	rawChatResp *http.Response
+	rawChatFunc func(ctx context.Context, body []byte) (*http.Response, error)
+
+	rawEmbedResp *http.Response
+	rawEmbedFunc func(ctx context.Context, body []byte) (*http.Response, error)
+}
+
+func (s *stubClient) Chat(ctx context.Context, req ollama.ChatRequest, stream bool) (*ollama.ChatResponse, <-chan ollama.ChatResponse, error) {
+	return nil, nil, fmt.Errorf("stubClient.Chat not implemented")
+}
+
+func (s *stubClient) Embed(ctx context.Context, req ollama.EmbedRequest) (*ollama.EmbedResponse, error) {
+	return nil, fmt.Errorf("stubClient.Embed not implemented")
+}
+
+func (s *stubClient) Tags(ctx context.Context) (*ollama.TagsResponse, error) {
+	if s.tagsErr != nil {
+		return nil, s.tagsErr
+	}
+	return s.tags, nil
+}
+
+func (s *stubClient) Ps(ctx context.Context) (*ollama.PsResponse, error) {
+	if s.psErr != nil {
+		return nil, s.psErr
+	}
+	return s.ps, nil
+}
+
+func (s *stubClient) RawChat(ctx context.Context, body []byte) (*http.Response, error) {
+	if s.rawChatFunc != nil {
+		return s.rawChatFunc(ctx, body)
+	}
+	if s.rawChatResp != nil {
+		return s.rawChatResp, nil
+	}
+	return nil, fmt.Errorf("stubClient.RawChat not configured")
+}
+
+func (s *stubClient) RawEmbed(ctx context.Context, body []byte) (*http.Response, error) {
+	if s.rawEmbedFunc != nil {
+		return s.rawEmbedFunc(ctx, body)
+	}
+	if s.rawEmbedResp != nil {
+		return s.rawEmbedResp, nil
+	}
+	return nil, fmt.Errorf("stubClient.RawEmbed not configured")
+}
+
+// newRawResponse builds a minimal *http.Response for testing.
+func newRawResponse(status int, contentType string, body []byte) *http.Response {
+	return &http.Response{
+		StatusCode: status,
+		Header:     http.Header{"Content-Type": {contentType}},
+		Body:       io.NopCloser(bytes.NewReader(body)),
+	}
+}