feat: add Ollama target client, model poller, and native passthrough

Phase 2 of foreman: the daemon now acts as a transparent Ollama proxy. - internal/ollama: Client interface and HTTP implementation for chat (streaming + non-streaming), embed, tags, ps with auth forwarding, NDJSON streaming via bufio.Scanner, and connection vs HTTP error classification via custom error types. - internal/ollama: ModelInventory with background poller for /api/tags and /api/ps, degraded mode on target unreachable with model retention, automatic recovery on reconnect. - internal/server: Passthrough routes (/api/chat, /api/tags, /api/ps, /api/embed, /api/embeddings) with model validation, chat serialization gate (capacity-1 channel), concurrent embedding bypass (ADR-0013), NDJSON streaming with per-chunk flush, and degraded health reporting. - cmd/foreman: Full serve wiring with Ollama client, poller goroutine, embedder warmup (keep_alive:-1), and signal-based shutdown. The Mac is now usable as a go-llm target through foreman. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 18:07:33 -04:00
parent 9cdf4b2472
commit 27f196d333
10 changed files with 1877 additions and 39 deletions
@@ -0,0 +1,326 @@
+package ollama
+
+import (
+	"bufio"
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net"
+	"net/http"
+	"strings"
+)
+
+// scannerBufSize is the buffer size for the NDJSON scanner (4 MB).
+// Large enough to handle big tool-call payloads in a single line.
+const scannerBufSize = 4 * 1024 * 1024
+
+// Client defines the interface for communicating with an Ollama target.
+//
+// Why: an interface allows the worker loop, passthrough handlers, and tests to
+// share a single contract and swap in stubs.
+// What: covers the four Ollama endpoints foreman uses: chat, embed, tags, and ps.
+// Test: implement with a stub HTTP server; verify round-trip for each method.
+type Client interface {
+	// Chat sends a chat request. When stream is false, returns (*ChatResponse, nil, nil).
+	// When stream is true, returns (nil, <-chan ChatResponse, nil) with chunks delivered
+	// on the channel. The channel is closed when the stream ends.
+	Chat(ctx context.Context, req ChatRequest, stream bool) (*ChatResponse, <-chan ChatResponse, error)
+
+	// Embed sends an embedding request to /api/embed.
+	Embed(ctx context.Context, req EmbedRequest) (*EmbedResponse, error)
+
+	// Tags returns the list of installed models from /api/tags.
+	Tags(ctx context.Context) (*TagsResponse, error)
+
+	// Ps returns the list of currently-loaded models from /api/ps.
+	Ps(ctx context.Context) (*PsResponse, error)
+
+	// RawChat performs a raw proxied chat request, returning the http.Response for
+	// the caller to stream directly to a downstream client. The caller is responsible
+	// for closing the response body.
+	RawChat(ctx context.Context, body []byte) (*http.Response, error)
+
+	// RawEmbed performs a raw proxied embed request, returning the http.Response.
+	// The caller is responsible for closing the response body.
+	RawEmbed(ctx context.Context, body []byte) (*http.Response, error)
+}
+
+// httpClient is the concrete implementation of Client backed by net/http.
+type httpClient struct {
+	baseURL    string
+	token      string
+	httpClient *http.Client
+}
+
+// NewClient creates a new Ollama HTTP client.
+//
+// Why: centralizes base URL, auth token, and HTTP client configuration.
+// What: returns a Client that makes HTTP requests to the given Ollama base URL.
+// Test: create with a httptest.Server URL, call Tags, verify correct request path.
+func NewClient(baseURL, token string) Client {
+	// Trim trailing slash for consistent URL construction.
+	baseURL = strings.TrimRight(baseURL, "/")
+	return &httpClient{
+		baseURL:    baseURL,
+		token:      token,
+		httpClient: &http.Client{},
+	}
+}
+
+// Chat sends a POST /api/chat to the Ollama target.
+//
+// Why: the worker loop and sync passthrough both need structured chat access.
+// What: POSTs the chat request, returns either a single response or a channel of
+// streamed chunks depending on the stream parameter.
+// Test: stub a /api/chat endpoint returning NDJSON or a single JSON object; verify
+// both streaming and non-streaming paths.
+func (c *httpClient) Chat(ctx context.Context, req ChatRequest, stream bool) (*ChatResponse, <-chan ChatResponse, error) {
+	streamVal := stream
+	req.Stream = &streamVal
+
+	body, err := json.Marshal(req)
+	if err != nil {
+		return nil, nil, fmt.Errorf("marshal chat request: %w", err)
+	}
+
+	httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/api/chat", bytes.NewReader(body))
+	if err != nil {
+		return nil, nil, fmt.Errorf("create chat request: %w", err)
+	}
+	httpReq.Header.Set("Content-Type", "application/json")
+	c.setAuth(httpReq)
+
+	resp, err := c.httpClient.Do(httpReq)
+	if err != nil {
+		return nil, nil, c.wrapConnErr(err)
+	}
+
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		defer resp.Body.Close()
+		errBody, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
+		return nil, nil, &HTTPError{StatusCode: resp.StatusCode, Body: string(errBody)}
+	}
+
+	if !stream {
+		defer resp.Body.Close()
+		var chatResp ChatResponse
+		if err := json.NewDecoder(resp.Body).Decode(&chatResp); err != nil {
+			return nil, nil, fmt.Errorf("decode chat response: %w", err)
+		}
+		return &chatResp, nil, nil
+	}
+
+	// Streaming: read NDJSON lines and send on channel.
+	ch := make(chan ChatResponse, 64)
+	go func() {
+		defer close(ch)
+		defer resp.Body.Close()
+
+		scanner := bufio.NewScanner(resp.Body)
+		scanner.Buffer(make([]byte, 0, scannerBufSize), scannerBufSize)
+		for scanner.Scan() {
+			line := scanner.Bytes()
+			if len(line) == 0 {
+				continue
+			}
+			var chunk ChatResponse
+			if err := json.Unmarshal(line, &chunk); err != nil {
+				continue
+			}
+			ch <- chunk
+		}
+	}()
+
+	return nil, ch, nil
+}
+
+// Embed sends a POST /api/embed to the Ollama target.
+//
+// Why: embedding requests bypass the queue and go directly to the target (ADR-0013).
+// What: POSTs the embed request and returns the parsed response.
+// Test: stub /api/embed, send a request, verify embeddings in the response.
+func (c *httpClient) Embed(ctx context.Context, req EmbedRequest) (*EmbedResponse, error) {
+	body, err := json.Marshal(req)
+	if err != nil {
+		return nil, fmt.Errorf("marshal embed request: %w", err)
+	}
+
+	httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/api/embed", bytes.NewReader(body))
+	if err != nil {
+		return nil, fmt.Errorf("create embed request: %w", err)
+	}
+	httpReq.Header.Set("Content-Type", "application/json")
+	c.setAuth(httpReq)
+
+	resp, err := c.httpClient.Do(httpReq)
+	if err != nil {
+		return nil, c.wrapConnErr(err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		errBody, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
+		return nil, &HTTPError{StatusCode: resp.StatusCode, Body: string(errBody)}
+	}
+
+	var embedResp EmbedResponse
+	if err := json.NewDecoder(resp.Body).Decode(&embedResp); err != nil {
+		return nil, fmt.Errorf("decode embed response: %w", err)
+	}
+	return &embedResp, nil
+}
+
+// Tags fetches GET /api/tags from the Ollama target.
+//
+// Why: the model poller needs the installed model list for inventory and validation.
+// What: GETs /api/tags and returns the parsed response.
+// Test: stub /api/tags with a model list, verify Tags() returns it.
+func (c *httpClient) Tags(ctx context.Context) (*TagsResponse, error) {
+	httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+"/api/tags", nil)
+	if err != nil {
+		return nil, fmt.Errorf("create tags request: %w", err)
+	}
+	c.setAuth(httpReq)
+
+	resp, err := c.httpClient.Do(httpReq)
+	if err != nil {
+		return nil, c.wrapConnErr(err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		errBody, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
+		return nil, &HTTPError{StatusCode: resp.StatusCode, Body: string(errBody)}
+	}
+
+	var tagsResp TagsResponse
+	if err := json.NewDecoder(resp.Body).Decode(&tagsResp); err != nil {
+		return nil, fmt.Errorf("decode tags response: %w", err)
+	}
+	return &tagsResp, nil
+}
+
+// Ps fetches GET /api/ps from the Ollama target.
+//
+// Why: the poller and scheduler need to know which models are currently loaded.
+// What: GETs /api/ps and returns the parsed response.
+// Test: stub /api/ps with running models, verify Ps() returns them.
+func (c *httpClient) Ps(ctx context.Context) (*PsResponse, error) {
+	httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+"/api/ps", nil)
+	if err != nil {
+		return nil, fmt.Errorf("create ps request: %w", err)
+	}
+	c.setAuth(httpReq)
+
+	resp, err := c.httpClient.Do(httpReq)
+	if err != nil {
+		return nil, c.wrapConnErr(err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		errBody, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
+		return nil, &HTTPError{StatusCode: resp.StatusCode, Body: string(errBody)}
+	}
+
+	var psResp PsResponse
+	if err := json.NewDecoder(resp.Body).Decode(&psResp); err != nil {
+		return nil, fmt.Errorf("decode ps response: %w", err)
+	}
+	return &psResp, nil
+}
+
+// RawChat performs a raw proxied POST /api/chat, returning the http.Response for
+// direct streaming to a downstream client.
+//
+// Why: the passthrough handler needs raw access to the response body for NDJSON
+// streaming without double-parsing.
+// What: POSTs the raw body to /api/chat and returns the raw HTTP response.
+// Test: stub /api/chat, call RawChat, verify response status and body forwarding.
+func (c *httpClient) RawChat(ctx context.Context, body []byte) (*http.Response, error) {
+	httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/api/chat", bytes.NewReader(body))
+	if err != nil {
+		return nil, fmt.Errorf("create raw chat request: %w", err)
+	}
+	httpReq.Header.Set("Content-Type", "application/json")
+	c.setAuth(httpReq)
+
+	resp, err := c.httpClient.Do(httpReq)
+	if err != nil {
+		return nil, c.wrapConnErr(err)
+	}
+
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		defer resp.Body.Close()
+		errBody, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
+		return nil, &HTTPError{StatusCode: resp.StatusCode, Body: string(errBody)}
+	}
+
+	return resp, nil
+}
+
+// RawEmbed performs a raw proxied POST /api/embed, returning the http.Response.
+//
+// Why: the embed passthrough handler proxies the raw body/response without parsing.
+// What: POSTs the raw body to /api/embed and returns the raw HTTP response.
+// Test: stub /api/embed, call RawEmbed, verify response forwarding.
+func (c *httpClient) RawEmbed(ctx context.Context, body []byte) (*http.Response, error) {
+	httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/api/embed", bytes.NewReader(body))
+	if err != nil {
+		return nil, fmt.Errorf("create raw embed request: %w", err)
+	}
+	httpReq.Header.Set("Content-Type", "application/json")
+	c.setAuth(httpReq)
+
+	resp, err := c.httpClient.Do(httpReq)
+	if err != nil {
+		return nil, c.wrapConnErr(err)
+	}
+
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		defer resp.Body.Close()
+		errBody, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
+		return nil, &HTTPError{StatusCode: resp.StatusCode, Body: string(errBody)}
+	}
+
+	return resp, nil
+}
+
+// setAuth adds the bearer token to the request if configured.
+func (c *httpClient) setAuth(req *http.Request) {
+	if c.token != "" {
+		req.Header.Set("Authorization", "Bearer "+c.token)
+	}
+}
+
+// wrapConnErr checks if the error is a network-level failure and wraps it in a
+// ConnectionError. Non-network errors are returned as-is.
+func (c *httpClient) wrapConnErr(err error) error {
+	if err == nil {
+		return nil
+	}
+	// Check for common network error types.
+	if _, ok := err.(*net.OpError); ok {
+		return &ConnectionError{URL: c.baseURL, Err: err}
+	}
+	if _, ok := err.(net.Error); ok {
+		return &ConnectionError{URL: c.baseURL, Err: err}
+	}
+	// Also catch connection refused, DNS errors, etc. that might be wrapped.
+	if isConnectionError(err) {
+		return &ConnectionError{URL: c.baseURL, Err: err}
+	}
+	return err
+}
+
+// isConnectionError checks for common connection-level error patterns in wrapped errors.
+func isConnectionError(err error) bool {
+	msg := err.Error()
+	return strings.Contains(msg, "connection refused") ||
+		strings.Contains(msg, "no such host") ||
+		strings.Contains(msg, "network is unreachable") ||
+		strings.Contains(msg, "i/o timeout") ||
+		strings.Contains(msg, "dial tcp")
+}
@@ -0,0 +1,279 @@
+package ollama
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"sync/atomic"
+	"testing"
+)
+
+func TestTags_ParsesResponse(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path != "/api/tags" {
+			t.Errorf("unexpected path: %s", r.URL.Path)
+		}
+		w.Header().Set("Content-Type", "application/json")
+		json.NewEncoder(w).Encode(TagsResponse{
+			Models: []ModelInfo{
+				{Name: "qwen3:30b", Model: "qwen3:30b", Size: 19000000000},
+				{Name: "nomic-embed-text", Model: "nomic-embed-text", Size: 300000000},
+			},
+		})
+	}))
+	defer srv.Close()
+
+	client := NewClient(srv.URL, "")
+	resp, err := client.Tags(context.Background())
+	if err != nil {
+		t.Fatalf("Tags: %v", err)
+	}
+	if len(resp.Models) != 2 {
+		t.Fatalf("got %d models, want 2", len(resp.Models))
+	}
+	if resp.Models[0].Name != "qwen3:30b" {
+		t.Errorf("first model = %q, want %q", resp.Models[0].Name, "qwen3:30b")
+	}
+}
+
+func TestPs_ParsesResponse(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path != "/api/ps" {
+			t.Errorf("unexpected path: %s", r.URL.Path)
+		}
+		w.Header().Set("Content-Type", "application/json")
+		json.NewEncoder(w).Encode(PsResponse{
+			Models: []RunningModel{
+				{Name: "nomic-embed-text", Model: "nomic-embed-text", Size: 300000000},
+			},
+		})
+	}))
+	defer srv.Close()
+
+	client := NewClient(srv.URL, "")
+	resp, err := client.Ps(context.Background())
+	if err != nil {
+		t.Fatalf("Ps: %v", err)
+	}
+	if len(resp.Models) != 1 {
+		t.Fatalf("got %d models, want 1", len(resp.Models))
+	}
+}
+
+func TestChat_NonStreaming(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		body, _ := io.ReadAll(r.Body)
+		var req ChatRequest
+		if err := json.Unmarshal(body, &req); err != nil {
+			t.Errorf("unmarshal: %v", err)
+		}
+		if req.Stream != nil && *req.Stream {
+			t.Error("expected stream=false")
+		}
+		w.Header().Set("Content-Type", "application/json")
+		json.NewEncoder(w).Encode(ChatResponse{
+			Model:   "qwen3:30b",
+			Done:    true,
+			Message: &Message{Role: "assistant", Content: "Hello!"},
+		})
+	}))
+	defer srv.Close()
+
+	client := NewClient(srv.URL, "")
+	resp, ch, err := client.Chat(context.Background(), ChatRequest{
+		Model:    "qwen3:30b",
+		Messages: []Message{{Role: "user", Content: "hi"}},
+	}, false)
+	if err != nil {
+		t.Fatalf("Chat: %v", err)
+	}
+	if ch != nil {
+		t.Error("expected nil channel for non-streaming")
+	}
+	if resp == nil {
+		t.Fatal("expected non-nil response")
+	}
+	if resp.Message.Content != "Hello!" {
+		t.Errorf("content = %q, want %q", resp.Message.Content, "Hello!")
+	}
+}
+
+func TestChat_Streaming(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/x-ndjson")
+		flusher := w.(http.Flusher)
+
+		chunks := []ChatResponse{
+			{Model: "qwen3:30b", Done: false, Message: &Message{Role: "assistant", Content: "Hel"}},
+			{Model: "qwen3:30b", Done: false, Message: &Message{Role: "assistant", Content: "lo"}},
+			{Model: "qwen3:30b", Done: true, DoneReason: "stop"},
+		}
+		for _, c := range chunks {
+			b, _ := json.Marshal(c)
+			w.Write(b)
+			w.Write([]byte("\n"))
+			flusher.Flush()
+		}
+	}))
+	defer srv.Close()
+
+	client := NewClient(srv.URL, "")
+	resp, ch, err := client.Chat(context.Background(), ChatRequest{
+		Model:    "qwen3:30b",
+		Messages: []Message{{Role: "user", Content: "hi"}},
+	}, true)
+	if err != nil {
+		t.Fatalf("Chat: %v", err)
+	}
+	if resp != nil {
+		t.Error("expected nil response for streaming")
+	}
+	if ch == nil {
+		t.Fatal("expected non-nil channel for streaming")
+	}
+
+	var collected []ChatResponse
+	for c := range ch {
+		collected = append(collected, c)
+	}
+	if len(collected) != 3 {
+		t.Fatalf("got %d chunks, want 3", len(collected))
+	}
+	if !collected[2].Done {
+		t.Error("last chunk should have done=true")
+	}
+}
+
+func TestEmbed_ParsesResponse(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path != "/api/embed" {
+			t.Errorf("unexpected path: %s", r.URL.Path)
+		}
+		w.Header().Set("Content-Type", "application/json")
+		json.NewEncoder(w).Encode(EmbedResponse{
+			Model:      "nomic-embed-text",
+			Embeddings: [][]float64{{0.1, 0.2, 0.3}},
+		})
+	}))
+	defer srv.Close()
+
+	client := NewClient(srv.URL, "")
+	resp, err := client.Embed(context.Background(), EmbedRequest{
+		Model: "nomic-embed-text",
+		Input: json.RawMessage(`"test input"`),
+	})
+	if err != nil {
+		t.Fatalf("Embed: %v", err)
+	}
+	if len(resp.Embeddings) != 1 {
+		t.Fatalf("got %d embeddings, want 1", len(resp.Embeddings))
+	}
+	if len(resp.Embeddings[0]) != 3 {
+		t.Errorf("embedding dim = %d, want 3", len(resp.Embeddings[0]))
+	}
+}
+
+func TestClient_SetsAuthToken(t *testing.T) {
+	var gotAuth string
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		gotAuth = r.Header.Get("Authorization")
+		w.Header().Set("Content-Type", "application/json")
+		json.NewEncoder(w).Encode(TagsResponse{})
+	}))
+	defer srv.Close()
+
+	client := NewClient(srv.URL, "my-secret-token")
+	_, err := client.Tags(context.Background())
+	if err != nil {
+		t.Fatalf("Tags: %v", err)
+	}
+	if gotAuth != "Bearer my-secret-token" {
+		t.Errorf("auth header = %q, want %q", gotAuth, "Bearer my-secret-token")
+	}
+}
+
+func TestClient_ConnectionError(t *testing.T) {
+	// Use a server that immediately closes.
+	client := NewClient("http://127.0.0.1:1", "")
+	_, err := client.Tags(context.Background())
+	if err == nil {
+		t.Fatal("expected error for unreachable target")
+	}
+
+	var connErr *ConnectionError
+	if !errors.As(err, &connErr) {
+		t.Errorf("expected *ConnectionError, got %T: %v", err, err)
+	}
+}
+
+func TestClient_HTTPError(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		http.Error(w, `{"error":"bad model"}`, http.StatusBadRequest)
+	}))
+	defer srv.Close()
+
+	client := NewClient(srv.URL, "")
+	_, err := client.Tags(context.Background())
+	if err == nil {
+		t.Fatal("expected error for 400 response")
+	}
+
+	var httpErr *HTTPError
+	if !errors.As(err, &httpErr) {
+		t.Errorf("expected *HTTPError, got %T: %v", err, err)
+	}
+	if httpErr.StatusCode != http.StatusBadRequest {
+		t.Errorf("status = %d, want %d", httpErr.StatusCode, http.StatusBadRequest)
+	}
+}
+
+func TestRawChat_ForwardsBody(t *testing.T) {
+	var gotBody []byte
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		gotBody, _ = io.ReadAll(r.Body)
+		w.Header().Set("Content-Type", "application/json")
+		w.Write([]byte(`{"done":true}`))
+	}))
+	defer srv.Close()
+
+	client := NewClient(srv.URL, "")
+	body := []byte(`{"model":"qwen3:30b","messages":[{"role":"user","content":"test"}]}`)
+	resp, err := client.RawChat(context.Background(), body)
+	if err != nil {
+		t.Fatalf("RawChat: %v", err)
+	}
+	defer resp.Body.Close()
+
+	if string(gotBody) != string(body) {
+		t.Errorf("body mismatch: got %s, want %s", gotBody, body)
+	}
+}
+
+func TestRawEmbed_ForwardsBody(t *testing.T) {
+	var callCount atomic.Int32
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		callCount.Add(1)
+		if r.URL.Path != "/api/embed" {
+			t.Errorf("unexpected path: %s", r.URL.Path)
+		}
+		w.Header().Set("Content-Type", "application/json")
+		w.Write([]byte(fmt.Sprintf(`{"model":"nomic-embed-text","embeddings":[[0.%d]]}`, callCount.Load())))
+	}))
+	defer srv.Close()
+
+	client := NewClient(srv.URL, "")
+	body := []byte(`{"model":"nomic-embed-text","input":"test"}`)
+	resp, err := client.RawEmbed(context.Background(), body)
+	if err != nil {
+		t.Fatalf("RawEmbed: %v", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		t.Errorf("status = %d, want %d", resp.StatusCode, http.StatusOK)
+	}
+}
@@ -0,0 +1,41 @@
+package ollama
+
+import (
+	"fmt"
+)
+
+// ConnectionError wraps a network-level error when the Ollama target is unreachable.
+// Phase 3 uses this to distinguish connection failures (retry-eligible) from HTTP
+// errors (usually not retryable).
+//
+// Why: callers must differentiate "target is down" from "target returned 4xx/5xx"
+// to decide on retry strategy.
+// What: wraps a net-level error and satisfies the error and Unwrap interfaces.
+// Test: create a ConnectionError, verify errors.Is/As can match it.
+type ConnectionError struct {
+	URL string
+	Err error
+}
+
+func (e *ConnectionError) Error() string {
+	return fmt.Sprintf("connection to ollama target %s failed: %v", e.URL, e.Err)
+}
+
+func (e *ConnectionError) Unwrap() error {
+	return e.Err
+}
+
+// HTTPError represents a non-2xx HTTP response from the Ollama target.
+//
+// Why: callers need the status code to distinguish client errors (4xx) from
+// server errors (5xx) and decide on retry logic.
+// What: holds the HTTP status code and response body for error diagnosis.
+// Test: create an HTTPError with status 500, verify Error() includes the code.
+type HTTPError struct {
+	StatusCode int
+	Body       string
+}
+
+func (e *HTTPError) Error() string {
+	return fmt.Sprintf("ollama target returned HTTP %d: %s", e.StatusCode, e.Body)
+}
@@ -0,0 +1,167 @@
+package ollama
+
+import (
+	"context"
+	"log/slog"
+	"sync"
+	"time"
+)
+
+// ModelInventory maintains an in-memory cache of the Ollama target's installed
+// and running models, refreshed by a background poller.
+//
+// Why: foreman needs a reasonably fresh view of installed models for validation,
+// passthrough, and scheduling without hitting the target on every request.
+// What: wraps a mutex-protected model list updated by a polling goroutine.
+// Test: create with a stub client, poll, verify Models()/HasModel()/Degraded().
+type ModelInventory struct {
+	client Client
+	logger *slog.Logger
+
+	mu             sync.RWMutex
+	models         []ModelInfo
+	runningModels  []RunningModel
+	lastPoll       time.Time
+	degraded       bool
+}
+
+// NewModelInventory creates a new ModelInventory backed by the given client.
+//
+// Why: separates construction from the poll loop so callers can control lifecycle.
+// What: returns an inventory ready for polling; call Start to begin the background loop.
+// Test: create, call Refresh manually, assert Models() is populated.
+func NewModelInventory(client Client, logger *slog.Logger) *ModelInventory {
+	return &ModelInventory{
+		client: client,
+		logger: logger,
+	}
+}
+
+// Models returns the current known model list.
+//
+// Why: the /api/tags handler and model validation need the cached list.
+// What: returns a copy of the model slice under a read lock.
+// Test: Refresh, call Models(), verify the returned slice matches.
+func (inv *ModelInventory) Models() []ModelInfo {
+	inv.mu.RLock()
+	defer inv.mu.RUnlock()
+	out := make([]ModelInfo, len(inv.models))
+	copy(out, inv.models)
+	return out
+}
+
+// HasModel checks whether a model name is in the current inventory.
+//
+// Why: job submission validates the model name before queuing.
+// What: scans the model list for an exact name match.
+// Test: Refresh with known models, assert HasModel returns true/false correctly.
+func (inv *ModelInventory) HasModel(name string) bool {
+	inv.mu.RLock()
+	defer inv.mu.RUnlock()
+	for _, m := range inv.models {
+		if m.Name == name {
+			return true
+		}
+	}
+	return false
+}
+
+// ResidentModels returns the list of currently-loaded models from the last /api/ps poll.
+//
+// Why: the scheduler (Phase 3) uses this to decide if a model swap is needed.
+// What: returns a copy of the running model slice under a read lock.
+// Test: Refresh, call ResidentModels(), verify it matches /api/ps output.
+func (inv *ModelInventory) ResidentModels() []RunningModel {
+	inv.mu.RLock()
+	defer inv.mu.RUnlock()
+	out := make([]RunningModel, len(inv.runningModels))
+	copy(out, inv.runningModels)
+	return out
+}
+
+// LastPoll returns the timestamp of the most recent successful poll.
+//
+// Why: health/diagnostics use this to judge staleness.
+// What: returns the lastPoll time under a read lock.
+// Test: Refresh, assert LastPoll is non-zero and recent.
+func (inv *ModelInventory) LastPoll() time.Time {
+	inv.mu.RLock()
+	defer inv.mu.RUnlock()
+	return inv.lastPoll
+}
+
+// Degraded reports whether the target was unreachable on the last poll attempt.
+//
+// Why: the /healthz endpoint surfaces this to operators and probes.
+// What: returns the degraded flag under a read lock.
+// Test: Refresh with an unreachable stub, assert Degraded() is true; then with a
+// reachable stub, assert it clears.
+func (inv *ModelInventory) Degraded() bool {
+	inv.mu.RLock()
+	defer inv.mu.RUnlock()
+	return inv.degraded
+}
+
+// Refresh performs an immediate poll of /api/tags and /api/ps on the target.
+//
+// Why: called by the poller goroutine and on-demand (e.g., model-miss re-check).
+// What: fetches tags and ps, updates the cached lists, clears or sets the degraded
+// flag based on success/failure.
+// Test: stub both endpoints, call Refresh, verify Models() and ResidentModels() match.
+func (inv *ModelInventory) Refresh(ctx context.Context) error {
+	tags, tagsErr := inv.client.Tags(ctx)
+	ps, psErr := inv.client.Ps(ctx)
+
+	inv.mu.Lock()
+	defer inv.mu.Unlock()
+
+	if tagsErr != nil {
+		inv.degraded = true
+		inv.logger.Warn("model poll failed: tags",
+			"error", tagsErr,
+			"retained_models", len(inv.models),
+		)
+		return tagsErr
+	}
+
+	// Tags succeeded — update model list and clear degraded.
+	inv.models = tags.Models
+	inv.lastPoll = time.Now()
+	inv.degraded = false
+
+	if psErr != nil {
+		// Tags worked but ps failed — log but don't mark degraded for ps alone.
+		inv.logger.Warn("model poll partial: ps failed", "error", psErr)
+	} else {
+		inv.runningModels = ps.Models
+	}
+
+	return nil
+}
+
+// Start begins the background polling loop. It blocks until ctx is cancelled.
+// Call this in a goroutine.
+//
+// Why: continuous polling keeps the model inventory fresh for validation and scheduling.
+// What: polls at the given interval, respecting context cancellation for clean shutdown.
+// Test: start with a short interval and cancelled context, verify it exits cleanly.
+func (inv *ModelInventory) Start(ctx context.Context, interval time.Duration) {
+	// Do an initial poll immediately.
+	if err := inv.Refresh(ctx); err != nil {
+		inv.logger.Warn("initial model poll failed", "error", err)
+	}
+
+	ticker := time.NewTicker(interval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			if err := inv.Refresh(ctx); err != nil {
+				inv.logger.Warn("periodic model poll failed", "error", err)
+			}
+		}
+	}
+}
@@ -0,0 +1,201 @@
+package ollama
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"net/http"
+	"testing"
+	"time"
+)
+
+// mockClient implements Client for inventory testing.
+type mockClient struct {
+	tagsFn func(ctx context.Context) (*TagsResponse, error)
+	psFn   func(ctx context.Context) (*PsResponse, error)
+}
+
+func (m *mockClient) Chat(ctx context.Context, req ChatRequest, stream bool) (*ChatResponse, <-chan ChatResponse, error) {
+	return nil, nil, fmt.Errorf("not implemented")
+}
+
+func (m *mockClient) Embed(ctx context.Context, req EmbedRequest) (*EmbedResponse, error) {
+	return nil, fmt.Errorf("not implemented")
+}
+
+func (m *mockClient) Tags(ctx context.Context) (*TagsResponse, error) {
+	return m.tagsFn(ctx)
+}
+
+func (m *mockClient) Ps(ctx context.Context) (*PsResponse, error) {
+	return m.psFn(ctx)
+}
+
+func (m *mockClient) RawChat(ctx context.Context, body []byte) (*http.Response, error) {
+	return nil, fmt.Errorf("not implemented")
+}
+
+func (m *mockClient) RawEmbed(ctx context.Context, body []byte) (*http.Response, error) {
+	return nil, fmt.Errorf("not implemented")
+}
+
+func TestInventory_RefreshPopulatesModels(t *testing.T) {
+	client := &mockClient{
+		tagsFn: func(ctx context.Context) (*TagsResponse, error) {
+			return &TagsResponse{
+				Models: []ModelInfo{
+					{Name: "qwen3:30b"},
+					{Name: "nomic-embed-text"},
+				},
+			}, nil
+		},
+		psFn: func(ctx context.Context) (*PsResponse, error) {
+			return &PsResponse{
+				Models: []RunningModel{
+					{Name: "nomic-embed-text"},
+				},
+			}, nil
+		},
+	}
+
+	inv := NewModelInventory(client, slog.Default())
+	if err := inv.Refresh(context.Background()); err != nil {
+		t.Fatalf("Refresh: %v", err)
+	}
+
+	models := inv.Models()
+	if len(models) != 2 {
+		t.Fatalf("got %d models, want 2", len(models))
+	}
+
+	if !inv.HasModel("qwen3:30b") {
+		t.Error("HasModel(qwen3:30b) = false, want true")
+	}
+	if inv.HasModel("nonexistent") {
+		t.Error("HasModel(nonexistent) = true, want false")
+	}
+
+	resident := inv.ResidentModels()
+	if len(resident) != 1 {
+		t.Fatalf("got %d resident models, want 1", len(resident))
+	}
+
+	if inv.Degraded() {
+		t.Error("degraded should be false after successful refresh")
+	}
+	if inv.LastPoll().IsZero() {
+		t.Error("lastPoll should be non-zero after refresh")
+	}
+}
+
+func TestInventory_DegradedOnFailure(t *testing.T) {
+	callCount := 0
+	client := &mockClient{
+		tagsFn: func(ctx context.Context) (*TagsResponse, error) {
+			callCount++
+			if callCount == 1 {
+				return &TagsResponse{
+					Models: []ModelInfo{{Name: "qwen3:30b"}},
+				}, nil
+			}
+			return nil, fmt.Errorf("connection refused")
+		},
+		psFn: func(ctx context.Context) (*PsResponse, error) {
+			return &PsResponse{}, nil
+		},
+	}
+
+	inv := NewModelInventory(client, slog.Default())
+
+	// First refresh succeeds.
+	if err := inv.Refresh(context.Background()); err != nil {
+		t.Fatalf("first Refresh: %v", err)
+	}
+	if inv.Degraded() {
+		t.Error("should not be degraded after first successful poll")
+	}
+
+	// Second refresh fails — should retain models but mark degraded.
+	if err := inv.Refresh(context.Background()); err == nil {
+		t.Fatal("expected error on second refresh")
+	}
+	if !inv.Degraded() {
+		t.Error("should be degraded after failed poll")
+	}
+
+	// Models should be retained.
+	if !inv.HasModel("qwen3:30b") {
+		t.Error("should retain models after failed poll")
+	}
+}
+
+func TestInventory_RecoveryFromDegraded(t *testing.T) {
+	failing := true
+	client := &mockClient{
+		tagsFn: func(ctx context.Context) (*TagsResponse, error) {
+			if failing {
+				return nil, fmt.Errorf("connection refused")
+			}
+			return &TagsResponse{
+				Models: []ModelInfo{{Name: "qwen3:30b"}},
+			}, nil
+		},
+		psFn: func(ctx context.Context) (*PsResponse, error) {
+			return &PsResponse{}, nil
+		},
+	}
+
+	inv := NewModelInventory(client, slog.Default())
+
+	// First refresh fails.
+	inv.Refresh(context.Background())
+	if !inv.Degraded() {
+		t.Error("should be degraded after failed poll")
+	}
+
+	// Target recovers.
+	failing = false
+	if err := inv.Refresh(context.Background()); err != nil {
+		t.Fatalf("recovery Refresh: %v", err)
+	}
+	if inv.Degraded() {
+		t.Error("should not be degraded after successful poll")
+	}
+}
+
+func TestInventory_StartAndCancel(t *testing.T) {
+	pollCount := 0
+	client := &mockClient{
+		tagsFn: func(ctx context.Context) (*TagsResponse, error) {
+			pollCount++
+			return &TagsResponse{}, nil
+		},
+		psFn: func(ctx context.Context) (*PsResponse, error) {
+			return &PsResponse{}, nil
+		},
+	}
+
+	inv := NewModelInventory(client, slog.Default())
+	ctx, cancel := context.WithCancel(context.Background())
+
+	done := make(chan struct{})
+	go func() {
+		inv.Start(ctx, 10*time.Millisecond)
+		close(done)
+	}()
+
+	// Let it poll a few times.
+	time.Sleep(50 * time.Millisecond)
+	cancel()
+
+	select {
+	case <-done:
+		// Clean exit.
+	case <-time.After(2 * time.Second):
+		t.Fatal("Start did not exit after context cancellation")
+	}
+
+	if pollCount < 2 {
+		t.Errorf("poll count = %d, want >= 2 (initial + at least one tick)", pollCount)
+	}
+}
@@ -0,0 +1,99 @@
+// Package ollama provides a client for talking to an Ollama target.
+//
+// Why: foreman needs a clean, testable interface to the Ollama HTTP API so the
+// worker loop and passthrough handlers share a single client contract.
+// What: defines wire types matching Ollama's native JSON API and a Client
+// interface for chat, embed, tags, and ps operations.
+// Test: use a stub HTTP server that returns canned Ollama JSON; verify the client
+// parses responses and surfaces errors correctly.
+package ollama
+
+import (
+	"encoding/json"
+	"time"
+)
+
+// ChatRequest is the JSON body for POST /api/chat.
+// Fields use json.RawMessage where polymorphism or pass-through fidelity is required.
+type ChatRequest struct {
+	Model     string            `json:"model"`
+	Messages  []Message         `json:"messages"`
+	Stream    *bool             `json:"stream,omitempty"`
+	Tools     json.RawMessage   `json:"tools,omitempty"`
+	Options   json.RawMessage   `json:"options,omitempty"`
+	KeepAlive json.RawMessage   `json:"keep_alive,omitempty"`
+	Think     json.RawMessage   `json:"think,omitempty"`
+	Format    json.RawMessage   `json:"format,omitempty"`
+	Template  string            `json:"template,omitempty"`
+	System    string            `json:"system,omitempty"`
+	Context   json.RawMessage   `json:"context,omitempty"`
+}
+
+// Message is a single message in a chat conversation.
+type Message struct {
+	Role      string          `json:"role"`
+	Content   string          `json:"content"`
+	Images    []string        `json:"images,omitempty"`
+	ToolCalls json.RawMessage `json:"tool_calls,omitempty"`
+	Thinking  json.RawMessage `json:"thinking,omitempty"`
+}
+
+// ChatResponse is the JSON response from POST /api/chat.
+type ChatResponse struct {
+	Model           string          `json:"model,omitempty"`
+	CreatedAt       string          `json:"created_at,omitempty"`
+	Message         *Message        `json:"message,omitempty"`
+	Done            bool            `json:"done"`
+	DoneReason      string          `json:"done_reason,omitempty"`
+	TotalDuration   int64           `json:"total_duration,omitempty"`
+	LoadDuration    int64           `json:"load_duration,omitempty"`
+	PromptEvalCount int             `json:"prompt_eval_count,omitempty"`
+	EvalCount       int             `json:"eval_count,omitempty"`
+	EvalDuration    int64           `json:"eval_duration,omitempty"`
+	Context         json.RawMessage `json:"context,omitempty"`
+}
+
+// EmbedRequest is the JSON body for POST /api/embed (and /api/embeddings).
+type EmbedRequest struct {
+	Model     string          `json:"model"`
+	Input     json.RawMessage `json:"input"`
+	KeepAlive json.RawMessage `json:"keep_alive,omitempty"`
+	Options   json.RawMessage `json:"options,omitempty"`
+}
+
+// EmbedResponse is the JSON response from POST /api/embed.
+type EmbedResponse struct {
+	Model      string      `json:"model,omitempty"`
+	Embeddings [][]float64 `json:"embeddings,omitempty"`
+}
+
+// ModelInfo describes an installed model, as returned by GET /api/tags.
+type ModelInfo struct {
+	Name       string          `json:"name"`
+	Model      string          `json:"model"`
+	ModifiedAt time.Time       `json:"modified_at"`
+	Size       int64           `json:"size"`
+	Digest     string          `json:"digest"`
+	Details    json.RawMessage `json:"details,omitempty"`
+}
+
+// TagsResponse is the JSON response from GET /api/tags.
+type TagsResponse struct {
+	Models []ModelInfo `json:"models"`
+}
+
+// RunningModel describes a currently-loaded model, as returned by GET /api/ps.
+type RunningModel struct {
+	Name      string          `json:"name"`
+	Model     string          `json:"model"`
+	Size      int64           `json:"size"`
+	Digest    string          `json:"digest"`
+	ExpiresAt time.Time       `json:"expires_at"`
+	Details   json.RawMessage `json:"details,omitempty"`
+	SizeVRAM  int64           `json:"size_vram,omitempty"`
+}
+
+// PsResponse is the JSON response from GET /api/ps.
+type PsResponse struct {
+	Models []RunningModel `json:"models"`
+}