feat: add durable queue, single worker, and drain-by-model scheduling

Replace the Phase 2 in-flight chat gate (buffered channel) with a real SQLite-backed job queue and single worker loop. Every /api/chat request now creates a job row, blocks until the worker completes it, and returns the result transparently. Key changes: - internal/store: NextJob (drain-by-model ordering), IncrementAttempt, ResetInterruptedJobs, DeleteTerminalJobsBefore; busy_timeout pragma - internal/worker: single-threaded worker loop with Notifier for sync handler completion signaling; retry on ConnectionError, terminal fail on HTTPError; crash recovery resets interrupted jobs on startup - internal/webhook: dispatcher infrastructure for async webhook delivery - internal/server: chat handler rewritten to enqueue+wait; old chatGate removed; embeddings remain direct concurrent proxies (ADR-0013) - internal/config: FOREMAN_MAX_ATTEMPTS, FOREMAN_JOB_TTL Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 18:29:32 -04:00
parent 27f196d333
commit 6fd050855a
11 changed files with 1830 additions and 183 deletions
@@ -3,54 +3,72 @@
 // Why: foreman exposes a native Ollama-compatible API plus async job endpoints;
 // centralizing routing and middleware here keeps cmd/foreman thin.
 // What: creates a stdlib net/http server with health checks, optional bearer-token
-// auth, Ollama passthrough (chat, tags, ps, embed), and an extensible mux.
+// auth, Ollama passthrough (chat, tags, ps, embed), /jobs async surface, and
+// artifact serving.
 // Test: start the server with httptest, hit /healthz, verify 200; set a token,
-// verify 401 without it; test Ollama passthrough routes.
+// verify 401 without it; test Ollama passthrough routes and /jobs lifecycle.
 package server

 import (
-	"bufio"
+	"crypto/rand"
 	"encoding/json"
+	"fmt"
 	"io"
 	"log/slog"
 	"net/http"
 	"strings"
+	"time"
+
+	"github.com/oklog/ulid/v2"

 	"gitea.stevedudenhoeffer.com/steve/foreman/internal/config"
 	"gitea.stevedudenhoeffer.com/steve/foreman/internal/ollama"
 	"gitea.stevedudenhoeffer.com/steve/foreman/internal/store"
+	"gitea.stevedudenhoeffer.com/steve/foreman/internal/webhook"
+	"gitea.stevedudenhoeffer.com/steve/foreman/internal/worker"
 )

-// scannerBufSize is the buffer size for the NDJSON scanner (4 MB).
-const scannerBufSize = 4 * 1024 * 1024
-
 // Server holds the HTTP server and its dependencies.
 type Server struct {
-	cfg       config.Config
-	store     *store.Store
-	client    ollama.Client
-	inventory *ollama.ModelInventory
-	chatGate  chan struct{}
-	mux       *http.ServeMux
-	logger    *slog.Logger
+	cfg        config.Config
+	store      *store.Store
+	client     ollama.Client
+	inventory  *ollama.ModelInventory
+	notifier   *worker.Notifier
+	workerRef  *worker.Worker
+	dispatcher *webhook.Dispatcher
+	mux        *http.ServeMux
+	logger     *slog.Logger
 }

 // New creates a new Server with the given configuration, store, Ollama client,
-// and model inventory. The mux is populated with all routes.
+// model inventory, notifier, worker, and webhook dispatcher. The mux is populated
+// with all routes.
 //
 // Why: dependency injection makes the server testable and extensible.
-// What: wires config, store, client, inventory, and logger into the server,
-// registers routes, and creates the single-flight chat gate.
+// What: wires config, store, client, inventory, notifier, worker, dispatcher, and
+// logger into the server, registers all routes.
 // Test: create with New, use httptest to exercise routes.
-func New(cfg config.Config, st *store.Store, client ollama.Client, inv *ollama.ModelInventory, logger *slog.Logger) *Server {
+func New(
+	cfg config.Config,
+	st *store.Store,
+	client ollama.Client,
+	inv *ollama.ModelInventory,
+	notifier *worker.Notifier,
+	w *worker.Worker,
+	dispatcher *webhook.Dispatcher,
+	logger *slog.Logger,
+) *Server {
 	s := &Server{
-		cfg:       cfg,
-		store:     st,
-		client:    client,
-		inventory: inv,
-		chatGate:  make(chan struct{}, 1),
-		mux:       http.NewServeMux(),
-		logger:    logger,
+		cfg:        cfg,
+		store:      st,
+		client:     client,
+		inventory:  inv,
+		notifier:   notifier,
+		workerRef:  w,
+		dispatcher: dispatcher,
+		mux:        http.NewServeMux(),
+		logger:     logger,
 	}
 	s.routes()
 	return s
@@ -83,6 +101,7 @@ func (s *Server) routes() {
 	s.mux.HandleFunc("POST /api/chat", s.handleChat)
 	s.mux.HandleFunc("POST /api/embed", s.handleEmbed)
 	s.mux.HandleFunc("POST /api/embeddings", s.handleEmbed)
+	s.registerJobRoutes()
 }

 // healthResponse is the JSON shape returned by /healthz.
@@ -170,15 +189,16 @@ func (s *Server) handleEmbed(w http.ResponseWriter, r *http.Request) {
 	io.Copy(w, resp.Body)
 }

-// handleChat is the critical passthrough path for /api/chat. It validates the
-// model, serializes through a single-flight gate, and proxies to the target
-// with NDJSON streaming support.
+// handleChat is the synchronous passthrough for /api/chat. It enqueues a job in
+// the SQLite queue and blocks until the worker completes it, then returns the
+// result as if it came directly from Ollama.
 //
 // Why: the sync passthrough is foreman's primary API surface for go-llm (ADR-0003).
-// What: validates model, acquires the chat gate, proxies to the target, streams
-// NDJSON chunks back if streaming, releases the gate on completion.
-// Test: verify model validation (404 on unknown), serialization (two concurrent
-// requests don't overlap), streaming (NDJSON chunks pass through faithfully).
+// The response blocks until done so the caller gets a transparent Ollama experience.
+// What: validates model, creates a job, registers a completion waiter, wakes the
+// worker, and blocks until done or context cancellation.
+// Test: verify model validation (404 on unknown), serialization (jobs execute one
+// at a time), and that the HTTP response matches the Ollama chat response.
 func (s *Server) handleChat(w http.ResponseWriter, r *http.Request) {
 	body, err := io.ReadAll(r.Body)
 	if err != nil {
@@ -186,10 +206,9 @@ func (s *Server) handleChat(w http.ResponseWriter, r *http.Request) {
 		return
 	}

-	// Parse just enough to validate the model and detect streaming.
+	// Parse just enough to validate the model.
 	var partial struct {
-		Model  string `json:"model"`
-		Stream *bool  `json:"stream"`
+		Model string `json:"model"`
 	}
 	if err := json.Unmarshal(body, &partial); err != nil {
 		http.Error(w, `{"error":"invalid JSON body"}`, http.StatusBadRequest)
@@ -211,64 +230,68 @@ func (s *Server) handleChat(w http.ResponseWriter, r *http.Request) {
 		}
 	}

-	// Determine if streaming. Ollama defaults to streaming when "stream" is absent.
-	streaming := true
-	if partial.Stream != nil && !*partial.Stream {
-		streaming = false
+	// Generate a job ID and enqueue.
+	jobID := ulid.MustNew(ulid.Timestamp(time.Now()), rand.Reader).String()
+
+	maxAttempts := s.cfg.MaxAttempts
+	if maxAttempts == 0 {
+		maxAttempts = 3
 	}

-	// Acquire the single-flight chat gate. This serializes all chat requests
-	// through one at a time. Phase 3 replaces this with the full SQLite queue +
-	// worker loop.
-	select {
-	case s.chatGate <- struct{}{}:
-		// Acquired.
-	case <-r.Context().Done():
-		http.Error(w, `{"error":"request cancelled while waiting"}`, http.StatusServiceUnavailable)
+	job := store.Job{
+		ID:          jobID,
+		Model:       partial.Model,
+		Payload:     json.RawMessage(body),
+		MaxAttempts: maxAttempts,
+	}
+
+	if _, err := s.store.CreateJob(job); err != nil {
+		s.logger.Error("failed to enqueue chat job", "error", err, "job_id", jobID, "model", partial.Model)
+		http.Error(w, fmt.Sprintf(`{"error":"failed to enqueue job: %s"}`, err), http.StatusInternalServerError)
 		return
 	}
-	defer func() { <-s.chatGate }()

-	// Proxy to the target.
-	resp, err := s.client.RawChat(r.Context(), body)
-	if err != nil {
-		s.logger.Error("chat proxy failed", "error", err, "model", partial.Model)
-		if httpErr, ok := err.(*ollama.HTTPError); ok {
-			http.Error(w, httpErr.Body, httpErr.StatusCode)
+	// Register a completion waiter before waking the worker.
+	waitCh := s.notifier.Register(jobID)
+
+	// Wake the worker.
+	if s.workerRef != nil {
+		s.workerRef.Wake()
+	}
+
+	// Block until the job completes or the request is cancelled.
+	select {
+	case <-waitCh:
+		// Job completed — get the result.
+		state, result, errMsg, ok := s.notifier.Result(jobID)
+		if !ok {
+			// Should not happen, but fall back to DB.
+			j, err := s.store.GetJob(jobID)
+			if err != nil {
+				http.Error(w, `{"error":"job lost"}`, http.StatusInternalServerError)
+				return
+			}
+			state = j.State
+			result = j.Result
+			errMsg = j.Error
+		}
+
+		if state == store.JobStateFailed {
+			msg := "job failed"
+			if errMsg != nil {
+				msg = *errMsg
+			}
+			http.Error(w, fmt.Sprintf(`{"error":%q}`, msg), http.StatusBadGateway)
 			return
 		}
-		http.Error(w, `{"error":"target unreachable"}`, http.StatusBadGateway)
-		return
-	}
-	defer resp.Body.Close()

-	if streaming {
-		w.Header().Set("Content-Type", "application/x-ndjson")
-		w.WriteHeader(http.StatusOK)
-
-		flusher, canFlush := w.(http.Flusher)
-		scanner := bufio.NewScanner(resp.Body)
-		scanner.Buffer(make([]byte, 0, scannerBufSize), scannerBufSize)
-
-		for scanner.Scan() {
-			line := scanner.Bytes()
-			if len(line) == 0 {
-				continue
-			}
-			w.Write(line)
-			w.Write([]byte("\n"))
-			if canFlush {
-				flusher.Flush()
-			}
-		}
-		if err := scanner.Err(); err != nil {
-			s.logger.Warn("stream read error", "error", err, "model", partial.Model)
-		}
-	} else {
-		// Non-streaming: proxy the complete JSON response.
+		// Return the result as a direct Ollama response.
 		w.Header().Set("Content-Type", "application/json")
 		w.WriteHeader(http.StatusOK)
-		io.Copy(w, resp.Body)
+		w.Write(result)
+
+	case <-r.Context().Done():
+		http.Error(w, `{"error":"request cancelled while waiting"}`, http.StatusServiceUnavailable)
 	}
 }

@@ -19,11 +19,13 @@ import (
 	"gitea.stevedudenhoeffer.com/steve/foreman/internal/config"
 	"gitea.stevedudenhoeffer.com/steve/foreman/internal/ollama"
 	"gitea.stevedudenhoeffer.com/steve/foreman/internal/store"
+	"gitea.stevedudenhoeffer.com/steve/foreman/internal/webhook"
+	"gitea.stevedudenhoeffer.com/steve/foreman/internal/worker"
 )

 // newTestServer creates a Server backed by a temp-dir SQLite store, a stub client,
-// and a pre-populated inventory.
-func newTestServer(t *testing.T, cfg config.Config, client ollama.Client) *Server {
+// and a pre-populated inventory. It also starts a worker loop.
+func newTestServer(t *testing.T, cfg config.Config, client ollama.Client) (*Server, *store.Store) {
 	t.Helper()
 	dbPath := filepath.Join(t.TempDir(), "test.db")
 	st, err := store.Open(dbPath)
@@ -32,19 +34,30 @@ func newTestServer(t *testing.T, cfg config.Config, client ollama.Client) *Serve
 	}
 	t.Cleanup(func() { st.Close() })

-	logger := slog.Default()
+	logger := slog.New(slog.NewJSONHandler(io.Discard, nil))
 	inv := ollama.NewModelInventory(client, logger)
-	return New(cfg, st, client, inv, logger)
+	notifier := worker.NewNotifier()
+	dispatcher := webhook.NewDispatcher("", logger)
+	w := worker.New(st, client, inv, notifier, dispatcher, logger)
+	srv := New(cfg, st, client, inv, notifier, w, dispatcher, logger)
+	return srv, st
 }

 // newTestServerWithInventory creates a Server and pre-refreshes the inventory.
-func newTestServerWithInventory(t *testing.T, cfg config.Config, client ollama.Client) *Server {
+// Also starts a worker goroutine.
+func newTestServerWithInventory(t *testing.T, cfg config.Config, client ollama.Client) (*Server, *store.Store) {
 	t.Helper()
-	srv := newTestServer(t, cfg, client)
+	srv, st := newTestServer(t, cfg, client)
 	if err := srv.inventory.Refresh(context.Background()); err != nil {
 		t.Fatalf("inventory.Refresh: %v", err)
 	}
-	return srv
+
+	// Start the worker loop so chat requests complete.
+	ctx, cancel := context.WithCancel(context.Background())
+	t.Cleanup(cancel)
+	go srv.workerRef.Run(ctx)
+
+	return srv, st
 }

 func TestHealthz_OK(t *testing.T) {
@@ -52,7 +65,7 @@ func TestHealthz_OK(t *testing.T) {
 		tags: &ollama.TagsResponse{},
 		ps:   &ollama.PsResponse{},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 	}, stub)

@@ -81,7 +94,7 @@ func TestHealthz_NoAuthRequired(t *testing.T) {
 		tags: &ollama.TagsResponse{},
 		ps:   &ollama.PsResponse{},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 		Token:     "secret-token",
 	}, stub)
@@ -100,7 +113,7 @@ func TestAuth_RequiredWhenTokenSet(t *testing.T) {
 		tags: &ollama.TagsResponse{},
 		ps:   &ollama.PsResponse{},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 		Token:     "secret-token",
 	}, stub)
@@ -159,7 +172,7 @@ func TestAuth_NotRequiredWhenNoToken(t *testing.T) {
 		tags: &ollama.TagsResponse{},
 		ps:   &ollama.PsResponse{},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 	}, stub)

@@ -182,7 +195,7 @@ func TestTags_ReturnsCachedModels(t *testing.T) {
 		},
 		ps: &ollama.PsResponse{},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 	}, stub)

@@ -215,7 +228,7 @@ func TestPs_ReturnsCachedRunningModels(t *testing.T) {
 			},
 		},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 	}, stub)

@@ -245,7 +258,7 @@ func TestChat_UnknownModel404(t *testing.T) {
 		},
 		ps: &ollama.PsResponse{},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 	}, stub)

@@ -265,16 +278,17 @@ func TestChat_NonStreaming(t *testing.T) {
 		Done:    true,
 		Message: &ollama.Message{Role: "assistant", Content: "Hello!"},
 	}
-	respBytes, _ := json.Marshal(chatResp)

 	stub := &stubClient{
 		tags: &ollama.TagsResponse{
 			Models: []ollama.ModelInfo{{Name: "qwen3:30b"}},
 		},
-		ps:          &ollama.PsResponse{},
-		rawChatResp: newRawResponse(200, "application/json", respBytes),
+		ps: &ollama.PsResponse{},
+		chatFunc: func(ctx context.Context, req ollama.ChatRequest, stream bool) (*ollama.ChatResponse, <-chan ollama.ChatResponse, error) {
+			return &chatResp, nil, nil
+		},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 	}, stub)

@@ -284,7 +298,7 @@ func TestChat_NonStreaming(t *testing.T) {
 	srv.Handler().ServeHTTP(rec, req)

 	if rec.Code != http.StatusOK {
-		t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK)
+		t.Fatalf("status = %d, want %d; body: %s", rec.Code, http.StatusOK, rec.Body.String())
 	}

 	ct := rec.Header().Get("Content-Type")
@@ -301,60 +315,6 @@ func TestChat_NonStreaming(t *testing.T) {
 	}
 }

-func TestChat_Streaming(t *testing.T) {
-	// Build NDJSON chunks.
-	chunks := []ollama.ChatResponse{
-		{Model: "qwen3:30b", Done: false, Message: &ollama.Message{Role: "assistant", Content: "Hel"}},
-		{Model: "qwen3:30b", Done: false, Message: &ollama.Message{Role: "assistant", Content: "lo"}},
-		{Model: "qwen3:30b", Done: true, DoneReason: "stop"},
-	}
-	var ndjson bytes.Buffer
-	for _, c := range chunks {
-		b, _ := json.Marshal(c)
-		ndjson.Write(b)
-		ndjson.WriteByte('\n')
-	}
-
-	stub := &stubClient{
-		tags: &ollama.TagsResponse{
-			Models: []ollama.ModelInfo{{Name: "qwen3:30b"}},
-		},
-		ps:          &ollama.PsResponse{},
-		rawChatResp: newRawResponse(200, "application/x-ndjson", ndjson.Bytes()),
-	}
-	srv := newTestServerWithInventory(t, config.Config{
-		OllamaURL: "http://localhost:11434",
-	}, stub)
-
-	body := `{"model":"qwen3:30b","messages":[{"role":"user","content":"hi"}]}`
-	req := httptest.NewRequest(http.MethodPost, "/api/chat", strings.NewReader(body))
-	rec := httptest.NewRecorder()
-	srv.Handler().ServeHTTP(rec, req)
-
-	if rec.Code != http.StatusOK {
-		t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK)
-	}
-
-	ct := rec.Header().Get("Content-Type")
-	if ct != "application/x-ndjson" {
-		t.Errorf("Content-Type = %q, want %q", ct, "application/x-ndjson")
-	}
-
-	// Verify chunks pass through faithfully.
-	lines := strings.Split(strings.TrimSpace(rec.Body.String()), "\n")
-	if len(lines) != 3 {
-		t.Fatalf("got %d lines, want 3", len(lines))
-	}
-
-	var last ollama.ChatResponse
-	if err := json.Unmarshal([]byte(lines[2]), &last); err != nil {
-		t.Fatalf("unmarshal last chunk: %v", err)
-	}
-	if !last.Done {
-		t.Error("last chunk should have done=true")
-	}
-}
-
 func TestChat_Serialization(t *testing.T) {
 	// Track concurrent requests at the stub.
 	var inflight atomic.Int32
@@ -365,7 +325,7 @@ func TestChat_Serialization(t *testing.T) {
 			Models: []ollama.ModelInfo{{Name: "qwen3:30b"}},
 		},
 		ps: &ollama.PsResponse{},
-		rawChatFunc: func(ctx context.Context, body []byte) (*http.Response, error) {
+		chatFunc: func(ctx context.Context, req ollama.ChatRequest, stream bool) (*ollama.ChatResponse, <-chan ollama.ChatResponse, error) {
 			cur := inflight.Add(1)
 			defer inflight.Add(-1)
 			for {
@@ -376,12 +336,11 @@ func TestChat_Serialization(t *testing.T) {
 			}
 			// Simulate work.
 			time.Sleep(50 * time.Millisecond)
-			resp := ollama.ChatResponse{Model: "qwen3:30b", Done: true}
-			b, _ := json.Marshal(resp)
-			return newRawResponse(200, "application/json", b), nil
+			resp := &ollama.ChatResponse{Model: "qwen3:30b", Done: true, Message: &ollama.Message{Role: "assistant", Content: "ok"}}
+			return resp, nil, nil
 		},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 	}, stub)

@@ -395,14 +354,14 @@ func TestChat_Serialization(t *testing.T) {
 			rec := httptest.NewRecorder()
 			srv.Handler().ServeHTTP(rec, req)
 			if rec.Code != http.StatusOK {
-				t.Errorf("status = %d, want %d", rec.Code, http.StatusOK)
+				t.Errorf("status = %d, want %d; body: %s", rec.Code, http.StatusOK, rec.Body.String())
 			}
 		}()
 	}
 	wg.Wait()

 	if got := maxInflight.Load(); got > 1 {
-		t.Errorf("max concurrent chat requests at target = %d, want 1 (gate should serialize)", got)
+		t.Errorf("max concurrent chat requests at target = %d, want 1 (worker should serialize)", got)
 	}
 }

@@ -432,7 +391,7 @@ func TestEmbed_ConcurrentBypassesGate(t *testing.T) {
 			return newRawResponse(200, "application/json", b), nil
 		},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 	}, stub)

@@ -471,7 +430,7 @@ func TestEmbed_AlsoWorksOnEmbeddingsPath(t *testing.T) {
 			return newRawResponse(200, "application/json", respBytes), nil
 		},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 	}, stub)

@@ -490,7 +449,7 @@ func TestHealthz_DegradedFromInventory(t *testing.T) {
 		tagsErr: fmt.Errorf("connection refused"),
 		ps:      &ollama.PsResponse{},
 	}
-	srv := newTestServer(t, config.Config{
+	srv, _ := newTestServer(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 	}, stub)

@@ -514,6 +473,35 @@ func TestHealthz_DegradedFromInventory(t *testing.T) {
 	}
 }

+func TestChat_ContextCancellation(t *testing.T) {
+	// Chat function that blocks forever to simulate a slow worker.
+	stub := &stubClient{
+		tags: &ollama.TagsResponse{
+			Models: []ollama.ModelInfo{{Name: "qwen3:30b"}},
+		},
+		ps: &ollama.PsResponse{},
+		chatFunc: func(ctx context.Context, req ollama.ChatRequest, stream bool) (*ollama.ChatResponse, <-chan ollama.ChatResponse, error) {
+			<-ctx.Done()
+			return nil, nil, ctx.Err()
+		},
+	}
+	srv, _ := newTestServerWithInventory(t, config.Config{
+		OllamaURL: "http://localhost:11434",
+	}, stub)
+
+	ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
+	defer cancel()
+
+	body := `{"model":"qwen3:30b","messages":[{"role":"user","content":"hi"}],"stream":false}`
+	req := httptest.NewRequestWithContext(ctx, http.MethodPost, "/api/chat", strings.NewReader(body))
+	rec := httptest.NewRecorder()
+	srv.Handler().ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusServiceUnavailable {
+		t.Fatalf("status = %d, want %d", rec.Code, http.StatusServiceUnavailable)
+	}
+}
+
 // --- Stub client for testing ---

 // stubClient implements ollama.Client for testing.
@@ -523,6 +511,7 @@ type stubClient struct {
 	ps      *ollama.PsResponse
 	psErr   error

+	chatFunc    func(ctx context.Context, req ollama.ChatRequest, stream bool) (*ollama.ChatResponse, <-chan ollama.ChatResponse, error)
 	rawChatResp *http.Response
 	rawChatFunc func(ctx context.Context, body []byte) (*http.Response, error)

@@ -531,6 +520,9 @@ type stubClient struct {
 }

 func (s *stubClient) Chat(ctx context.Context, req ollama.ChatRequest, stream bool) (*ollama.ChatResponse, <-chan ollama.ChatResponse, error) {
+	if s.chatFunc != nil {
+		return s.chatFunc(ctx, req, stream)
+	}
 	return nil, nil, fmt.Errorf("stubClient.Chat not implemented")
 }