feat: add durable queue, single worker, and drain-by-model scheduling

Replace the Phase 2 in-flight chat gate (buffered channel) with a real SQLite-backed job queue and single worker loop. Every /api/chat request now creates a job row, blocks until the worker completes it, and returns the result transparently. Key changes: - internal/store: NextJob (drain-by-model ordering), IncrementAttempt, ResetInterruptedJobs, DeleteTerminalJobsBefore; busy_timeout pragma - internal/worker: single-threaded worker loop with Notifier for sync handler completion signaling; retry on ConnectionError, terminal fail on HTTPError; crash recovery resets interrupted jobs on startup - internal/webhook: dispatcher infrastructure for async webhook delivery - internal/server: chat handler rewritten to enqueue+wait; old chatGate removed; embeddings remain direct concurrent proxies (ADR-0013) - internal/config: FOREMAN_MAX_ATTEMPTS, FOREMAN_JOB_TTL Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 18:29:32 -04:00
parent 27f196d333
commit 6fd050855a
11 changed files with 1830 additions and 183 deletions
@@ -19,11 +19,13 @@ import (
 	"gitea.stevedudenhoeffer.com/steve/foreman/internal/config"
 	"gitea.stevedudenhoeffer.com/steve/foreman/internal/ollama"
 	"gitea.stevedudenhoeffer.com/steve/foreman/internal/store"
+	"gitea.stevedudenhoeffer.com/steve/foreman/internal/webhook"
+	"gitea.stevedudenhoeffer.com/steve/foreman/internal/worker"
 )

 // newTestServer creates a Server backed by a temp-dir SQLite store, a stub client,
-// and a pre-populated inventory.
-func newTestServer(t *testing.T, cfg config.Config, client ollama.Client) *Server {
+// and a pre-populated inventory. It also starts a worker loop.
+func newTestServer(t *testing.T, cfg config.Config, client ollama.Client) (*Server, *store.Store) {
 	t.Helper()
 	dbPath := filepath.Join(t.TempDir(), "test.db")
 	st, err := store.Open(dbPath)
@@ -32,19 +34,30 @@ func newTestServer(t *testing.T, cfg config.Config, client ollama.Client) *Serve
 	}
 	t.Cleanup(func() { st.Close() })

-	logger := slog.Default()
+	logger := slog.New(slog.NewJSONHandler(io.Discard, nil))
 	inv := ollama.NewModelInventory(client, logger)
-	return New(cfg, st, client, inv, logger)
+	notifier := worker.NewNotifier()
+	dispatcher := webhook.NewDispatcher("", logger)
+	w := worker.New(st, client, inv, notifier, dispatcher, logger)
+	srv := New(cfg, st, client, inv, notifier, w, dispatcher, logger)
+	return srv, st
 }

 // newTestServerWithInventory creates a Server and pre-refreshes the inventory.
-func newTestServerWithInventory(t *testing.T, cfg config.Config, client ollama.Client) *Server {
+// Also starts a worker goroutine.
+func newTestServerWithInventory(t *testing.T, cfg config.Config, client ollama.Client) (*Server, *store.Store) {
 	t.Helper()
-	srv := newTestServer(t, cfg, client)
+	srv, st := newTestServer(t, cfg, client)
 	if err := srv.inventory.Refresh(context.Background()); err != nil {
 		t.Fatalf("inventory.Refresh: %v", err)
 	}
-	return srv
+
+	// Start the worker loop so chat requests complete.
+	ctx, cancel := context.WithCancel(context.Background())
+	t.Cleanup(cancel)
+	go srv.workerRef.Run(ctx)
+
+	return srv, st
 }

 func TestHealthz_OK(t *testing.T) {
@@ -52,7 +65,7 @@ func TestHealthz_OK(t *testing.T) {
 		tags: &ollama.TagsResponse{},
 		ps:   &ollama.PsResponse{},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 	}, stub)

@@ -81,7 +94,7 @@ func TestHealthz_NoAuthRequired(t *testing.T) {
 		tags: &ollama.TagsResponse{},
 		ps:   &ollama.PsResponse{},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 		Token:     "secret-token",
 	}, stub)
@@ -100,7 +113,7 @@ func TestAuth_RequiredWhenTokenSet(t *testing.T) {
 		tags: &ollama.TagsResponse{},
 		ps:   &ollama.PsResponse{},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 		Token:     "secret-token",
 	}, stub)
@@ -159,7 +172,7 @@ func TestAuth_NotRequiredWhenNoToken(t *testing.T) {
 		tags: &ollama.TagsResponse{},
 		ps:   &ollama.PsResponse{},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 	}, stub)

@@ -182,7 +195,7 @@ func TestTags_ReturnsCachedModels(t *testing.T) {
 		},
 		ps: &ollama.PsResponse{},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 	}, stub)

@@ -215,7 +228,7 @@ func TestPs_ReturnsCachedRunningModels(t *testing.T) {
 			},
 		},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 	}, stub)

@@ -245,7 +258,7 @@ func TestChat_UnknownModel404(t *testing.T) {
 		},
 		ps: &ollama.PsResponse{},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 	}, stub)

@@ -265,16 +278,17 @@ func TestChat_NonStreaming(t *testing.T) {
 		Done:    true,
 		Message: &ollama.Message{Role: "assistant", Content: "Hello!"},
 	}
-	respBytes, _ := json.Marshal(chatResp)

 	stub := &stubClient{
 		tags: &ollama.TagsResponse{
 			Models: []ollama.ModelInfo{{Name: "qwen3:30b"}},
 		},
-		ps:          &ollama.PsResponse{},
-		rawChatResp: newRawResponse(200, "application/json", respBytes),
+		ps: &ollama.PsResponse{},
+		chatFunc: func(ctx context.Context, req ollama.ChatRequest, stream bool) (*ollama.ChatResponse, <-chan ollama.ChatResponse, error) {
+			return &chatResp, nil, nil
+		},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 	}, stub)

@@ -284,7 +298,7 @@ func TestChat_NonStreaming(t *testing.T) {
 	srv.Handler().ServeHTTP(rec, req)

 	if rec.Code != http.StatusOK {
-		t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK)
+		t.Fatalf("status = %d, want %d; body: %s", rec.Code, http.StatusOK, rec.Body.String())
 	}

 	ct := rec.Header().Get("Content-Type")
@@ -301,60 +315,6 @@ func TestChat_NonStreaming(t *testing.T) {
 	}
 }

-func TestChat_Streaming(t *testing.T) {
-	// Build NDJSON chunks.
-	chunks := []ollama.ChatResponse{
-		{Model: "qwen3:30b", Done: false, Message: &ollama.Message{Role: "assistant", Content: "Hel"}},
-		{Model: "qwen3:30b", Done: false, Message: &ollama.Message{Role: "assistant", Content: "lo"}},
-		{Model: "qwen3:30b", Done: true, DoneReason: "stop"},
-	}
-	var ndjson bytes.Buffer
-	for _, c := range chunks {
-		b, _ := json.Marshal(c)
-		ndjson.Write(b)
-		ndjson.WriteByte('\n')
-	}
-
-	stub := &stubClient{
-		tags: &ollama.TagsResponse{
-			Models: []ollama.ModelInfo{{Name: "qwen3:30b"}},
-		},
-		ps:          &ollama.PsResponse{},
-		rawChatResp: newRawResponse(200, "application/x-ndjson", ndjson.Bytes()),
-	}
-	srv := newTestServerWithInventory(t, config.Config{
-		OllamaURL: "http://localhost:11434",
-	}, stub)
-
-	body := `{"model":"qwen3:30b","messages":[{"role":"user","content":"hi"}]}`
-	req := httptest.NewRequest(http.MethodPost, "/api/chat", strings.NewReader(body))
-	rec := httptest.NewRecorder()
-	srv.Handler().ServeHTTP(rec, req)
-
-	if rec.Code != http.StatusOK {
-		t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK)
-	}
-
-	ct := rec.Header().Get("Content-Type")
-	if ct != "application/x-ndjson" {
-		t.Errorf("Content-Type = %q, want %q", ct, "application/x-ndjson")
-	}
-
-	// Verify chunks pass through faithfully.
-	lines := strings.Split(strings.TrimSpace(rec.Body.String()), "\n")
-	if len(lines) != 3 {
-		t.Fatalf("got %d lines, want 3", len(lines))
-	}
-
-	var last ollama.ChatResponse
-	if err := json.Unmarshal([]byte(lines[2]), &last); err != nil {
-		t.Fatalf("unmarshal last chunk: %v", err)
-	}
-	if !last.Done {
-		t.Error("last chunk should have done=true")
-	}
-}
-
 func TestChat_Serialization(t *testing.T) {
 	// Track concurrent requests at the stub.
 	var inflight atomic.Int32
@@ -365,7 +325,7 @@ func TestChat_Serialization(t *testing.T) {
 			Models: []ollama.ModelInfo{{Name: "qwen3:30b"}},
 		},
 		ps: &ollama.PsResponse{},
-		rawChatFunc: func(ctx context.Context, body []byte) (*http.Response, error) {
+		chatFunc: func(ctx context.Context, req ollama.ChatRequest, stream bool) (*ollama.ChatResponse, <-chan ollama.ChatResponse, error) {
 			cur := inflight.Add(1)
 			defer inflight.Add(-1)
 			for {
@@ -376,12 +336,11 @@ func TestChat_Serialization(t *testing.T) {
 			}
 			// Simulate work.
 			time.Sleep(50 * time.Millisecond)
-			resp := ollama.ChatResponse{Model: "qwen3:30b", Done: true}
-			b, _ := json.Marshal(resp)
-			return newRawResponse(200, "application/json", b), nil
+			resp := &ollama.ChatResponse{Model: "qwen3:30b", Done: true, Message: &ollama.Message{Role: "assistant", Content: "ok"}}
+			return resp, nil, nil
 		},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 	}, stub)

@@ -395,14 +354,14 @@ func TestChat_Serialization(t *testing.T) {
 			rec := httptest.NewRecorder()
 			srv.Handler().ServeHTTP(rec, req)
 			if rec.Code != http.StatusOK {
-				t.Errorf("status = %d, want %d", rec.Code, http.StatusOK)
+				t.Errorf("status = %d, want %d; body: %s", rec.Code, http.StatusOK, rec.Body.String())
 			}
 		}()
 	}
 	wg.Wait()

 	if got := maxInflight.Load(); got > 1 {
-		t.Errorf("max concurrent chat requests at target = %d, want 1 (gate should serialize)", got)
+		t.Errorf("max concurrent chat requests at target = %d, want 1 (worker should serialize)", got)
 	}
 }

@@ -432,7 +391,7 @@ func TestEmbed_ConcurrentBypassesGate(t *testing.T) {
 			return newRawResponse(200, "application/json", b), nil
 		},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 	}, stub)

@@ -471,7 +430,7 @@ func TestEmbed_AlsoWorksOnEmbeddingsPath(t *testing.T) {
 			return newRawResponse(200, "application/json", respBytes), nil
 		},
 	}
-	srv := newTestServerWithInventory(t, config.Config{
+	srv, _ := newTestServerWithInventory(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 	}, stub)

@@ -490,7 +449,7 @@ func TestHealthz_DegradedFromInventory(t *testing.T) {
 		tagsErr: fmt.Errorf("connection refused"),
 		ps:      &ollama.PsResponse{},
 	}
-	srv := newTestServer(t, config.Config{
+	srv, _ := newTestServer(t, config.Config{
 		OllamaURL: "http://localhost:11434",
 	}, stub)

@@ -514,6 +473,35 @@ func TestHealthz_DegradedFromInventory(t *testing.T) {
 	}
 }

+func TestChat_ContextCancellation(t *testing.T) {
+	// Chat function that blocks forever to simulate a slow worker.
+	stub := &stubClient{
+		tags: &ollama.TagsResponse{
+			Models: []ollama.ModelInfo{{Name: "qwen3:30b"}},
+		},
+		ps: &ollama.PsResponse{},
+		chatFunc: func(ctx context.Context, req ollama.ChatRequest, stream bool) (*ollama.ChatResponse, <-chan ollama.ChatResponse, error) {
+			<-ctx.Done()
+			return nil, nil, ctx.Err()
+		},
+	}
+	srv, _ := newTestServerWithInventory(t, config.Config{
+		OllamaURL: "http://localhost:11434",
+	}, stub)
+
+	ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
+	defer cancel()
+
+	body := `{"model":"qwen3:30b","messages":[{"role":"user","content":"hi"}],"stream":false}`
+	req := httptest.NewRequestWithContext(ctx, http.MethodPost, "/api/chat", strings.NewReader(body))
+	rec := httptest.NewRecorder()
+	srv.Handler().ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusServiceUnavailable {
+		t.Fatalf("status = %d, want %d", rec.Code, http.StatusServiceUnavailable)
+	}
+}
+
 // --- Stub client for testing ---

 // stubClient implements ollama.Client for testing.
@@ -523,6 +511,7 @@ type stubClient struct {
 	ps      *ollama.PsResponse
 	psErr   error

+	chatFunc    func(ctx context.Context, req ollama.ChatRequest, stream bool) (*ollama.ChatResponse, <-chan ollama.ChatResponse, error)
 	rawChatResp *http.Response
 	rawChatFunc func(ctx context.Context, body []byte) (*http.Response, error)

@@ -531,6 +520,9 @@ type stubClient struct {
 }

 func (s *stubClient) Chat(ctx context.Context, req ollama.ChatRequest, stream bool) (*ollama.ChatResponse, <-chan ollama.ChatResponse, error) {
+	if s.chatFunc != nil {
+		return s.chatFunc(ctx, req, stream)
+	}
 	return nil, nil, fmt.Errorf("stubClient.Chat not implemented")
 }