feat: add durable queue, single worker, and drain-by-model scheduling
Replace the Phase 2 in-flight chat gate (buffered channel) with a real SQLite-backed job queue and single worker loop. Every /api/chat request now creates a job row, blocks until the worker completes it, and returns the result transparently. Key changes: - internal/store: NextJob (drain-by-model ordering), IncrementAttempt, ResetInterruptedJobs, DeleteTerminalJobsBefore; busy_timeout pragma - internal/worker: single-threaded worker loop with Notifier for sync handler completion signaling; retry on ConnectionError, terminal fail on HTTPError; crash recovery resets interrupted jobs on startup - internal/webhook: dispatcher infrastructure for async webhook delivery - internal/server: chat handler rewritten to enqueue+wait; old chatGate removed; embeddings remain direct concurrent proxies (ADR-0013) - internal/config: FOREMAN_MAX_ATTEMPTS, FOREMAN_JOB_TTL Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,190 @@
|
||||
// Package webhook delivers state-change events to job webhook URLs.
|
||||
//
|
||||
// Why: async job callers need push notification of state transitions without
|
||||
// polling (ADR-0005). Delivery must never block or fail the job itself.
|
||||
// What: fires HTTP POSTs with JSON payloads to configured webhook URLs, retrying
|
||||
// with exponential backoff. Optionally signs payloads with HMAC-SHA256.
|
||||
// Test: spin up an in-test HTTP server, fire events, verify receipt, retry on 500,
|
||||
// and HMAC signature verification.
|
||||
package webhook
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/hmac"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Event is the JSON payload POSTed to a webhook URL on each state transition.
|
||||
type Event struct {
|
||||
JobID string `json:"job_id"`
|
||||
State string `json:"state"`
|
||||
PreviousState string `json:"previous_state"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
Model string `json:"model"`
|
||||
Attempt int `json:"attempt"`
|
||||
Result json.RawMessage `json:"result"`
|
||||
Artifacts json.RawMessage `json:"artifacts"`
|
||||
Error *string `json:"error"`
|
||||
}
|
||||
|
||||
// Dispatcher sends webhook events to job-specified URLs.
|
||||
type Dispatcher struct {
|
||||
secret string
|
||||
httpClient *http.Client
|
||||
logger *slog.Logger
|
||||
|
||||
maxRetries int
|
||||
baseDelay time.Duration
|
||||
}
|
||||
|
||||
// NewDispatcher creates a new webhook dispatcher.
|
||||
//
|
||||
// Why: centralizes webhook delivery config (secret, retry policy) in one place.
|
||||
// What: returns a Dispatcher ready to fire events asynchronously.
|
||||
// Test: create with a secret, fire an event, verify HMAC header.
|
||||
func NewDispatcher(secret string, logger *slog.Logger) *Dispatcher {
|
||||
return &Dispatcher{
|
||||
secret: secret,
|
||||
httpClient: &http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
},
|
||||
logger: logger,
|
||||
maxRetries: 5,
|
||||
baseDelay: 1 * time.Second,
|
||||
}
|
||||
}
|
||||
|
||||
// Fire sends a webhook event to the given URL in a background goroutine. It never
|
||||
// blocks the caller and never returns an error — failed deliveries are logged and
|
||||
// dropped per ADR-0005.
|
||||
//
|
||||
// Why: webhook failures must never block or fail the worker loop.
|
||||
// What: marshals the event, spawns a goroutine that retries with backoff.
|
||||
// Test: fire an event at a 500-returning server, verify retries happen then stop.
|
||||
func (d *Dispatcher) Fire(url string, event Event) {
|
||||
go d.deliver(url, event)
|
||||
}
|
||||
|
||||
// deliver attempts to POST the event with retries and backoff.
|
||||
func (d *Dispatcher) deliver(url string, event Event) {
|
||||
body, err := json.Marshal(event)
|
||||
if err != nil {
|
||||
d.logger.Error("webhook marshal failed", "error", err, "job_id", event.JobID)
|
||||
return
|
||||
}
|
||||
|
||||
for attempt := 0; attempt <= d.maxRetries; attempt++ {
|
||||
if attempt > 0 {
|
||||
delay := d.baseDelay * (1 << (attempt - 1))
|
||||
time.Sleep(delay)
|
||||
}
|
||||
|
||||
req, err := http.NewRequest(http.MethodPost, url, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
d.logger.Error("webhook request creation failed",
|
||||
"error", err, "url", url, "job_id", event.JobID)
|
||||
return
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
if d.secret != "" {
|
||||
sig := computeHMAC(body, d.secret)
|
||||
req.Header.Set("X-Foreman-Signature", "sha256="+sig)
|
||||
}
|
||||
|
||||
resp, err := d.httpClient.Do(req)
|
||||
if err != nil {
|
||||
d.logger.Warn("webhook delivery failed",
|
||||
"error", err, "url", url, "job_id", event.JobID,
|
||||
"attempt", attempt+1, "max", d.maxRetries+1)
|
||||
continue
|
||||
}
|
||||
resp.Body.Close()
|
||||
|
||||
if resp.StatusCode >= 200 && resp.StatusCode < 300 {
|
||||
d.logger.Debug("webhook delivered",
|
||||
"url", url, "job_id", event.JobID, "state", event.State)
|
||||
return
|
||||
}
|
||||
|
||||
d.logger.Warn("webhook non-2xx response",
|
||||
"status", resp.StatusCode, "url", url, "job_id", event.JobID,
|
||||
"attempt", attempt+1, "max", d.maxRetries+1)
|
||||
}
|
||||
|
||||
d.logger.Error("webhook delivery exhausted retries",
|
||||
"url", url, "job_id", event.JobID, "state", event.State)
|
||||
}
|
||||
|
||||
// computeHMAC computes HMAC-SHA256 of body using the given key and returns the
|
||||
// hex-encoded digest.
|
||||
func computeHMAC(body []byte, key string) string {
|
||||
mac := hmac.New(sha256.New, []byte(key))
|
||||
mac.Write(body)
|
||||
return hex.EncodeToString(mac.Sum(nil))
|
||||
}
|
||||
|
||||
// VerifySignature checks that the signature header matches the HMAC-SHA256 of
|
||||
// the body. Exported for use by webhook receivers.
|
||||
//
|
||||
// Why: webhook consumers need to verify authenticity of incoming payloads.
|
||||
// What: computes HMAC and compares to the provided signature using constant-time comparison.
|
||||
// Test: sign a body, verify with correct and incorrect secrets.
|
||||
func VerifySignature(body []byte, signature, secret string) bool {
|
||||
if len(signature) < 8 || signature[:7] != "sha256=" {
|
||||
return false
|
||||
}
|
||||
expected := computeHMAC(body, secret)
|
||||
return hmac.Equal([]byte(expected), []byte(signature[7:]))
|
||||
}
|
||||
|
||||
// FormatArtifacts formats artifact metadata for webhook payloads. Small artifacts
|
||||
// (under threshold) are inlined; large ones get a URL reference.
|
||||
//
|
||||
// Why: webhook bodies must stay bounded per ADR-0006 (~256KB threshold).
|
||||
// What: returns JSON-encoded artifact metadata with inline data or URL references.
|
||||
// Test: create artifacts above and below threshold, verify inline vs URL in output.
|
||||
func FormatArtifacts(jobID string, artifacts []ArtifactMeta) json.RawMessage {
|
||||
if len(artifacts) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
type artifactOut struct {
|
||||
Name string `json:"name"`
|
||||
ContentType string `json:"content_type"`
|
||||
Size int64 `json:"size"`
|
||||
Data string `json:"data,omitempty"`
|
||||
URL string `json:"url,omitempty"`
|
||||
}
|
||||
|
||||
out := make([]artifactOut, len(artifacts))
|
||||
for i, a := range artifacts {
|
||||
out[i] = artifactOut{
|
||||
Name: a.Name,
|
||||
ContentType: a.ContentType,
|
||||
Size: a.Size,
|
||||
}
|
||||
if a.Size <= 256*1024 && a.Data != nil {
|
||||
out[i].Data = string(a.Data)
|
||||
} else {
|
||||
out[i].URL = fmt.Sprintf("/jobs/%s/artifacts/%s", jobID, a.Name)
|
||||
}
|
||||
}
|
||||
|
||||
b, _ := json.Marshal(out)
|
||||
return json.RawMessage(b)
|
||||
}
|
||||
|
||||
// ArtifactMeta holds artifact info for webhook formatting.
|
||||
type ArtifactMeta struct {
|
||||
Name string
|
||||
ContentType string
|
||||
Size int64
|
||||
Data []byte
|
||||
}
|
||||
Reference in New Issue
Block a user