fix: resolve flaky TestWebhook_LifecycleEvents caused by async delivery ordering
The test assumed webhook events arrive in wall-clock order (queued first, done last), but dispatcher.Fire spawns a goroutine per event with no ordering guarantee. On a single-core CI runner the "queued" goroutine was routinely preempted before making its HTTP POST, letting "loading"/"working"/"done" goroutines land first. Fix: wait until a "done" event appears in the received set (proving all prior transitions have been dispatched by the worker), then assert that "queued" and "done" each appear exactly once rather than checking positional order. Reproduced with: GOMAXPROCS=1 go test -race -count=100 -run TestWebhook_LifecycleEvents ./internal/server/ Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -317,14 +317,23 @@ func TestWebhook_LifecycleEvents(t *testing.T) {
|
||||
var submitResp jobSubmitResponse
|
||||
json.NewDecoder(submitRec.Body).Decode(&submitResp)
|
||||
|
||||
// Wait for webhooks to arrive.
|
||||
// Wait until we see a "done" event. Since all webhooks are delivered in
|
||||
// background goroutines there is no guaranteed wall-clock ordering between
|
||||
// "queued", "loading"/"working", and "done". Waiting for "done" to appear
|
||||
// is the only reliable signal that all prior events have been dispatched
|
||||
// (the worker fires them in order before completing).
|
||||
deadline := time.Now().Add(5 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
mu.Lock()
|
||||
n := len(events)
|
||||
found := false
|
||||
for _, e := range events {
|
||||
if e.State == "done" {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
mu.Unlock()
|
||||
// We expect at least: queued, working (or loading), done.
|
||||
if n >= 3 {
|
||||
if found {
|
||||
break
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
@@ -333,11 +342,12 @@ func TestWebhook_LifecycleEvents(t *testing.T) {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
|
||||
// Verify we received at least: queued, working (or loading), done.
|
||||
if len(events) < 3 {
|
||||
t.Fatalf("received %d webhook events, want >= 3", len(events))
|
||||
}
|
||||
|
||||
// Verify all events have the correct job_id.
|
||||
// Verify all events have the correct job_id and model.
|
||||
for i, e := range events {
|
||||
if e.JobID != submitResp.JobID {
|
||||
t.Errorf("event[%d].job_id = %q, want %q", i, e.JobID, submitResp.JobID)
|
||||
@@ -347,15 +357,18 @@ func TestWebhook_LifecycleEvents(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// First event should be "queued".
|
||||
if events[0].State != "queued" {
|
||||
t.Errorf("first event state = %q, want %q", events[0].State, "queued")
|
||||
// Verify that "queued" and "done" each appear exactly once across all events.
|
||||
// We do not assert wall-clock arrival order because all deliveries are async
|
||||
// goroutines that may be scheduled in any order by the OS.
|
||||
stateCount := make(map[string]int)
|
||||
for _, e := range events {
|
||||
stateCount[e.State]++
|
||||
}
|
||||
|
||||
// Last event should be "done".
|
||||
lastEvent := events[len(events)-1]
|
||||
if lastEvent.State != "done" {
|
||||
t.Errorf("last event state = %q, want %q", lastEvent.State, "done")
|
||||
if stateCount["queued"] != 1 {
|
||||
t.Errorf("expected exactly 1 'queued' event, got %d", stateCount["queued"])
|
||||
}
|
||||
if stateCount["done"] != 1 {
|
||||
t.Errorf("expected exactly 1 'done' event, got %d", stateCount["done"])
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user