feat(chain): fail over on empty/degenerate responses

A failover chain previously treated a successful-but-empty completion (no content parts and no tool calls — a "stop with nothing") as a valid result and returned it. The agent loop then ended the run with empty output, and the configured backup models were never tried because no error was raised. This let a single flaky model silently terminate an agent/skill run with no answer (observed in the wild with ollama-cloud/glm-5.2 returning empty completions right after a large tool/think turn). - Add llm.ErrEmptyResponse (classified transient) and Response.IsEmpty(): true only when there are no tool calls and no meaningful content (no parts, or whitespace-only text). A media/image part counts as content, so image-only responses are NOT empty. - chain.Generate converts an empty completion into ErrEmptyResponse so the chain fails over to the next target. Unlike an ordinary transient it is NOT retried on the same target (the model just produced it; these calls are expensive) — the chain penalizes health (so a persistently-empty target benches) and advances immediately. - When every target returns empty the call fails with ErrChainExhausted joined to ErrEmptyResponse — a visible error instead of a hollow success. Single-element chains therefore also surface empties as errors. Stream path is unchanged (can't inspect content before the consumer reads it). Tests: Response.IsEmpty table; chain fails over past an empty head; all-empty chain returns ErrChainExhausted/ErrEmptyResponse; repeated empties bench the target across requests. Full suite green. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 10:35:07 -04:00
parent 3e81fbd540
commit 74474c6da0
6 changed files with 217 additions and 1 deletions
@@ -0,0 +1,112 @@
+package majordomo
+
+import (
+	"errors"
+	"testing"
+	"time"
+
+	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
+	"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
+)
+
+// TestChainFailsOverOnEmptyResponse: a target that returns successfully but
+// with an empty/degenerate response (no content, no tool calls) is treated
+// as a per-target failure — the chain advances to the next element, which
+// answers. The empty head is tried exactly once (no wasteful same-target
+// retry of an expensive call).
+func TestChainFailsOverOnEmptyResponse(t *testing.T) {
+	r := newTestRegistry(t)
+	fp := fake.New("fp")
+	r.RegisterProvider(fp)
+
+	fp.Enqueue("empty", fake.ReplyWith(llm.Response{FinishReason: llm.FinishStop}))
+	fp.Enqueue("good", fake.Reply("real answer"))
+
+	m, err := r.Parse("fp/empty,fp/good")
+	if err != nil {
+		t.Fatalf("Parse: %v", err)
+	}
+
+	resp, err := generate(t, m)
+	if err != nil {
+		t.Fatalf("Generate: %v", err)
+	}
+	if resp.Text() != "real answer" {
+		t.Errorf("text = %q, want real answer", resp.Text())
+	}
+	if resp.Model != "fp/good" {
+		t.Errorf("resp.Model = %q, want fp/good (the serving target)", resp.Model)
+	}
+	if n := fp.CallCount("empty"); n != 1 {
+		t.Errorf("empty target call count = %d, want 1 (no same-target retry)", n)
+	}
+}
+
+// TestChainAllEmptyReturnsError: when every target comes back empty the call
+// fails with ErrChainExhausted joined to ErrEmptyResponse — a visible error,
+// never a hollow "successful" empty completion.
+func TestChainAllEmptyReturnsError(t *testing.T) {
+	r := newTestRegistry(t)
+	fp := fake.New("fp")
+	r.RegisterProvider(fp)
+
+	// fake.Reply("") yields a single empty text part → degenerate.
+	fp.Enqueue("a", fake.Reply(""))
+
+	m, err := r.Parse("fp/a")
+	if err != nil {
+		t.Fatalf("Parse: %v", err)
+	}
+
+	resp, err := generate(t, m)
+	if err == nil {
+		t.Fatalf("want error, got resp=%v", resp)
+	}
+	if !errors.Is(err, llm.ErrEmptyResponse) {
+		t.Errorf("err = %v, want it to wrap ErrEmptyResponse", err)
+	}
+	if !errors.Is(err, ErrChainExhausted) {
+		t.Errorf("err = %v, want it to wrap ErrChainExhausted", err)
+	}
+}
+
+// TestEmptyResponseBenchesTargetAcrossRequests: repeated empty responses
+// penalize the target's health like any transient failure, so a
+// persistently-empty head gets benched and is skipped on later requests
+// (rather than stalling every request before failing over).
+func TestEmptyResponseBenchesTargetAcrossRequests(t *testing.T) {
+	clock := newFakeClock()
+	r := newTestRegistry(t, WithClock(clock.Now))
+	fp := fake.New("fp")
+	r.RegisterProvider(fp)
+
+	// Head returns empty on the first two requests (benches at the default
+	// 2-consecutive-failure threshold); tail always answers.
+	fp.Enqueue("head", fake.Reply(""), fake.Reply(""))
+	fp.Enqueue("tail", fake.Reply("t1"), fake.Reply("t2"), fake.Reply("t3"))
+
+	m, err := r.Parse("fp/head,fp/tail")
+	if err != nil {
+		t.Fatalf("Parse: %v", err)
+	}
+
+	if resp, _ := generate(t, m); resp.Text() != "t1" {
+		t.Fatalf("request 1: got %q, want t1", resp.Text())
+	}
+	if resp, _ := generate(t, m); resp.Text() != "t2" {
+		t.Fatalf("request 2: got %q, want t2", resp.Text())
+	}
+
+	// Two empties benched the head: the third request must skip it entirely.
+	clock.Advance(1 * time.Second) // still within the cooldown window
+	callsBefore := fp.CallCount("head")
+	if resp, _ := generate(t, m); resp.Text() != "t3" {
+		t.Fatalf("request 3: got %q, want t3", resp.Text())
+	}
+	if got := fp.CallCount("head"); got != callsBefore {
+		t.Errorf("benched empty head must not be called during cooldown (calls %d -> %d)", callsBefore, got)
+	}
+	if r.Health().Available("fp/head") {
+		t.Error("head should be benched after two consecutive empty responses")
+	}
+}