feat(chain): fail over on empty/degenerate responses

A failover chain previously treated a successful-but-empty completion (no content parts and no tool calls — a "stop with nothing") as a valid result and returned it. The agent loop then ended the run with empty output, and the configured backup models were never tried because no error was raised. This let a single flaky model silently terminate an agent/skill run with no answer (observed in the wild with ollama-cloud/glm-5.2 returning empty completions right after a large tool/think turn). - Add llm.ErrEmptyResponse (classified transient) and Response.IsEmpty(): true only when there are no tool calls and no meaningful content (no parts, or whitespace-only text). A media/image part counts as content, so image-only responses are NOT empty. - chain.Generate converts an empty completion into ErrEmptyResponse so the chain fails over to the next target. Unlike an ordinary transient it is NOT retried on the same target (the model just produced it; these calls are expensive) — the chain penalizes health (so a persistently-empty target benches) and advances immediately. - When every target returns empty the call fails with ErrChainExhausted joined to ErrEmptyResponse — a visible error instead of a hollow success. Single-element chains therefore also surface empties as errors. Stream path is unchanged (can't inspect content before the consumer reads it). Tests: Response.IsEmpty table; chain fails over past an empty head; all-empty chain returns ErrChainExhausted/ErrEmptyResponse; repeated empties bench the target across requests. Full suite green. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 10:35:07 -04:00
parent 3e81fbd540
commit 74474c6da0
6 changed files with 217 additions and 1 deletions
@@ -90,6 +90,15 @@ process env eagerly; unknown names also resolve lazily at Parse time
  benched for base 5s × 2^n, capped 5m. Success fully resets. Chains skip
  benched targets; 404 advances penalty-free; auth/malformed fail fast
  (configurable); exhaustion returns a joined error naming every target.
 - **Empty response = failover.** A target that returns *without error* but
  with no usable output — no content parts and no tool calls (`Response.IsEmpty`;
  a media/image part counts as content) — is treated as a per-target failure
  (`llm.ErrEmptyResponse`, classified transient). Unlike an ordinary
  transient it is **not** retried on the same target (the model just did
  this; the call is expensive): the chain penalizes health and advances
  immediately. If every target comes back empty the call fails with
  `ErrChainExhausted` rather than a hollow "successful" empty completion, so
  a single flaky model can't silently end an agent run with nothing.
 - Tracker is in-memory, process-local, clock-injected. No persistence.
 ## House conventions (mirror foreman)
@@ -64,10 +64,24 @@ func (c *chain) Capabilities() llm.Capabilities {
 }
 // Generate tries each target per the chain semantics above.
 //
 // A target that returns, without error, an empty/degenerate response (no
 // content and no tool calls — see Response.IsEmpty) is treated as a
 // per-target failure (llm.ErrEmptyResponse): the chain benches it and
 // advances to the next element. This stops a single flaky model from
 // silently ending an agent run with nothing; if every target comes back
 // empty the call fails with ErrChainExhausted rather than a hollow success.
 func (c *chain) Generate(ctx context.Context, req llm.Request, opts ...llm.Option) (*llm.Response, error) {
 	req = req.Apply(opts...)
 	return chainDo(ctx, c, req, func(ctx context.Context, t chainTarget, nreq llm.Request) (*llm.Response, error) {
-		return t.model.Generate(ctx, nreq)
+		resp, err := t.model.Generate(ctx, nreq)
 		if err != nil {
 			return nil, err
 		}
 		if resp.IsEmpty() {
 			return nil, llm.ErrEmptyResponse
 		}
 		return resp, nil
 	})
 }
@@ -124,6 +138,18 @@ func chainDo[T any](ctx context.Context, c *chain, req llm.Request, attempt func
 				return result, nil
 			}
 			if errors.Is(err, llm.ErrEmptyResponse) {
 				// The target returned successfully but with nothing usable.
 				// Don't spend an (expensive) same-target retry — it just did
 				// this. Penalize health so a persistently-empty target
 				// benches and is skipped next time, then advance to the next
 				// element immediately.
 				benched := c.tracker.ReportFailure(t.key)
 				observe(FailoverEvent{Target: t.key, Err: err, Class: llm.ClassTransient, Attempt: attemptN, Benched: benched})
 				failures = append(failures, fmt.Errorf("%s: %w", t.key, err))
 				break
 			}
 			class := c.cfg.classify(err)
 			if class == llm.ClassPermanent {
 				observe(FailoverEvent{Target: t.key, Err: err, Class: class, Attempt: attemptN})
@@ -0,0 +1,112 @@
 package majordomo
 import (
 	"errors"
 	"testing"
 	"time"
 	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
 	"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
 )
 // TestChainFailsOverOnEmptyResponse: a target that returns successfully but
 // with an empty/degenerate response (no content, no tool calls) is treated
 // as a per-target failure — the chain advances to the next element, which
 // answers. The empty head is tried exactly once (no wasteful same-target
 // retry of an expensive call).
 func TestChainFailsOverOnEmptyResponse(t *testing.T) {
 	r := newTestRegistry(t)
 	fp := fake.New("fp")
 	r.RegisterProvider(fp)
 	fp.Enqueue("empty", fake.ReplyWith(llm.Response{FinishReason: llm.FinishStop}))
 	fp.Enqueue("good", fake.Reply("real answer"))
 	m, err := r.Parse("fp/empty,fp/good")
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
 	resp, err := generate(t, m)
 	if err != nil {
 		t.Fatalf("Generate: %v", err)
 	}
 	if resp.Text() != "real answer" {
 		t.Errorf("text = %q, want real answer", resp.Text())
 	}
 	if resp.Model != "fp/good" {
 		t.Errorf("resp.Model = %q, want fp/good (the serving target)", resp.Model)
 	}
 	if n := fp.CallCount("empty"); n != 1 {
 		t.Errorf("empty target call count = %d, want 1 (no same-target retry)", n)
 	}
 }
 // TestChainAllEmptyReturnsError: when every target comes back empty the call
 // fails with ErrChainExhausted joined to ErrEmptyResponse — a visible error,
 // never a hollow "successful" empty completion.
 func TestChainAllEmptyReturnsError(t *testing.T) {
 	r := newTestRegistry(t)
 	fp := fake.New("fp")
 	r.RegisterProvider(fp)
 	// fake.Reply("") yields a single empty text part → degenerate.
 	fp.Enqueue("a", fake.Reply(""))
 	m, err := r.Parse("fp/a")
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
 	resp, err := generate(t, m)
 	if err == nil {
 		t.Fatalf("want error, got resp=%v", resp)
 	}
 	if !errors.Is(err, llm.ErrEmptyResponse) {
 		t.Errorf("err = %v, want it to wrap ErrEmptyResponse", err)
 	}
 	if !errors.Is(err, ErrChainExhausted) {
 		t.Errorf("err = %v, want it to wrap ErrChainExhausted", err)
 	}
 }
 // TestEmptyResponseBenchesTargetAcrossRequests: repeated empty responses
 // penalize the target's health like any transient failure, so a
 // persistently-empty head gets benched and is skipped on later requests
 // (rather than stalling every request before failing over).
 func TestEmptyResponseBenchesTargetAcrossRequests(t *testing.T) {
 	clock := newFakeClock()
 	r := newTestRegistry(t, WithClock(clock.Now))
 	fp := fake.New("fp")
 	r.RegisterProvider(fp)
 	// Head returns empty on the first two requests (benches at the default
 	// 2-consecutive-failure threshold); tail always answers.
 	fp.Enqueue("head", fake.Reply(""), fake.Reply(""))
 	fp.Enqueue("tail", fake.Reply("t1"), fake.Reply("t2"), fake.Reply("t3"))
 	m, err := r.Parse("fp/head,fp/tail")
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
 	if resp, _ := generate(t, m); resp.Text() != "t1" {
 		t.Fatalf("request 1: got %q, want t1", resp.Text())
 	}
 	if resp, _ := generate(t, m); resp.Text() != "t2" {
 		t.Fatalf("request 2: got %q, want t2", resp.Text())
 	}
 	// Two empties benched the head: the third request must skip it entirely.
 	clock.Advance(1 * time.Second) // still within the cooldown window
 	callsBefore := fp.CallCount("head")
 	if resp, _ := generate(t, m); resp.Text() != "t3" {
 		t.Fatalf("request 3: got %q, want t3", resp.Text())
 	}
 	if got := fp.CallCount("head"); got != callsBefore {
 		t.Errorf("benched empty head must not be called during cooldown (calls %d -> %d)", callsBefore, got)
 	}
 	if r.Health().Available("fp/head") {
 		t.Error("head should be benched after two consecutive empty responses")
 	}
 }
@@ -33,6 +33,15 @@ var ErrModelNotFound = errors.New("model not found")
 // well be able to serve the request.
 var ErrUnsupported = errors.New("request unsupported by target")
 // ErrEmptyResponse marks a provider call that returned, without error, no
 // usable output — no content parts and no tool calls (a "stop with
 // nothing"). It is never a valid completion: an agent step needs either a
 // final answer or a tool call, and a one-shot Generate needs content. A
 // chain treats it as a per-target failure and fails over to the next
 // element (benching the empty target) so a single flaky model cannot
 // silently end a run with nothing. See chain.Generate / Response.IsEmpty.
 var ErrEmptyResponse = errors.New("model returned an empty response")
 // APIError is a structured provider error carrying enough context to
 // classify it and to debug it.
 type APIError struct {
@@ -106,6 +115,11 @@ func Classify(err error) ErrorClass {
 	if errors.Is(err, ErrModelNotFound) || errors.Is(err, ErrUnsupported) {
 		return ClassPermanent
 	}
 	if errors.Is(err, ErrEmptyResponse) {
 		// An empty completion may be a one-off provider hiccup; another
 		// target (or, rarely, a retry) can produce real output.
 		return ClassTransient
 	}
 	if errors.Is(err, syscall.ECONNREFUSED) || errors.Is(err, syscall.ECONNRESET) {
 		return ClassTransient
 	}
@@ -86,3 +86,29 @@ func (r *Response) Text() string {
 func (r *Response) Message() Message {
 	return Message{Role: RoleAssistant, Parts: r.Parts, ToolCalls: r.ToolCalls}
 }
 // IsEmpty reports whether the response carries no usable output: no tool
 // calls and no meaningful content (no parts at all, or only whitespace
 // text). A media/image part — or any non-text part — counts as content, so
 // an image-only response is NOT empty. A "stop with nothing" like this is
 // never a valid completion for an agent step or a Generate call; failover
 // chains treat it as a per-target failure (see ErrEmptyResponse).
 func (r *Response) IsEmpty() bool {
 	if r == nil {
 		return true
 	}
 	if len(r.ToolCalls) > 0 {
 		return false
 	}
 	for _, p := range r.Parts {
 		if t, ok := p.(TextPart); ok {
 			if strings.TrimSpace(t.Text) != "" {
 				return false
 			}
 			continue
 		}
 		// Any non-text part (image/media) is meaningful output.
 		return false
 	}
 	return true
 }
@@ -0,0 +1,29 @@
 package llm
 import "testing"
 func TestResponseIsEmpty(t *testing.T) {
 	img := ImagePart{MIME: "image/png", Data: []byte{0x89, 0x50, 0x4e, 0x47}}
 	tests := []struct {
 		name string
 		resp *Response
 		want bool
 	}{
 		{"nil response", nil, true},
 		{"no parts, no tool calls", &Response{FinishReason: FinishStop}, true},
 		{"single empty text part", &Response{Parts: []Part{Text("")}}, true},
 		{"whitespace-only text", &Response{Parts: []Part{Text("  \n\t ")}}, true},
 		{"real text", &Response{Parts: []Part{Text("hello")}}, false},
 		{"tool call, no text", &Response{ToolCalls: []ToolCall{{ID: "1", Name: "x"}}}, false},
 		{"image only", &Response{Parts: []Part{img}}, false},
 		{"empty text but a tool call", &Response{Parts: []Part{Text("")}, ToolCalls: []ToolCall{{ID: "1", Name: "x"}}}, false},
 		{"whitespace text plus an image", &Response{Parts: []Part{Text("  "), img}}, false},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			if got := tt.resp.IsEmpty(); got != tt.want {
 				t.Errorf("IsEmpty() = %v, want %v", got, tt.want)
 			}
 		})
 	}
 }