diff --git a/CLAUDE.md b/CLAUDE.md index ac56915..104ddfb 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -90,6 +90,15 @@ process env eagerly; unknown names also resolve lazily at Parse time benched for base 5s × 2^n, capped 5m. Success fully resets. Chains skip benched targets; 404 advances penalty-free; auth/malformed fail fast (configurable); exhaustion returns a joined error naming every target. +- **Empty response = failover.** A target that returns *without error* but + with no usable output — no content parts and no tool calls (`Response.IsEmpty`; + a media/image part counts as content) — is treated as a per-target failure + (`llm.ErrEmptyResponse`, classified transient). Unlike an ordinary + transient it is **not** retried on the same target (the model just did + this; the call is expensive): the chain penalizes health and advances + immediately. If every target comes back empty the call fails with + `ErrChainExhausted` rather than a hollow "successful" empty completion, so + a single flaky model can't silently end an agent run with nothing. - Tracker is in-memory, process-local, clock-injected. No persistence. ## House conventions (mirror foreman) diff --git a/chain.go b/chain.go index a86d87f..04264f0 100644 --- a/chain.go +++ b/chain.go @@ -64,10 +64,24 @@ func (c *chain) Capabilities() llm.Capabilities { } // Generate tries each target per the chain semantics above. +// +// A target that returns, without error, an empty/degenerate response (no +// content and no tool calls — see Response.IsEmpty) is treated as a +// per-target failure (llm.ErrEmptyResponse): the chain benches it and +// advances to the next element. This stops a single flaky model from +// silently ending an agent run with nothing; if every target comes back +// empty the call fails with ErrChainExhausted rather than a hollow success. func (c *chain) Generate(ctx context.Context, req llm.Request, opts ...llm.Option) (*llm.Response, error) { req = req.Apply(opts...) return chainDo(ctx, c, req, func(ctx context.Context, t chainTarget, nreq llm.Request) (*llm.Response, error) { - return t.model.Generate(ctx, nreq) + resp, err := t.model.Generate(ctx, nreq) + if err != nil { + return nil, err + } + if resp.IsEmpty() { + return nil, llm.ErrEmptyResponse + } + return resp, nil }) } @@ -124,6 +138,18 @@ func chainDo[T any](ctx context.Context, c *chain, req llm.Request, attempt func return result, nil } + if errors.Is(err, llm.ErrEmptyResponse) { + // The target returned successfully but with nothing usable. + // Don't spend an (expensive) same-target retry — it just did + // this. Penalize health so a persistently-empty target + // benches and is skipped next time, then advance to the next + // element immediately. + benched := c.tracker.ReportFailure(t.key) + observe(FailoverEvent{Target: t.key, Err: err, Class: llm.ClassTransient, Attempt: attemptN, Benched: benched}) + failures = append(failures, fmt.Errorf("%s: %w", t.key, err)) + break + } + class := c.cfg.classify(err) if class == llm.ClassPermanent { observe(FailoverEvent{Target: t.key, Err: err, Class: class, Attempt: attemptN}) diff --git a/failover_empty_test.go b/failover_empty_test.go new file mode 100644 index 0000000..bc7e6cb --- /dev/null +++ b/failover_empty_test.go @@ -0,0 +1,112 @@ +package majordomo + +import ( + "errors" + "testing" + "time" + + "gitea.stevedudenhoeffer.com/steve/majordomo/llm" + "gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake" +) + +// TestChainFailsOverOnEmptyResponse: a target that returns successfully but +// with an empty/degenerate response (no content, no tool calls) is treated +// as a per-target failure — the chain advances to the next element, which +// answers. The empty head is tried exactly once (no wasteful same-target +// retry of an expensive call). +func TestChainFailsOverOnEmptyResponse(t *testing.T) { + r := newTestRegistry(t) + fp := fake.New("fp") + r.RegisterProvider(fp) + + fp.Enqueue("empty", fake.ReplyWith(llm.Response{FinishReason: llm.FinishStop})) + fp.Enqueue("good", fake.Reply("real answer")) + + m, err := r.Parse("fp/empty,fp/good") + if err != nil { + t.Fatalf("Parse: %v", err) + } + + resp, err := generate(t, m) + if err != nil { + t.Fatalf("Generate: %v", err) + } + if resp.Text() != "real answer" { + t.Errorf("text = %q, want real answer", resp.Text()) + } + if resp.Model != "fp/good" { + t.Errorf("resp.Model = %q, want fp/good (the serving target)", resp.Model) + } + if n := fp.CallCount("empty"); n != 1 { + t.Errorf("empty target call count = %d, want 1 (no same-target retry)", n) + } +} + +// TestChainAllEmptyReturnsError: when every target comes back empty the call +// fails with ErrChainExhausted joined to ErrEmptyResponse — a visible error, +// never a hollow "successful" empty completion. +func TestChainAllEmptyReturnsError(t *testing.T) { + r := newTestRegistry(t) + fp := fake.New("fp") + r.RegisterProvider(fp) + + // fake.Reply("") yields a single empty text part → degenerate. + fp.Enqueue("a", fake.Reply("")) + + m, err := r.Parse("fp/a") + if err != nil { + t.Fatalf("Parse: %v", err) + } + + resp, err := generate(t, m) + if err == nil { + t.Fatalf("want error, got resp=%v", resp) + } + if !errors.Is(err, llm.ErrEmptyResponse) { + t.Errorf("err = %v, want it to wrap ErrEmptyResponse", err) + } + if !errors.Is(err, ErrChainExhausted) { + t.Errorf("err = %v, want it to wrap ErrChainExhausted", err) + } +} + +// TestEmptyResponseBenchesTargetAcrossRequests: repeated empty responses +// penalize the target's health like any transient failure, so a +// persistently-empty head gets benched and is skipped on later requests +// (rather than stalling every request before failing over). +func TestEmptyResponseBenchesTargetAcrossRequests(t *testing.T) { + clock := newFakeClock() + r := newTestRegistry(t, WithClock(clock.Now)) + fp := fake.New("fp") + r.RegisterProvider(fp) + + // Head returns empty on the first two requests (benches at the default + // 2-consecutive-failure threshold); tail always answers. + fp.Enqueue("head", fake.Reply(""), fake.Reply("")) + fp.Enqueue("tail", fake.Reply("t1"), fake.Reply("t2"), fake.Reply("t3")) + + m, err := r.Parse("fp/head,fp/tail") + if err != nil { + t.Fatalf("Parse: %v", err) + } + + if resp, _ := generate(t, m); resp.Text() != "t1" { + t.Fatalf("request 1: got %q, want t1", resp.Text()) + } + if resp, _ := generate(t, m); resp.Text() != "t2" { + t.Fatalf("request 2: got %q, want t2", resp.Text()) + } + + // Two empties benched the head: the third request must skip it entirely. + clock.Advance(1 * time.Second) // still within the cooldown window + callsBefore := fp.CallCount("head") + if resp, _ := generate(t, m); resp.Text() != "t3" { + t.Fatalf("request 3: got %q, want t3", resp.Text()) + } + if got := fp.CallCount("head"); got != callsBefore { + t.Errorf("benched empty head must not be called during cooldown (calls %d -> %d)", callsBefore, got) + } + if r.Health().Available("fp/head") { + t.Error("head should be benched after two consecutive empty responses") + } +} diff --git a/llm/errors.go b/llm/errors.go index cd78ef5..12807be 100644 --- a/llm/errors.go +++ b/llm/errors.go @@ -33,6 +33,15 @@ var ErrModelNotFound = errors.New("model not found") // well be able to serve the request. var ErrUnsupported = errors.New("request unsupported by target") +// ErrEmptyResponse marks a provider call that returned, without error, no +// usable output — no content parts and no tool calls (a "stop with +// nothing"). It is never a valid completion: an agent step needs either a +// final answer or a tool call, and a one-shot Generate needs content. A +// chain treats it as a per-target failure and fails over to the next +// element (benching the empty target) so a single flaky model cannot +// silently end a run with nothing. See chain.Generate / Response.IsEmpty. +var ErrEmptyResponse = errors.New("model returned an empty response") + // APIError is a structured provider error carrying enough context to // classify it and to debug it. type APIError struct { @@ -106,6 +115,11 @@ func Classify(err error) ErrorClass { if errors.Is(err, ErrModelNotFound) || errors.Is(err, ErrUnsupported) { return ClassPermanent } + if errors.Is(err, ErrEmptyResponse) { + // An empty completion may be a one-off provider hiccup; another + // target (or, rarely, a retry) can produce real output. + return ClassTransient + } if errors.Is(err, syscall.ECONNREFUSED) || errors.Is(err, syscall.ECONNRESET) { return ClassTransient } diff --git a/llm/response.go b/llm/response.go index 068b9e0..524a670 100644 --- a/llm/response.go +++ b/llm/response.go @@ -86,3 +86,29 @@ func (r *Response) Text() string { func (r *Response) Message() Message { return Message{Role: RoleAssistant, Parts: r.Parts, ToolCalls: r.ToolCalls} } + +// IsEmpty reports whether the response carries no usable output: no tool +// calls and no meaningful content (no parts at all, or only whitespace +// text). A media/image part — or any non-text part — counts as content, so +// an image-only response is NOT empty. A "stop with nothing" like this is +// never a valid completion for an agent step or a Generate call; failover +// chains treat it as a per-target failure (see ErrEmptyResponse). +func (r *Response) IsEmpty() bool { + if r == nil { + return true + } + if len(r.ToolCalls) > 0 { + return false + } + for _, p := range r.Parts { + if t, ok := p.(TextPart); ok { + if strings.TrimSpace(t.Text) != "" { + return false + } + continue + } + // Any non-text part (image/media) is meaningful output. + return false + } + return true +} diff --git a/llm/response_test.go b/llm/response_test.go new file mode 100644 index 0000000..250d11d --- /dev/null +++ b/llm/response_test.go @@ -0,0 +1,29 @@ +package llm + +import "testing" + +func TestResponseIsEmpty(t *testing.T) { + img := ImagePart{MIME: "image/png", Data: []byte{0x89, 0x50, 0x4e, 0x47}} + tests := []struct { + name string + resp *Response + want bool + }{ + {"nil response", nil, true}, + {"no parts, no tool calls", &Response{FinishReason: FinishStop}, true}, + {"single empty text part", &Response{Parts: []Part{Text("")}}, true}, + {"whitespace-only text", &Response{Parts: []Part{Text(" \n\t ")}}, true}, + {"real text", &Response{Parts: []Part{Text("hello")}}, false}, + {"tool call, no text", &Response{ToolCalls: []ToolCall{{ID: "1", Name: "x"}}}, false}, + {"image only", &Response{Parts: []Part{img}}, false}, + {"empty text but a tool call", &Response{Parts: []Part{Text("")}, ToolCalls: []ToolCall{{ID: "1", Name: "x"}}}, false}, + {"whitespace text plus an image", &Response{Parts: []Part{Text(" "), img}}, false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := tt.resp.IsEmpty(); got != tt.want { + t.Errorf("IsEmpty() = %v, want %v", got, tt.want) + } + }) + } +}