Files
llama-swap/internal/router/scheduler/serial_test.go
T
steve 542b79dacf
Validate JSON Schema / validate-schema (push) Successful in 9m53s
Linux CI / run-tests (push) Failing after 15m57s
Windows CI / run-tests (push) Has been cancelled
internal/router/scheduler: add serial scheduler, default on this fork
Add a strict one-model-at-a-time scheduler. Requests run in exact
arrival order; at most one runs at a time; switching to a different
model evicts every other running model first so a single model occupies
memory at a time. Unlike fifo it never reorders or batches same-model
requests, and it ignores group/matrix co-residency entirely, making the
single-model guarantee a property of the scheduler rather than the config.

- new Serial scheduler implementing the Scheduler interface
- register "serial" in scheduler.New; default routing.scheduler.use to
  "serial" at config load (fifo still selectable for upstream behavior)
- update config schema, example config, and config defaults tests

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-28 12:17:32 -04:00

392 lines
12 KiB
Go

package scheduler
import (
"errors"
"io"
"testing"
"time"
"github.com/mostlygeek/llama-swap/internal/logmon"
"github.com/mostlygeek/llama-swap/internal/process"
)
// Serial methods all run on the router's single run-loop goroutine, so these
// tests drive them directly and synchronously, reusing fakeEffects and the
// req/reqCh helpers from fifo_test.go. A load completes via OnSwapDone and a
// served request finishes via OnServeDone — the events the run loop delivers.
func newSerial(eff Effects) *Serial {
return NewSerial("test", logmon.NewWriter(io.Discard), eff)
}
// lastStart returns the most recent StartSwap record.
func lastStart(t *testing.T, eff *fakeEffects) startRec {
t.Helper()
if len(eff.starts) == 0 {
t.Fatal("no StartSwap recorded")
}
return eff.starts[len(eff.starts)-1]
}
func sameSet(a, b []string) bool {
if len(a) != len(b) {
return false
}
m := map[string]int{}
for _, x := range a {
m[x]++
}
for _, x := range b {
m[x]--
}
for _, v := range m {
if v != 0 {
return false
}
}
return true
}
// servedOrder returns the model IDs of every successful serve grant in order.
func servedOrder(eff *fakeEffects) []string {
var out []string
for _, g := range eff.grants {
if g.err == nil && g.serve {
out = append(out, g.model)
}
}
return out
}
func TestSerial_FastPath_AlreadyLoaded(t *testing.T) {
eff := newFakeEffects()
eff.states["a"] = process.StateReady
s := newSerial(eff)
s.OnRequest(req("a"))
if got := len(eff.starts); got != 0 {
t.Errorf("StartSwap calls=%d want 0 (already loaded, no swap)", got)
}
if got := eff.served("a"); got != 1 {
t.Errorf("served(a)=%d want 1", got)
}
}
func TestSerial_ColdStart_LoadsThenServes(t *testing.T) {
eff := newFakeEffects()
eff.states["a"] = process.StateStopped
s := newSerial(eff)
s.OnRequest(req("a"))
if got := eff.startsFor("a"); got != 1 {
t.Fatalf("StartSwap(a)=%d want 1", got)
}
if got := eff.served("a"); got != 0 {
t.Errorf("served(a)=%d want 0 before load completes", got)
}
eff.states["a"] = process.StateReady
s.OnSwapDone(SwapDone{ModelID: "a"})
if got := eff.served("a"); got != 1 {
t.Errorf("served(a)=%d want 1 after load", got)
}
}
func TestSerial_UnknownModel(t *testing.T) {
eff := newFakeEffects() // no states => unknown
s := newSerial(eff)
s.OnRequest(req("ghost"))
if len(eff.starts) != 0 {
t.Errorf("StartSwap calls=%d want 0", len(eff.starts))
}
if eff.errored("ghost") != 1 {
t.Fatalf("errored(ghost)=%d want 1", eff.errored("ghost"))
}
if !errors.Is(eff.grants[0].err, ErrModelNotFound) {
t.Errorf("err=%v want ErrModelNotFound", eff.grants[0].err)
}
}
func TestSerial_EvictsEveryOtherModel(t *testing.T) {
eff := newFakeEffects()
eff.states["x"] = process.StateReady // already running
eff.states["y"] = process.StateReady // also running (e.g. left over)
eff.states["a"] = process.StateStopped
s := newSerial(eff)
s.OnRequest(req("a"))
st := lastStart(t, eff)
if st.model != "a" {
t.Fatalf("loading %s want a", st.model)
}
if !sameSet(st.evict, []string{"x", "y"}) {
t.Errorf("evict=%v want [x y] (serial evicts ALL other models)", st.evict)
}
}
// TestSerial_OneJobAtATime verifies a second request waits while the first is
// serving, and only starts after the first finishes.
func TestSerial_OneJobAtATime(t *testing.T) {
eff := newFakeEffects()
eff.states["a"] = process.StateReady
eff.states["b"] = process.StateStopped
s := newSerial(eff)
s.OnRequest(req("a")) // served immediately
s.OnRequest(req("b")) // must wait — a is serving
if got := eff.startsFor("b"); got != 0 {
t.Fatalf("StartSwap(b)=%d want 0 while a is serving", got)
}
if got := eff.served("a"); got != 1 {
t.Fatalf("served(a)=%d want 1", got)
}
// a finishes -> b may now load (evicting a).
s.OnServeDone(ServeDoneEvent{ModelID: "a"})
if got := eff.startsFor("b"); got != 1 {
t.Fatalf("StartSwap(b)=%d want 1 after a finished", got)
}
if st := lastStart(t, eff); !sameSet(st.evict, []string{"a"}) {
t.Errorf("b evict=%v want [a]", st.evict)
}
}
// TestSerial_SameModelConsecutive_NoReload verifies back-to-back requests for the
// already-loaded model run without a reload, one after another.
func TestSerial_SameModelConsecutive_NoReload(t *testing.T) {
eff := newFakeEffects()
eff.states["a"] = process.StateStopped
s := newSerial(eff)
s.OnRequest(req("a")) // cold load
s.OnRequest(req("a")) // queued behind the first
eff.states["a"] = process.StateReady
s.OnSwapDone(SwapDone{ModelID: "a"}) // first serves
if got := eff.served("a"); got != 1 {
t.Fatalf("served(a)=%d want 1 (one at a time)", got)
}
s.OnServeDone(ServeDoneEvent{ModelID: "a"}) // first done -> second serves
if got := eff.served("a"); got != 2 {
t.Fatalf("served(a)=%d want 2", got)
}
if got := eff.startsFor("a"); got != 1 {
t.Errorf("StartSwap(a)=%d want 1 (second request must not reload)", got)
}
}
// TestSerial_StrictArrivalOrder is the core guarantee: qwen36, qwen35, sdxl,
// qwen36 execute in EXACTLY that order with evictions between each model switch,
// including reloading qwen36 at the end even though it ran first.
func TestSerial_StrictArrivalOrder(t *testing.T) {
eff := newFakeEffects()
for _, m := range []string{"qwen36", "qwen35", "sdxl"} {
eff.states[m] = process.StateStopped
}
s := newSerial(eff)
for _, m := range []string{"qwen36", "qwen35", "sdxl", "qwen36"} {
s.OnRequest(req(m))
}
// Only the first job starts loading; the rest wait their turn.
if len(eff.starts) != 1 || eff.starts[0].model != "qwen36" {
t.Fatalf("starts=%+v want only [qwen36] loading first", eff.starts)
}
// step completes the current model's load+serve and returns control to the
// scheduler, which must start the next queued model.
step := func(model string, wantEvict []string) {
t.Helper()
st := lastStart(t, eff)
if st.model != model {
t.Fatalf("loading %q want %q", st.model, model)
}
if !sameSet(st.evict, wantEvict) {
t.Fatalf("loading %q evict=%v want %v", model, st.evict, wantEvict)
}
// Simulate the eviction + load actually happening.
for _, e := range st.evict {
eff.states[e] = process.StateStopped
}
eff.states[model] = process.StateReady
s.OnSwapDone(SwapDone{ModelID: model})
s.OnServeDone(ServeDoneEvent{ModelID: model})
}
step("qwen36", nil) // cold load, nothing else running
step("qwen35", []string{"qwen36"}) // evict qwen36
step("sdxl", []string{"qwen35"}) // evict qwen35
step("qwen36", []string{"sdxl"}) // RELOAD qwen36, evict sdxl
want := []string{"qwen36", "qwen35", "sdxl", "qwen36"}
if got := servedOrder(eff); !sameOrder(got, want) {
t.Fatalf("serve order=%v want %v", got, want)
}
}
func sameOrder(a, b []string) bool {
if len(a) != len(b) {
return false
}
for i := range a {
if a[i] != b[i] {
return false
}
}
return true
}
func TestSerial_SwapError_FailsCallerAndAdvances(t *testing.T) {
eff := newFakeEffects()
eff.states["a"] = process.StateStopped
eff.states["b"] = process.StateStopped
s := newSerial(eff)
s.OnRequest(req("a"))
s.OnRequest(req("b")) // queued behind a
// a's load fails: its caller is errored and b proceeds.
s.OnSwapDone(SwapDone{ModelID: "a", Err: errors.New("boom")})
if eff.errored("a") != 1 {
t.Fatalf("errored(a)=%d want 1", eff.errored("a"))
}
if got := eff.startsFor("b"); got != 1 {
t.Fatalf("StartSwap(b)=%d want 1 after a's load failed", got)
}
}
// TestSerial_GrantServeFalse_Advances verifies that when the active request's
// caller has disconnected by serve time, the queue advances to the next request.
func TestSerial_GrantServeFalse_Advances(t *testing.T) {
eff := newFakeEffects()
eff.states["a"] = process.StateStopped
eff.states["b"] = process.StateStopped
eff.serveResult["a"] = false // a's caller is gone by grant time
s := newSerial(eff)
s.OnRequest(req("a"))
s.OnRequest(req("b")) // queued
eff.states["a"] = process.StateReady
s.OnSwapDone(SwapDone{ModelID: "a"}) // grant fails -> advance to b
if got := eff.served("a"); got != 0 {
t.Errorf("served(a)=%d want 0 (caller gone)", got)
}
if got := eff.startsFor("b"); got != 1 {
t.Fatalf("StartSwap(b)=%d want 1 (advanced after gone caller)", got)
}
}
func TestSerial_OnCancel_QueuedRequest(t *testing.T) {
eff := newFakeEffects()
eff.states["a"] = process.StateStopped
eff.states["b"] = process.StateStopped
s := newSerial(eff)
s.OnRequest(reqCh("a")) // starts loading a
cancelled := reqCh("b")
s.OnRequest(cancelled) // queued behind a
if len(s.queued) != 1 {
t.Fatalf("queued=%d want 1", len(s.queued))
}
s.OnCancel(cancelled)
if len(s.queued) != 0 {
t.Fatalf("queued=%d want 0 after cancel", len(s.queued))
}
// a completes; b is gone, so nothing starts for it.
eff.states["a"] = process.StateReady
s.OnSwapDone(SwapDone{ModelID: "a"})
s.OnServeDone(ServeDoneEvent{ModelID: "a"})
if got := eff.startsFor("b"); got != 0 {
t.Errorf("StartSwap(b)=%d want 0 (cancelled before its turn)", got)
}
}
func TestSerial_OnShutdown_FailsQueuedAndActiveLoad(t *testing.T) {
eff := newFakeEffects()
eff.states["a"] = process.StateStopped
eff.states["b"] = process.StateStopped
eff.states["c"] = process.StateStopped
s := newSerial(eff)
s.OnRequest(req("a")) // active (loading)
s.OnRequest(req("b")) // queued
s.OnRequest(req("c")) // queued
s.OnShutdown(errors.New("shutting down"))
if got := eff.errored(""); got != 3 {
t.Errorf("error grants=%d want 3 (active load + 2 queued)", got)
}
if len(s.queued) != 0 {
t.Errorf("queued=%d want 0 after shutdown", len(s.queued))
}
}
// TestSerial_OnUnload_WhileServing verifies that unloading the model that is
// actively serving does not strand the queue: OnUnload stops the process but
// leaves the active request to end via OnServeDone, which then advances.
func TestSerial_OnUnload_WhileServing(t *testing.T) {
eff := newFakeEffects()
eff.states["a"] = process.StateReady
eff.states["b"] = process.StateStopped
s := newSerial(eff)
s.OnRequest(req("a")) // served immediately (a ready)
s.OnRequest(req("b")) // queued behind a
if got := eff.served("a"); got != 1 {
t.Fatalf("served(a)=%d want 1", got)
}
// Unload a while it is serving: the process is stopped, but the queue must
// not advance yet — the active serve is still outstanding.
s.OnUnload([]string{"a"}, time.Second)
if len(eff.stops) != 1 || !sameSet(eff.stops[0].ids, []string{"a"}) {
t.Errorf("StopProcesses=%+v want one call stopping [a]", eff.stops)
}
if got := eff.startsFor("b"); got != 0 {
t.Fatalf("StartSwap(b)=%d want 0 before the serving request ends", got)
}
// The killed handler returns -> OnServeDone advances to b.
eff.states["a"] = process.StateStopped
s.OnServeDone(ServeDoneEvent{ModelID: "a"})
if got := eff.startsFor("b"); got != 1 {
t.Fatalf("StartSwap(b)=%d want 1 after the serving request ended", got)
}
}
func TestSerial_OnUnload_DropsQueuedAndStops(t *testing.T) {
eff := newFakeEffects()
eff.states["a"] = process.StateStopped
eff.states["b"] = process.StateStopped
s := newSerial(eff)
s.OnRequest(req("a")) // active (loading a)
s.OnRequest(req("b")) // queued
// Unload a: its active load is failed and a is stopped.
s.OnUnload([]string{"a"}, time.Second)
if eff.errored("a") != 1 {
t.Errorf("errored(a)=%d want 1 (active load failed)", eff.errored("a"))
}
if len(eff.stops) != 1 || !sameSet(eff.stops[0].ids, []string{"a"}) {
t.Errorf("StopProcesses=%+v want one call stopping [a]", eff.stops)
}
// b was queued and not unloaded; with a's load cancelled it now starts.
if got := eff.startsFor("b"); got != 1 {
t.Errorf("StartSwap(b)=%d want 1 after unload advanced the queue", got)
}
}