Files
llama-swap/internal/router/scheduler/serial.go
T
steve 542b79dacf
Validate JSON Schema / validate-schema (push) Successful in 9m53s
Linux CI / run-tests (push) Failing after 15m57s
Windows CI / run-tests (push) Has been cancelled
internal/router/scheduler: add serial scheduler, default on this fork
Add a strict one-model-at-a-time scheduler. Requests run in exact
arrival order; at most one runs at a time; switching to a different
model evicts every other running model first so a single model occupies
memory at a time. Unlike fifo it never reorders or batches same-model
requests, and it ignores group/matrix co-residency entirely, making the
single-model guarantee a property of the scheduler rather than the config.

- new Serial scheduler implementing the Scheduler interface
- register "serial" in scheduler.New; default routing.scheduler.use to
  "serial" at config load (fifo still selectable for upstream behavior)
- update config schema, example config, and config defaults tests

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-28 12:17:32 -04:00

254 lines
8.0 KiB
Go

package scheduler
import (
"fmt"
"sort"
"time"
"github.com/mostlygeek/llama-swap/internal/logmon"
"github.com/mostlygeek/llama-swap/internal/process"
)
// Serial is a strict one-model-at-a-time scheduler. Unlike FIFO it never reorders
// or batches: requests run in exact arrival order and at most one request runs at
// any instant. When the next request targets a model other than the one loaded,
// every other running model is evicted and the target is loaded before it runs,
// so a single model occupies memory at a time — at the cost of throughput.
//
// Example: A B C A is served as A B C A. The final A reloads its model even
// though it ran first, because B and C displaced it in between. (FIFO, by
// contrast, would batch the two A requests: A A B C.)
//
// Serial ignores group/eviction policy entirely: it always evicts every other
// running model, regardless of how groups are configured. That is what makes the
// single-model guarantee a property of the scheduler rather than of the config.
//
// Like FIFO, every method runs on the router's single run-loop goroutine, so no
// internal locking is needed.
type Serial struct {
name string
logger *logmon.Monitor
effects Effects
// queued holds requests in strict arrival order. It is never reordered.
queued []HandlerReq
// active is the one request currently being processed (loading or serving),
// or nil when idle. phase is meaningful only while active != nil.
active *HandlerReq
phase serialPhase
}
// serialPhase is the lifecycle stage of the active request.
type serialPhase int
const (
phaseIdle serialPhase = iota
phaseSwapping // waiting for OnSwapDone for active.Model
phaseServing // waiting for OnServeDone for active.Model
)
// NewSerial builds a Serial scheduler. It takes no Swapper: eviction is always
// "stop every other running model", so the group planner is not consulted.
func NewSerial(name string, logger *logmon.Monitor, eff Effects) *Serial {
return &Serial{
name: name,
logger: logger,
effects: eff,
}
}
// OnRequest validates the model and appends the request to the tail of the queue,
// then tries to start the next job. Unknown models fail immediately.
func (s *Serial) OnRequest(req HandlerReq) {
if _, ok := s.effects.ModelState(req.Model); !ok {
s.logger.Debugf("%s: model %s not handled by this router", s.name, req.Model)
s.effects.GrantError(req, ErrModelNotFound)
return
}
s.queued = append(s.queued, req)
broadcastQueuePositions(s.queued)
s.startNext()
}
// startNext begins processing the head of the queue when nothing is active. It
// fast-paths a request whose model is already the sole loaded-and-ready process;
// otherwise it launches a swap that evicts every other running model first. The
// loop skips over requests for models that vanished (e.g. a config reload) and
// requests whose caller disconnected before they could be served.
func (s *Serial) startNext() {
if s.active != nil {
return // a job is already loading or serving
}
for len(s.queued) > 0 {
req := s.queued[0]
s.queued = s.queued[1:]
broadcastQueuePositions(s.queued)
state, ok := s.effects.ModelState(req.Model)
if !ok {
s.effects.GrantError(req, ErrModelNotFound)
continue
}
r := req
s.active = &r
evict := s.otherRunning(req.Model)
if state == process.StateReady && len(evict) == 0 {
// Already loaded and the only model running — serve immediately.
s.logger.Debugf("%s: serving model %s (already loaded)", s.name, req.Model)
if s.serve() {
return
}
continue // caller gone; pick the next request
}
s.logger.Debugf("%s: swapping to model %s, evicting %v", s.name, req.Model, evict)
s.phase = phaseSwapping
s.effects.StartSwap(req.Model, evict)
return
}
}
// serve hands the active request its tracked handler. It returns true when the
// request is now serving (await OnServeDone); false when the caller had already
// disconnected, in which case active is cleared so the next job can start.
func (s *Serial) serve() bool {
if s.effects.GrantServe(*s.active, s.active.Model) {
s.phase = phaseServing
return true
}
s.logger.Debugf("%s: caller for model %s gone before serve", s.name, s.active.Model)
s.active = nil
s.phase = phaseIdle
return false
}
// OnSwapDone fires when the load for the active request completes. On success the
// request is served; on failure its caller receives the error and the queue
// advances. A SwapDone that does not match the active load (e.g. its request was
// unloaded or cancelled mid-load) is ignored.
func (s *Serial) OnSwapDone(ev SwapDone) {
if s.active == nil || s.phase != phaseSwapping || s.active.Model != ev.ModelID {
return
}
if ev.Err != nil {
s.logger.Debugf("%s: swap for model %s failed: %v", s.name, ev.ModelID, ev.Err)
s.effects.GrantError(*s.active, ev.Err)
s.active = nil
s.phase = phaseIdle
s.startNext()
return
}
if !s.serve() {
s.startNext() // caller vanished while the model loaded; move on
}
}
// OnServeDone fires when the active request's handler returns. The slot is freed
// and the next queued request begins.
func (s *Serial) OnServeDone(ev ServeDoneEvent) {
if s.active == nil || s.phase != phaseServing {
return
}
s.active = nil
s.phase = phaseIdle
s.startNext()
}
// OnCancel removes a disconnected client's request from the queue. A request that
// is already active is left to finish: if it was loading, OnSwapDone's serve()
// will find the caller gone (GrantServe false) and advance; if it was serving,
// its handler returns normally and reaches OnServeDone.
func (s *Serial) OnCancel(req HandlerReq) {
if len(s.queued) == 0 {
return
}
kept := s.queued[:0]
removed := false
for _, q := range s.queued {
if q.Respond == req.Respond {
removed = true
continue
}
kept = append(kept, q)
}
s.queued = kept
if removed {
s.logger.Debugf("%s: cancelled request for model %s pruned from queue", s.name, req.Model)
broadcastQueuePositions(s.queued)
}
}
// OnUnload reconciles state for an unload, stops the targeted processes, and
// advances the queue. It mirrors the FIFO contract: queued requests for unloaded
// models are failed; an active *loading* request for an unloaded model is failed
// (its swap goroutine is left to finish and its SwapDone is then ignored); an
// active *serving* request is left for its handler to end when StopProcesses
// kills the upstream. The Stop is synchronous so callers of Unload can rely on
// the processes being stopped on return.
func (s *Serial) OnUnload(targets []string, timeout time.Duration) {
unloadErr := fmt.Errorf("%s: model unloaded", s.name)
targetSet := make(map[string]bool, len(targets))
for _, id := range targets {
targetSet[id] = true
}
if s.active != nil && s.phase == phaseSwapping && targetSet[s.active.Model] {
s.effects.GrantError(*s.active, unloadErr)
s.active = nil
s.phase = phaseIdle
}
if len(s.queued) > 0 {
kept := s.queued[:0]
for _, q := range s.queued {
if targetSet[q.Model] {
s.effects.GrantError(q, unloadErr)
continue
}
kept = append(kept, q)
}
s.queued = kept
broadcastQueuePositions(s.queued)
}
s.effects.StopProcesses(timeout, targets)
// A still-serving active request advances via OnServeDone when its killed
// handler returns; only start the next job when nothing is active now.
if s.active == nil {
s.startNext()
}
}
// OnShutdown grants err to every request the scheduler still holds: an active
// loading request and all queued requests. A serving request is torn down with
// its process by the baseRouter.
func (s *Serial) OnShutdown(err error) {
if s.active != nil && s.phase == phaseSwapping {
s.effects.GrantError(*s.active, err)
s.active = nil
s.phase = phaseIdle
}
for _, q := range s.queued {
s.effects.GrantError(q, err)
}
s.queued = nil
}
// otherRunning returns every running model except target, sorted for
// deterministic eviction.
func (s *Serial) otherRunning(target string) []string {
var out []string
for id := range s.effects.RunningModels() {
if id != target {
out = append(out, id)
}
}
sort.Strings(out)
return out
}