internal/router/scheduler: add serial scheduler, default on this fork
Add a strict one-model-at-a-time scheduler. Requests run in exact arrival order; at most one runs at a time; switching to a different model evicts every other running model first so a single model occupies memory at a time. Unlike fifo it never reorders or batches same-model requests, and it ignores group/matrix co-residency entirely, making the single-model guarantee a property of the scheduler rather than the config. - new Serial scheduler implementing the Scheduler interface - register "serial" in scheduler.New; default routing.scheduler.use to "serial" at config load (fifo still selectable for upstream behavior) - update config schema, example config, and config defaults tests Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,253 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"github.com/mostlygeek/llama-swap/internal/logmon"
|
||||
"github.com/mostlygeek/llama-swap/internal/process"
|
||||
)
|
||||
|
||||
// Serial is a strict one-model-at-a-time scheduler. Unlike FIFO it never reorders
|
||||
// or batches: requests run in exact arrival order and at most one request runs at
|
||||
// any instant. When the next request targets a model other than the one loaded,
|
||||
// every other running model is evicted and the target is loaded before it runs,
|
||||
// so a single model occupies memory at a time — at the cost of throughput.
|
||||
//
|
||||
// Example: A B C A is served as A B C A. The final A reloads its model even
|
||||
// though it ran first, because B and C displaced it in between. (FIFO, by
|
||||
// contrast, would batch the two A requests: A A B C.)
|
||||
//
|
||||
// Serial ignores group/eviction policy entirely: it always evicts every other
|
||||
// running model, regardless of how groups are configured. That is what makes the
|
||||
// single-model guarantee a property of the scheduler rather than of the config.
|
||||
//
|
||||
// Like FIFO, every method runs on the router's single run-loop goroutine, so no
|
||||
// internal locking is needed.
|
||||
type Serial struct {
|
||||
name string
|
||||
logger *logmon.Monitor
|
||||
effects Effects
|
||||
|
||||
// queued holds requests in strict arrival order. It is never reordered.
|
||||
queued []HandlerReq
|
||||
|
||||
// active is the one request currently being processed (loading or serving),
|
||||
// or nil when idle. phase is meaningful only while active != nil.
|
||||
active *HandlerReq
|
||||
phase serialPhase
|
||||
}
|
||||
|
||||
// serialPhase is the lifecycle stage of the active request.
|
||||
type serialPhase int
|
||||
|
||||
const (
|
||||
phaseIdle serialPhase = iota
|
||||
phaseSwapping // waiting for OnSwapDone for active.Model
|
||||
phaseServing // waiting for OnServeDone for active.Model
|
||||
)
|
||||
|
||||
// NewSerial builds a Serial scheduler. It takes no Swapper: eviction is always
|
||||
// "stop every other running model", so the group planner is not consulted.
|
||||
func NewSerial(name string, logger *logmon.Monitor, eff Effects) *Serial {
|
||||
return &Serial{
|
||||
name: name,
|
||||
logger: logger,
|
||||
effects: eff,
|
||||
}
|
||||
}
|
||||
|
||||
// OnRequest validates the model and appends the request to the tail of the queue,
|
||||
// then tries to start the next job. Unknown models fail immediately.
|
||||
func (s *Serial) OnRequest(req HandlerReq) {
|
||||
if _, ok := s.effects.ModelState(req.Model); !ok {
|
||||
s.logger.Debugf("%s: model %s not handled by this router", s.name, req.Model)
|
||||
s.effects.GrantError(req, ErrModelNotFound)
|
||||
return
|
||||
}
|
||||
s.queued = append(s.queued, req)
|
||||
broadcastQueuePositions(s.queued)
|
||||
s.startNext()
|
||||
}
|
||||
|
||||
// startNext begins processing the head of the queue when nothing is active. It
|
||||
// fast-paths a request whose model is already the sole loaded-and-ready process;
|
||||
// otherwise it launches a swap that evicts every other running model first. The
|
||||
// loop skips over requests for models that vanished (e.g. a config reload) and
|
||||
// requests whose caller disconnected before they could be served.
|
||||
func (s *Serial) startNext() {
|
||||
if s.active != nil {
|
||||
return // a job is already loading or serving
|
||||
}
|
||||
for len(s.queued) > 0 {
|
||||
req := s.queued[0]
|
||||
s.queued = s.queued[1:]
|
||||
broadcastQueuePositions(s.queued)
|
||||
|
||||
state, ok := s.effects.ModelState(req.Model)
|
||||
if !ok {
|
||||
s.effects.GrantError(req, ErrModelNotFound)
|
||||
continue
|
||||
}
|
||||
|
||||
r := req
|
||||
s.active = &r
|
||||
|
||||
evict := s.otherRunning(req.Model)
|
||||
if state == process.StateReady && len(evict) == 0 {
|
||||
// Already loaded and the only model running — serve immediately.
|
||||
s.logger.Debugf("%s: serving model %s (already loaded)", s.name, req.Model)
|
||||
if s.serve() {
|
||||
return
|
||||
}
|
||||
continue // caller gone; pick the next request
|
||||
}
|
||||
|
||||
s.logger.Debugf("%s: swapping to model %s, evicting %v", s.name, req.Model, evict)
|
||||
s.phase = phaseSwapping
|
||||
s.effects.StartSwap(req.Model, evict)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// serve hands the active request its tracked handler. It returns true when the
|
||||
// request is now serving (await OnServeDone); false when the caller had already
|
||||
// disconnected, in which case active is cleared so the next job can start.
|
||||
func (s *Serial) serve() bool {
|
||||
if s.effects.GrantServe(*s.active, s.active.Model) {
|
||||
s.phase = phaseServing
|
||||
return true
|
||||
}
|
||||
s.logger.Debugf("%s: caller for model %s gone before serve", s.name, s.active.Model)
|
||||
s.active = nil
|
||||
s.phase = phaseIdle
|
||||
return false
|
||||
}
|
||||
|
||||
// OnSwapDone fires when the load for the active request completes. On success the
|
||||
// request is served; on failure its caller receives the error and the queue
|
||||
// advances. A SwapDone that does not match the active load (e.g. its request was
|
||||
// unloaded or cancelled mid-load) is ignored.
|
||||
func (s *Serial) OnSwapDone(ev SwapDone) {
|
||||
if s.active == nil || s.phase != phaseSwapping || s.active.Model != ev.ModelID {
|
||||
return
|
||||
}
|
||||
if ev.Err != nil {
|
||||
s.logger.Debugf("%s: swap for model %s failed: %v", s.name, ev.ModelID, ev.Err)
|
||||
s.effects.GrantError(*s.active, ev.Err)
|
||||
s.active = nil
|
||||
s.phase = phaseIdle
|
||||
s.startNext()
|
||||
return
|
||||
}
|
||||
if !s.serve() {
|
||||
s.startNext() // caller vanished while the model loaded; move on
|
||||
}
|
||||
}
|
||||
|
||||
// OnServeDone fires when the active request's handler returns. The slot is freed
|
||||
// and the next queued request begins.
|
||||
func (s *Serial) OnServeDone(ev ServeDoneEvent) {
|
||||
if s.active == nil || s.phase != phaseServing {
|
||||
return
|
||||
}
|
||||
s.active = nil
|
||||
s.phase = phaseIdle
|
||||
s.startNext()
|
||||
}
|
||||
|
||||
// OnCancel removes a disconnected client's request from the queue. A request that
|
||||
// is already active is left to finish: if it was loading, OnSwapDone's serve()
|
||||
// will find the caller gone (GrantServe false) and advance; if it was serving,
|
||||
// its handler returns normally and reaches OnServeDone.
|
||||
func (s *Serial) OnCancel(req HandlerReq) {
|
||||
if len(s.queued) == 0 {
|
||||
return
|
||||
}
|
||||
kept := s.queued[:0]
|
||||
removed := false
|
||||
for _, q := range s.queued {
|
||||
if q.Respond == req.Respond {
|
||||
removed = true
|
||||
continue
|
||||
}
|
||||
kept = append(kept, q)
|
||||
}
|
||||
s.queued = kept
|
||||
if removed {
|
||||
s.logger.Debugf("%s: cancelled request for model %s pruned from queue", s.name, req.Model)
|
||||
broadcastQueuePositions(s.queued)
|
||||
}
|
||||
}
|
||||
|
||||
// OnUnload reconciles state for an unload, stops the targeted processes, and
|
||||
// advances the queue. It mirrors the FIFO contract: queued requests for unloaded
|
||||
// models are failed; an active *loading* request for an unloaded model is failed
|
||||
// (its swap goroutine is left to finish and its SwapDone is then ignored); an
|
||||
// active *serving* request is left for its handler to end when StopProcesses
|
||||
// kills the upstream. The Stop is synchronous so callers of Unload can rely on
|
||||
// the processes being stopped on return.
|
||||
func (s *Serial) OnUnload(targets []string, timeout time.Duration) {
|
||||
unloadErr := fmt.Errorf("%s: model unloaded", s.name)
|
||||
|
||||
targetSet := make(map[string]bool, len(targets))
|
||||
for _, id := range targets {
|
||||
targetSet[id] = true
|
||||
}
|
||||
|
||||
if s.active != nil && s.phase == phaseSwapping && targetSet[s.active.Model] {
|
||||
s.effects.GrantError(*s.active, unloadErr)
|
||||
s.active = nil
|
||||
s.phase = phaseIdle
|
||||
}
|
||||
|
||||
if len(s.queued) > 0 {
|
||||
kept := s.queued[:0]
|
||||
for _, q := range s.queued {
|
||||
if targetSet[q.Model] {
|
||||
s.effects.GrantError(q, unloadErr)
|
||||
continue
|
||||
}
|
||||
kept = append(kept, q)
|
||||
}
|
||||
s.queued = kept
|
||||
broadcastQueuePositions(s.queued)
|
||||
}
|
||||
|
||||
s.effects.StopProcesses(timeout, targets)
|
||||
|
||||
// A still-serving active request advances via OnServeDone when its killed
|
||||
// handler returns; only start the next job when nothing is active now.
|
||||
if s.active == nil {
|
||||
s.startNext()
|
||||
}
|
||||
}
|
||||
|
||||
// OnShutdown grants err to every request the scheduler still holds: an active
|
||||
// loading request and all queued requests. A serving request is torn down with
|
||||
// its process by the baseRouter.
|
||||
func (s *Serial) OnShutdown(err error) {
|
||||
if s.active != nil && s.phase == phaseSwapping {
|
||||
s.effects.GrantError(*s.active, err)
|
||||
s.active = nil
|
||||
s.phase = phaseIdle
|
||||
}
|
||||
for _, q := range s.queued {
|
||||
s.effects.GrantError(q, err)
|
||||
}
|
||||
s.queued = nil
|
||||
}
|
||||
|
||||
// otherRunning returns every running model except target, sorted for
|
||||
// deterministic eviction.
|
||||
func (s *Serial) otherRunning(target string) []string {
|
||||
var out []string
|
||||
for id := range s.effects.RunningModels() {
|
||||
if id != target {
|
||||
out = append(out, id)
|
||||
}
|
||||
}
|
||||
sort.Strings(out)
|
||||
return out
|
||||
}
|
||||
Reference in New Issue
Block a user