Files
llama-swap/internal/router/scheduler/fifo.go
T
Benson Wong 6cf1317341 schedule,shared: move concurrency 429 limits into scheduler code (#849)
- make concurrency limiting the scheduler.Scheduler's responsibility
- eliminate the separate concurrency limit middleware 
- move concurrencyLimit logic into scheduler.FIFO to maintain backwards compatibility
- add HTTPError from #834 

Updates #834
2026-06-15 22:35:12 -07:00

484 lines
15 KiB
Go

package scheduler
import (
"fmt"
"sort"
"time"
"github.com/mostlygeek/llama-swap/internal/config"
"github.com/mostlygeek/llama-swap/internal/logmon"
"github.com/mostlygeek/llama-swap/internal/process"
"github.com/mostlygeek/llama-swap/internal/shared"
)
// defaultConcurrencyLimit caps simultaneous in-flight requests per model when
// the model config leaves concurrencyLimit unset.
const defaultConcurrencyLimit = 10
// activeSwap tracks one in-flight swap and the callers waiting on it.
type activeSwap struct {
modelID string
evict []string
waiters []HandlerReq
}
// FIFO is the default scheduler. Requests are handled in a first-in, first-out order.
// To reduce swapping requests for a model that is already running will be handled
// immediately by the running process.
//
// Requests into this schedule are handled like this:
//
// A B C A B C --> A A B B C C
//
// The strategy is simple and reduces the number of swaps required.
type FIFO struct {
name string
logger *logmon.Monitor
planner Swapper
cfg config.FifoConfig
effects Effects
limits map[string]int
active map[string]*activeSwap
inFlight map[string]int
queued []HandlerReq
}
// NewFIFO builds a FIFO scheduler. Per-model concurrency limits are derived
// from models: each model's ConcurrencyLimit overrides defaultConcurrencyLimit
// when set to a value greater than zero.
func NewFIFO(name string, logger *logmon.Monitor, planner Swapper, cfg config.FifoConfig, models map[string]config.ModelConfig, eff Effects) *FIFO {
limits := make(map[string]int, len(models))
for id, mc := range models {
limit := defaultConcurrencyLimit
if mc.ConcurrencyLimit > 0 {
limit = mc.ConcurrencyLimit
}
limits[id] = limit
}
return &FIFO{
name: name,
logger: logger,
planner: planner,
cfg: cfg,
effects: eff,
limits: limits,
active: make(map[string]*activeSwap),
inFlight: make(map[string]int),
}
}
// OnRequest decides what to do with one incoming ServeHTTP request. It never
// blocks indefinitely: any work that has to wait (starting a process, stopping
// siblings, waiting for ready) is deferred to a swap goroutine and reported back
// via OnSwapDone.
//
// The decision tree, in order:
//
// 1. Unknown model — respond with ErrModelNotFound and move on.
// 2. A swap to the same model is already in flight — attach this waiter so
// one swap serves all callers that asked for the same model.
// 3. Fast path — the target process is already ready, the planner sees
// nothing to evict, and no in-flight swap is evicting it. Hand back its
// ServeHTTP immediately.
// 4. Would collide with an in-flight swap (we'd stop their target, or they're
// stopping us) — park in the queue for OnSwapDone to drain.
// 5. Would evict a process that is still handling requests — park in the
// queue. OnServeDone will retry when the busy process drains.
// 6. Otherwise — start a new swap. This may run in parallel with other active
// swaps when their evict sets don't intersect.
func (s *FIFO) OnRequest(req HandlerReq) {
// (1) Unknown model.
state, ok := s.effects.ModelState(req.Model)
if !ok {
s.logger.Debugf("%s: model %s not handled by this router", s.name, req.Model)
s.effects.GrantError(req, ErrModelNotFound)
return
}
// (2) Join an in-flight swap for the same model.
if sw, ok := s.active[req.Model]; ok {
s.logger.Debugf("%s: joining in-flight swap for model %s (%d waiters)", s.name, req.Model, len(sw.waiters)+1)
sw.waiters = append(sw.waiters, req)
return
}
running := s.runningSet(req.Model)
evict := s.planner.EvictionFor(req.Model, running)
// (3) Fast path: ready, nothing to evict, and nobody is evicting us.
if state == process.StateReady && len(evict) == 0 && !collidesWith(req.Model, evict, s.active) {
s.logger.Debugf("%s: fast-path serving model %s (already ready)", s.name, req.Model)
s.grantHandler(req, req.Model)
return
}
// (4) Collision with an in-flight swap — queue.
if collidesWith(req.Model, evict, s.active) {
s.logger.Debugf("%s: queuing request for model %s (collides with in-flight swap)", s.name, req.Model)
s.enqueue(req)
return
}
// (5) Would evict a busy process — queue until it drains.
if conflictsWithInFlight(evict, s.inFlight) {
s.logger.Debugf("%s: queuing request for model %s (would evict in-flight process)", s.name, req.Model)
s.enqueue(req)
return
}
// (6) Start a new (possibly parallel) swap.
s.logger.Debugf("%s: starting swap for model %s, evicting %v", s.name, req.Model, evict)
s.startSwap(req, evict, running)
}
// OnCancel removes a request whose client has disconnected from the queue and
// from every in-flight swap's waiters. If the request was the sole waiter of an
// active swap, the swap goroutine is left to complete on its own — OnSwapDone
// will find no waiters and simply clean up. This prevents drainQueue from ever
// starting a model load for a caller that is no longer there.
func (s *FIFO) OnCancel(req HandlerReq) {
removed := false
// Prune from the queue.
if len(s.queued) > 0 {
kept := s.queued[:0]
for _, q := range s.queued {
if q.Respond == req.Respond {
removed = true
continue
}
kept = append(kept, q)
}
s.queued = kept
}
// Prune from any active swap's waiters.
for _, sw := range s.active {
filtered := sw.waiters[:0]
for _, w := range sw.waiters {
if w.Respond == req.Respond {
removed = true
continue
}
filtered = append(filtered, w)
}
sw.waiters = filtered
}
if removed {
s.logger.Debugf("%s: cancelled request for model %s pruned from scheduler", s.name, req.Model)
broadcastQueuePositions(s.queued)
}
}
// OnSwapDone fans the result out to every waiter that joined this swap, removes
// the swap from the active map, then walks the queue once, promoting any items
// that no longer collide with the remaining active set. FIFO order is preserved:
// items still blocked stay in place.
func (s *FIFO) OnSwapDone(ev SwapDone) {
sw, ok := s.active[ev.ModelID]
if !ok {
return
}
delete(s.active, ev.ModelID)
for _, w := range sw.waiters {
if ev.Err != nil {
s.effects.GrantError(w, ev.Err)
} else {
s.grantHandler(w, ev.ModelID)
}
}
s.drainQueue()
}
// OnServeDone decrements the per-model in-flight count and, when that drops to
// zero, retries the queue: requests whose swap was deferred because they would
// have evicted this (now-idle) process can now proceed.
func (s *FIFO) OnServeDone(ev ServeDoneEvent) {
s.inFlight[ev.ModelID]--
if s.inFlight[ev.ModelID] <= 0 {
delete(s.inFlight, ev.ModelID)
s.drainQueue()
}
}
// OnUnload reconciles router-owned state with the impending Stop, performs the
// Stop (synchronously, via Effects) so callers of Unload remain blocked until
// each targeted process has exited, then drains the queue.
func (s *FIFO) OnUnload(targets []string, timeout time.Duration) {
unloadErr := fmt.Errorf("%s: model unloaded", s.name)
targetSet := make(map[string]bool, len(targets))
for _, id := range targets {
targetSet[id] = true
}
// Release waiters of any in-flight swap whose target is being unloaded.
// The swap goroutine itself is left to finish on its own; when its
// SwapDone arrives, OnSwapDone will find no entry in active and drop it.
for id := range targetSet {
sw, ok := s.active[id]
if !ok {
continue
}
for _, w := range sw.waiters {
s.effects.GrantError(w, unloadErr)
}
delete(s.active, id)
}
// Drop queued requests addressed to unloaded models. Requests for other
// models stay queued and may benefit from drainQueue at the end.
if len(s.queued) > 0 {
kept := s.queued[:0]
for _, w := range s.queued {
if targetSet[w.Model] {
s.effects.GrantError(w, unloadErr)
continue
}
kept = append(kept, w)
}
s.queued = kept
}
// Stop the targeted processes. Done synchronously so Unload's caller can
// rely on "after Unload returns, the process is stopped". inFlight is
// intentionally NOT cleared here: each dying handler will fire its tracked
// serve and reach OnServeDone in the normal way.
s.effects.StopProcesses(timeout, targets)
// Removing entries from active above may have unblocked queued requests
// that previously collided with the now-cancelled swaps.
s.drainQueue()
}
// OnShutdown grants err to every waiter still held by the scheduler.
func (s *FIFO) OnShutdown(err error) {
for _, sw := range s.active {
for _, w := range sw.waiters {
s.effects.GrantError(w, err)
}
}
for _, w := range s.queued {
s.effects.GrantError(w, err)
}
}
// grantHandler hands the caller a tracked handler for modelID and, only if the
// caller was still there to receive it, bumps the in-flight count. Incrementing
// when the grant failed would strand the counter and block future evictions.
// Requests that would exceed the model's concurrency limit are rejected with a
// shared.NewConcurrencyLimitError (HTTP 429 with Retry-After).
func (s *FIFO) grantHandler(req HandlerReq, modelID string) {
if s.inFlight[modelID] >= s.limit(modelID) {
s.effects.GrantError(req, shared.ConcurrencyLimitError{})
return
}
if s.effects.GrantServe(req, modelID) {
s.inFlight[modelID]++
}
}
// limit returns the per-model concurrency cap, defaulting to
// defaultConcurrencyLimit when the model has no explicit entry.
func (s *FIFO) limit(modelID string) int {
if l, ok := s.limits[modelID]; ok {
return l
}
return defaultConcurrencyLimit
}
// startSwap records the swap as active and launches it via Effects. running is
// the set EvictionFor saw, forwarded to OnSwapStart so the planner logs against
// the same picture it decided on.
func (s *FIFO) startSwap(initial HandlerReq, evict, running []string) {
s.active[initial.Model] = &activeSwap{
modelID: initial.Model,
evict: evict,
waiters: []HandlerReq{initial},
}
s.planner.OnSwapStart(initial.Model, running)
s.effects.StartSwap(initial.Model, evict)
}
// enqueue inserts req into the queue in priority order: it goes just before the
// first queued item whose priority is strictly lower, so higher-priority models
// are serviced first while equal-priority requests keep their arrival (FIFO)
// order. Priorities come from the FifoConfig; unlisted models default to 0.
func (s *FIFO) enqueue(req HandlerReq) {
p := s.cfg.Priority[req.Model]
i := len(s.queued)
for j, q := range s.queued {
if s.cfg.Priority[q.Model] < p {
i = j
break
}
}
s.queued = append(s.queued, HandlerReq{})
copy(s.queued[i+1:], s.queued[i:])
s.queued[i] = req
broadcastQueuePositions(s.queued)
}
// drainQueue walks the queued requests in order, re-running the OnRequest
// decision tree against the (now smaller) active set. Items that can now start
// or join become satisfied; items still blocked remain queued in original order
// so they get another chance on the next swap completion.
func (s *FIFO) drainQueue() {
if len(s.queued) == 0 {
return
}
pending := s.queued
var remaining []HandlerReq
for _, req := range pending {
state, ok := s.effects.ModelState(req.Model)
if !ok {
s.effects.GrantError(req, ErrModelNotFound)
continue
}
if sw, ok := s.active[req.Model]; ok {
s.logger.Debugf("%s: queued request for model %s now joining in-flight swap", s.name, req.Model)
sw.waiters = append(sw.waiters, req)
continue
}
running := s.runningSet(req.Model)
evict := s.planner.EvictionFor(req.Model, running)
if state == process.StateReady && len(evict) == 0 && !collidesWith(req.Model, evict, s.active) {
s.logger.Debugf("%s: queued request for model %s now served fast-path", s.name, req.Model)
s.grantHandler(req, req.Model)
continue
}
if collidesWith(req.Model, evict, s.active) {
remaining = append(remaining, req)
continue
}
if conflictsWithInFlight(evict, s.inFlight) {
remaining = append(remaining, req)
continue
}
s.logger.Debugf("%s: queued request for model %s now starting swap, evicting %v", s.name, req.Model, evict)
s.startSwap(req, evict, running)
}
s.queued = remaining
broadcastQueuePositions(s.queued)
}
// runningSet is the live model set handed to the Swapper: every process the
// baseRouter reports as running, unioned with the targets of in-flight swaps
// (excluding excludeActive, the model whose own swap is being decided — its
// in-flight entry must not count as "already running"). The result is sorted so
// eviction decisions derived from it are deterministic.
func (s *FIFO) runningSet(excludeActive string) []string {
seen := make(map[string]struct{})
var out []string
add := func(id string) {
if _, dup := seen[id]; dup {
return
}
seen[id] = struct{}{}
out = append(out, id)
}
for id := range s.effects.RunningModels() {
add(id)
}
for _, id := range activeTargets(s.active, excludeActive) {
add(id)
}
sort.Strings(out)
return out
}
// activeTargets returns the IDs of every in-flight swap target except exclude.
// The planner uses this to account for models committed to but not yet reflected
// in process state.
func activeTargets(active map[string]*activeSwap, exclude string) []string {
if len(active) == 0 {
return nil
}
out := make([]string, 0, len(active))
for id := range active {
if id == exclude {
continue
}
out = append(out, id)
}
return out
}
// collidesWith reports whether a new swap with this target and evict set can
// safely run alongside the currently active swaps. Same-target callers should
// JOIN (handled before this) — they do not collide with themselves.
func collidesWith(target string, evict []string, active map[string]*activeSwap) bool {
for id, sw := range active {
if id == target {
continue
}
if containsString(evict, id) {
return true
}
if containsString(sw.evict, target) {
return true
}
if slicesOverlap(evict, sw.evict) {
return true
}
}
return false
}
// slicesOverlap reports whether xs and ys share any common element.
func slicesOverlap(xs, ys []string) bool {
for _, x := range xs {
if containsString(ys, x) {
return true
}
}
return false
}
// conflictsWithInFlight reports whether any model in evict is still handling
// requests. Stopping a busy process would cancel its callers' connections, so
// the scheduler defers the swap until those callers finish.
func conflictsWithInFlight(evict []string, inFlight map[string]int) bool {
for _, m := range evict {
if inFlight[m] > 0 {
return true
}
}
return false
}
func containsString(xs []string, s string) bool {
for _, x := range xs {
if x == s {
return true
}
}
return false
}
// broadcastQueuePositions sends each queued request its current 1-indexed
// position. Sends are non-blocking: if the channel is full, the old value is
// drained first so the consumer always sees the latest position.
func broadcastQueuePositions(queued []HandlerReq) {
for i, req := range queued {
pos := i + 1
select {
case req.PositionCh <- pos:
default:
select {
case <-req.PositionCh:
default:
}
select {
case req.PositionCh <- pos:
default:
}
}
}
}