6cf1317341
- make concurrency limiting the scheduler.Scheduler's responsibility - eliminate the separate concurrency limit middleware - move concurrencyLimit logic into scheduler.FIFO to maintain backwards compatibility - add HTTPError from #834 Updates #834
484 lines
15 KiB
Go
484 lines
15 KiB
Go
package scheduler
|
|
|
|
import (
|
|
"fmt"
|
|
"sort"
|
|
"time"
|
|
|
|
"github.com/mostlygeek/llama-swap/internal/config"
|
|
"github.com/mostlygeek/llama-swap/internal/logmon"
|
|
"github.com/mostlygeek/llama-swap/internal/process"
|
|
"github.com/mostlygeek/llama-swap/internal/shared"
|
|
)
|
|
|
|
// defaultConcurrencyLimit caps simultaneous in-flight requests per model when
|
|
// the model config leaves concurrencyLimit unset.
|
|
const defaultConcurrencyLimit = 10
|
|
|
|
// activeSwap tracks one in-flight swap and the callers waiting on it.
|
|
type activeSwap struct {
|
|
modelID string
|
|
evict []string
|
|
waiters []HandlerReq
|
|
}
|
|
|
|
// FIFO is the default scheduler. Requests are handled in a first-in, first-out order.
|
|
// To reduce swapping requests for a model that is already running will be handled
|
|
// immediately by the running process.
|
|
//
|
|
// Requests into this schedule are handled like this:
|
|
//
|
|
// A B C A B C --> A A B B C C
|
|
//
|
|
// The strategy is simple and reduces the number of swaps required.
|
|
type FIFO struct {
|
|
name string
|
|
logger *logmon.Monitor
|
|
planner Swapper
|
|
cfg config.FifoConfig
|
|
effects Effects
|
|
|
|
limits map[string]int
|
|
active map[string]*activeSwap
|
|
inFlight map[string]int
|
|
queued []HandlerReq
|
|
}
|
|
|
|
// NewFIFO builds a FIFO scheduler. Per-model concurrency limits are derived
|
|
// from models: each model's ConcurrencyLimit overrides defaultConcurrencyLimit
|
|
// when set to a value greater than zero.
|
|
func NewFIFO(name string, logger *logmon.Monitor, planner Swapper, cfg config.FifoConfig, models map[string]config.ModelConfig, eff Effects) *FIFO {
|
|
limits := make(map[string]int, len(models))
|
|
for id, mc := range models {
|
|
limit := defaultConcurrencyLimit
|
|
if mc.ConcurrencyLimit > 0 {
|
|
limit = mc.ConcurrencyLimit
|
|
}
|
|
limits[id] = limit
|
|
}
|
|
|
|
return &FIFO{
|
|
name: name,
|
|
logger: logger,
|
|
planner: planner,
|
|
cfg: cfg,
|
|
effects: eff,
|
|
limits: limits,
|
|
active: make(map[string]*activeSwap),
|
|
inFlight: make(map[string]int),
|
|
}
|
|
}
|
|
|
|
// OnRequest decides what to do with one incoming ServeHTTP request. It never
|
|
// blocks indefinitely: any work that has to wait (starting a process, stopping
|
|
// siblings, waiting for ready) is deferred to a swap goroutine and reported back
|
|
// via OnSwapDone.
|
|
//
|
|
// The decision tree, in order:
|
|
//
|
|
// 1. Unknown model — respond with ErrModelNotFound and move on.
|
|
// 2. A swap to the same model is already in flight — attach this waiter so
|
|
// one swap serves all callers that asked for the same model.
|
|
// 3. Fast path — the target process is already ready, the planner sees
|
|
// nothing to evict, and no in-flight swap is evicting it. Hand back its
|
|
// ServeHTTP immediately.
|
|
// 4. Would collide with an in-flight swap (we'd stop their target, or they're
|
|
// stopping us) — park in the queue for OnSwapDone to drain.
|
|
// 5. Would evict a process that is still handling requests — park in the
|
|
// queue. OnServeDone will retry when the busy process drains.
|
|
// 6. Otherwise — start a new swap. This may run in parallel with other active
|
|
// swaps when their evict sets don't intersect.
|
|
func (s *FIFO) OnRequest(req HandlerReq) {
|
|
// (1) Unknown model.
|
|
state, ok := s.effects.ModelState(req.Model)
|
|
if !ok {
|
|
s.logger.Debugf("%s: model %s not handled by this router", s.name, req.Model)
|
|
s.effects.GrantError(req, ErrModelNotFound)
|
|
return
|
|
}
|
|
|
|
// (2) Join an in-flight swap for the same model.
|
|
if sw, ok := s.active[req.Model]; ok {
|
|
s.logger.Debugf("%s: joining in-flight swap for model %s (%d waiters)", s.name, req.Model, len(sw.waiters)+1)
|
|
sw.waiters = append(sw.waiters, req)
|
|
return
|
|
}
|
|
|
|
running := s.runningSet(req.Model)
|
|
evict := s.planner.EvictionFor(req.Model, running)
|
|
|
|
// (3) Fast path: ready, nothing to evict, and nobody is evicting us.
|
|
if state == process.StateReady && len(evict) == 0 && !collidesWith(req.Model, evict, s.active) {
|
|
s.logger.Debugf("%s: fast-path serving model %s (already ready)", s.name, req.Model)
|
|
s.grantHandler(req, req.Model)
|
|
return
|
|
}
|
|
|
|
// (4) Collision with an in-flight swap — queue.
|
|
if collidesWith(req.Model, evict, s.active) {
|
|
s.logger.Debugf("%s: queuing request for model %s (collides with in-flight swap)", s.name, req.Model)
|
|
s.enqueue(req)
|
|
return
|
|
}
|
|
|
|
// (5) Would evict a busy process — queue until it drains.
|
|
if conflictsWithInFlight(evict, s.inFlight) {
|
|
s.logger.Debugf("%s: queuing request for model %s (would evict in-flight process)", s.name, req.Model)
|
|
s.enqueue(req)
|
|
return
|
|
}
|
|
|
|
// (6) Start a new (possibly parallel) swap.
|
|
s.logger.Debugf("%s: starting swap for model %s, evicting %v", s.name, req.Model, evict)
|
|
s.startSwap(req, evict, running)
|
|
}
|
|
|
|
// OnCancel removes a request whose client has disconnected from the queue and
|
|
// from every in-flight swap's waiters. If the request was the sole waiter of an
|
|
// active swap, the swap goroutine is left to complete on its own — OnSwapDone
|
|
// will find no waiters and simply clean up. This prevents drainQueue from ever
|
|
// starting a model load for a caller that is no longer there.
|
|
func (s *FIFO) OnCancel(req HandlerReq) {
|
|
removed := false
|
|
|
|
// Prune from the queue.
|
|
if len(s.queued) > 0 {
|
|
kept := s.queued[:0]
|
|
for _, q := range s.queued {
|
|
if q.Respond == req.Respond {
|
|
removed = true
|
|
continue
|
|
}
|
|
kept = append(kept, q)
|
|
}
|
|
s.queued = kept
|
|
}
|
|
|
|
// Prune from any active swap's waiters.
|
|
for _, sw := range s.active {
|
|
filtered := sw.waiters[:0]
|
|
for _, w := range sw.waiters {
|
|
if w.Respond == req.Respond {
|
|
removed = true
|
|
continue
|
|
}
|
|
filtered = append(filtered, w)
|
|
}
|
|
sw.waiters = filtered
|
|
}
|
|
|
|
if removed {
|
|
s.logger.Debugf("%s: cancelled request for model %s pruned from scheduler", s.name, req.Model)
|
|
broadcastQueuePositions(s.queued)
|
|
}
|
|
}
|
|
|
|
// OnSwapDone fans the result out to every waiter that joined this swap, removes
|
|
// the swap from the active map, then walks the queue once, promoting any items
|
|
// that no longer collide with the remaining active set. FIFO order is preserved:
|
|
// items still blocked stay in place.
|
|
func (s *FIFO) OnSwapDone(ev SwapDone) {
|
|
sw, ok := s.active[ev.ModelID]
|
|
if !ok {
|
|
return
|
|
}
|
|
delete(s.active, ev.ModelID)
|
|
|
|
for _, w := range sw.waiters {
|
|
if ev.Err != nil {
|
|
s.effects.GrantError(w, ev.Err)
|
|
} else {
|
|
s.grantHandler(w, ev.ModelID)
|
|
}
|
|
}
|
|
|
|
s.drainQueue()
|
|
}
|
|
|
|
// OnServeDone decrements the per-model in-flight count and, when that drops to
|
|
// zero, retries the queue: requests whose swap was deferred because they would
|
|
// have evicted this (now-idle) process can now proceed.
|
|
func (s *FIFO) OnServeDone(ev ServeDoneEvent) {
|
|
s.inFlight[ev.ModelID]--
|
|
if s.inFlight[ev.ModelID] <= 0 {
|
|
delete(s.inFlight, ev.ModelID)
|
|
s.drainQueue()
|
|
}
|
|
}
|
|
|
|
// OnUnload reconciles router-owned state with the impending Stop, performs the
|
|
// Stop (synchronously, via Effects) so callers of Unload remain blocked until
|
|
// each targeted process has exited, then drains the queue.
|
|
func (s *FIFO) OnUnload(targets []string, timeout time.Duration) {
|
|
unloadErr := fmt.Errorf("%s: model unloaded", s.name)
|
|
|
|
targetSet := make(map[string]bool, len(targets))
|
|
for _, id := range targets {
|
|
targetSet[id] = true
|
|
}
|
|
|
|
// Release waiters of any in-flight swap whose target is being unloaded.
|
|
// The swap goroutine itself is left to finish on its own; when its
|
|
// SwapDone arrives, OnSwapDone will find no entry in active and drop it.
|
|
for id := range targetSet {
|
|
sw, ok := s.active[id]
|
|
if !ok {
|
|
continue
|
|
}
|
|
for _, w := range sw.waiters {
|
|
s.effects.GrantError(w, unloadErr)
|
|
}
|
|
delete(s.active, id)
|
|
}
|
|
|
|
// Drop queued requests addressed to unloaded models. Requests for other
|
|
// models stay queued and may benefit from drainQueue at the end.
|
|
if len(s.queued) > 0 {
|
|
kept := s.queued[:0]
|
|
for _, w := range s.queued {
|
|
if targetSet[w.Model] {
|
|
s.effects.GrantError(w, unloadErr)
|
|
continue
|
|
}
|
|
kept = append(kept, w)
|
|
}
|
|
s.queued = kept
|
|
}
|
|
|
|
// Stop the targeted processes. Done synchronously so Unload's caller can
|
|
// rely on "after Unload returns, the process is stopped". inFlight is
|
|
// intentionally NOT cleared here: each dying handler will fire its tracked
|
|
// serve and reach OnServeDone in the normal way.
|
|
s.effects.StopProcesses(timeout, targets)
|
|
|
|
// Removing entries from active above may have unblocked queued requests
|
|
// that previously collided with the now-cancelled swaps.
|
|
s.drainQueue()
|
|
}
|
|
|
|
// OnShutdown grants err to every waiter still held by the scheduler.
|
|
func (s *FIFO) OnShutdown(err error) {
|
|
for _, sw := range s.active {
|
|
for _, w := range sw.waiters {
|
|
s.effects.GrantError(w, err)
|
|
}
|
|
}
|
|
for _, w := range s.queued {
|
|
s.effects.GrantError(w, err)
|
|
}
|
|
}
|
|
|
|
// grantHandler hands the caller a tracked handler for modelID and, only if the
|
|
// caller was still there to receive it, bumps the in-flight count. Incrementing
|
|
// when the grant failed would strand the counter and block future evictions.
|
|
// Requests that would exceed the model's concurrency limit are rejected with a
|
|
// shared.NewConcurrencyLimitError (HTTP 429 with Retry-After).
|
|
func (s *FIFO) grantHandler(req HandlerReq, modelID string) {
|
|
if s.inFlight[modelID] >= s.limit(modelID) {
|
|
s.effects.GrantError(req, shared.ConcurrencyLimitError{})
|
|
return
|
|
}
|
|
if s.effects.GrantServe(req, modelID) {
|
|
s.inFlight[modelID]++
|
|
}
|
|
}
|
|
|
|
// limit returns the per-model concurrency cap, defaulting to
|
|
// defaultConcurrencyLimit when the model has no explicit entry.
|
|
func (s *FIFO) limit(modelID string) int {
|
|
if l, ok := s.limits[modelID]; ok {
|
|
return l
|
|
}
|
|
return defaultConcurrencyLimit
|
|
}
|
|
|
|
// startSwap records the swap as active and launches it via Effects. running is
|
|
// the set EvictionFor saw, forwarded to OnSwapStart so the planner logs against
|
|
// the same picture it decided on.
|
|
func (s *FIFO) startSwap(initial HandlerReq, evict, running []string) {
|
|
s.active[initial.Model] = &activeSwap{
|
|
modelID: initial.Model,
|
|
evict: evict,
|
|
waiters: []HandlerReq{initial},
|
|
}
|
|
s.planner.OnSwapStart(initial.Model, running)
|
|
s.effects.StartSwap(initial.Model, evict)
|
|
}
|
|
|
|
// enqueue inserts req into the queue in priority order: it goes just before the
|
|
// first queued item whose priority is strictly lower, so higher-priority models
|
|
// are serviced first while equal-priority requests keep their arrival (FIFO)
|
|
// order. Priorities come from the FifoConfig; unlisted models default to 0.
|
|
func (s *FIFO) enqueue(req HandlerReq) {
|
|
p := s.cfg.Priority[req.Model]
|
|
i := len(s.queued)
|
|
for j, q := range s.queued {
|
|
if s.cfg.Priority[q.Model] < p {
|
|
i = j
|
|
break
|
|
}
|
|
}
|
|
s.queued = append(s.queued, HandlerReq{})
|
|
copy(s.queued[i+1:], s.queued[i:])
|
|
s.queued[i] = req
|
|
broadcastQueuePositions(s.queued)
|
|
}
|
|
|
|
// drainQueue walks the queued requests in order, re-running the OnRequest
|
|
// decision tree against the (now smaller) active set. Items that can now start
|
|
// or join become satisfied; items still blocked remain queued in original order
|
|
// so they get another chance on the next swap completion.
|
|
func (s *FIFO) drainQueue() {
|
|
if len(s.queued) == 0 {
|
|
return
|
|
}
|
|
pending := s.queued
|
|
var remaining []HandlerReq
|
|
for _, req := range pending {
|
|
state, ok := s.effects.ModelState(req.Model)
|
|
if !ok {
|
|
s.effects.GrantError(req, ErrModelNotFound)
|
|
continue
|
|
}
|
|
if sw, ok := s.active[req.Model]; ok {
|
|
s.logger.Debugf("%s: queued request for model %s now joining in-flight swap", s.name, req.Model)
|
|
sw.waiters = append(sw.waiters, req)
|
|
continue
|
|
}
|
|
running := s.runningSet(req.Model)
|
|
evict := s.planner.EvictionFor(req.Model, running)
|
|
if state == process.StateReady && len(evict) == 0 && !collidesWith(req.Model, evict, s.active) {
|
|
s.logger.Debugf("%s: queued request for model %s now served fast-path", s.name, req.Model)
|
|
s.grantHandler(req, req.Model)
|
|
continue
|
|
}
|
|
if collidesWith(req.Model, evict, s.active) {
|
|
remaining = append(remaining, req)
|
|
continue
|
|
}
|
|
if conflictsWithInFlight(evict, s.inFlight) {
|
|
remaining = append(remaining, req)
|
|
continue
|
|
}
|
|
s.logger.Debugf("%s: queued request for model %s now starting swap, evicting %v", s.name, req.Model, evict)
|
|
s.startSwap(req, evict, running)
|
|
}
|
|
s.queued = remaining
|
|
broadcastQueuePositions(s.queued)
|
|
}
|
|
|
|
// runningSet is the live model set handed to the Swapper: every process the
|
|
// baseRouter reports as running, unioned with the targets of in-flight swaps
|
|
// (excluding excludeActive, the model whose own swap is being decided — its
|
|
// in-flight entry must not count as "already running"). The result is sorted so
|
|
// eviction decisions derived from it are deterministic.
|
|
func (s *FIFO) runningSet(excludeActive string) []string {
|
|
seen := make(map[string]struct{})
|
|
var out []string
|
|
add := func(id string) {
|
|
if _, dup := seen[id]; dup {
|
|
return
|
|
}
|
|
seen[id] = struct{}{}
|
|
out = append(out, id)
|
|
}
|
|
for id := range s.effects.RunningModels() {
|
|
add(id)
|
|
}
|
|
for _, id := range activeTargets(s.active, excludeActive) {
|
|
add(id)
|
|
}
|
|
sort.Strings(out)
|
|
return out
|
|
}
|
|
|
|
// activeTargets returns the IDs of every in-flight swap target except exclude.
|
|
// The planner uses this to account for models committed to but not yet reflected
|
|
// in process state.
|
|
func activeTargets(active map[string]*activeSwap, exclude string) []string {
|
|
if len(active) == 0 {
|
|
return nil
|
|
}
|
|
out := make([]string, 0, len(active))
|
|
for id := range active {
|
|
if id == exclude {
|
|
continue
|
|
}
|
|
out = append(out, id)
|
|
}
|
|
return out
|
|
}
|
|
|
|
// collidesWith reports whether a new swap with this target and evict set can
|
|
// safely run alongside the currently active swaps. Same-target callers should
|
|
// JOIN (handled before this) — they do not collide with themselves.
|
|
func collidesWith(target string, evict []string, active map[string]*activeSwap) bool {
|
|
for id, sw := range active {
|
|
if id == target {
|
|
continue
|
|
}
|
|
if containsString(evict, id) {
|
|
return true
|
|
}
|
|
if containsString(sw.evict, target) {
|
|
return true
|
|
}
|
|
if slicesOverlap(evict, sw.evict) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// slicesOverlap reports whether xs and ys share any common element.
|
|
func slicesOverlap(xs, ys []string) bool {
|
|
for _, x := range xs {
|
|
if containsString(ys, x) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// conflictsWithInFlight reports whether any model in evict is still handling
|
|
// requests. Stopping a busy process would cancel its callers' connections, so
|
|
// the scheduler defers the swap until those callers finish.
|
|
func conflictsWithInFlight(evict []string, inFlight map[string]int) bool {
|
|
for _, m := range evict {
|
|
if inFlight[m] > 0 {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func containsString(xs []string, s string) bool {
|
|
for _, x := range xs {
|
|
if x == s {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// broadcastQueuePositions sends each queued request its current 1-indexed
|
|
// position. Sends are non-blocking: if the channel is full, the old value is
|
|
// drained first so the consumer always sees the latest position.
|
|
func broadcastQueuePositions(queued []HandlerReq) {
|
|
for i, req := range queued {
|
|
pos := i + 1
|
|
select {
|
|
case req.PositionCh <- pos:
|
|
default:
|
|
select {
|
|
case <-req.PositionCh:
|
|
default:
|
|
}
|
|
select {
|
|
case req.PositionCh <- pos:
|
|
default:
|
|
}
|
|
}
|
|
}
|
|
}
|