package scheduler import ( "fmt" "sort" "time" "github.com/mostlygeek/llama-swap/internal/config" "github.com/mostlygeek/llama-swap/internal/logmon" "github.com/mostlygeek/llama-swap/internal/process" ) // activeSwap tracks one in-flight swap and the callers waiting on it. type activeSwap struct { modelID string evict []string waiters []HandlerReq } // FIFO is the default scheduler. Requests are handled in a first-in, first-out order. // To reduce swapping requests for a model that is already running will be handled // immediately by the running process. // // Requests into this schedule are handled like this: // // A B C A B C --> A A B B C C // // The strategy is simple and reduces the number of swaps required. type FIFO struct { name string logger *logmon.Monitor planner Swapper cfg config.FifoConfig effects Effects active map[string]*activeSwap inFlight map[string]int queued []HandlerReq } // NewFIFO builds a FIFO scheduler. It matches scheduler.Factory once a planner // is captured in a closure. func NewFIFO(name string, logger *logmon.Monitor, planner Swapper, cfg config.FifoConfig, eff Effects) *FIFO { return &FIFO{ name: name, logger: logger, planner: planner, cfg: cfg, effects: eff, active: make(map[string]*activeSwap), inFlight: make(map[string]int), } } // OnRequest decides what to do with one incoming ServeHTTP request. It never // blocks indefinitely: any work that has to wait (starting a process, stopping // siblings, waiting for ready) is deferred to a swap goroutine and reported back // via OnSwapDone. // // The decision tree, in order: // // 1. Unknown model — respond with ErrModelNotFound and move on. // 2. A swap to the same model is already in flight — attach this waiter so // one swap serves all callers that asked for the same model. // 3. Fast path — the target process is already ready, the planner sees // nothing to evict, and no in-flight swap is evicting it. Hand back its // ServeHTTP immediately. // 4. Would collide with an in-flight swap (we'd stop their target, or they're // stopping us) — park in the queue for OnSwapDone to drain. // 5. Would evict a process that is still handling requests — park in the // queue. OnServeDone will retry when the busy process drains. // 6. Otherwise — start a new swap. This may run in parallel with other active // swaps when their evict sets don't intersect. func (s *FIFO) OnRequest(req HandlerReq) { // (1) Unknown model. state, ok := s.effects.ModelState(req.Model) if !ok { s.logger.Debugf("%s: model %s not handled by this router", s.name, req.Model) s.effects.GrantError(req, ErrModelNotFound) return } // (2) Join an in-flight swap for the same model. if sw, ok := s.active[req.Model]; ok { s.logger.Debugf("%s: joining in-flight swap for model %s (%d waiters)", s.name, req.Model, len(sw.waiters)+1) sw.waiters = append(sw.waiters, req) return } running := s.runningSet(req.Model) evict := s.planner.EvictionFor(req.Model, running) // (3) Fast path: ready, nothing to evict, and nobody is evicting us. if state == process.StateReady && len(evict) == 0 && !collidesWith(req.Model, evict, s.active) { s.logger.Debugf("%s: fast-path serving model %s (already ready)", s.name, req.Model) s.grantHandler(req, req.Model) return } // (4) Collision with an in-flight swap — queue. if collidesWith(req.Model, evict, s.active) { s.logger.Debugf("%s: queuing request for model %s (collides with in-flight swap)", s.name, req.Model) s.enqueue(req) return } // (5) Would evict a busy process — queue until it drains. if conflictsWithInFlight(evict, s.inFlight) { s.logger.Debugf("%s: queuing request for model %s (would evict in-flight process)", s.name, req.Model) s.enqueue(req) return } // (6) Start a new (possibly parallel) swap. s.logger.Debugf("%s: starting swap for model %s, evicting %v", s.name, req.Model, evict) s.startSwap(req, evict, running) } // OnCancel removes a request whose client has disconnected from the queue and // from every in-flight swap's waiters. If the request was the sole waiter of an // active swap, the swap goroutine is left to complete on its own — OnSwapDone // will find no waiters and simply clean up. This prevents drainQueue from ever // starting a model load for a caller that is no longer there. func (s *FIFO) OnCancel(req HandlerReq) { removed := false // Prune from the queue. if len(s.queued) > 0 { kept := s.queued[:0] for _, q := range s.queued { if q.Respond == req.Respond { removed = true continue } kept = append(kept, q) } s.queued = kept } // Prune from any active swap's waiters. for _, sw := range s.active { filtered := sw.waiters[:0] for _, w := range sw.waiters { if w.Respond == req.Respond { removed = true continue } filtered = append(filtered, w) } sw.waiters = filtered } if removed { s.logger.Debugf("%s: cancelled request for model %s pruned from scheduler", s.name, req.Model) broadcastQueuePositions(s.queued) } } // OnSwapDone fans the result out to every waiter that joined this swap, removes // the swap from the active map, then walks the queue once, promoting any items // that no longer collide with the remaining active set. FIFO order is preserved: // items still blocked stay in place. func (s *FIFO) OnSwapDone(ev SwapDone) { sw, ok := s.active[ev.ModelID] if !ok { return } delete(s.active, ev.ModelID) for _, w := range sw.waiters { if ev.Err != nil { s.effects.GrantError(w, ev.Err) } else { s.grantHandler(w, ev.ModelID) } } s.drainQueue() } // OnServeDone decrements the per-model in-flight count and, when that drops to // zero, retries the queue: requests whose swap was deferred because they would // have evicted this (now-idle) process can now proceed. func (s *FIFO) OnServeDone(ev ServeDoneEvent) { s.inFlight[ev.ModelID]-- if s.inFlight[ev.ModelID] <= 0 { delete(s.inFlight, ev.ModelID) s.drainQueue() } } // OnUnload reconciles router-owned state with the impending Stop, performs the // Stop (synchronously, via Effects) so callers of Unload remain blocked until // each targeted process has exited, then drains the queue. func (s *FIFO) OnUnload(targets []string, timeout time.Duration) { unloadErr := fmt.Errorf("%s: model unloaded", s.name) targetSet := make(map[string]bool, len(targets)) for _, id := range targets { targetSet[id] = true } // Release waiters of any in-flight swap whose target is being unloaded. // The swap goroutine itself is left to finish on its own; when its // SwapDone arrives, OnSwapDone will find no entry in active and drop it. for id := range targetSet { sw, ok := s.active[id] if !ok { continue } for _, w := range sw.waiters { s.effects.GrantError(w, unloadErr) } delete(s.active, id) } // Drop queued requests addressed to unloaded models. Requests for other // models stay queued and may benefit from drainQueue at the end. if len(s.queued) > 0 { kept := s.queued[:0] for _, w := range s.queued { if targetSet[w.Model] { s.effects.GrantError(w, unloadErr) continue } kept = append(kept, w) } s.queued = kept } // Stop the targeted processes. Done synchronously so Unload's caller can // rely on "after Unload returns, the process is stopped". inFlight is // intentionally NOT cleared here: each dying handler will fire its tracked // serve and reach OnServeDone in the normal way. s.effects.StopProcesses(timeout, targets) // Removing entries from active above may have unblocked queued requests // that previously collided with the now-cancelled swaps. s.drainQueue() } // OnShutdown grants err to every waiter still held by the scheduler. func (s *FIFO) OnShutdown(err error) { for _, sw := range s.active { for _, w := range sw.waiters { s.effects.GrantError(w, err) } } for _, w := range s.queued { s.effects.GrantError(w, err) } } // grantHandler hands the caller a tracked handler for modelID and, only if the // caller was still there to receive it, bumps the in-flight count. Incrementing // when the grant failed would strand the counter and block future evictions. func (s *FIFO) grantHandler(req HandlerReq, modelID string) { if s.effects.GrantServe(req, modelID) { s.inFlight[modelID]++ } } // startSwap records the swap as active and launches it via Effects. running is // the set EvictionFor saw, forwarded to OnSwapStart so the planner logs against // the same picture it decided on. func (s *FIFO) startSwap(initial HandlerReq, evict, running []string) { s.active[initial.Model] = &activeSwap{ modelID: initial.Model, evict: evict, waiters: []HandlerReq{initial}, } s.planner.OnSwapStart(initial.Model, running) s.effects.StartSwap(initial.Model, evict) } // enqueue inserts req into the queue in priority order: it goes just before the // first queued item whose priority is strictly lower, so higher-priority models // are serviced first while equal-priority requests keep their arrival (FIFO) // order. Priorities come from the FifoConfig; unlisted models default to 0. func (s *FIFO) enqueue(req HandlerReq) { p := s.cfg.Priority[req.Model] i := len(s.queued) for j, q := range s.queued { if s.cfg.Priority[q.Model] < p { i = j break } } s.queued = append(s.queued, HandlerReq{}) copy(s.queued[i+1:], s.queued[i:]) s.queued[i] = req broadcastQueuePositions(s.queued) } // drainQueue walks the queued requests in order, re-running the OnRequest // decision tree against the (now smaller) active set. Items that can now start // or join become satisfied; items still blocked remain queued in original order // so they get another chance on the next swap completion. func (s *FIFO) drainQueue() { if len(s.queued) == 0 { return } pending := s.queued var remaining []HandlerReq for _, req := range pending { state, ok := s.effects.ModelState(req.Model) if !ok { s.effects.GrantError(req, ErrModelNotFound) continue } if sw, ok := s.active[req.Model]; ok { s.logger.Debugf("%s: queued request for model %s now joining in-flight swap", s.name, req.Model) sw.waiters = append(sw.waiters, req) continue } running := s.runningSet(req.Model) evict := s.planner.EvictionFor(req.Model, running) if state == process.StateReady && len(evict) == 0 && !collidesWith(req.Model, evict, s.active) { s.logger.Debugf("%s: queued request for model %s now served fast-path", s.name, req.Model) s.grantHandler(req, req.Model) continue } if collidesWith(req.Model, evict, s.active) { remaining = append(remaining, req) continue } if conflictsWithInFlight(evict, s.inFlight) { remaining = append(remaining, req) continue } s.logger.Debugf("%s: queued request for model %s now starting swap, evicting %v", s.name, req.Model, evict) s.startSwap(req, evict, running) } s.queued = remaining broadcastQueuePositions(s.queued) } // runningSet is the live model set handed to the Swapper: every process the // baseRouter reports as running, unioned with the targets of in-flight swaps // (excluding excludeActive, the model whose own swap is being decided — its // in-flight entry must not count as "already running"). The result is sorted so // eviction decisions derived from it are deterministic. func (s *FIFO) runningSet(excludeActive string) []string { seen := make(map[string]struct{}) var out []string add := func(id string) { if _, dup := seen[id]; dup { return } seen[id] = struct{}{} out = append(out, id) } for id := range s.effects.RunningModels() { add(id) } for _, id := range activeTargets(s.active, excludeActive) { add(id) } sort.Strings(out) return out } // activeTargets returns the IDs of every in-flight swap target except exclude. // The planner uses this to account for models committed to but not yet reflected // in process state. func activeTargets(active map[string]*activeSwap, exclude string) []string { if len(active) == 0 { return nil } out := make([]string, 0, len(active)) for id := range active { if id == exclude { continue } out = append(out, id) } return out } // collidesWith reports whether a new swap with this target and evict set can // safely run alongside the currently active swaps. Same-target callers should // JOIN (handled before this) — they do not collide with themselves. func collidesWith(target string, evict []string, active map[string]*activeSwap) bool { for id, sw := range active { if id == target { continue } if containsString(evict, id) { return true } if containsString(sw.evict, target) { return true } if slicesOverlap(evict, sw.evict) { return true } } return false } // slicesOverlap reports whether xs and ys share any common element. func slicesOverlap(xs, ys []string) bool { for _, x := range xs { if containsString(ys, x) { return true } } return false } // conflictsWithInFlight reports whether any model in evict is still handling // requests. Stopping a busy process would cancel its callers' connections, so // the scheduler defers the swap until those callers finish. func conflictsWithInFlight(evict []string, inFlight map[string]int) bool { for _, m := range evict { if inFlight[m] > 0 { return true } } return false } func containsString(xs []string, s string) bool { for _, x := range xs { if x == s { return true } } return false } // broadcastQueuePositions sends each queued request its current 1-indexed // position. Sends are non-blocking: if the channel is full, the old value is // drained first so the consumer always sees the latest position. func broadcastQueuePositions(queued []HandlerReq) { for i, req := range queued { pos := i + 1 select { case req.PositionCh <- pos: default: select { case <-req.PositionCh: default: } select { case req.PositionCh <- pos: default: } } } }