// Package scheduler contains the request-scheduling strategies used by the // router's baseRouter. A Scheduler owns the queue, in-flight tracking, and the // decision tree for when to start a swap versus queue a request. The baseRouter // owns the channels, run loop, and process machinery, and exposes the // side-effects a scheduler needs through the Effects interface. // // Splitting these apart lets the scheduling strategy be swapped out // independently of both the process machinery (baseRouter) and the eviction // policy (Swapper). FIFO is the first and currently only implementation. package scheduler import ( "context" "fmt" "net/http" "time" "github.com/mostlygeek/llama-swap/internal/config" "github.com/mostlygeek/llama-swap/internal/logmon" "github.com/mostlygeek/llama-swap/internal/process" "github.com/mostlygeek/llama-swap/internal/shared" ) // ErrModelNotFound is granted to callers whose model is not handled by this // router. It is an alias for shared.ErrNoLocalModelFound. var ErrModelNotFound = shared.ErrNoLocalModelFound // Swapper is the eviction policy: it decides which running models must be // stopped before a target can serve. It is orthogonal to the scheduling // strategy — any Scheduler works with any Swapper. type Swapper interface { // EvictionFor returns running model IDs that must be stopped before // target can serve. running is the complete set the scheduler considers // live: every process that is not stopped, unioned with the targets of // in-flight swaps the scheduler has already committed to (which are not yet // visible in process state). The planner does not inspect process state // itself. Pure decision; must not log. EvictionFor(target string, running []string) []string // OnSwapStart runs once at the start of every swap, with the same running // set EvictionFor was given for this decision. Planners may log their // decision here at whatever verbosity they choose. OnSwapStart(target string, running []string) } // Scheduler decides what happens to each event the router's run loop receives. // All methods run on that single run-loop goroutine, so implementations need no // internal locking for their own state. type Scheduler interface { // OnRequest handles one incoming ServeHTTP request. OnRequest(req HandlerReq) // OnCancel handles a request whose client has disconnected before it was // granted. The scheduler must remove the request from its queue and from // any in-flight swap's waiters so it never triggers a model load or grant // for a caller that is no longer there. OnCancel(req HandlerReq) // OnSwapDone handles a swap goroutine reporting completion. OnSwapDone(ev SwapDone) // OnServeDone handles a tracked ServeHTTP finishing (in-flight decrement). OnServeDone(ev ServeDoneEvent) // OnUnload reconciles scheduler state for an unload, stops the targeted // processes via Effects, and drains the queue. It must block until the // targeted processes have stopped. OnUnload(targets []string, timeout time.Duration) // OnShutdown grants err to every waiter the scheduler still holds (active // swap waiters and queued requests). Process teardown is the baseRouter's // responsibility. OnShutdown(err error) } // Effects is implemented by the baseRouter. The scheduler calls back through it // for every side-effect: inspecting process state, launching swaps, responding // to callers, and stopping processes. type Effects interface { // ModelState returns the current state of a model's process. ok is false // when the model is not handled by this router. ModelState(modelID string) (process.ProcessState, bool) // RunningModels returns the state of every process that is not stopped or // shut down, keyed by model ID. The scheduler uses it to build the running // set it hands the Swapper. RunningModels() map[string]process.ProcessState // StartSwap launches the swap goroutine for modelID, stopping evict first. StartSwap(modelID string, evict []string) // GrantError responds to a caller with an error. GrantError(req HandlerReq, err error) // GrantServe hands a caller the wrapped handler for modelID and reports // whether the caller was still there to receive it. The scheduler bumps // its in-flight count only when this returns true. GrantServe(req HandlerReq, modelID string) bool // StopProcesses stops the named processes in parallel and blocks until all // have stopped. Unknown IDs are skipped. StopProcesses(timeout time.Duration, ids []string) } // New returns a Scheduler selected by conf.Routing.Scheduler.Use, configured from // conf and bound to the given planner and effects. Supported values are "fifo" // (throughput-oriented, batches same-model requests) and "serial" (strict // one-model-at-a-time, exact arrival order). // // The deployment default is applied by config loading (LoadConfig sets Use to // "serial" when unset). The "" fallback here is the library default and remains // "fifo" so callers that build a Config directly keep the original behavior. func New(conf config.Config, name string, logger *logmon.Monitor, planner Swapper, eff Effects) (Scheduler, error) { use := conf.Routing.Scheduler.Use if use == "" { use = "fifo" } switch use { case "fifo": return NewFIFO(name, logger, planner, conf.Routing.Scheduler.Settings.Fifo, conf.Models, eff), nil case "serial": // Serial ignores the group planner: it always evicts every other model. return NewSerial(name, logger, eff), nil default: return nil, fmt.Errorf("unsupported scheduler type: %q", use) } } // HandlerReq is one in-flight ServeHTTP request waiting for a routing decision. type HandlerReq struct { Model string Ctx context.Context Respond chan HandlerResp PositionCh chan int } // HandlerResp is the routing decision returned to a HandlerReq's caller: either // a handler to serve with, or an error. type HandlerResp struct { HandleFunc http.HandlerFunc Err error } // SwapDone is reported by a swap goroutine when its target is ready (or failed). type SwapDone struct { ModelID string Err error } // ServeDoneEvent is reported when a tracked ServeHTTP handler returns. type ServeDoneEvent struct { ModelID string }