From 9b3a33d7b9c3bd742d7c3dd4755e84f637ffdc77 Mon Sep 17 00:00:00 2001 From: Benson Wong <83972+mostlygeek@users.noreply.github.com> Date: Wed, 10 Jun 2026 20:34:25 -0700 Subject: [PATCH] Implement new scheduler (#823) - introduce internal/router/scheduler to decouple routing, swapping and queuing into interface contracts. - introduce a new `routing` configuration section that supersedes `matrix` and `group` while maintaining backwards compatibility - add FIFO scheduler with prioritized queuing - add internal/router/design.md as developer documentation on implementing new schedulers and routers Fixes #797 --- .github/workflows/config-schema.yml | 11 +- config-schema.json | 227 ++++++--- config.example.yaml | 257 ++++++---- go.mod | 1 + go.sum | 2 + internal/config/config.go | 95 +++- internal/config/config_posix_test.go | 43 +- internal/config/config_schema_test.go | 60 +++ internal/config/config_test.go | 171 +++++++ internal/config/config_windows_test.go | 43 +- internal/config/matrix.go | 3 + internal/config/matrix_test.go | 4 +- internal/router/base.go | 511 ++++---------------- internal/router/base_test.go | 629 +------------------------ internal/router/design.md | 404 ++++++++++++++++ internal/router/group.go | 33 +- internal/router/group_test.go | 51 +- internal/router/helpers_test.go | 13 + internal/router/matrix.go | 62 +-- internal/router/matrix_test.go | 22 +- internal/router/router.go | 3 +- internal/router/scheduler/fifo.go | 411 ++++++++++++++++ internal/router/scheduler/fifo_test.go | 537 +++++++++++++++++++++ internal/router/scheduler/scheduler.go | 116 +++++ internal/server/server.go | 5 +- internal/server/server_test.go | 14 +- 26 files changed, 2398 insertions(+), 1330 deletions(-) create mode 100644 internal/config/config_schema_test.go create mode 100644 internal/router/design.md create mode 100644 internal/router/scheduler/fifo.go create mode 100644 internal/router/scheduler/fifo_test.go create mode 100644 internal/router/scheduler/scheduler.go diff --git a/.github/workflows/config-schema.yml b/.github/workflows/config-schema.yml index 342ce182..bfae5de2 100644 --- a/.github/workflows/config-schema.yml +++ b/.github/workflows/config-schema.yml @@ -44,13 +44,10 @@ jobs: echo "✓ config-schema.json is valid" - - name: Set up Python - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 #v6.2.0 + - name: Set up Go + uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c #6.4.0 with: - python-version: "3.x" - - - name: Install check-jsonschema - run: pip install check-jsonschema + go-version-file: go.mod - name: Validate config.example.yaml against schema - run: check-jsonschema --schemafile config-schema.json config.example.yaml + run: go test ./internal/config/ -run TestConfig_ExampleMatchesSchema -v diff --git a/config-schema.json b/config-schema.json index 8fa0d04c..52a7229c 100644 --- a/config-schema.json +++ b/config-schema.json @@ -82,6 +82,78 @@ }, "additionalProperties": false, "description": "Timeout settings for proxy connections." + }, + "groupsConfig": { + "type": "object", + "additionalProperties": { + "type": "object", + "required": [ + "members" + ], + "properties": { + "swap": { + "type": "boolean", + "default": true, + "description": "Controls model swapping behaviour within the group. True: only one model runs at a time. False: all models can run together." + }, + "exclusive": { + "type": "boolean", + "default": true, + "description": "Controls how the group affects other groups. True: causes all other groups to unload when this group runs a model. False: does not affect other groups." + }, + "persistent": { + "type": "boolean", + "default": false, + "description": "Prevents other groups from unloading the models in this group. Does not affect individual model behaviour." + }, + "members": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Array of model IDs that are members of this group. Model IDs must be defined in models." + } + } + }, + "description": "A dictionary of group settings. Provides advanced controls over model swapping behaviour. Model IDs must be defined in models. A model can only be a member of one group. Behaviour controlled via swap, exclusive, persistent." + }, + "matrixConfig": { + "type": "object", + "description": "Solver-based alternative to groups. Declares valid combinations of concurrent models. The solver minimizes eviction cost when swapping. A config must use either groups or matrix, not both.", + "required": [ + "vars", + "sets" + ], + "properties": { + "vars": { + "type": "object", + "description": "Short names for models. Keys must be alphanumeric, 1-8 characters. All sets and evict_costs must use these IDs.", + "minProperties": 1, + "additionalProperties": { + "type": "string" + }, + "propertyNames": { + "pattern": "^[a-zA-Z0-9]{1,8}$" + } + }, + "evict_costs": { + "type": "object", + "description": "Relative cost of evicting a running model. Models not listed default to 1. Values must be positive integers.", + "additionalProperties": { + "type": "integer", + "minimum": 1 + } + }, + "sets": { + "type": "object", + "description": "Named sets of concurrent model combinations. Values are DSL strings using & (AND), | (OR), () (grouping), and +ref (inline another set). Definition order is used for tie-breaking.", + "minProperties": 1, + "additionalProperties": { + "type": "string" + } + } + }, + "additionalProperties": false } }, "properties": { @@ -311,76 +383,10 @@ } }, "groups": { - "type": "object", - "additionalProperties": { - "type": "object", - "required": [ - "members" - ], - "properties": { - "swap": { - "type": "boolean", - "default": true, - "description": "Controls model swapping behaviour within the group. True: only one model runs at a time. False: all models can run together." - }, - "exclusive": { - "type": "boolean", - "default": true, - "description": "Controls how the group affects other groups. True: causes all other groups to unload when this group runs a model. False: does not affect other groups." - }, - "persistent": { - "type": "boolean", - "default": false, - "description": "Prevents other groups from unloading the models in this group. Does not affect individual model behaviour." - }, - "members": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Array of model IDs that are members of this group. Model IDs must be defined in models." - } - } - }, - "description": "A dictionary of group settings. Provides advanced controls over model swapping behaviour. Model IDs must be defined in models. A model can only be a member of one group. Behaviour controlled via swap, exclusive, persistent." + "$ref": "#/definitions/groupsConfig" }, "matrix": { - "type": "object", - "description": "Solver-based alternative to groups. Declares valid combinations of concurrent models. The solver minimizes eviction cost when swapping. A config must use either groups or matrix, not both.", - "required": [ - "vars", - "sets" - ], - "properties": { - "vars": { - "type": "object", - "description": "Short names for models. Keys must be alphanumeric, 1-8 characters. All sets and evict_costs must use these IDs.", - "minProperties": 1, - "additionalProperties": { - "type": "string" - }, - "propertyNames": { - "pattern": "^[a-zA-Z0-9]{1,8}$" - } - }, - "evict_costs": { - "type": "object", - "description": "Relative cost of evicting a running model. Models not listed default to 1. Values must be positive integers.", - "additionalProperties": { - "type": "integer", - "minimum": 1 - } - }, - "sets": { - "type": "object", - "description": "Named sets of concurrent model combinations. Values are DSL strings using & (AND), | (OR), () (grouping), and +ref (inline another set). Definition order is used for tie-breaking.", - "minProperties": 1, - "additionalProperties": { - "type": "string" - } - } - }, - "additionalProperties": false + "$ref": "#/definitions/matrixConfig" }, "hooks": { "type": "object", @@ -512,26 +518,103 @@ }, "default": {}, "description": "A dictionary of remote peers and models they provide. Peers can be another llama-swap or any server that provides the /v1/ generative API endpoints supported by llama-swap." + }, + "routing": { + "type": "object", + "description": "Canonical routing/scheduling configuration. Alternative to the legacy top-level 'groups'/'matrix' keys; a config must not use both styles.", + "properties": { + "scheduler": { + "type": "object", + "description": "Scheduler configuration. Decides the order in which queued requests are serviced.", + "properties": { + "use": { + "type": "string", + "enum": [ + "fifo" + ], + "default": "fifo", + "description": "Scheduler to use. Only 'fifo' is currently supported." + }, + "settings": { + "type": "object", + "properties": { + "fifo": { + "type": "object", + "properties": { + "priority": { + "type": "object", + "description": "Per-model priority. Keys are model IDs, values are integers (default 0). Higher values are serviced first.", + "additionalProperties": { + "type": "integer" + } + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + }, + "router": { + "type": "object", + "description": "Router configuration. Selects between the group and matrix swapping strategies.", + "properties": { + "use": { + "type": "string", + "enum": [ + "group", + "matrix" + ], + "default": "group", + "description": "Router to use. 'group' uses static groups, 'matrix' uses the solver-based swap matrix." + }, + "settings": { + "type": "object", + "properties": { + "groups": { + "$ref": "#/definitions/groupsConfig" + }, + "matrix": { + "$ref": "#/definitions/matrixConfig" + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false } }, "allOf": [ { "if": { - "required": ["groups"] + "required": [ + "groups" + ] }, "then": { "not": { - "required": ["matrix"] + "required": [ + "matrix" + ] } } }, { "if": { - "required": ["matrix"] + "required": [ + "matrix" + ] }, "then": { "not": { - "required": ["groups"] + "required": [ + "groups" + ] } } } diff --git a/config.example.yaml b/config.example.yaml index f33f5478..91c3581e 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -343,93 +343,6 @@ models: # - processes have 5 seconds to shutdown until forceful termination is attempted cmdStop: docker stop ${MODEL_ID} -# ============================================================================= -# matrix: run concurrent models with a solver-based swap DSL -# ============================================================================= -# -# Matrix or Groups? -# -# Groups are available and fully supported. The syntax may be easier to use -# for simple use cases. -# -# Documentation can be found here: -# https://github.com/mostlygeek/llama-swap/blob/40e39f7/config.example.yaml#L334-L396 -# -# A config can only use a matrix (recommended) or groups. A configuration error -# will occur if both are defined. Groups is legacy but is fully supported with -# no plans to deprecate it. -# -# ~~~~~ -# -# The matrix declares valid combinations of models that can run concurrently. -# When a model is requested, the solver finds the cheapest way to make it -# available by evicting as few (and least costly) running models as possible. -# -# Solver behavior: -# 1. Request arrives for model X -# 2. If X is already running, forward immediately. Done. -# 3. Find all sets containing X -# 4. For each candidate set, compute cost: sum of evict_costs for -# every running model NOT in that set -# 5. Pick lowest cost candidate. Ties broken by definition order. -# 6. Evict what needs to stop. Start X. Forward request. -# -# Subset semantics: a set [a, b, c] means any subset is valid. -# Only the requested model is started — others are not preloaded. -# -# A model not appearing in any set can only run alone. -# -matrix: - # vars: short names for models (alphanumeric, 1-8 chars) - # - required for sets and evict_costs settings - # - each entry is a short name to a real model ID. Do not use an alias - # - used to keep set DSL logic short and easier to read - # - sets and evict_costs only use identifiers defined in vars - vars: - g: gemma-model - q: qwen-model - m: mistral-model - v: voxtral-model - e: reranker-model - L: llama-70B - sd: stable-diffusion - - # evict_costs: relative cost of losing a running model (default: 1) - evict_costs: - v: 50 # vllm backend, slow cold start - L: 30 # 70B weights, slow to load - - # sets: named sets of concurrent model combinations - # Values are DSL strings with operators: - # & AND (models run together) - # | OR (alternatives) - # () grouping - # +ref inline another set's expression - # - # Expansion examples: - # "L" → [L] - # "a & b" → [a, b] - # "a | b" → [a], [b] - # "(a | b) & c" → [a, c], [b, c] - # "(a | b) & (c | d)" → [a,c], [a,d], [b,c], [b,d] - # "+llms & v" → expands llms inline, then applies & v - sets: - # LLM + TTS: switching between g/q/m won't evict v - # expands to: [g,v], [q,v], [m,v] - standard: "(g | q | m) & v" - - # LLM + TTS + reranker - # expands to: [g,v,e], [q,v,e] - with_rerank: "(g | q) & v & e" - - # LLM + image generation, no TTS - # expands to: [g,sd], [q,sd] - creative: "(g | q) & sd" - - # 70B model uses all GPUs, can only run alone - # expands to: [L] - full: "L" - # hooks: a dictionary of event triggers and actions # - optional, default: empty dictionary # - the only supported hook is on_startup @@ -446,6 +359,176 @@ hooks: preload: - "llama" +# routing: +# Controls how llama-swap decides which models can run at the same time and +# which get swapped out. Choose one of two swap engines: +# +# - group: the default engine. Simpler to configure. You define groups of +# models that run together, and loading one group typically unloads +# the others. +# +# - matrix: the newer engine. More involved to configure, but far more +# flexible. It uses a small expression language to describe which +# model combinations are allowed to run concurrently, enabling +# setups that groups cannot express. +# +# The routing section is optional. +routing: + router: + # use: a string defining which engine to use + # - optional, default: "group" + # - valid values: group, matrix + use: group + + # settings: a dictionary of settings for the specific engines + settings: + # groups: a dictionary of named groups + # - optional, default: empty dictionary + # - lets you keep some models loaded while others swap out + # - every member must be a model ID defined in the models section + # - a model can belong to only one group + # - behaviour is set per group with the `swap`, `exclusive` and + # `persistent` fields + # - see issue #109 for details + # + # NOTE: the model names below are illustrative and are not defined above. + groups: + # group1 reproduces llama-swap's default behaviour: only one model + # runs at a time across the entire instance. + "group1": + # swap: how members of this group swap among themselves + # - optional, default: true + # - true: only one member runs at a time + # - false: all members can run together, no swapping + swap: true + + # exclusive: how this group affects other groups + # - optional, default: true + # - true: running a member unloads every other group + # - false: running a member leaves other groups untouched + exclusive: true + + # members: the model IDs in this group + # required + members: + - "llama" + - "qwen-unlisted" + + # group2: members all run together, but loading any other group + # unloads them. + "group2": + # swap: false lets all members stay loaded at once + swap: false + + # exclusive: false means requesting a member loads it without + # unloading any other group + exclusive: false + members: + - "docker-llama" + - "modelA" + - "modelB" + + # forever: a persistent group that other groups can never unload. + "forever": + # persistent: other groups cannot unload this group's members + # - optional, default: false + # - has no effect on swapping within the group + persistent: true + + # swap/exclusive: false keeps all members loaded and avoids + # unloading other groups + swap: false + exclusive: false + members: + - "forever-modelA" + - "forever-modelB" + - "forever-modelc" + + # The matrix lists the model combinations that are allowed to run + # concurrently. When a model is requested, the solver makes room for it + # by evicting as few running models as possible, preferring to keep the + # costliest ones loaded. + # + # Solver behaviour: + # 1. A request arrives for model X. + # 2. If X is already running, forward the request. Done. + # 3. Collect every set that contains X. + # 4. For each set, add up the evict_costs of the running models that + # are NOT in that set — that is the set's cost. + # 5. Choose the lowest-cost set. Break ties by definition order. + # 6. Evict the models outside that set, start X, forward the request. + # + # Subset semantics: a set [a, b, c] also permits any subset of itself. + # Only the requested model is started; the others are not preloaded. + # + # A model that appears in no set can only run on its own. + # + matrix: + # vars: short aliases for model IDs (alphanumeric, 1-8 chars) + # - required: sets and evict_costs reference these names, not model IDs + # - map each short name to a real model ID (not a model alias) + # - keeps the set expressions short and readable + vars: + g: gemma-model + q: qwen-model + m: mistral-model + v: voxtral-model + e: reranker-model + L: llama-70B + sd: stable-diffusion + + # evict_costs: relative cost of losing a running model (default: 1) + evict_costs: + v: 50 # vllm backend, slow cold start + L: 30 # 70B weights, slow to load + + # sets: named combinations of models that may run together. + # Each value is an expression built from these operators: + # & AND (models run together) + # | OR (alternatives) + # () grouping + # +ref inline the expression of another set + # + # Each expression expands into one or more concrete sets: + # "L" → [L] + # "a & b" → [a, b] + # "a | b" → [a], [b] + # "(a | b) & c" → [a, c], [b, c] + # "(a | b) & (c | d)" → [a,c], [a,d], [b,c], [b,d] + # "+llms & v" → inline the llms set, then AND with v + sets: + # An LLM plus TTS. Switching between g/q/m keeps v loaded. + # expands to: [g,v], [q,v], [m,v] + standard: "(g | q | m) & v" + + # An LLM plus TTS plus reranker. + # expands to: [g,v,e], [q,v,e] + with_rerank: "(g | q) & v & e" + + # An LLM plus image generation, no TTS. + # expands to: [g,sd], [q,sd] + creative: "(g | q) & sd" + + # The 70B model uses every GPU, so it can only run alone. + # expands to: [L] + full: "L" + + # scheduler: how queued requests are ordered. + # The default and only valid scheduler is "fifo" + scheduler: + use: fifo + settings: + fifo: + # priority: a dictionary of model ID -> priority + # - optional, default: empty dictionary + # - models default to priority 0 + # - higher priority requests are serviced first in the queue + priority: + A: 10 + B: 5 + C: 5 + D: 1 + # peers: a dictionary of remote peers and models they provide # - optional, default empty dictionary # - peers can be another llama-swap diff --git a/go.mod b/go.mod index 08232ead..ee1ce145 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/charmbracelet/lipgloss v1.1.0 github.com/fxamacker/cbor/v2 v2.9.1 github.com/gin-gonic/gin v1.10.0 + github.com/google/jsonschema-go v0.4.3 github.com/klauspost/compress v1.18.5 github.com/shirou/gopsutil/v4 v4.26.4 github.com/stretchr/testify v1.11.1 diff --git a/go.sum b/go.sum index 843b8bab..4e98ce37 100644 --- a/go.sum +++ b/go.sum @@ -61,6 +61,8 @@ github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/jsonschema-go v0.4.3 h1:/DBOLZTfDow7pe2GmaJNhltueGTtDKICi8V8p+DQPd0= +github.com/google/jsonschema-go v0.4.3/go.mod h1:r5quNTdLOYEz95Ru18zA0ydNbBuYoo9tgaYcxEYhJVE= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBFW2IE= diff --git a/internal/config/config.go b/internal/config/config.go index 103daffe..29fc1535 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -129,13 +129,16 @@ type Config struct { GlobalTTL int `yaml:"globalTTL"` Models map[string]ModelConfig `yaml:"models"` /* key is model ID */ Profiles map[string][]string `yaml:"profiles"` - Groups map[string]GroupConfig `yaml:"groups"` /* key is group ID */ - // swap matrix: solver-based alternative to groups - Matrix *MatrixConfig `yaml:"matrix"` + // routing is the canonical source for swap/scheduling configuration. + // New code must read Routing, never the backwards-compat fields below. + Routing RoutingConfig `yaml:"routing"` - // populated during validation when matrix is configured - ExpandedSets []ExpandedSet `yaml:"-"` + // Groups and Matrix are permanent backwards-compat input fields for the + // legacy top-level `groups:`/`matrix:` keys. They are normalized into + // Routing by LoadConfigFromReader. New code must not read them directly. + Groups map[string]GroupConfig `yaml:"groups"` /* key is group ID */ + Matrix *MatrixConfig `yaml:"matrix"` // for key/value replacements in model's cmd, cmdStop, proxy, checkEndPoint Macros MacroList `yaml:"macros"` @@ -162,6 +165,35 @@ type Config struct { Peers PeerDictionaryConfig `yaml:"peers"` } +// RoutingConfig is the canonical, normalized routing/scheduling configuration. +type RoutingConfig struct { + Scheduler SchedulerConfig `yaml:"scheduler"` + Router RouterConfig `yaml:"router"` +} + +type SchedulerConfig struct { + Use string `yaml:"use"` // default "fifo" + Settings SchedulerSettings `yaml:"settings"` +} + +type SchedulerSettings struct { + Fifo FifoConfig `yaml:"fifo"` +} + +type FifoConfig struct { + Priority map[string]int `yaml:"priority"` // model ID -> priority, default 0 +} + +type RouterConfig struct { + Use string `yaml:"use"` // "group" (default) | "matrix" + Settings RouterSettings `yaml:"settings"` +} + +type RouterSettings struct { + Groups map[string]GroupConfig `yaml:"groups"` + Matrix *MatrixConfig `yaml:"matrix"` +} + func (c *Config) RealModelName(search string) (string, bool) { if _, found := c.Models[search]; found { return search, true @@ -455,6 +487,34 @@ func LoadConfigFromReader(r io.Reader) (Config, error) { config.Models[modelId] = modelConfig } + // Normalize routing config. The legacy top-level `matrix`/`groups` keys and + // the new `routing.router` block are mutually exclusive: a config may use + // either style, never both. + hasTopLevel := config.Matrix != nil || len(config.Groups) > 0 + rtr := config.Routing.Router + hasRouting := rtr.Use != "" || rtr.Settings.Matrix != nil || len(rtr.Settings.Groups) > 0 + + if hasTopLevel && hasRouting { + return Config{}, fmt.Errorf("config uses both the legacy top-level 'matrix'/'groups' keys and the new 'routing.router' block; please migrate the top-level keys into 'routing.router' and remove them") + } + + if !hasTopLevel { + // Both groups and matrix may be defined under routing.router.settings; + // routing.router.use selects which one is active, so there is no conflict. + rs := config.Routing.Router.Settings + switch config.Routing.Router.Use { + case "matrix": + if rs.Matrix == nil { + return Config{}, fmt.Errorf("routing.router.use is 'matrix' but routing.router.settings.matrix is not set") + } + config.Matrix = rs.Matrix + case "group", "": + config.Groups = rs.Groups + default: + return Config{}, fmt.Errorf("routing.router.use: unknown router %q (valid: group, matrix)", config.Routing.Router.Use) + } + } + // groups XOR matrix if config.Matrix != nil && len(config.Groups) > 0 { return Config{}, fmt.Errorf("config cannot use both 'groups' and 'matrix'") @@ -465,7 +525,7 @@ func LoadConfigFromReader(r io.Reader) (Config, error) { if err != nil { return Config{}, fmt.Errorf("matrix: %w", err) } - config.ExpandedSets = expandedSets + config.Matrix.ExpandedSets = expandedSets } else { config = AddDefaultGroupToConfig(config) @@ -487,6 +547,29 @@ func LoadConfigFromReader(r io.Reader) (Config, error) { } } + // Build the canonical Config.Routing from the effective result. Both legacy + // and new-style configs converge here. The Matrix pointer is shared so + // ExpandedSets stays in one place. + if config.Matrix != nil { + config.Routing.Router.Use = "matrix" + } else { + config.Routing.Router.Use = "group" + } + config.Routing.Router.Settings.Matrix = config.Matrix + config.Routing.Router.Settings.Groups = config.Groups + + if config.Routing.Scheduler.Use == "" { + config.Routing.Scheduler.Use = "fifo" + } + if config.Routing.Scheduler.Use != "fifo" { + return Config{}, fmt.Errorf("routing.scheduler.use: unknown scheduler %q (valid: fifo)", config.Routing.Scheduler.Use) + } + for modelID := range config.Routing.Scheduler.Settings.Fifo.Priority { + if _, found := config.RealModelName(modelID); !found { + return Config{}, fmt.Errorf("routing.scheduler.settings.fifo.priority references unknown model %q", modelID) + } + } + // Clean up hooks preload if len(config.Hooks.OnStartup.Preload) > 0 { var toPreload []string diff --git a/internal/config/config_posix_test.go b/internal/config/config_posix_test.go index 5ac9b0a9..64124b5c 100644 --- a/internal/config/config_posix_test.go +++ b/internal/config/config_posix_test.go @@ -173,6 +173,25 @@ groups: IdleConn: 90, } + expectedGroups := map[string]GroupConfig{ + DEFAULT_GROUP_ID: { + Swap: true, + Exclusive: true, + Members: []string{"model1", "model3"}, + }, + "group1": { + Swap: true, + Exclusive: false, + Members: []string{"model2"}, + }, + "forever": { + Swap: true, + Exclusive: false, + Persistent: true, + Members: []string{"model4"}, + }, + } + expected := Config{ LogLevel: "info", LogTimeFormat: "", @@ -246,22 +265,16 @@ groups: "m2": "model2", "mthree": "model3", }, - Groups: map[string]GroupConfig{ - DEFAULT_GROUP_ID: { - Swap: true, - Exclusive: true, - Members: []string{"model1", "model3"}, + Groups: expectedGroups, + Routing: RoutingConfig{ + Router: RouterConfig{ + Use: "group", + Settings: RouterSettings{ + Groups: expectedGroups, + }, }, - "group1": { - Swap: true, - Exclusive: false, - Members: []string{"model2"}, - }, - "forever": { - Swap: true, - Exclusive: false, - Persistent: true, - Members: []string{"model4"}, + Scheduler: SchedulerConfig{ + Use: "fifo", }, }, } diff --git a/internal/config/config_schema_test.go b/internal/config/config_schema_test.go new file mode 100644 index 00000000..a47ee898 --- /dev/null +++ b/internal/config/config_schema_test.go @@ -0,0 +1,60 @@ +package config + +import ( + "encoding/json" + "os" + "testing" + + "github.com/google/jsonschema-go/jsonschema" + "gopkg.in/yaml.v3" +) + +// TestConfig_ExampleMatchesSchema validates that config.example.yaml conforms to +// config-schema.json. Both files live at the repository root. +func TestConfig_ExampleMatchesSchema(t *testing.T) { + const ( + schemaPath = "../../config-schema.json" + examplePath = "../../config.example.yaml" + ) + + schemaBytes, err := os.ReadFile(schemaPath) + if err != nil { + t.Fatalf("reading %s: %v", schemaPath, err) + } + + var schema jsonschema.Schema + if err := json.Unmarshal(schemaBytes, &schema); err != nil { + t.Fatalf("unmarshalling schema: %v", err) + } + + resolved, err := schema.Resolve(&jsonschema.ResolveOptions{ + BaseURI: "https://github.com/mostlygeek/llama-swap/", + }) + if err != nil { + t.Fatalf("resolving schema: %v", err) + } + + exampleBytes, err := os.ReadFile(examplePath) + if err != nil { + t.Fatalf("reading %s: %v", examplePath, err) + } + + // Convert YAML to a JSON-like value so numbers and keys match what the + // validator expects. + var yamlValue any + if err := yaml.Unmarshal(exampleBytes, &yamlValue); err != nil { + t.Fatalf("unmarshalling example yaml: %v", err) + } + jsonBytes, err := json.Marshal(yamlValue) + if err != nil { + t.Fatalf("converting example to json: %v", err) + } + var instance any + if err := json.Unmarshal(jsonBytes, &instance); err != nil { + t.Fatalf("unmarshalling example json: %v", err) + } + + if err := resolved.Validate(instance); err != nil { + t.Fatalf("config.example.yaml does not match config-schema.json:\n%v", err) + } +} diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 1382fe3b..da63db73 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -1544,3 +1544,174 @@ peers: assert.Equal(t, 1, peerConfig.Timeouts.ExpectContinue) assert.Equal(t, 90, peerConfig.Timeouts.IdleConn) } + +// twoModels is a minimal models block reused by the routing tests below. +const twoModels = ` +models: + gemma: + cmd: echo gemma + proxy: http://localhost:8080 + qwen: + cmd: echo qwen + proxy: http://localhost:8081 +` + +func TestConfig_Routing_LegacyTopLevelGroups(t *testing.T) { + yaml := twoModels + ` +groups: + g1: + members: [gemma, qwen] +` + cfg, err := LoadConfigFromReader(strings.NewReader(yaml)) + require.NoError(t, err) + assert.Equal(t, "group", cfg.Routing.Router.Use) + // default group injected for orphaned models (none here) still leaves g1 + assert.Contains(t, cfg.Routing.Router.Settings.Groups, "g1") + assert.Equal(t, "fifo", cfg.Routing.Scheduler.Use) +} + +func TestConfig_Routing_LegacyTopLevelMatrix(t *testing.T) { + yaml := twoModels + ` +matrix: + vars: + g: gemma + q: qwen + sets: + combo: "g | q" +` + cfg, err := LoadConfigFromReader(strings.NewReader(yaml)) + require.NoError(t, err) + assert.Equal(t, "matrix", cfg.Routing.Router.Use) + require.NotNil(t, cfg.Routing.Router.Settings.Matrix) + assert.Len(t, cfg.Routing.Router.Settings.Matrix.ExpandedSets, 2) +} + +func TestConfig_Routing_RouterUseMatrix(t *testing.T) { + yaml := twoModels + ` +routing: + router: + use: matrix + settings: + matrix: + vars: + g: gemma + q: qwen + sets: + combo: "g | q" +` + cfg, err := LoadConfigFromReader(strings.NewReader(yaml)) + require.NoError(t, err) + assert.Equal(t, "matrix", cfg.Routing.Router.Use) + require.NotNil(t, cfg.Routing.Router.Settings.Matrix) + assert.Len(t, cfg.Routing.Router.Settings.Matrix.ExpandedSets, 2) +} + +func TestConfig_Routing_RouterUseGroup(t *testing.T) { + yaml := twoModels + ` +routing: + router: + use: group + settings: + groups: + g1: + members: [gemma, qwen] +` + cfg, err := LoadConfigFromReader(strings.NewReader(yaml)) + require.NoError(t, err) + assert.Equal(t, "group", cfg.Routing.Router.Use) + assert.Contains(t, cfg.Routing.Router.Settings.Groups, "g1") +} + +func TestConfig_Routing_DefaultsToGroup(t *testing.T) { + cfg, err := LoadConfigFromReader(strings.NewReader(twoModels)) + require.NoError(t, err) + assert.Equal(t, "group", cfg.Routing.Router.Use) + assert.Equal(t, "fifo", cfg.Routing.Scheduler.Use) +} + +func TestConfig_Routing_LegacyAndRoutingConflict(t *testing.T) { + yaml := twoModels + ` +groups: + g1: + members: [gemma, qwen] +routing: + router: + use: group +` + _, err := LoadConfigFromReader(strings.NewReader(yaml)) + require.Error(t, err) + assert.Contains(t, err.Error(), "migrate") +} + +func TestConfig_Routing_RouterUseMatrixWithoutSettings(t *testing.T) { + yaml := twoModels + ` +routing: + router: + use: matrix +` + _, err := LoadConfigFromReader(strings.NewReader(yaml)) + require.Error(t, err) + assert.Contains(t, err.Error(), "routing.router.settings.matrix is not set") +} + +// Both groups and matrix may be defined under routing.router.settings; +// routing.router.use selects which one is active. +func TestConfig_Routing_RouterSettingsBothGroupsAndMatrix(t *testing.T) { + yaml := twoModels + ` +routing: + router: + use: group + settings: + groups: + g1: + members: [gemma, qwen] + matrix: + sets: + s: "gemma" +` + config, err := LoadConfigFromReader(strings.NewReader(yaml)) + require.NoError(t, err) + // use: group means groups are active and matrix is ignored + assert.Equal(t, "group", config.Routing.Router.Use) + assert.Nil(t, config.Matrix) + assert.Contains(t, config.Groups, "g1") +} + +func TestConfig_Routing_UnknownRouter(t *testing.T) { + yaml := twoModels + ` +routing: + router: + use: bogus +` + _, err := LoadConfigFromReader(strings.NewReader(yaml)) + require.Error(t, err) + assert.Contains(t, err.Error(), "unknown router") +} + +func TestConfig_Routing_FifoPriorityUnknownModel(t *testing.T) { + yaml := twoModels + ` +routing: + scheduler: + settings: + fifo: + priority: + nope: 5 +` + _, err := LoadConfigFromReader(strings.NewReader(yaml)) + require.Error(t, err) + assert.Contains(t, err.Error(), "unknown model") +} + +func TestConfig_Routing_FifoPriorityKnownModel(t *testing.T) { + yaml := twoModels + ` +routing: + scheduler: + settings: + fifo: + priority: + gemma: 5 +` + cfg, err := LoadConfigFromReader(strings.NewReader(yaml)) + require.NoError(t, err) + assert.Equal(t, 5, cfg.Routing.Scheduler.Settings.Fifo.Priority["gemma"]) +} diff --git a/internal/config/config_windows_test.go b/internal/config/config_windows_test.go index dcba2d7d..4992be3b 100644 --- a/internal/config/config_windows_test.go +++ b/internal/config/config_windows_test.go @@ -165,6 +165,25 @@ groups: IdleConn: 90, } + expectedGroups := map[string]GroupConfig{ + DEFAULT_GROUP_ID: { + Swap: true, + Exclusive: true, + Members: []string{"model1", "model3"}, + }, + "group1": { + Swap: true, + Exclusive: false, + Members: []string{"model2"}, + }, + "forever": { + Swap: true, + Exclusive: false, + Persistent: true, + Members: []string{"model4"}, + }, + } + expected := Config{ LogLevel: "info", LogTimeFormat: "", @@ -235,22 +254,16 @@ groups: "m2": "model2", "mthree": "model3", }, - Groups: map[string]GroupConfig{ - DEFAULT_GROUP_ID: { - Swap: true, - Exclusive: true, - Members: []string{"model1", "model3"}, + Groups: expectedGroups, + Routing: RoutingConfig{ + Router: RouterConfig{ + Use: "group", + Settings: RouterSettings{ + Groups: expectedGroups, + }, }, - "group1": { - Swap: true, - Exclusive: false, - Members: []string{"model2"}, - }, - "forever": { - Swap: true, - Exclusive: false, - Persistent: true, - Members: []string{"model4"}, + Scheduler: SchedulerConfig{ + Use: "fifo", }, }, } diff --git a/internal/config/matrix.go b/internal/config/matrix.go index 7ac9e046..0c3000ec 100644 --- a/internal/config/matrix.go +++ b/internal/config/matrix.go @@ -15,6 +15,9 @@ type MatrixConfig struct { Var map[string]string `yaml:"vars"` EvictCosts map[string]int `yaml:"evict_costs"` Sets OrderedSets `yaml:"sets"` + + // populated by ValidateMatrix; not settable from yaml + ExpandedSets []ExpandedSet `yaml:"-"` } // SetEntry is a single named set with its DSL expression. diff --git a/internal/config/matrix_test.go b/internal/config/matrix_test.go index 6d497b7d..ebf07dc5 100644 --- a/internal/config/matrix_test.go +++ b/internal/config/matrix_test.go @@ -289,7 +289,9 @@ matrix: cfg, err := LoadConfigFromReader(strings.NewReader(yaml)) require.NoError(t, err) assert.NotNil(t, cfg.Matrix) - assert.Len(t, cfg.ExpandedSets, 2) + assert.Len(t, cfg.Matrix.ExpandedSets, 2) + assert.Equal(t, "matrix", cfg.Routing.Router.Use) + assert.Len(t, cfg.Routing.Router.Settings.Matrix.ExpandedSets, 2) // Groups should be empty when matrix is used assert.Empty(t, cfg.Groups) } diff --git a/internal/router/base.go b/internal/router/base.go index 1002efcf..ab6a3c4d 100644 --- a/internal/router/base.go +++ b/internal/router/base.go @@ -11,6 +11,7 @@ import ( "github.com/mostlygeek/llama-swap/internal/config" "github.com/mostlygeek/llama-swap/internal/logmon" "github.com/mostlygeek/llama-swap/internal/process" + "github.com/mostlygeek/llama-swap/internal/router/scheduler" ) type shutdownReq struct { @@ -24,56 +25,17 @@ type unloadReq struct { respond chan struct{} } -type handlerReq struct { - model string - ctx context.Context - respond chan handlerResp - positionCh chan int -} - -type handlerResp struct { - handleFunc http.HandlerFunc - err error -} - -type swapDone struct { - modelID string - err error -} - -type serveDoneEvent struct { - modelID string -} - -type activeSwap struct { - modelID string - evict []string - waiters []handlerReq -} - -// swapPlanner is the only piece of behaviour that differs between concrete -// routers. baseRouter never inspects its internals. -type swapPlanner interface { - // EvictionFor returns running model IDs that must be stopped before - // target can serve. alsoRunning lists models the baseRouter has already - // committed to loading (in-flight swaps) which the planner cannot see - // via process.State() yet. Pure decision; must not log. - EvictionFor(target string, alsoRunning []string) []string - - // OnSwapStart runs once at the start of every swap. Planners may log - // their decision here at whatever verbosity they choose. - OnSwapStart(target string) -} - -// baseRouter owns the channels, run-loop, and orchestration code shared by -// every concrete router. Concrete routers embed *baseRouter and supply a -// swapPlanner that captures how their eviction set is decided. +// baseRouter owns the channels, run-loop, and process machinery shared by every +// concrete router. Concrete routers embed *baseRouter and supply a +// scheduler.Factory (which captures their scheduler.Swapper) describing how +// requests are scheduled and how their eviction set is decided. baseRouter +// implements scheduler.Effects so the scheduler can call back for side-effects. type baseRouter struct { name string config config.Config processes map[string]process.Process logger *logmon.Monitor - planner swapPlanner + schedule scheduler.Scheduler // shutdownCtx governs the request machinery: cancelling it tells grant() // and ServeHTTP to stop granting and reject callers. It is deliberately @@ -90,11 +52,11 @@ type baseRouter struct { procCtx context.Context procCancel context.CancelFunc - handlerCh chan handlerReq + handlerCh chan scheduler.HandlerReq shutdownCh chan shutdownReq unloadCh chan unloadReq - swapDoneCh chan swapDone - serveDoneCh chan serveDoneEvent + swapDoneCh chan scheduler.SwapDone + serveDoneCh chan scheduler.ServeDoneEvent runDone chan struct{} @@ -106,26 +68,33 @@ type baseRouter struct { testProcessed chan struct{} } -func newBaseRouter(name string, conf config.Config, processes map[string]process.Process, planner swapPlanner, logger *logmon.Monitor) *baseRouter { +func newBaseRouter( + name string, + conf config.Config, + processes map[string]process.Process, + logger *logmon.Monitor, + newSched scheduler.Factory, +) *baseRouter { shutdownCtx, shutdownFn := context.WithCancel(context.Background()) procCtx, procCancel := context.WithCancel(context.Background()) - return &baseRouter{ + b := &baseRouter{ name: name, config: conf, processes: processes, logger: logger, - planner: planner, shutdownCtx: shutdownCtx, shutdownFn: shutdownFn, procCtx: procCtx, procCancel: procCancel, - handlerCh: make(chan handlerReq), + handlerCh: make(chan scheduler.HandlerReq), shutdownCh: make(chan shutdownReq), unloadCh: make(chan unloadReq), - swapDoneCh: make(chan swapDone), - serveDoneCh: make(chan serveDoneEvent), + swapDoneCh: make(chan scheduler.SwapDone), + serveDoneCh: make(chan scheduler.ServeDoneEvent), runDone: make(chan struct{}), } + b.schedule = newSched(name, logger, b) + return b } func (b *baseRouter) notifyProcessed() { @@ -137,30 +106,27 @@ func (b *baseRouter) notifyProcessed() { func (b *baseRouter) run() { defer close(b.runDone) - active := make(map[string]*activeSwap) - inFlight := make(map[string]int) - var queued []handlerReq - for { select { case req := <-b.shutdownCh: - b.handleShutdown(req, active, queued) + b.handleShutdown(req) return case req := <-b.handlerCh: - b.handleRequest(req, active, inFlight, &queued) + b.schedule.OnRequest(req) b.notifyProcessed() case req := <-b.unloadCh: - b.handleUnload(req, active, inFlight, &queued) + b.schedule.OnUnload(req.targets, req.timeout) + close(req.respond) b.notifyProcessed() case ev := <-b.swapDoneCh: - b.handleSwapDone(ev, active, inFlight, &queued) + b.schedule.OnSwapDone(ev) b.notifyProcessed() case ev := <-b.serveDoneCh: - b.handleServeDone(ev, active, inFlight, &queued) + b.schedule.OnServeDone(ev) } } } @@ -177,37 +143,68 @@ func (b *baseRouter) run() { // down, the send never lands, one of the other select cases fires, and we // report back that the grant did NOT happen. // -// That distinction matters for in-flight bookkeeping — see grantHandler. -func (b *baseRouter) grant(req handlerReq, resp handlerResp) bool { +// That distinction matters for in-flight bookkeeping — see GrantServe. +func (b *baseRouter) grant(req scheduler.HandlerReq, resp scheduler.HandlerResp) bool { select { - case req.respond <- resp: + case req.Respond <- resp: return true - case <-req.ctx.Done(): + case <-req.Ctx.Done(): return false case <-b.shutdownCtx.Done(): return false } } -// grantHandler is the "this caller can now use process p" path. It does -// two things that must stay locked together: -// -// 1. Hand the caller a wrapped p.ServeHTTP (via trackedServe) so when the -// HTTP request finishes, the run loop hears about it. -// 2. Bump inFlight[modelID] so the router knows this process is busy and -// refuses to evict it until the count comes back down. -// -// The increment is gated on grant() returning true. If grant() returns -// false, the caller already walked away and trackedServe will never run — -// which means no matching decrement will ever arrive on serveDoneCh. -// Incrementing in that case would strand the counter at >0 forever and -// the router would never again be willing to swap this model out. -// -// In short: increment if and only if we know a decrement is coming. -func (b *baseRouter) grantHandler(req handlerReq, modelID string, p process.Process, inFlight map[string]int) { - if b.grant(req, handlerResp{handleFunc: b.trackedServe(modelID, p)}) { - inFlight[modelID]++ +// ModelState implements scheduler.Effects. +func (b *baseRouter) ModelState(modelID string) (process.ProcessState, bool) { + p, ok := b.processes[modelID] + if !ok { + var zero process.ProcessState + return zero, false } + return p.State(), true +} + +// StartSwap implements scheduler.Effects, launching the swap goroutine. +func (b *baseRouter) StartSwap(modelID string, evict []string) { + go b.doSwap(modelID, evict) +} + +// GrantError implements scheduler.Effects. +func (b *baseRouter) GrantError(req scheduler.HandlerReq, err error) { + b.grant(req, scheduler.HandlerResp{Err: err}) +} + +// GrantServe implements scheduler.Effects. It hands the caller a wrapped +// p.ServeHTTP (via trackedServe) so the run loop hears about the request +// finishing, and reports whether the caller received it. The scheduler bumps +// its in-flight count only on a true return: if grant() returns false the +// caller already walked away and trackedServe will never run, so no matching +// decrement will ever arrive — incrementing would strand the counter at >0 and +// the router would never again be willing to evict this model. +func (b *baseRouter) GrantServe(req scheduler.HandlerReq, modelID string) bool { + p := b.processes[modelID] + return b.grant(req, scheduler.HandlerResp{HandleFunc: b.trackedServe(modelID, p)}) +} + +// StopProcesses implements scheduler.Effects, stopping the named processes in +// parallel and blocking until all have stopped. +func (b *baseRouter) StopProcesses(timeout time.Duration, ids []string) { + var wg sync.WaitGroup + for _, id := range ids { + p, ok := b.processes[id] + if !ok { + continue + } + wg.Add(1) + go func(id string, p process.Process) { + defer wg.Done() + if err := p.Stop(timeout); err != nil { + b.logger.Warnf("%s: stopping %s failed: %v", b.name, id, err) + } + }(id, p) + } + wg.Wait() } // trackedServe is the wrapper that closes the loop on in-flight tracking. @@ -224,7 +221,7 @@ func (b *baseRouter) trackedServe(modelID string, p process.Process) http.Handle return func(w http.ResponseWriter, r *http.Request) { defer func() { select { - case b.serveDoneCh <- serveDoneEvent{modelID: modelID}: + case b.serveDoneCh <- scheduler.ServeDoneEvent{ModelID: modelID}: case <-b.shutdownCtx.Done(): } }() @@ -232,240 +229,6 @@ func (b *baseRouter) trackedServe(modelID string, p process.Process) http.Handle } } -// handleRequest decides what to do with one incoming ServeHTTP request. It is -// called from run() and never blocks indefinitely: any work that has to wait -// (starting a process, stopping siblings, waiting for ready) is deferred to -// a swap goroutine and reported back via swapDoneCh. -// -// The decision tree, in order: -// -// 1. Unknown model — respond with ErrNoLocalModelFound and move on. -// 2. A swap to the same model is already in flight — attach this waiter so -// one swap serves all callers that asked for the same model. -// 3. Fast path — the target process is already ready, the planner sees -// nothing to evict, and no in-flight swap is evicting it. Hand back its -// ServeHTTP immediately (wrapped so the run loop knows when it ends). -// 4. Would collide with an in-flight swap (we'd stop their target, or -// they're stopping us) — park in the queue for handleSwapDone to drain. -// 5. Would evict a process that is still handling requests — park in the -// queue. handleServeDone will retry when the busy process drains. -// 6. Otherwise — start a new swap. This may run in parallel with other -// active swaps when their evict sets don't intersect. -func (b *baseRouter) handleRequest(req handlerReq, active map[string]*activeSwap, inFlight map[string]int, queued *[]handlerReq) { - // (1) Unknown model. - p, ok := b.processes[req.model] - if !ok { - b.logger.Debugf("%s: model %s not handled by this router", b.name, req.model) - b.grant(req, handlerResp{err: ErrNoLocalModelFound}) - return - } - - // (2) Join an in-flight swap for the same model. - if s, ok := active[req.model]; ok { - b.logger.Debugf("%s: joining in-flight swap for model %s (%d waiters)", b.name, req.model, len(s.waiters)+1) - s.waiters = append(s.waiters, req) - return - } - - evict := b.planner.EvictionFor(req.model, activeTargets(active, req.model)) - - // (3) Fast path: ready, nothing to evict, and nobody is evicting us. - if p.State() == process.StateReady && len(evict) == 0 && !collidesWith(req.model, evict, active) { - b.logger.Debugf("%s: fast-path serving model %s (already ready)", b.name, req.model) - b.grantHandler(req, req.model, p, inFlight) - return - } - - // (4) Collision with an in-flight swap — queue. - if collidesWith(req.model, evict, active) { - b.logger.Debugf("%s: queuing request for model %s (collides with in-flight swap)", b.name, req.model) - *queued = append(*queued, req) - b.broadcastQueuePositions(*queued) - return - } - - // (5) Would evict a busy process — queue until it drains. - if conflictsWithInFlight(evict, inFlight) { - b.logger.Debugf("%s: queuing request for model %s (would evict in-flight process)", b.name, req.model) - *queued = append(*queued, req) - b.broadcastQueuePositions(*queued) - return - } - - // (6) Start a new (possibly parallel) swap. - b.logger.Debugf("%s: starting swap for model %s, evicting %v", b.name, req.model, evict) - s := b.startSwap(req, evict) - active[s.modelID] = s -} - -// handleSwapDone is called from run() when a swap goroutine reports that it -// has finished. It fans out the result to every waiter that joined this swap, -// removes the swap from the active map, and then walks the queue once, -// promoting any items that no longer collide with the remaining active set. -// FIFO order is preserved: items still blocked stay in place. -func (b *baseRouter) handleSwapDone(ev swapDone, active map[string]*activeSwap, inFlight map[string]int, queued *[]handlerReq) { - s, ok := active[ev.modelID] - if !ok { - return - } - delete(active, ev.modelID) - - for _, w := range s.waiters { - if ev.err != nil { - b.grant(w, handlerResp{err: ev.err}) - } else { - p := b.processes[ev.modelID] - b.grantHandler(w, ev.modelID, p, inFlight) - } - } - - b.drainQueue(active, inFlight, queued) -} - -// handleServeDone is called from run() each time a tracked ServeHTTP -// finishes. It decrements the per-model in-flight count and, when that -// drops to zero, retries the queue: requests whose swap was deferred -// because they would have evicted this (now-idle) process can now proceed. -func (b *baseRouter) handleServeDone(ev serveDoneEvent, active map[string]*activeSwap, inFlight map[string]int, queued *[]handlerReq) { - inFlight[ev.modelID]-- - if inFlight[ev.modelID] <= 0 { - delete(inFlight, ev.modelID) - b.drainQueue(active, inFlight, queued) - } -} - -// drainQueue walks the queued requests in order, re-running the handleRequest -// decision tree against the (now smaller) active set. Items that can now start -// or join become satisfied; items still blocked remain queued in original -// order so they get another chance on the next swap completion. -func (b *baseRouter) drainQueue(active map[string]*activeSwap, inFlight map[string]int, queued *[]handlerReq) { - if len(*queued) == 0 { - return - } - pending := *queued - var remaining []handlerReq - for _, req := range pending { - p, ok := b.processes[req.model] - if !ok { - b.grant(req, handlerResp{err: ErrNoLocalModelFound}) - continue - } - if s, ok := active[req.model]; ok { - b.logger.Debugf("%s: queued request for model %s now joining in-flight swap", b.name, req.model) - s.waiters = append(s.waiters, req) - continue - } - evict := b.planner.EvictionFor(req.model, activeTargets(active, req.model)) - if p.State() == process.StateReady && len(evict) == 0 && !collidesWith(req.model, evict, active) { - b.logger.Debugf("%s: queued request for model %s now served fast-path", b.name, req.model) - b.grantHandler(req, req.model, p, inFlight) - continue - } - if collidesWith(req.model, evict, active) { - remaining = append(remaining, req) - continue - } - if conflictsWithInFlight(evict, inFlight) { - remaining = append(remaining, req) - continue - } - b.logger.Debugf("%s: queued request for model %s now starting swap, evicting %v", b.name, req.model, evict) - s := b.startSwap(req, evict) - active[s.modelID] = s - } - *queued = remaining - b.broadcastQueuePositions(*queued) -} - -// broadcastQueuePositions sends each queued request its current 1-indexed -// position. Sends are non-blocking: if the channel is full, the old value is -// drained first so the consumer always sees the latest position. -func (b *baseRouter) broadcastQueuePositions(queued []handlerReq) { - for i, req := range queued { - pos := i + 1 - select { - case req.positionCh <- pos: - default: - select { - case <-req.positionCh: - default: - } - select { - case req.positionCh <- pos: - default: - } - } - } -} - -func (b *baseRouter) startSwap(initial handlerReq, evict []string) *activeSwap { - swap := &activeSwap{ - modelID: initial.model, - evict: evict, - waiters: []handlerReq{initial}, - } - b.planner.OnSwapStart(initial.model) - go b.doSwap(initial.model, evict) - return swap -} - -// activeTargets returns the IDs of every in-flight swap target except exclude. -// baseRouter passes this to the planner so eviction decisions account for -// models that have been committed to but have not yet transitioned to -// StateStarting in their process state machine. -func activeTargets(active map[string]*activeSwap, exclude string) []string { - if len(active) == 0 { - return nil - } - out := make([]string, 0, len(active)) - for id := range active { - if id == exclude { - continue - } - out = append(out, id) - } - return out -} - -// collidesWith reports whether a new swap with this target and evict set can -// safely run alongside the currently active swaps. Same-target callers should -// JOIN (handled before this) — they do not collide with themselves. -func collidesWith(target string, evict []string, active map[string]*activeSwap) bool { - for id, s := range active { - if id == target { - continue - } - if containsString(evict, id) { - return true - } - if containsString(s.evict, target) { - return true - } - } - return false -} - -// conflictsWithInFlight reports whether any model in evict is still handling -// requests. Stopping a busy process would cancel its callers' connections, -// so the router defers the swap until those callers finish. -func conflictsWithInFlight(evict []string, inFlight map[string]int) bool { - for _, m := range evict { - if inFlight[m] > 0 { - return true - } - } - return false -} - -func containsString(xs []string, s string) bool { - for _, x := range xs { - if x == s { - return true - } - } - return false -} - func (b *baseRouter) doSwap(modelID string, toStop []string) { timeout := b.healthCheckTimeout() @@ -493,31 +256,24 @@ func (b *baseRouter) doSwap(modelID string, toStop []string) { err := target.WaitReady(b.shutdownCtx) select { - case b.swapDoneCh <- swapDone{modelID: modelID, err: err}: + case b.swapDoneCh <- scheduler.SwapDone{ModelID: modelID, Err: err}: case <-b.shutdownCtx.Done(): } } -func (b *baseRouter) handleShutdown(req shutdownReq, active map[string]*activeSwap, queued []handlerReq) { +func (b *baseRouter) handleShutdown(req shutdownReq) { shutdownErr := fmt.Errorf("%s is shutting down", b.name) // Cancel shutdownCtx first so any waiter that is currently parked on // its respond channel can exit via its own shutdownCtx.Done() branch. - // The grant calls below then either land (waiter happened to receive + // The OnShutdown grants below then either land (waiter happened to receive // before noticing shutdown) or fall through immediately via grant's // shutdownCtx case — either way the waiter sees a non-OK response. // This does NOT touch processes: their lifetime is procCtx, cancelled // only after the graceful Stop() calls below have reaped them. b.shutdownFn() - for _, s := range active { - for _, w := range s.waiters { - b.grant(w, handlerResp{err: shutdownErr}) - } - } - for _, w := range queued { - b.grant(w, handlerResp{err: shutdownErr}) - } + b.schedule.OnShutdown(shutdownErr) stopTimeout := req.timeout if stopTimeout <= 0 { @@ -628,75 +384,6 @@ func (b *baseRouter) Unload(timeout time.Duration, models ...string) { <-req.respond } -// handleUnload runs on the run loop in response to an Unload call. It -// reconciles router-owned state with the impending Stop, then performs -// the Stop synchronously so callers of Unload remain blocked until each -// targeted process has actually exited. -func (b *baseRouter) handleUnload(req unloadReq, active map[string]*activeSwap, inFlight map[string]int, queued *[]handlerReq) { - unloadErr := fmt.Errorf("%s: model unloaded", b.name) - - targetSet := make(map[string]bool, len(req.targets)) - for _, id := range req.targets { - targetSet[id] = true - } - - // Release waiters of any in-flight swap whose target is being - // unloaded. The swap goroutine itself is left to finish on its own; - // when its swapDone arrives, handleSwapDone will find no entry in - // active and silently drop it. - for id := range targetSet { - s, ok := active[id] - if !ok { - continue - } - for _, w := range s.waiters { - b.grant(w, handlerResp{err: unloadErr}) - } - delete(active, id) - } - - // Drop queued requests addressed to unloaded models. Requests for - // other models stay queued and may benefit from drainQueue at the end. - if len(*queued) > 0 { - kept := (*queued)[:0] - for _, w := range *queued { - if targetSet[w.model] { - b.grant(w, handlerResp{err: unloadErr}) - continue - } - kept = append(kept, w) - } - *queued = kept - } - - // Stop the targeted processes. Done synchronously so Unload's caller - // can rely on "after Unload returns, the process is stopped". inFlight - // is intentionally NOT cleared here: each dying handler will fire its - // trackedServe defer and reach handleServeDone in the normal way once - // the run loop is free again. - var wg sync.WaitGroup - for id := range targetSet { - p, ok := b.processes[id] - if !ok { - continue - } - wg.Add(1) - go func(id string, p process.Process) { - defer wg.Done() - if err := p.Stop(req.timeout); err != nil { - b.logger.Warnf("%s: unloading %s failed: %v", b.name, id, err) - } - }(id, p) - } - wg.Wait() - - // Removing entries from active above may have unblocked queued - // requests that previously collided with the now-cancelled swaps. - b.drainQueue(active, inFlight, queued) - - close(req.respond) -} - func (b *baseRouter) Shutdown(timeout time.Duration) error { if !b.shuttingDown.CompareAndSwap(false, true) { return fmt.Errorf("%s shutdown already in progress", b.name) @@ -722,14 +409,14 @@ func (b *baseRouter) ServeHTTP(w http.ResponseWriter, req *http.Request) { return } - hr := handlerReq{ - model: data.ModelID, - ctx: req.Context(), - // Unbuffered: a successful send on respond proves the waiter is + hr := scheduler.HandlerReq{ + Model: data.ModelID, + Ctx: req.Context(), + // Unbuffered: a successful send on Respond proves the waiter is // alive and consuming. grant() relies on this to avoid handing a // handleFunc to a cancelled waiter and leaking the inFlight count. - respond: make(chan handlerResp), - positionCh: make(chan int, 1), + Respond: make(chan scheduler.HandlerResp), + PositionCh: make(chan int, 1), } select { @@ -757,7 +444,7 @@ func (b *baseRouter) ServeHTTP(w http.ResponseWriter, req *http.Request) { go func() { for { select { - case pos := <-hr.positionCh: + case pos := <-hr.PositionCh: lw.setUpdate(fmt.Sprintf("Queue position: #%d", pos)) case <-swapCtx.Done(): return @@ -779,9 +466,9 @@ func (b *baseRouter) ServeHTTP(w http.ResponseWriter, req *http.Request) { } } - var resp handlerResp + var resp scheduler.HandlerResp select { - case resp = <-hr.respond: + case resp = <-hr.Respond: finishLoading() case <-req.Context().Done(): finishLoading() @@ -792,9 +479,9 @@ func (b *baseRouter) ServeHTTP(w http.ResponseWriter, req *http.Request) { return } - if resp.err != nil { - SendError(w, req, resp.err) + if resp.Err != nil { + SendError(w, req, resp.Err) return } - resp.handleFunc(w, req) + resp.HandleFunc(w, req) } diff --git a/internal/router/base_test.go b/internal/router/base_test.go index 3f63f939..82e9e0ff 100644 --- a/internal/router/base_test.go +++ b/internal/router/base_test.go @@ -5,35 +5,34 @@ import ( "io" "net/http" "net/http/httptest" - "sync" "testing" "time" "github.com/mostlygeek/llama-swap/internal/config" "github.com/mostlygeek/llama-swap/internal/logmon" "github.com/mostlygeek/llama-swap/internal/process" + "github.com/mostlygeek/llama-swap/internal/router/scheduler" ) -// stubPlanner is a swapPlanner that returns a fixed eviction list per target -// and never logs. It lets the base-router tests cover shared run-loop -// behaviour without dragging in either real router's eviction rules. -type stubPlanner struct { - evict map[string][]string -} +// These tests cover baseRouter's own machinery — the run loop, process +// lifecycle (doSwap), grant/ServeHTTP plumbing, Unload, and Shutdown. The +// scheduling decision logic (queueing, collation, eviction collisions) lives in +// the scheduler package and is tested directly there; see fifo_test.go. -func (s *stubPlanner) EvictionFor(target string, _ []string) []string { - if s.evict == nil { - return nil - } - return s.evict[target] -} +// stubPlanner evicts nothing. baseRouter tests drive the run loop through the +// default FIFO scheduler without exercising any particular eviction policy. +type stubPlanner struct{} -func (s *stubPlanner) OnSwapStart(string) {} +func (s *stubPlanner) EvictionFor(string, []string) []string { return nil } +func (s *stubPlanner) OnSwapStart(string, []string) {} -func newTestBase(t *testing.T, processes map[string]process.Process, planner swapPlanner) *baseRouter { +func newTestBase(t *testing.T, processes map[string]process.Process, planner scheduler.Swapper) *baseRouter { t.Helper() conf := config.Config{HealthCheckTimeout: 5} - b := newBaseRouter("test", conf, processes, planner, logmon.NewWriter(io.Discard)) + b := newBaseRouter("test", conf, processes, logmon.NewWriter(io.Discard), + func(name string, logger *logmon.Monitor, eff scheduler.Effects) scheduler.Scheduler { + return scheduler.NewFIFO(name, logger, planner, conf.Routing.Scheduler.Settings.Fifo, eff) + }) b.testProcessed = make(chan struct{}, 64) go b.run() t.Cleanup(func() { @@ -157,114 +156,6 @@ func TestBaseRouter_Unload_StopsInParallel(t *testing.T) { } } -// TestBaseRouter_Unload_ReleasesActiveSwapWaiters verifies that Unload -// rejoins router state: a request whose swap to the unloaded model is -// still in progress receives an error, instead of being abandoned -// against a process that's about to vanish. -func TestBaseRouter_Unload_ReleasesActiveSwapWaiters(t *testing.T) { - a := newFakeProcess("a") - // autoReady=false: the swap parks on WaitReady so we can interrupt - // it with Unload before it completes. - - b := newTestBase(t, map[string]process.Process{"a": a}, &stubPlanner{}) - - w := httptest.NewRecorder() - done := make(chan struct{}) - go func() { - b.ServeHTTP(w, newRequest("a")) - close(done) - }() - waitProcessed(t, b.testProcessed, 1) // handlerReq absorbed; swap started - <-a.runStarted - - b.Unload(time.Second, "a") - - select { - case <-done: - case <-time.After(2 * time.Second): - t.Fatal("ServeHTTP did not return after Unload") - } - if w.Code == http.StatusOK { - t.Errorf("expected non-OK status after Unload, got %d body=%q", w.Code, w.Body.String()) - } - if a.State() != process.StateStopped { - t.Errorf("a state=%q want stopped", a.State()) - } -} - -// TestBaseRouter_Unload_DropsQueuedRequests verifies that queued requests -// for an unloaded model receive an error rather than sitting forever in -// the queue against state the router no longer maintains. -func TestBaseRouter_Unload_DropsQueuedRequests(t *testing.T) { - a := newFakeProcess("a") - pb := newFakeProcess("b") - // Loading B evicts A — so a request for B while A is loading queues. - planner := &stubPlanner{evict: map[string][]string{"b": {"a"}}} - b := newTestBase(t, map[string]process.Process{"a": a, "b": pb}, planner) - - // r1 starts the swap to A and parks on WaitReady (autoReady=false). - w1 := httptest.NewRecorder() - done1 := make(chan struct{}) - go func() { - b.ServeHTTP(w1, newRequest("a")) - close(done1) - }() - waitProcessed(t, b.testProcessed, 1) - <-a.runStarted - - // r2 for B collides with A's in-flight swap and queues. - w2 := httptest.NewRecorder() - done2 := make(chan struct{}) - go func() { - b.ServeHTTP(w2, newRequest("b")) - close(done2) - }() - waitProcessed(t, b.testProcessed, 1) - - // Unload B — r2 (queued, targeting B) must be released with an error. - b.Unload(time.Second, "b") - - select { - case <-done2: - case <-time.After(2 * time.Second): - t.Fatal("queued B request did not return after Unload(b)") - } - if w2.Code == http.StatusOK { - t.Errorf("queued B request: expected non-OK status, got %d", w2.Code) - } - if got := pb.runCalls.Load(); got != 0 { - t.Errorf("b.runCalls=%d want 0 (B should never have been started)", got) - } - - // Release r1 so the test cleans up cleanly. - a.markReady() - select { - case <-done1: - case <-time.After(2 * time.Second): - t.Fatal("r1 did not complete after a.markReady") - } -} - -func TestBaseRouter_FastPath(t *testing.T) { - a := newFakeProcess("a") - a.markReady() - - b := newTestBase(t, map[string]process.Process{"a": a}, &stubPlanner{}) - - w := httptest.NewRecorder() - b.ServeHTTP(w, newRequest("a")) - - if w.Code != http.StatusOK { - t.Fatalf("status=%d body=%q", w.Code, w.Body.String()) - } - if got := a.serveCalls.Load(); got != 1 { - t.Errorf("serveCalls=%d want 1", got) - } - if got := a.runCalls.Load(); got != 0 { - t.Errorf("runCalls=%d want 0 (fast path should not start)", got) - } -} - func TestBaseRouter_OnDemandStart(t *testing.T) { a := newFakeProcess("a") a.autoReady = true @@ -285,43 +176,6 @@ func TestBaseRouter_OnDemandStart(t *testing.T) { } } -func TestBaseRouter_ConcurrentSameModel(t *testing.T) { - a := newFakeProcess("a") - // autoReady=false so the swap parks on WaitReady until we release it. - - b := newTestBase(t, map[string]process.Process{"a": a}, &stubPlanner{}) - - const N = 5 - var wg sync.WaitGroup - codes := make([]int, N) - for i := 0; i < N; i++ { - wg.Add(1) - go func(i int) { - defer wg.Done() - w := httptest.NewRecorder() - b.ServeHTTP(w, newRequest("a")) - codes[i] = w.Code - }(i) - } - - waitProcessed(t, b.testProcessed, N) // all N handlerReqs absorbed by run() - <-a.runStarted // swap goroutine reached Run() - a.markReady() - wg.Wait() - - for i, c := range codes { - if c != http.StatusOK { - t.Errorf("request %d: status=%d", i, c) - } - } - if got := a.runCalls.Load(); got != 1 { - t.Errorf("runCalls=%d want 1 (single swap should issue one Run)", got) - } - if got := a.serveCalls.Load(); got != N { - t.Errorf("serveCalls=%d want %d", got, N) - } -} - func TestBaseRouter_ContextCancel(t *testing.T) { a := newFakeProcess("a") // autoReady=false so swap parks forever until we mark ready. @@ -364,459 +218,6 @@ func TestBaseRouter_ContextCancel(t *testing.T) { } } -func TestBaseRouter_QueuedDifferentModel(t *testing.T) { - a := newFakeProcess("a") - pa := newFakeProcess("b") - - // Loading b must stop a. - planner := &stubPlanner{evict: map[string][]string{"b": {"a"}}} - b := newTestBase(t, map[string]process.Process{"a": a, "b": pa}, planner) - - // First request starts a swap to A; A's autoReady=false so it parks. - w1 := httptest.NewRecorder() - done1 := make(chan struct{}) - go func() { - b.ServeHTTP(w1, newRequest("a")) - close(done1) - }() - waitProcessed(t, b.testProcessed, 1) - <-a.runStarted - - // Second request for B should queue while A's swap is in flight. - w2 := httptest.NewRecorder() - done2 := make(chan struct{}) - go func() { - b.ServeHTTP(w2, newRequest("b")) - close(done2) - }() - waitProcessed(t, b.testProcessed, 1) - - if got := pa.runCalls.Load(); got != 0 { - t.Errorf("b started early: runCalls=%d want 0 while A's swap is pending", got) - } - - // Release A's swap. B's swap should then run. - a.markReady() - waitProcessed(t, b.testProcessed, 1) // swapDone for A → B's swap kicked off - <-pa.runStarted - - select { - case <-done1: - case <-time.After(time.Second): - t.Fatal("A request did not complete") - } - pa.markReady() - select { - case <-done2: - case <-time.After(time.Second): - t.Fatal("queued B request did not complete after A's swap") - } - if w2.Code != http.StatusOK { - t.Errorf("B status=%d body=%q", w2.Code, w2.Body.String()) - } - if got := a.stopCalls.Load(); got != 1 { - t.Errorf("a.stopCalls=%d want 1 (B's swap must stop A)", got) - } -} - -// TestBaseRouter_QueueCollation verifies that incoming requests of the form -// a, b, c, a, b, c collapse into three swaps (one per model) and that the -// second request for each model rides the fast path — either joining the -// active swap, or being pulled out of the queue when handleSwapDone promotes -// the next model. -func TestBaseRouter_QueueCollation(t *testing.T) { - a := newFakeProcess("a") - pb := newFakeProcess("b") - pc := newFakeProcess("c") - - // Each model evicts the other two so all swaps are mutually exclusive. - planner := &stubPlanner{evict: map[string][]string{ - "a": {"b", "c"}, - "b": {"a", "c"}, - "c": {"a", "b"}, - }} - b := newTestBase(t, map[string]process.Process{"a": a, "b": pb, "c": pc}, planner) - - var ( - completedMu sync.Mutex - completed []string - ) - record := func(id string) { - completedMu.Lock() - defer completedMu.Unlock() - completed = append(completed, id) - } - - ids := []string{"a", "b", "c", "a", "b", "c"} - var wg sync.WaitGroup - for _, id := range ids { - id := id - wg.Add(1) - go func() { - defer wg.Done() - w := httptest.NewRecorder() - b.ServeHTTP(w, newRequest(id)) - if w.Code != http.StatusOK { - t.Errorf("%s: status=%d body=%q", id, w.Code, w.Body.String()) - return - } - record(id) - }() - // Wait for run() to absorb this request before launching the next, - // so handlerCh receives them in launch order. - waitProcessed(t, b.testProcessed, 1) - } - - // All 6 are now parked in run()'s waiters/queue. Release each swap in - // sequence, waiting deterministically for each promotion to fire. - <-a.runStarted - a.markReady() - waitProcessed(t, b.testProcessed, 1) // swapDone(a) → b swap kicked off - - <-pb.runStarted - pb.markReady() - waitProcessed(t, b.testProcessed, 1) // swapDone(b) → c swap kicked off - - <-pc.runStarted - pc.markReady() - wg.Wait() - - if got := len(completed); got != 6 { - t.Fatalf("completed=%v want 6", completed) - } - - // run() fans out responses in model-grouped order (a1,a2 → b1,b2 → c1,c2) - // but waiter goroutines may be scheduled in any order after their respond - // channel fires, so completion order isn't deterministic. Per-model counts - // (combined with the runCalls checks below) are sufficient to prove queue - // collation collapsed each pair into a single swap. - aDone, bDone, cDone := 0, 0, 0 - for _, id := range completed { - switch id { - case "a": - aDone++ - case "b": - bDone++ - case "c": - cDone++ - } - } - if aDone != 2 || bDone != 2 || cDone != 2 { - t.Errorf("per-model counts: a=%d b=%d c=%d, want 2 each (order=%v)", aDone, bDone, cDone, completed) - } - - // Single swap per model — the second request for each must have ridden - // the fast path (joined active swap or joined a queued sibling), not - // triggered an extra Run. - if got := a.runCalls.Load(); got != 1 { - t.Errorf("a.runCalls=%d want 1", got) - } - if got := pb.runCalls.Load(); got != 1 { - t.Errorf("b.runCalls=%d want 1", got) - } - if got := pc.runCalls.Load(); got != 1 { - t.Errorf("c.runCalls=%d want 1", got) - } -} - -// TestBaseRouter_ConcurrentDisjointSwaps verifies that two requests with -// non-conflicting evict sets are loaded in parallel: both Run() calls happen -// before either process is marked ready. -func TestBaseRouter_ConcurrentDisjointSwaps(t *testing.T) { - a := newFakeProcess("a") - pb := newFakeProcess("b") - - // Empty evict sets for both: they can load in parallel. - b := newTestBase(t, map[string]process.Process{"a": a, "b": pb}, &stubPlanner{}) - - w1 := httptest.NewRecorder() - done1 := make(chan struct{}) - go func() { - b.ServeHTTP(w1, newRequest("a")) - close(done1) - }() - waitProcessed(t, b.testProcessed, 1) - - w2 := httptest.NewRecorder() - done2 := make(chan struct{}) - go func() { - b.ServeHTTP(w2, newRequest("b")) - close(done2) - }() - waitProcessed(t, b.testProcessed, 1) - - // Both swaps must have reached Run() before either is marked ready — - // proves they ran in parallel rather than serializing. - <-a.runStarted - <-pb.runStarted - - a.markReady() - pb.markReady() - - select { - case <-done1: - case <-time.After(time.Second): - t.Fatal("request A did not complete") - } - select { - case <-done2: - case <-time.After(time.Second): - t.Fatal("request B did not complete") - } - - if w1.Code != http.StatusOK { - t.Errorf("A status=%d body=%q", w1.Code, w1.Body.String()) - } - if w2.Code != http.StatusOK { - t.Errorf("B status=%d body=%q", w2.Code, w2.Body.String()) - } - if got := a.stopCalls.Load(); got != 0 { - t.Errorf("a.stopCalls=%d want 0 (parallel swap, no eviction)", got) - } - if got := pb.stopCalls.Load(); got != 0 { - t.Errorf("b.stopCalls=%d want 0 (parallel swap, no eviction)", got) - } -} - -// TestBaseRouter_QueueDrainPromotesMultiple verifies that completing one swap -// unblocks every queued request that no longer collides — they all start in -// parallel rather than one-per-completion. -func TestBaseRouter_QueueDrainPromotesMultiple(t *testing.T) { - a := newFakeProcess("a") - pb := newFakeProcess("b") - pc := newFakeProcess("c") - - // A's swap evicts both B and C, so B and C must queue. Once A finishes - // B and C themselves have empty evict sets, so they can start together. - planner := &stubPlanner{evict: map[string][]string{ - "a": {"b", "c"}, - }} - b := newTestBase(t, map[string]process.Process{"a": a, "b": pb, "c": pc}, planner) - - w1 := httptest.NewRecorder() - done1 := make(chan struct{}) - go func() { - b.ServeHTTP(w1, newRequest("a")) - close(done1) - }() - waitProcessed(t, b.testProcessed, 1) - <-a.runStarted - - // B and C arrive while A is loading. evict_b and evict_c are empty, - // but collidesWith returns true because they appear in A's evict set. - w2 := httptest.NewRecorder() - done2 := make(chan struct{}) - go func() { - b.ServeHTTP(w2, newRequest("b")) - close(done2) - }() - waitProcessed(t, b.testProcessed, 1) - - w3 := httptest.NewRecorder() - done3 := make(chan struct{}) - go func() { - b.ServeHTTP(w3, newRequest("c")) - close(done3) - }() - waitProcessed(t, b.testProcessed, 1) - - if got := pb.runCalls.Load(); got != 0 { - t.Errorf("b started early: runCalls=%d", got) - } - if got := pc.runCalls.Load(); got != 0 { - t.Errorf("c started early: runCalls=%d", got) - } - - // Release A. The swapDone handler should drain the queue and start - // both B and C in parallel. - a.markReady() - waitProcessed(t, b.testProcessed, 1) // swapDone(A) → drainQueue starts B and C - <-pb.runStarted - <-pc.runStarted - - pb.markReady() - pc.markReady() - - for i, ch := range []chan struct{}{done1, done2, done3} { - select { - case <-ch: - case <-time.After(time.Second): - t.Fatalf("request %d did not complete", i) - } - } -} - -// TestBaseRouter_Shutdown_FailsAllInFlight verifies that shutdown returns -// the shutdown error to every waiter on every active swap AND to every -// queued request. -func TestBaseRouter_Shutdown_FailsAllInFlight(t *testing.T) { - a := newFakeProcess("a") - pb := newFakeProcess("b") - pc := newFakeProcess("c") - - // a and b load in parallel (empty evicts). c collides with both. - planner := &stubPlanner{evict: map[string][]string{ - "c": {"a", "b"}, - }} - b := newTestBase(t, map[string]process.Process{"a": a, "b": pb, "c": pc}, planner) - - const waitersPer = 2 - var wg sync.WaitGroup - codes := make([]int, 0, 2*waitersPer+1) - var codesMu sync.Mutex - record := func(code int) { - codesMu.Lock() - codes = append(codes, code) - codesMu.Unlock() - } - - launch := func(model string) { - wg.Add(1) - go func() { - defer wg.Done() - w := httptest.NewRecorder() - b.ServeHTTP(w, newRequest(model)) - record(w.Code) - }() - } - - // Active swaps for a and b, each with 2 waiters. - for i := 0; i < waitersPer; i++ { - launch("a") - waitProcessed(t, b.testProcessed, 1) - } - for i := 0; i < waitersPer; i++ { - launch("b") - waitProcessed(t, b.testProcessed, 1) - } - // c collides with both → queues. - launch("c") - waitProcessed(t, b.testProcessed, 1) - - <-a.runStarted - <-pb.runStarted - - if err := b.Shutdown(time.Second); err != nil { - t.Fatalf("Shutdown: %v", err) - } - wg.Wait() - - codesMu.Lock() - defer codesMu.Unlock() - if len(codes) != 2*waitersPer+1 { - t.Fatalf("got %d responses, want %d", len(codes), 2*waitersPer+1) - } - for i, c := range codes { - if c == http.StatusOK { - t.Errorf("response %d: status=%d, want non-200 (shutdown)", i, c) - } - } -} - -// TestBaseRouter_NoSwapWhileServing verifies that an already-loaded model -// is not stopped to satisfy another model's swap while it is still handling -// a request. -// -// Sequence: -// 1. r1 (A) — A loads; ServeHTTP enters and is pinned via serveBlock. -// 2. r2 (B, planner: B evicts A) — must NOT cause A.Stop while r1 is live. -// 3. r3 (A) — arrives next; the existing code queues it because B's swap -// intent collides with A. -// 4. r1 released — A finishes r1, then r3 is served by A. -// 5. B's swap then proceeds; r2 is served by B. -// -// fakeProcess.stoppedWhileServing flips true if Stop is ever called while -// a ServeHTTP is in flight — a direct, race-free signal of the violation. -func TestBaseRouter_NoSwapWhileServing(t *testing.T) { - a := newFakeProcess("a") - // autoReady left false: we markReady manually after observing runStarted, - // so autoReady's setState(Ready) cannot race with a later Stop and leave - // A in Ready, masking the bug. - a.serveBlock = make(chan struct{}) - pb := newFakeProcess("b") - // Same reasoning for B: park its swap on WaitReady until we choose. - - planner := &stubPlanner{evict: map[string][]string{"b": {"a"}}} - b := newTestBase(t, map[string]process.Process{"a": a, "b": pb}, planner) - - // r1 — load A and enter its ServeHTTP (which blocks on serveBlock). - w1 := httptest.NewRecorder() - done1 := make(chan struct{}) - go func() { - b.ServeHTTP(w1, newRequest("a")) - close(done1) - }() - waitProcessed(t, b.testProcessed, 1) // handlerReq for r1 - <-a.runStarted - a.markReady() - waitProcessed(t, b.testProcessed, 1) // swapDone for A - <-a.serveStarted - - // r2 — would evict A. A must not be stopped while r1 is in flight. - w2 := httptest.NewRecorder() - done2 := make(chan struct{}) - go func() { - b.ServeHTTP(w2, newRequest("b")) - close(done2) - }() - waitProcessed(t, b.testProcessed, 1) - - // r3 — another request for A, arrives behind r2 and queues because - // B's swap intent (which evicts A) is recorded as active. - w3 := httptest.NewRecorder() - done3 := make(chan struct{}) - go func() { - b.ServeHTTP(w3, newRequest("a")) - close(done3) - }() - waitProcessed(t, b.testProcessed, 1) - - // Release r1 (and r3 if it is fast-pathed onto the still-loaded A). - // The router must hold off B's swap until A has drained. - close(a.serveBlock) - - select { - case <-done1: - case <-time.After(2 * time.Second): - t.Fatal("r1 did not complete after serveBlock release") - } - - // Wait for B.Run before marking it ready: markReady before Run would - // skip the Run path entirely and leave pb.runCalls at 0. In a correct - // implementation B's swap only starts after A has drained; in the - // current implementation it has already started — either way runStarted - // fires. - <-pb.runStarted - pb.markReady() - - select { - case <-done2: - case <-time.After(2 * time.Second): - t.Fatal("r2 did not complete after B marked ready") - } - select { - case <-done3: - case <-time.After(2 * time.Second): - t.Fatal("r3 did not complete") - } - - if w1.Code != http.StatusOK || w2.Code != http.StatusOK || w3.Code != http.StatusOK { - t.Fatalf("statuses: w1=%d w2=%d w3=%d", w1.Code, w2.Code, w3.Code) - } - if w1.Body.String() != "ok:a" { - t.Errorf("r1 body=%q want ok:a", w1.Body.String()) - } - if w3.Body.String() != "ok:a" { - t.Errorf("r3 body=%q want ok:a (r3 must be served by A)", w3.Body.String()) - } - if w2.Body.String() != "ok:b" { - t.Errorf("r2 body=%q want ok:b", w2.Body.String()) - } - if a.stoppedWhileServing.Load() { - t.Errorf("A.Stop was called while A was still handling a request — the router swapped out a busy process") - } -} - func TestBaseRouter_ModelNotFound(t *testing.T) { a := newFakeProcess("a") b := newTestBase(t, map[string]process.Process{"a": a}, &stubPlanner{}) diff --git a/internal/router/design.md b/internal/router/design.md new file mode 100644 index 00000000..62be1c6e --- /dev/null +++ b/internal/router/design.md @@ -0,0 +1,404 @@ +# Router design + +A developer tutorial for the `internal/router` package and its `scheduler` +sub-package. + +## Intro + +A llama-swap router is the component that sits behind the proxy and answers one +question for every incoming request: _can this model serve right now, and if +not, what has to happen first?_ Answering it means juggling three concerns that +used to live tangled together in one type: + +1. **Process machinery** — owning the OS processes, starting and stopping them, + running health checks, and shuttling HTTP requests onto the right upstream. +2. **Scheduling strategy** — the queue, in-flight bookkeeping, and the decision + tree that turns one request into "serve now", "join an existing swap", + "queue", or "start a swap". +3. **Eviction policy** — given a model we want to load, which currently-running + models have to be stopped to make room? + +The design pulls those three apart into separate, independently replaceable +pieces: + +| Concern | Type | Lives in | +| ------------------- | ------------------------------ | ------------------------------- | +| Process machinery | `baseRouter` | `internal/router/base.go` | +| Scheduling strategy | `scheduler.Scheduler` (`FIFO`) | `internal/router/scheduler/` | +| Eviction policy | `scheduler.Swapper` | `groupSwapper`, `matrixSwapper` | + +`baseRouter` keeps the channels, run loop, process lifecycle, and shutdown +teardown, and exposes the side-effects a scheduler needs through the +`scheduler.Effects` interface. The scheduler owns the queue and decision tree +but performs no side-effects directly — it calls back through `Effects`. The +`Swapper` is a pure function from "target model + currently running" to "models +to evict", and knows nothing about queues, channels, or processes. + +Because the seams are interfaces, you can replace the scheduling strategy +without touching process management, or write a new eviction policy without +touching either. `FIFO` is the first and currently only `Scheduler`; +`groupSwapper` and `matrixSwapper` are the two `Swapper`s. + +## Key concepts + +### One run loop, no locks + +`baseRouter.run()` is a single goroutine selecting over a handful of channels: + +```go +for { + select { + case req := <-b.shutdownCh: b.handleShutdown(req); return + case req := <-b.handlerCh: b.schedule.OnRequest(req) + case req := <-b.unloadCh: b.schedule.OnUnload(req.targets, req.timeout); close(req.respond) + case ev := <-b.swapDoneCh: b.schedule.OnSwapDone(ev) + case ev := <-b.serveDoneCh: b.schedule.OnServeDone(ev) + } +} +``` + +Every `Scheduler` method runs on this one goroutine. That is the single most +important fact about the design: **the scheduler never needs a mutex for its own +state**. All scheduler state is touched only from these callbacks, which are +serialized by the run loop. If you write a new scheduler, you get the same +guarantee for free — and you must not break it by spinning up goroutines that +mutate scheduler state. + +### Events flow in, side-effects flow out + +The run loop turns external happenings into method calls on the scheduler: + +- A new HTTP request becomes `OnRequest(HandlerReq)`. +- A swap goroutine finishing becomes `OnSwapDone(SwapDone)`. +- A tracked request handler returning becomes `OnServeDone(ServeDoneEvent)`. +- An admin unload becomes `OnUnload(targets, timeout)`. +- Shutdown becomes `OnShutdown(err)`. + +The scheduler reacts by calling **back out** through `Effects`: inspect a +process state, start a swap, grant a response to a caller, or stop processes. It +never calls `process.Process` directly and never writes to a channel directly. +This keeps the scheduler pure enough to unit-test against a fake `Effects` with +no goroutines or real processes involved (see `scheduler/fifo_test.go`). + +``` + HTTP request admin Unload / Shutdown + │ │ + ▼ ▼ + ServeHTTP ──HandlerReq──▶ baseRouter.run() ◀──unloadCh/shutdownCh + │ (single goroutine) + ▼ + Scheduler.On*(...) + │ calls back through + ▼ + Effects: ModelState / StartSwap / + GrantServe / GrantError / StopProcesses + │ + ▼ + baseRouter side-effects: doSwap goroutine, + grant() to caller, process.Stop() + │ + swap completes ──SwapDone──▶ back into run loop +``` + +### The swap goroutine + +Scheduling decisions must be quick and non-blocking, but loading a model is +slow. The two are reconciled by doing the slow part on a separate goroutine. + +When the scheduler decides to start a swap, inside `OnRequest` it: + +1. records "a swap for X is in flight" in its own state, then +2. calls `Effects.StartSwap(modelID, evict)`. + +`StartSwap` does **not** load the model itself — it just launches a detached +goroutine (`doSwap`) and returns straight away. `doSwap` is what does the slow +work: stop the evicted processes, start the target, wait for it to become ready. +Because `StartSwap` returned immediately, `OnRequest` returns too, and the run +loop is free to pick up the next event — another request, a serve-done, an +unload — while `doSwap` runs in the background. + +The swap's eventual result comes back as just another event: when `doSwap` +finishes it posts a `SwapDone` onto `swapDoneCh`, which the run loop delivers as +`OnSwapDone`. So a slow load never blocks the run loop; it brackets it with two +quick events (`OnRequest` to start, `OnSwapDone` to finish) and everything in +between is handled normally. + +### In-flight tracking and `trackedServe` + +When the scheduler grants a request, the handler it hands back is wrapped by +`baseRouter.trackedServe`. The wrapper runs the real `ServeHTTP` and, on return, +posts a `ServeDoneEvent` so the run loop can decrement the per-model in-flight +count. This is why the scheduler can know whether a process is "busy": it counts +grants out and serve-dones in. A swap that would evict a busy process is +deferred until that process's in-flight count hits zero (`OnServeDone` then +re-drains the queue). + +The subtle contract here is `GrantServe`'s boolean return. The caller's +`Respond` channel is unbuffered, so a successful send proves the HTTP goroutine +is alive and took the handler. If the caller already disconnected, the send +fails, `trackedServe` never runs, and **no** `ServeDoneEvent` will ever arrive — +so the scheduler must only increment `inFlight` when `GrantServe` returns true. +Incrementing on a false return would strand the counter above zero and the model +could never be evicted again. + +## The interfaces + +All three live in `scheduler/scheduler.go`. + +### `Scheduler` + +```go +type Scheduler interface { + OnRequest(req HandlerReq) + OnSwapDone(ev SwapDone) + OnServeDone(ev ServeDoneEvent) + OnUnload(targets []string, timeout time.Duration) + OnShutdown(err error) +} +``` + +Owns the queue, in-flight tracking, and the decision tree. All methods run on +the run-loop goroutine, so no internal locking is needed. + +### `Swapper` + +```go +type Swapper interface { + EvictionFor(target string, running []string) []string + OnSwapStart(target string, running []string) +} +``` + +The eviction policy. `EvictionFor` is a **pure decision** — given the target and +the complete `running` set, return the running model IDs that must stop. It must +not log or mutate anything, and it does **not** inspect process state itself: +the scheduler hands it `running` already assembled (every non-stopped process, +unioned with the targets of in-flight swaps already committed but not yet +visible in process state). That keeps the swapper a pure function of its inputs, +with no reference to processes. + +The reason it must not log is that it is a _speculative_ query — "what would we +evict if we started this swap right now?" — called far more often than swaps +actually happen. The scheduler calls it once per incoming request, and then +**again for every still-queued request on every queue drain** (each `OnSwapDone`, +`OnServeDone`, and `OnUnload`). Most of those calls end in "still queued", +"collides", or "nothing to evict", not a real swap. Logging there would emit +duplicate lines for a request that simply sits in the queue, and lines for +decisions that never happen — the log would stop meaning "a swap occurred". + +`OnSwapStart` is the one place a Swapper may log, because it is called exactly +once, at the moment a swap is committed. One log line there equals one real swap, +with the evict set that is genuinely being applied — which is why `matrixSwapper` +re-solves and logs the full decision (set, DSL, cost) in `OnSwapStart` rather +than in `EvictionFor`. + +### `Effects` + +```go +type Effects interface { + ModelState(modelID string) (process.ProcessState, bool) + RunningModels() map[string]process.ProcessState + StartSwap(modelID string, evict []string) + GrantError(req HandlerReq, err error) + GrantServe(req HandlerReq, modelID string) bool + StopProcesses(timeout time.Duration, ids []string) +} +``` + +Implemented by `baseRouter`. This is the scheduler's entire window onto the +outside world; everything else about the router is hidden from it. See the +deep-dive below. + +### `Factory` — wiring it together + +```go +type Factory func(name string, logger *logmon.Monitor, eff Effects) Scheduler +``` + +`baseRouter` doesn't know which scheduler or swapper it has — it is handed a +`Factory` at construction and calls it once, passing itself as the `Effects`. +The concrete router captures its `Swapper` in the closure. From `group.go`: + +```go +swapper := &groupSwapper{ /* ... */ } +base := newBaseRouter("group", conf, processes, proxylog, + func(name string, logger *logmon.Monitor, eff scheduler.Effects) scheduler.Scheduler { + return scheduler.NewFIFO(name, logger, swapper, eff) + }) +``` + +This closure is the single point where the three pieces meet: it binds a +specific `Swapper` (`swapper`) and a specific `Scheduler` (`FIFO`) to the +`baseRouter`'s `Effects` (`eff`). + +**The swapper is a separate type from the concrete router.** There are currently two router implementations router.Group and router.Matrix. Each of these has a custom swapper that implements scheduler.Swapper for custom eviction logic. This decoupling of responsibilities makes it easy to implement custom swapping strategies. + +### The events + +A single goroutine in `baseRouter.run()` owns and serializes all state changes in the router. By processing events one at a time it ensures correctness and eliminates complex mutex lock logic. + +These are the events the router currently uses: + +```go +type HandlerReq struct { // one in-flight ServeHTTP awaiting a decision + Model string + Ctx context.Context + Respond chan HandlerResp // UNBUFFERED — see GrantServe contract + PositionCh chan int // queue-position updates for the loading UI +} + +type HandlerResp struct { // the decision handed back to the caller + HandleFunc http.HandlerFunc // serve with this, or... + Err error // ...fail with this +} + +type SwapDone struct{ ModelID string; Err error } // swap goroutine finished +type ServeDoneEvent struct{ ModelID string } // tracked handler returned +``` + +## Deep-dive: the `Effects` interface and why it exists + +`Effects` is the inversion-of-control boundary that makes the split possible. +The scheduler decides and `baseRouter` _acts_. Pulling the side-effects behind this +interface buys three things: + +1. **Purity and testability.** The scheduler performs no I/O, starts no + goroutines of its own, and touches no real processes. Its tests drive the + `On*` methods directly and assert on a `fakeEffects` that just records the + calls — synchronous, deterministic, no sleeps. (`scheduler/fifo_test.go`.) +2. **A single, auditable side-effect surface.** Every externally-visible thing a + scheduler can do is one of six methods. You can reason about the whole + contract by reading one interface. +3. **Decoupling lifetime.** The scheduler never holds a `process.Process`, + never sees a channel, and never learns how shutdown teardown works. It only + knows model IDs and states. + +Method by method, as implemented in `base.go`: + +- **`ModelState(modelID) (state, ok)`** — read-only snapshot of a process's + state, and whether this router handles the model at all. The scheduler uses it + for the "unknown model" check and the "already ready" fast path. Safe to call + any time because the process map is fixed at construction and `State()` is a + snapshot. + +- **`RunningModels()`** — the state of every process that isn't stopped or shut + down. The scheduler unions its keys with its own in-flight swap targets to + build the `running` set it hands the `Swapper`, so the swapper never has to + touch process state itself. + +- **`StartSwap(modelID, evict)`** — fire-and-forget. `baseRouter` launches the + `doSwap` goroutine and returns immediately; the result comes back later as a + `SwapDone`. The scheduler records the swap as active _before_ calling this so + that requests arriving in the meantime can join it. + +- **`GrantError(req, err)`** — hand a caller an error response. Used for unknown + models, failed swaps, unloads, and shutdown. + +- **`GrantServe(req, modelID) bool`** — hand a caller the tracked handler for a + ready model, returning whether the caller was still there to receive it. The + scheduler increments the in-flight count **only on a true return** (see the + in-flight contract above). This is the one `Effects` method whose return value + carries state-machine significance. + +- **`StopProcesses(timeout, ids)`** — stop processes in parallel and **block** + until all have stopped. Used by `OnUnload` so an admin `Unload` call can + guarantee the process is dead by the time it returns. (Note `StartSwap` is + async but `StopProcesses` is sync — the difference is deliberate and tied to + the caller's expectations.) + +A useful way to hold it in your head: `Effects` is the scheduler's syscall +table. The scheduler is a pure state machine; `Effects` is how it touches the +world, and `baseRouter` is the kernel that implements those syscalls with real +goroutines, channels, and processes. + +## How to implement a new `Swapper` + +A `Swapper` is a pure decision function plus a logging hook — the easiest of the three pieces to replace. + +1. **Write the swapper type** and give it whatever config it needs to make a + decision. It does **not** need the process map — the scheduler supplies the + running set as an argument. `groupSwapper` holds only its group config; + `matrixSwapper` holds only its solver and logger: + + ```go + type mySwapper struct { + config config.Config + } + ``` + +2. **Implement `EvictionFor(target, running)`** as a _pure_ decision: + - `running` is the complete live set, already assembled for you: every + non-stopped process unioned with the targets of in-flight swaps the + scheduler has committed to. You don't filter process state or fold in + in-flight targets yourself, that's the scheduler's job. Just decide against the slice you're handed. + - Return the list of model IDs in `running` that must stop for `target` to + run. Return `nil`/empty when nothing needs evicting. + - Do **not** mutate state here. + - Do **not** log here. It can be called multiple times per request. Since it is pure function have tests verify the expected behaviour. + +3. **Implement `OnSwapStart(target, running)`** — called once when a swap + actually begins, with the same `running` set `EvictionFor` saw. This is the + right place to log: one call equals one real swap. `matrixSwapper` re-solves + and logs the chosen set and cost here; `groupSwapper` logs nothing. + +4. **Wire it in** by instantiating the swapper in your router's constructor and + capturing it in the `Factory` closure passed to `newBaseRouter` — exactly as + `NewGroup` and `NewMatrix` do. The router struct itself only ever embeds + `*baseRouter`; the swapper reaches the scheduler solely through that closure. + +Reference implementations: `groupSwapper` (static group config) in `group.go` +and `matrixSwapper` (cost-based set solver) in `matrix.go`. + +## How to implement a new `Scheduler` + +Replacing the scheduler means taking over the queue and the entire decision tree. Read `scheduler/fifo.go` end to end first — it is the reference implementation and the rules below are easiest to understand in context. + +The rules you must honour: + +- **Single goroutine.** Every method runs on the `baseRouter.run()` goroutine. Keep your state in plain maps/slices and never read or write it from another goroutine. If you need slow work done, hand it to `Effects.StartSwap` and react to the resulting `SwapDone` — do not block a method waiting for it. + +- **Never block the run loop.** `OnRequest`, `OnSwapDone`, and `OnServeDone` must make a decision and return. The one method allowed to block is `OnUnload`, and only because it must wait on the synchronous `StopProcesses` so the admin caller's guarantee holds. + +- **Respect the `GrantServe` boolean.** Only count a request as in-flight when `GrantServe` returns true (see the in-flight contract above). A false return means the caller is gone; no `ServeDoneEvent` will ever arrive, so incrementing on false permanently strands the counter. + +- **Account for in-flight swaps in your running set.** When you call `Swapper.EvictionFor`, the running set you pass must include not just live processes (`Effects.RunningModels`) but also the targets of swaps you've already started that aren't yet visible in process state — otherwise the swapper contradicts decisions already in motion. + +What each method must do: + +- **`OnRequest(req)`** — every request must resolve to exactly one of: granted, errored, joined (piggybacks an in-flight swap), queued, or swap-started. No request may be silently dropped. + +- **`OnSwapDone(ev)`** — deliver the result to every waiter that joined this swap (grant on success, error on `ev.Err`), drop the swap from active tracking, then re-examine anything queued — a finished swap may have unblocked it. + +- **`OnServeDone(ev)`** — decrement the model's in-flight count; when it hits zero, re-examine the queue. Do **not** clear in-flight counts by hand; the handlers post their own `ServeDoneEvent`s on return. + +- **`OnUnload(targets, timeout)`** — error out any waiters or queued requests for the unloaded models, call `Effects.StopProcesses` (synchronously — the admin caller relies on the process being dead afterwards), then re-examine the queue. + +- **`OnShutdown(err)`** — error out every waiter you still hold (active swap waiters and queued requests). Don't touch processes; teardown is `baseRouter`'s job. + +Expose a constructor matching the `Factory` shape: + +```go +func NewMyScheduler(name string, logger *logmon.Monitor, swapper Swapper, eff Effects) *MyScheduler { + // ... +} + +// in the concrete router: +base := newBaseRouter(name, conf, processes, proxylog, + func(name string, logger *logmon.Monitor, eff scheduler.Effects) scheduler.Scheduler { + return scheduler.NewMyScheduler(name, logger, swapper, eff) + }) +``` + +## Testing + +- **Schedulers** are tested as pure state machines in the `scheduler` package: + drive the `On*` methods directly against a `fakeEffects` and assert on the + recorded grants/starts/stops. No goroutines, no sleeps. See + `scheduler/fifo_test.go` as the reference; follow the `TestSchedulerName_` + naming convention. +- **`baseRouter` mechanism** (run loop, `grant`/`ServeHTTP`, `Unload`, + `Shutdown`) is tested in `base_test.go`. The run loop exposes a + `testProcessed` channel so tests can wait for an event to be fully processed + instead of sleeping. +- Run new tests with `go test -v -run TestMyScheduler_... ./internal/router/scheduler/`, + then `make test-dev` for a quick `go test` + `staticcheck` pass over `proxy/`. diff --git a/internal/router/group.go b/internal/router/group.go index fef2739f..18491701 100644 --- a/internal/router/group.go +++ b/internal/router/group.go @@ -6,6 +6,7 @@ import ( "github.com/mostlygeek/llama-swap/internal/config" "github.com/mostlygeek/llama-swap/internal/logmon" "github.com/mostlygeek/llama-swap/internal/process" + "github.com/mostlygeek/llama-swap/internal/router/scheduler" ) type Group struct { @@ -14,7 +15,7 @@ type Group struct { func NewGroup(conf config.Config, proxylog, upstreamlog *logmon.Monitor) (*Group, error) { modelToGroup := make(map[string]string) - for gid, gcfg := range conf.Groups { + for gid, gcfg := range conf.Routing.Router.Settings.Groups { for _, mid := range gcfg.Members { if existing, dup := modelToGroup[mid]; dup { return nil, fmt.Errorf("model %q is in multiple groups: %q and %q", mid, existing, gid) @@ -23,14 +24,16 @@ func NewGroup(conf config.Config, proxylog, upstreamlog *logmon.Monitor) (*Group } } - planner := &groupPlanner{ + swapper := &groupSwapper{ config: conf, modelToGroup: modelToGroup, } processes := make(map[string]process.Process, len(modelToGroup)) - base := newBaseRouter("group", conf, processes, planner, proxylog) - planner.processes = processes + base := newBaseRouter("group", conf, processes, proxylog, + func(name string, logger *logmon.Monitor, eff scheduler.Effects) scheduler.Scheduler { + return scheduler.NewFIFO(name, logger, swapper, conf.Routing.Scheduler.Settings.Fifo, eff) + }) for mid := range modelToGroup { modelCfg, _, ok := conf.FindConfig(mid) @@ -54,21 +57,20 @@ func NewGroup(conf config.Config, proxylog, upstreamlog *logmon.Monitor) (*Group return g, nil } -// groupPlanner decides evictions from static group configuration. +// groupSwapper decides evictions from static group configuration. // // Same-group siblings are stopped when the group has swap=true. Cross-group // members are stopped only when the target's group is exclusive; loading a // model from a non-exclusive group leaves running exclusive groups alone, // matching the gotcha in the original ProcessGroup behaviour. -type groupPlanner struct { +type groupSwapper struct { config config.Config modelToGroup map[string]string - processes map[string]process.Process } -func (p *groupPlanner) EvictionFor(target string, alsoRunning []string) []string { +func (p *groupSwapper) EvictionFor(target string, running []string) []string { tg := p.modelToGroup[target] - tgCfg := p.config.Groups[tg] + tgCfg := p.config.Routing.Router.Settings.Groups[tg] seen := make(map[string]struct{}) var result []string @@ -89,24 +91,17 @@ func (p *groupPlanner) EvictionFor(target string, alsoRunning []string) []string // for backwards compatibility. The newer swap matrix approach does not // have this issue. case og != tg && tgCfg.Exclusive: - if ogCfg := p.config.Groups[og]; !ogCfg.Persistent { + if ogCfg := p.config.Routing.Router.Settings.Groups[og]; !ogCfg.Persistent { seen[mID] = struct{}{} result = append(result, mID) } } } - for mID, proc := range p.processes { - st := proc.State() - if st == process.StateStopped || st == process.StateShutdown { - continue - } - consider(mID) - } - for _, mID := range alsoRunning { + for _, mID := range running { consider(mID) } return result } -func (p *groupPlanner) OnSwapStart(target string) {} +func (p *groupSwapper) OnSwapStart(target string, running []string) {} diff --git a/internal/router/group_test.go b/internal/router/group_test.go index 5ba2c42c..336cbf28 100644 --- a/internal/router/group_test.go +++ b/internal/router/group_test.go @@ -10,6 +10,7 @@ import ( "github.com/mostlygeek/llama-swap/internal/config" "github.com/mostlygeek/llama-swap/internal/logmon" "github.com/mostlygeek/llama-swap/internal/process" + "github.com/mostlygeek/llama-swap/internal/router/scheduler" ) // newTestGroup builds a Group directly from the supplied processes and config, @@ -17,17 +18,19 @@ import ( func newTestGroup(t *testing.T, conf config.Config, processes map[string]process.Process) *Group { t.Helper() modelToGroup := make(map[string]string) - for gid, gcfg := range conf.Groups { + for gid, gcfg := range conf.Routing.Router.Settings.Groups { for _, mid := range gcfg.Members { modelToGroup[mid] = gid } } - planner := &groupPlanner{ + swapper := &groupSwapper{ config: conf, modelToGroup: modelToGroup, - processes: processes, } - base := newBaseRouter("group", conf, processes, planner, logmon.NewWriter(io.Discard)) + base := newBaseRouter("group", conf, processes, logmon.NewWriter(io.Discard), + func(name string, logger *logmon.Monitor, eff scheduler.Effects) scheduler.Scheduler { + return scheduler.NewFIFO(name, logger, swapper, conf.Routing.Scheduler.Settings.Fifo, eff) + }) base.testProcessed = make(chan struct{}, 64) g := &Group{baseRouter: base} go base.run() @@ -41,10 +44,10 @@ func newTestGroup(t *testing.T, conf config.Config, processes map[string]process func TestGroup_NewGroup_DuplicateMembership(t *testing.T) { conf := config.Config{ - Groups: map[string]config.GroupConfig{ + Routing: groupRouting(map[string]config.GroupConfig{ "g1": {Swap: true, Members: []string{"a"}}, "g2": {Swap: true, Members: []string{"a"}}, - }, + }), Models: map[string]config.ModelConfig{ "a": {}, }, @@ -65,9 +68,9 @@ func TestGroup_ServeHTTP_SwapStopsPrevious(t *testing.T) { conf := config.Config{ HealthCheckTimeout: 5, - Groups: map[string]config.GroupConfig{ + Routing: groupRouting(map[string]config.GroupConfig{ "g": {Swap: true, Exclusive: true, Members: []string{"a", "b"}}, - }, + }), } g := newTestGroup(t, conf, map[string]process.Process{"a": a, "b": b}) @@ -97,9 +100,9 @@ func TestGroup_NonSwapGroup_NoStop(t *testing.T) { conf := config.Config{ HealthCheckTimeout: 5, - Groups: map[string]config.GroupConfig{ + Routing: groupRouting(map[string]config.GroupConfig{ "g": {Swap: false, Exclusive: false, Members: []string{"a", "b"}}, - }, + }), } g := newTestGroup(t, conf, map[string]process.Process{"a": a, "b": b}) @@ -127,10 +130,10 @@ func TestGroup_CrossGroupExclusive(t *testing.T) { conf := config.Config{ HealthCheckTimeout: 5, - Groups: map[string]config.GroupConfig{ + Routing: groupRouting(map[string]config.GroupConfig{ "g1": {Swap: true, Exclusive: true, Members: []string{"a"}}, "g2": {Swap: true, Exclusive: true, Members: []string{"b"}}, - }, + }), } g := newTestGroup(t, conf, map[string]process.Process{"a": a, "b": b}) @@ -154,10 +157,10 @@ func TestGroup_CrossGroupNonExclusiveParallel(t *testing.T) { conf := config.Config{ HealthCheckTimeout: 5, - Groups: map[string]config.GroupConfig{ + Routing: groupRouting(map[string]config.GroupConfig{ "g1": {Swap: true, Exclusive: false, Members: []string{"a"}}, "g2": {Swap: true, Exclusive: false, Members: []string{"b"}}, - }, + }), } g := newTestGroup(t, conf, map[string]process.Process{"a": a, "b": pb}) @@ -202,16 +205,17 @@ func TestGroup_CrossGroupNonExclusiveParallel(t *testing.T) { // TestGroup_SameGroupSwapSerialises verifies that two same-group requests // (Swap=true) serialise even when both arrive while neither has reached -// StateStarting yet — the alsoRunning hint to the planner closes that race. +// StateStarting yet — the in-flight swap target the scheduler folds into the +// running set closes that race. func TestGroup_SameGroupSwapSerialises(t *testing.T) { a := newFakeProcess("a") pb := newFakeProcess("b") conf := config.Config{ HealthCheckTimeout: 5, - Groups: map[string]config.GroupConfig{ + Routing: groupRouting(map[string]config.GroupConfig{ "g": {Swap: true, Exclusive: false, Members: []string{"a", "b"}}, - }, + }), } g := newTestGroup(t, conf, map[string]process.Process{"a": a, "b": pb}) @@ -224,8 +228,9 @@ func TestGroup_SameGroupSwapSerialises(t *testing.T) { waitProcessed(t, g.testProcessed, 1) // Request B arrives before A transitions to StateStarting in the process - // state machine. Without the alsoRunning hint, the planner would not see - // A as running, and B would start in parallel, violating Swap=true. + // state machine. Without folding the in-flight swap target into the running + // set, the swapper would not see A as running, and B would start in + // parallel, violating Swap=true. w2 := httptest.NewRecorder() done2 := make(chan struct{}) go func() { @@ -269,10 +274,10 @@ func TestGroup_PersistentNotEvicted(t *testing.T) { conf := config.Config{ HealthCheckTimeout: 5, - Groups: map[string]config.GroupConfig{ + Routing: groupRouting(map[string]config.GroupConfig{ "persist": {Swap: true, Exclusive: false, Persistent: true, Members: []string{"a"}}, "other": {Swap: true, Exclusive: true, Members: []string{"b"}}, - }, + }), } g := newTestGroup(t, conf, map[string]process.Process{"a": a, "b": b}) @@ -306,10 +311,10 @@ func TestGroup_NonExclusiveDoesNotUnloadExclusive(t *testing.T) { conf := config.Config{ HealthCheckTimeout: 5, - Groups: map[string]config.GroupConfig{ + Routing: groupRouting(map[string]config.GroupConfig{ "g1": {Swap: true, Exclusive: true, Members: []string{"a"}}, "g2": {Swap: true, Exclusive: false, Members: []string{"b"}}, - }, + }), } g := newTestGroup(t, conf, map[string]process.Process{"a": a, "b": b}) diff --git a/internal/router/helpers_test.go b/internal/router/helpers_test.go index ce1df6bc..00fbe1a5 100644 --- a/internal/router/helpers_test.go +++ b/internal/router/helpers_test.go @@ -12,10 +12,23 @@ import ( "testing" "time" + "github.com/mostlygeek/llama-swap/internal/config" "github.com/mostlygeek/llama-swap/internal/logmon" "github.com/mostlygeek/llama-swap/internal/process" ) +// groupRouting builds a normalized RoutingConfig for the group router, mirroring +// what config.LoadConfigFromReader produces. Tests use it to populate +// config.Config.Routing without going through LoadConfig. +func groupRouting(groups map[string]config.GroupConfig) config.RoutingConfig { + return config.RoutingConfig{ + Router: config.RouterConfig{ + Use: "group", + Settings: config.RouterSettings{Groups: groups}, + }, + } +} + // fakeProcess is an in-memory implementation of process.Process used to drive // the routers through their state machine without spawning real upstreams. type fakeProcess struct { diff --git a/internal/router/matrix.go b/internal/router/matrix.go index 6f2d853a..d3812446 100644 --- a/internal/router/matrix.go +++ b/internal/router/matrix.go @@ -2,11 +2,11 @@ package router import ( "fmt" - "sort" "github.com/mostlygeek/llama-swap/internal/config" "github.com/mostlygeek/llama-swap/internal/logmon" "github.com/mostlygeek/llama-swap/internal/process" + "github.com/mostlygeek/llama-swap/internal/router/scheduler" ) type Matrix struct { @@ -14,20 +14,23 @@ type Matrix struct { } func NewMatrix(conf config.Config, proxylog, upstreamlog *logmon.Monitor) (*Matrix, error) { - if conf.Matrix == nil { + mtx := conf.Routing.Router.Settings.Matrix + if mtx == nil { return nil, fmt.Errorf("matrix router requires a matrix configuration") } - planner := &matrixPlanner{ - solver: newMatrixSolver(conf.ExpandedSets, conf.Matrix.ResolvedEvictCosts()), + swapper := &matrixSwapper{ + solver: newMatrixSolver(mtx.ExpandedSets, mtx.ResolvedEvictCosts()), logger: proxylog, } // Build a process for every model in the config. Any model can run alone // even if it is not part of a set; this mirrors proxy.NewMatrix. processes := make(map[string]process.Process, len(conf.Models)) - base := newBaseRouter("matrix", conf, processes, planner, proxylog) - planner.processes = processes + base := newBaseRouter("matrix", conf, processes, proxylog, + func(name string, logger *logmon.Monitor, eff scheduler.Effects) scheduler.Scheduler { + return scheduler.NewFIFO(name, logger, swapper, conf.Routing.Scheduler.Settings.Fifo, eff) + }) for mid, modelCfg := range conf.Models { procLog := logmon.NewWriter(upstreamlog) @@ -45,20 +48,18 @@ func NewMatrix(conf config.Config, proxylog, upstreamlog *logmon.Monitor) (*Matr return r, nil } -// matrixPlanner decides evictions by asking the matrix solver against the -// current running set. -type matrixPlanner struct { - solver *matrixSolver - processes map[string]process.Process - logger *logmon.Monitor +// matrixSwapper decides evictions by asking the matrix solver against the +// running set the scheduler hands it. +type matrixSwapper struct { + solver *matrixSolver + logger *logmon.Monitor } -func (p *matrixPlanner) EvictionFor(target string, alsoRunning []string) []string { - return p.solver.Solve(target, p.runningSet(alsoRunning)).Evict +func (p *matrixSwapper) EvictionFor(target string, running []string) []string { + return p.solver.Solve(target, running).Evict } -func (p *matrixPlanner) OnSwapStart(target string) { - running := p.runningModels() +func (p *matrixSwapper) OnSwapStart(target string, running []string) { result := p.solver.Solve(target, running) switch { case len(result.Evict) > 0: @@ -70,32 +71,3 @@ func (p *matrixPlanner) OnSwapStart(target string) { p.logger.Debugf("matrix: model=%s already running in set=%s dsl=%q", target, result.SetName, result.DSL) } } - -func (p *matrixPlanner) runningModels() []string { - return p.runningSet(nil) -} - -// runningSet returns the union of live processes (State != Stopped/Shutdown) -// and any extra IDs the baseRouter has already committed to loading but which -// the process state machine has not yet reflected. -func (p *matrixPlanner) runningSet(alsoRunning []string) []string { - seen := make(map[string]struct{}, len(p.processes)) - var running []string - for id, proc := range p.processes { - st := proc.State() - if st == process.StateStopped || st == process.StateShutdown { - continue - } - seen[id] = struct{}{} - running = append(running, id) - } - for _, id := range alsoRunning { - if _, dup := seen[id]; dup { - continue - } - seen[id] = struct{}{} - running = append(running, id) - } - sort.Strings(running) - return running -} diff --git a/internal/router/matrix_test.go b/internal/router/matrix_test.go index fb688d46..0d7a985d 100644 --- a/internal/router/matrix_test.go +++ b/internal/router/matrix_test.go @@ -10,6 +10,7 @@ import ( "github.com/mostlygeek/llama-swap/internal/config" "github.com/mostlygeek/llama-swap/internal/logmon" "github.com/mostlygeek/llama-swap/internal/process" + "github.com/mostlygeek/llama-swap/internal/router/scheduler" ) // newTestMatrix builds a Matrix router from supplied processes, bypassing @@ -17,12 +18,14 @@ import ( func newTestMatrix(t *testing.T, conf config.Config, expanded []config.ExpandedSet, evictCosts map[string]int, processes map[string]process.Process) *Matrix { t.Helper() logger := logmon.NewWriter(io.Discard) - planner := &matrixPlanner{ - solver: newMatrixSolver(expanded, evictCosts), - processes: processes, - logger: logger, + swapper := &matrixSwapper{ + solver: newMatrixSolver(expanded, evictCosts), + logger: logger, } - base := newBaseRouter("matrix", conf, processes, planner, logger) + base := newBaseRouter("matrix", conf, processes, logger, + func(name string, l *logmon.Monitor, eff scheduler.Effects) scheduler.Scheduler { + return scheduler.NewFIFO(name, l, swapper, conf.Routing.Scheduler.Settings.Fifo, eff) + }) base.testProcessed = make(chan struct{}, 64) r := &Matrix{baseRouter: base} go base.run() @@ -153,8 +156,8 @@ func TestMatrix_CoexistingSetParallel(t *testing.T) { // TestMatrix_IncompatibleQueues verifies that the second request for a model // that cannot coexist with the in-flight first model queues until the first -// completes, and then evicts it. This exercises the alsoRunning hint via the -// matrix solver's union into runningSet. +// completes, and then evicts it. This exercises the scheduler folding in-flight +// swap targets into the running set it hands the swapper. func TestMatrix_IncompatibleQueues(t *testing.T) { a := newFakeProcess("a") pb := newFakeProcess("b") @@ -173,8 +176,9 @@ func TestMatrix_IncompatibleQueues(t *testing.T) { }() waitProcessed(t, r.testProcessed, 1) - // B arrives before A transitions to StateStarting. The solver sees A via - // alsoRunning and returns evict=[a], so collidesWith forces B to queue. + // B arrives before A transitions to StateStarting. The running set the + // scheduler builds includes A (an in-flight swap target), so the solver + // returns evict=[a] and collidesWith forces B to queue. w2 := httptest.NewRecorder() done2 := make(chan struct{}) go func() { diff --git a/internal/router/router.go b/internal/router/router.go index 4bc8d8c9..8ebc1f11 100644 --- a/internal/router/router.go +++ b/internal/router/router.go @@ -13,6 +13,7 @@ import ( "github.com/mostlygeek/llama-swap/internal/config" "github.com/mostlygeek/llama-swap/internal/logmon" "github.com/mostlygeek/llama-swap/internal/process" + "github.com/mostlygeek/llama-swap/internal/router/scheduler" "github.com/tidwall/gjson" ) @@ -31,7 +32,7 @@ var ( ErrNoModelInContext = fmt.Errorf("no model in request context") ErrNoRouterFound = fmt.Errorf("no router found for model") ErrNoPeerModelFound = fmt.Errorf("peer model not found") - ErrNoLocalModelFound = fmt.Errorf("local model not found") + ErrNoLocalModelFound = scheduler.ErrModelNotFound ContextKey = &contextkey{"context"} ) diff --git a/internal/router/scheduler/fifo.go b/internal/router/scheduler/fifo.go new file mode 100644 index 00000000..d5159e7f --- /dev/null +++ b/internal/router/scheduler/fifo.go @@ -0,0 +1,411 @@ +package scheduler + +import ( + "fmt" + "sort" + "time" + + "github.com/mostlygeek/llama-swap/internal/config" + "github.com/mostlygeek/llama-swap/internal/logmon" + "github.com/mostlygeek/llama-swap/internal/process" +) + +// activeSwap tracks one in-flight swap and the callers waiting on it. +type activeSwap struct { + modelID string + evict []string + waiters []HandlerReq +} + +// FIFO is the default scheduler. Requests are handled in a first-in, first-out order. +// To reduce swapping requests for a model that is already running will be handled +// immediately by the running process. +// +// Requests into this schedule are handled like this: +// +// A B C A B C --> A A B B C C +// +// The strategy is simple and reduces the number of swaps required. +type FIFO struct { + name string + logger *logmon.Monitor + planner Swapper + cfg config.FifoConfig + effects Effects + + active map[string]*activeSwap + inFlight map[string]int + queued []HandlerReq +} + +// NewFIFO builds a FIFO scheduler. It matches scheduler.Factory once a planner +// is captured in a closure. +func NewFIFO(name string, logger *logmon.Monitor, planner Swapper, cfg config.FifoConfig, eff Effects) *FIFO { + return &FIFO{ + name: name, + logger: logger, + planner: planner, + cfg: cfg, + effects: eff, + active: make(map[string]*activeSwap), + inFlight: make(map[string]int), + } +} + +// OnRequest decides what to do with one incoming ServeHTTP request. It never +// blocks indefinitely: any work that has to wait (starting a process, stopping +// siblings, waiting for ready) is deferred to a swap goroutine and reported back +// via OnSwapDone. +// +// The decision tree, in order: +// +// 1. Unknown model — respond with ErrModelNotFound and move on. +// 2. A swap to the same model is already in flight — attach this waiter so +// one swap serves all callers that asked for the same model. +// 3. Fast path — the target process is already ready, the planner sees +// nothing to evict, and no in-flight swap is evicting it. Hand back its +// ServeHTTP immediately. +// 4. Would collide with an in-flight swap (we'd stop their target, or they're +// stopping us) — park in the queue for OnSwapDone to drain. +// 5. Would evict a process that is still handling requests — park in the +// queue. OnServeDone will retry when the busy process drains. +// 6. Otherwise — start a new swap. This may run in parallel with other active +// swaps when their evict sets don't intersect. +func (s *FIFO) OnRequest(req HandlerReq) { + // (1) Unknown model. + state, ok := s.effects.ModelState(req.Model) + if !ok { + s.logger.Debugf("%s: model %s not handled by this router", s.name, req.Model) + s.effects.GrantError(req, ErrModelNotFound) + return + } + + // (2) Join an in-flight swap for the same model. + if sw, ok := s.active[req.Model]; ok { + s.logger.Debugf("%s: joining in-flight swap for model %s (%d waiters)", s.name, req.Model, len(sw.waiters)+1) + sw.waiters = append(sw.waiters, req) + return + } + + running := s.runningSet(req.Model) + evict := s.planner.EvictionFor(req.Model, running) + + // (3) Fast path: ready, nothing to evict, and nobody is evicting us. + if state == process.StateReady && len(evict) == 0 && !collidesWith(req.Model, evict, s.active) { + s.logger.Debugf("%s: fast-path serving model %s (already ready)", s.name, req.Model) + s.grantHandler(req, req.Model) + return + } + + // (4) Collision with an in-flight swap — queue. + if collidesWith(req.Model, evict, s.active) { + s.logger.Debugf("%s: queuing request for model %s (collides with in-flight swap)", s.name, req.Model) + s.enqueue(req) + return + } + + // (5) Would evict a busy process — queue until it drains. + if conflictsWithInFlight(evict, s.inFlight) { + s.logger.Debugf("%s: queuing request for model %s (would evict in-flight process)", s.name, req.Model) + s.enqueue(req) + return + } + + // (6) Start a new (possibly parallel) swap. + s.logger.Debugf("%s: starting swap for model %s, evicting %v", s.name, req.Model, evict) + s.startSwap(req, evict, running) +} + +// OnSwapDone fans the result out to every waiter that joined this swap, removes +// the swap from the active map, then walks the queue once, promoting any items +// that no longer collide with the remaining active set. FIFO order is preserved: +// items still blocked stay in place. +func (s *FIFO) OnSwapDone(ev SwapDone) { + sw, ok := s.active[ev.ModelID] + if !ok { + return + } + delete(s.active, ev.ModelID) + + for _, w := range sw.waiters { + if ev.Err != nil { + s.effects.GrantError(w, ev.Err) + } else { + s.grantHandler(w, ev.ModelID) + } + } + + s.drainQueue() +} + +// OnServeDone decrements the per-model in-flight count and, when that drops to +// zero, retries the queue: requests whose swap was deferred because they would +// have evicted this (now-idle) process can now proceed. +func (s *FIFO) OnServeDone(ev ServeDoneEvent) { + s.inFlight[ev.ModelID]-- + if s.inFlight[ev.ModelID] <= 0 { + delete(s.inFlight, ev.ModelID) + s.drainQueue() + } +} + +// OnUnload reconciles router-owned state with the impending Stop, performs the +// Stop (synchronously, via Effects) so callers of Unload remain blocked until +// each targeted process has exited, then drains the queue. +func (s *FIFO) OnUnload(targets []string, timeout time.Duration) { + unloadErr := fmt.Errorf("%s: model unloaded", s.name) + + targetSet := make(map[string]bool, len(targets)) + for _, id := range targets { + targetSet[id] = true + } + + // Release waiters of any in-flight swap whose target is being unloaded. + // The swap goroutine itself is left to finish on its own; when its + // SwapDone arrives, OnSwapDone will find no entry in active and drop it. + for id := range targetSet { + sw, ok := s.active[id] + if !ok { + continue + } + for _, w := range sw.waiters { + s.effects.GrantError(w, unloadErr) + } + delete(s.active, id) + } + + // Drop queued requests addressed to unloaded models. Requests for other + // models stay queued and may benefit from drainQueue at the end. + if len(s.queued) > 0 { + kept := s.queued[:0] + for _, w := range s.queued { + if targetSet[w.Model] { + s.effects.GrantError(w, unloadErr) + continue + } + kept = append(kept, w) + } + s.queued = kept + } + + // Stop the targeted processes. Done synchronously so Unload's caller can + // rely on "after Unload returns, the process is stopped". inFlight is + // intentionally NOT cleared here: each dying handler will fire its tracked + // serve and reach OnServeDone in the normal way. + s.effects.StopProcesses(timeout, targets) + + // Removing entries from active above may have unblocked queued requests + // that previously collided with the now-cancelled swaps. + s.drainQueue() +} + +// OnShutdown grants err to every waiter still held by the scheduler. +func (s *FIFO) OnShutdown(err error) { + for _, sw := range s.active { + for _, w := range sw.waiters { + s.effects.GrantError(w, err) + } + } + for _, w := range s.queued { + s.effects.GrantError(w, err) + } +} + +// grantHandler hands the caller a tracked handler for modelID and, only if the +// caller was still there to receive it, bumps the in-flight count. Incrementing +// when the grant failed would strand the counter and block future evictions. +func (s *FIFO) grantHandler(req HandlerReq, modelID string) { + if s.effects.GrantServe(req, modelID) { + s.inFlight[modelID]++ + } +} + +// startSwap records the swap as active and launches it via Effects. running is +// the set EvictionFor saw, forwarded to OnSwapStart so the planner logs against +// the same picture it decided on. +func (s *FIFO) startSwap(initial HandlerReq, evict, running []string) { + s.active[initial.Model] = &activeSwap{ + modelID: initial.Model, + evict: evict, + waiters: []HandlerReq{initial}, + } + s.planner.OnSwapStart(initial.Model, running) + s.effects.StartSwap(initial.Model, evict) +} + +// enqueue inserts req into the queue in priority order: it goes just before the +// first queued item whose priority is strictly lower, so higher-priority models +// are serviced first while equal-priority requests keep their arrival (FIFO) +// order. Priorities come from the FifoConfig; unlisted models default to 0. +func (s *FIFO) enqueue(req HandlerReq) { + p := s.cfg.Priority[req.Model] + i := len(s.queued) + for j, q := range s.queued { + if s.cfg.Priority[q.Model] < p { + i = j + break + } + } + s.queued = append(s.queued, HandlerReq{}) + copy(s.queued[i+1:], s.queued[i:]) + s.queued[i] = req + broadcastQueuePositions(s.queued) +} + +// drainQueue walks the queued requests in order, re-running the OnRequest +// decision tree against the (now smaller) active set. Items that can now start +// or join become satisfied; items still blocked remain queued in original order +// so they get another chance on the next swap completion. +func (s *FIFO) drainQueue() { + if len(s.queued) == 0 { + return + } + pending := s.queued + var remaining []HandlerReq + for _, req := range pending { + state, ok := s.effects.ModelState(req.Model) + if !ok { + s.effects.GrantError(req, ErrModelNotFound) + continue + } + if sw, ok := s.active[req.Model]; ok { + s.logger.Debugf("%s: queued request for model %s now joining in-flight swap", s.name, req.Model) + sw.waiters = append(sw.waiters, req) + continue + } + running := s.runningSet(req.Model) + evict := s.planner.EvictionFor(req.Model, running) + if state == process.StateReady && len(evict) == 0 && !collidesWith(req.Model, evict, s.active) { + s.logger.Debugf("%s: queued request for model %s now served fast-path", s.name, req.Model) + s.grantHandler(req, req.Model) + continue + } + if collidesWith(req.Model, evict, s.active) { + remaining = append(remaining, req) + continue + } + if conflictsWithInFlight(evict, s.inFlight) { + remaining = append(remaining, req) + continue + } + s.logger.Debugf("%s: queued request for model %s now starting swap, evicting %v", s.name, req.Model, evict) + s.startSwap(req, evict, running) + } + s.queued = remaining + broadcastQueuePositions(s.queued) +} + +// runningSet is the live model set handed to the Swapper: every process the +// baseRouter reports as running, unioned with the targets of in-flight swaps +// (excluding excludeActive, the model whose own swap is being decided — its +// in-flight entry must not count as "already running"). The result is sorted so +// eviction decisions derived from it are deterministic. +func (s *FIFO) runningSet(excludeActive string) []string { + seen := make(map[string]struct{}) + var out []string + add := func(id string) { + if _, dup := seen[id]; dup { + return + } + seen[id] = struct{}{} + out = append(out, id) + } + for id := range s.effects.RunningModels() { + add(id) + } + for _, id := range activeTargets(s.active, excludeActive) { + add(id) + } + sort.Strings(out) + return out +} + +// activeTargets returns the IDs of every in-flight swap target except exclude. +// The planner uses this to account for models committed to but not yet reflected +// in process state. +func activeTargets(active map[string]*activeSwap, exclude string) []string { + if len(active) == 0 { + return nil + } + out := make([]string, 0, len(active)) + for id := range active { + if id == exclude { + continue + } + out = append(out, id) + } + return out +} + +// collidesWith reports whether a new swap with this target and evict set can +// safely run alongside the currently active swaps. Same-target callers should +// JOIN (handled before this) — they do not collide with themselves. +func collidesWith(target string, evict []string, active map[string]*activeSwap) bool { + for id, sw := range active { + if id == target { + continue + } + if containsString(evict, id) { + return true + } + if containsString(sw.evict, target) { + return true + } + if slicesOverlap(evict, sw.evict) { + return true + } + } + return false +} + +// slicesOverlap reports whether xs and ys share any common element. +func slicesOverlap(xs, ys []string) bool { + for _, x := range xs { + if containsString(ys, x) { + return true + } + } + return false +} + +// conflictsWithInFlight reports whether any model in evict is still handling +// requests. Stopping a busy process would cancel its callers' connections, so +// the scheduler defers the swap until those callers finish. +func conflictsWithInFlight(evict []string, inFlight map[string]int) bool { + for _, m := range evict { + if inFlight[m] > 0 { + return true + } + } + return false +} + +func containsString(xs []string, s string) bool { + for _, x := range xs { + if x == s { + return true + } + } + return false +} + +// broadcastQueuePositions sends each queued request its current 1-indexed +// position. Sends are non-blocking: if the channel is full, the old value is +// drained first so the consumer always sees the latest position. +func broadcastQueuePositions(queued []HandlerReq) { + for i, req := range queued { + pos := i + 1 + select { + case req.PositionCh <- pos: + default: + select { + case <-req.PositionCh: + default: + } + select { + case req.PositionCh <- pos: + default: + } + } + } +} diff --git a/internal/router/scheduler/fifo_test.go b/internal/router/scheduler/fifo_test.go new file mode 100644 index 00000000..54f71987 --- /dev/null +++ b/internal/router/scheduler/fifo_test.go @@ -0,0 +1,537 @@ +package scheduler + +import ( + "errors" + "io" + "testing" + "time" + + "github.com/mostlygeek/llama-swap/internal/config" + "github.com/mostlygeek/llama-swap/internal/logmon" + "github.com/mostlygeek/llama-swap/internal/process" +) + +// FIFO methods all run on the router's single run-loop goroutine, so these +// tests drive them directly and synchronously. A swap is "completed" by calling +// OnSwapDone, a served request "finishes" by calling OnServeDone — exactly the +// events the run loop would deliver. fakeEffects records every side-effect and +// stubPlanner supplies a fixed eviction set per target. + +// stubPlanner returns a fixed eviction list per target. +type stubPlanner struct { + evict map[string][]string +} + +func (s *stubPlanner) EvictionFor(target string, _ []string) []string { + if s.evict == nil { + return nil + } + return s.evict[target] +} + +func (s *stubPlanner) OnSwapStart(string, []string) {} + +// grantRec is one GrantError / GrantServe call. err!=nil marks an error grant; +// otherwise it is a serve grant and serve reports whether the caller received it. +type grantRec struct { + model string + err error + serve bool +} + +type startRec struct { + model string + evict []string +} + +type stopRec struct { + timeout time.Duration + ids []string +} + +// fakeEffects is an in-memory scheduler.Effects. Tests program process states +// and GrantServe outcomes, then assert on the recorded calls. +type fakeEffects struct { + states map[string]process.ProcessState // model -> state; missing => not handled + serveResult map[string]bool // GrantServe return per model (default true) + + starts []startRec + grants []grantRec + stops []stopRec +} + +func newFakeEffects() *fakeEffects { + return &fakeEffects{ + states: map[string]process.ProcessState{}, + serveResult: map[string]bool{}, + } +} + +func (f *fakeEffects) ModelState(modelID string) (process.ProcessState, bool) { + st, ok := f.states[modelID] + return st, ok +} + +func (f *fakeEffects) RunningModels() map[string]process.ProcessState { + out := make(map[string]process.ProcessState) + for id, st := range f.states { + if st == process.StateStopped || st == process.StateShutdown { + continue + } + out[id] = st + } + return out +} + +func (f *fakeEffects) StartSwap(modelID string, evict []string) { + f.starts = append(f.starts, startRec{model: modelID, evict: evict}) +} + +func (f *fakeEffects) GrantError(req HandlerReq, err error) { + f.grants = append(f.grants, grantRec{model: req.Model, err: err}) +} + +func (f *fakeEffects) GrantServe(req HandlerReq, modelID string) bool { + ok := true + if v, set := f.serveResult[modelID]; set { + ok = v + } + f.grants = append(f.grants, grantRec{model: modelID, serve: ok}) + return ok +} + +func (f *fakeEffects) StopProcesses(timeout time.Duration, ids []string) { + f.stops = append(f.stops, stopRec{timeout: timeout, ids: ids}) +} + +// served counts grants that handed modelID a handler and were received. +func (f *fakeEffects) served(modelID string) int { + n := 0 + for _, g := range f.grants { + if g.err == nil && g.serve && g.model == modelID { + n++ + } + } + return n +} + +// errored counts error grants, optionally filtered by model ("" = any). +func (f *fakeEffects) errored(model string) int { + n := 0 + for _, g := range f.grants { + if g.err != nil && (model == "" || g.model == model) { + n++ + } + } + return n +} + +// startsFor counts StartSwap calls for modelID. +func (f *fakeEffects) startsFor(modelID string) int { + n := 0 + for _, s := range f.starts { + if s.model == modelID { + n++ + } + } + return n +} + +func newFIFO(planner Swapper, eff Effects) *FIFO { + return NewFIFO("test", logmon.NewWriter(io.Discard), planner, config.FifoConfig{}, eff) +} + +func req(model string) HandlerReq { return HandlerReq{Model: model} } + +func TestFIFO_FastPath(t *testing.T) { + eff := newFakeEffects() + eff.states["a"] = process.StateReady + s := newFIFO(&stubPlanner{}, eff) + + s.OnRequest(req("a")) + + if got := eff.startsFor("a"); got != 0 { + t.Errorf("StartSwap calls=%d want 0 (fast path should not swap)", got) + } + if got := eff.served("a"); got != 1 { + t.Errorf("served(a)=%d want 1", got) + } +} + +func TestFIFO_ModelNotFound(t *testing.T) { + eff := newFakeEffects() // no states => model unknown + s := newFIFO(&stubPlanner{}, eff) + + s.OnRequest(req("ghost")) + + if got := len(eff.starts); got != 0 { + t.Errorf("StartSwap calls=%d want 0", got) + } + if eff.errored("ghost") != 1 { + t.Fatalf("want 1 error grant for ghost, grants=%+v", eff.grants) + } + if !errors.Is(eff.grants[0].err, ErrModelNotFound) { + t.Errorf("err=%v want ErrModelNotFound", eff.grants[0].err) + } +} + +func TestFIFO_OnDemandStartThenServe(t *testing.T) { + eff := newFakeEffects() + eff.states["a"] = process.StateStopped + s := newFIFO(&stubPlanner{}, eff) + + s.OnRequest(req("a")) + if got := eff.startsFor("a"); got != 1 { + t.Fatalf("StartSwap(a)=%d want 1", got) + } + if got := eff.served("a"); got != 0 { + t.Errorf("served(a)=%d want 0 before swap completes", got) + } + + // Swap finishes, model is now ready. + eff.states["a"] = process.StateReady + s.OnSwapDone(SwapDone{ModelID: "a"}) + + if got := eff.served("a"); got != 1 { + t.Errorf("served(a)=%d want 1 after swap done", got) + } +} + +func TestFIFO_JoinInFlightSwap(t *testing.T) { + eff := newFakeEffects() + eff.states["a"] = process.StateStopped + s := newFIFO(&stubPlanner{}, eff) + + s.OnRequest(req("a")) // starts swap + s.OnRequest(req("a")) // joins + s.OnRequest(req("a")) // joins + + if got := eff.startsFor("a"); got != 1 { + t.Fatalf("StartSwap(a)=%d want 1 (all three share one swap)", got) + } + + eff.states["a"] = process.StateReady + s.OnSwapDone(SwapDone{ModelID: "a"}) + + if got := eff.served("a"); got != 3 { + t.Errorf("served(a)=%d want 3 (one swap serves all waiters)", got) + } +} + +func TestFIFO_SwapDoneError_FailsAllWaiters(t *testing.T) { + eff := newFakeEffects() + eff.states["a"] = process.StateStopped + s := newFIFO(&stubPlanner{}, eff) + + s.OnRequest(req("a")) + s.OnRequest(req("a")) + + s.OnSwapDone(SwapDone{ModelID: "a", Err: errors.New("boom")}) + + if eff.served("a") != 0 { + t.Errorf("served(a)=%d want 0 on swap error", eff.served("a")) + } + if eff.errored("a") != 2 { + t.Errorf("errored(a)=%d want 2 (both waiters fail)", eff.errored("a")) + } +} + +// TestFIFO_QueueOnEvictionCollision covers a request whose target evicts the +// model currently being swapped: it must queue until that swap finishes AND its +// served request drains, because starting it would stop a busy process. +func TestFIFO_QueueOnEvictionCollision(t *testing.T) { + eff := newFakeEffects() + eff.states["a"] = process.StateStopped + eff.states["b"] = process.StateStopped + // Loading b evicts a. + s := newFIFO(&stubPlanner{evict: map[string][]string{"b": {"a"}}}, eff) + + s.OnRequest(req("a")) // StartSwap(a) + s.OnRequest(req("b")) // collides with a's in-flight swap -> queue + if got := eff.startsFor("b"); got != 0 { + t.Fatalf("b started early: StartSwap(b)=%d want 0", got) + } + + // a becomes ready and is granted (now serving, inFlight[a]=1). + eff.states["a"] = process.StateReady + s.OnSwapDone(SwapDone{ModelID: "a"}) + if got := eff.startsFor("b"); got != 0 { + t.Fatalf("b started while a is serving: StartSwap(b)=%d want 0", got) + } + + // a's request finishes -> a no longer in-flight -> b may now swap. + s.OnServeDone(ServeDoneEvent{ModelID: "a"}) + if got := eff.startsFor("b"); got != 1 { + t.Fatalf("StartSwap(b)=%d want 1 after a drained", got) + } + if got := eff.starts[len(eff.starts)-1].evict; len(got) != 1 || got[0] != "a" { + t.Errorf("b swap evict=%v want [a]", got) + } +} + +// TestFIFO_DisjointSwapsRunInParallel verifies two requests with +// non-conflicting evict sets both start without waiting for each other. +func TestFIFO_DisjointSwapsRunInParallel(t *testing.T) { + eff := newFakeEffects() + eff.states["a"] = process.StateStopped + eff.states["b"] = process.StateStopped + s := newFIFO(&stubPlanner{}, eff) // empty evicts + + s.OnRequest(req("a")) + s.OnRequest(req("b")) + + if eff.startsFor("a") != 1 || eff.startsFor("b") != 1 { + t.Fatalf("StartSwap a=%d b=%d want 1 each (parallel)", eff.startsFor("a"), eff.startsFor("b")) + } +} + +// TestFIFO_OverlappingEvictSetsDoNotRunInParallel verifies two swaps with +// different targets that evict the *same* model do not run concurrently: the +// second must queue rather than double-evict the shared model. Neither target is +// in the other's evict set, so this is only caught by the evict-set overlap +// check in collidesWith. +func TestFIFO_OverlappingEvictSetsDoNotRunInParallel(t *testing.T) { + eff := newFakeEffects() + eff.states["a"] = process.StateStopped + eff.states["b"] = process.StateStopped + eff.states["x"] = process.StateReady // shared eviction target, running + // Loading a or b both require evicting x. + s := newFIFO(&stubPlanner{evict: map[string][]string{"a": {"x"}, "b": {"x"}}}, eff) + + s.OnRequest(req("a")) // StartSwap(a, [x]) + s.OnRequest(req("b")) // overlaps a's evict set ([x]) -> queue + if eff.startsFor("a") != 1 { + t.Fatalf("StartSwap(a)=%d want 1", eff.startsFor("a")) + } + if got := eff.startsFor("b"); got != 0 { + t.Fatalf("b started in parallel while a evicts x: StartSwap(b)=%d want 0", got) + } + + // a's swap completes and x is gone; b can now evict nothing and start. + eff.states["a"] = process.StateReady + eff.states["x"] = process.StateStopped + s.OnSwapDone(SwapDone{ModelID: "a"}) + if got := eff.startsFor("b"); got != 1 { + t.Fatalf("StartSwap(b)=%d want 1 after a's swap drained", got) + } +} + +// TestFIFO_QueueDrainPromotesMultiple verifies completing one swap unblocks +// every queued request that no longer collides — they all start together. +func TestFIFO_QueueDrainPromotesMultiple(t *testing.T) { + eff := newFakeEffects() + eff.states["a"] = process.StateStopped + eff.states["b"] = process.StateStopped + eff.states["c"] = process.StateStopped + // a's swap evicts both b and c; b and c evict nothing. + s := newFIFO(&stubPlanner{evict: map[string][]string{"a": {"b", "c"}}}, eff) + + s.OnRequest(req("a")) // StartSwap(a, [b,c]) + s.OnRequest(req("b")) // collides (in a's evict set) -> queue + s.OnRequest(req("c")) // collides -> queue + if eff.startsFor("b") != 0 || eff.startsFor("c") != 0 { + t.Fatalf("b/c started early") + } + + eff.states["a"] = process.StateReady + s.OnSwapDone(SwapDone{ModelID: "a"}) + + // b and c have empty evict sets and don't evict a, so both start now. + if eff.startsFor("b") != 1 || eff.startsFor("c") != 1 { + t.Fatalf("StartSwap b=%d c=%d want 1 each after a done", eff.startsFor("b"), eff.startsFor("c")) + } + if eff.served("a") != 1 { + t.Errorf("served(a)=%d want 1", eff.served("a")) + } +} + +// TestFIFO_QueueCollation verifies duplicate requests collapse into one swap +// per model: the second request for each model joins the active swap (at arrival +// or at drain time) rather than triggering its own swap. +func TestFIFO_QueueCollation(t *testing.T) { + eff := newFakeEffects() + for _, id := range []string{"a", "b", "c"} { + eff.states[id] = process.StateStopped + } + // Each model evicts the other two: all swaps are mutually exclusive. + s := newFIFO(&stubPlanner{evict: map[string][]string{ + "a": {"b", "c"}, + "b": {"a", "c"}, + "c": {"a", "b"}, + }}, eff) + + for _, id := range []string{"a", "b", "c", "a", "b", "c"} { + s.OnRequest(req(id)) + } + + // Drain a, then its served requests, which promotes b; repeat for b -> c. + drain := func(model string, waiters int) { + eff.states[model] = process.StateReady + s.OnSwapDone(SwapDone{ModelID: model}) + for i := 0; i < waiters; i++ { + s.OnServeDone(ServeDoneEvent{ModelID: model}) + } + } + drain("a", 2) + drain("b", 2) + drain("c", 2) + + for _, id := range []string{"a", "b", "c"} { + if got := eff.startsFor(id); got != 1 { + t.Errorf("StartSwap(%s)=%d want 1 (collation)", id, got) + } + if got := eff.served(id); got != 2 { + t.Errorf("served(%s)=%d want 2", id, got) + } + } +} + +// TestFIFO_NoSwapWhileServing verifies a model still handling requests is not +// evicted: the evicting request waits until every in-flight request drains. +func TestFIFO_NoSwapWhileServing(t *testing.T) { + eff := newFakeEffects() + eff.states["a"] = process.StateReady + eff.states["b"] = process.StateStopped + s := newFIFO(&stubPlanner{evict: map[string][]string{"b": {"a"}}}, eff) + + s.OnRequest(req("a")) // fast path, inFlight[a]=1 + s.OnRequest(req("a")) // fast path, inFlight[a]=2 + s.OnRequest(req("b")) // would evict busy a -> queue + if eff.startsFor("b") != 0 { + t.Fatalf("b started while a serving") + } + + s.OnServeDone(ServeDoneEvent{ModelID: "a"}) // inFlight[a]=1 + if eff.startsFor("b") != 0 { + t.Fatalf("b started while a still serving one request") + } + + s.OnServeDone(ServeDoneEvent{ModelID: "a"}) // inFlight[a]=0 + if eff.startsFor("b") != 1 { + t.Fatalf("StartSwap(b)=%d want 1 after a fully drained", eff.startsFor("b")) + } +} + +// TestFIFO_GrantServeFalseDoesNotLeakInFlight verifies that when a caller has +// walked away (GrantServe returns false) the in-flight count is not bumped, so a +// later evicting request is not blocked forever. +func TestFIFO_GrantServeFalseDoesNotLeakInFlight(t *testing.T) { + eff := newFakeEffects() + eff.states["a"] = process.StateStopped + eff.states["b"] = process.StateStopped + eff.serveResult["a"] = false // a's waiter is gone by grant time + s := newFIFO(&stubPlanner{evict: map[string][]string{"b": {"a"}}}, eff) + + s.OnRequest(req("a")) + eff.states["a"] = process.StateReady + s.OnSwapDone(SwapDone{ModelID: "a"}) // grant fails, inFlight[a] stays 0 + + // b evicts a; since a is not in-flight, b should start immediately. + s.OnRequest(req("b")) + if eff.startsFor("b") != 1 { + t.Fatalf("StartSwap(b)=%d want 1 (no leaked in-flight on a)", eff.startsFor("b")) + } +} + +// TestFIFO_OnShutdown_FailsAllWaiters verifies shutdown errors every waiter the +// scheduler holds: active-swap waiters and queued requests alike. +func TestFIFO_OnShutdown_FailsAllWaiters(t *testing.T) { + eff := newFakeEffects() + for _, id := range []string{"a", "b", "c"} { + eff.states[id] = process.StateStopped + } + // a and b load in parallel; c collides with both and queues. + s := newFIFO(&stubPlanner{evict: map[string][]string{"c": {"a", "b"}}}, eff) + + s.OnRequest(req("a")) // StartSwap(a) + s.OnRequest(req("a")) // join a + s.OnRequest(req("b")) // StartSwap(b) + s.OnRequest(req("b")) // join b + s.OnRequest(req("c")) // queued + + s.OnShutdown(errors.New("shutting down")) + + if got := eff.errored(""); got != 5 { + t.Errorf("error grants=%d want 5 (2 a + 2 b + 1 c)", got) + } +} + +func TestFIFO_OnUnload_ReleasesActiveWaiters(t *testing.T) { + eff := newFakeEffects() + eff.states["a"] = process.StateStopped + s := newFIFO(&stubPlanner{}, eff) + + s.OnRequest(req("a")) // active swap a with one waiter + s.OnRequest(req("a")) // join + + s.OnUnload([]string{"a"}, time.Second) + + if got := eff.errored("a"); got != 2 { + t.Errorf("errored(a)=%d want 2 (active swap waiters released)", got) + } + if len(eff.stops) != 1 || len(eff.stops[0].ids) != 1 || eff.stops[0].ids[0] != "a" { + t.Errorf("StopProcesses=%+v want one call stopping [a]", eff.stops) + } + if eff.stops[0].timeout != time.Second { + t.Errorf("StopProcesses timeout=%v want 1s", eff.stops[0].timeout) + } +} + +func TestFIFO_OnUnload_DropsQueuedRequests(t *testing.T) { + eff := newFakeEffects() + eff.states["a"] = process.StateStopped + eff.states["b"] = process.StateStopped + // b evicts a, so a request for b queues while a is loading. + s := newFIFO(&stubPlanner{evict: map[string][]string{"b": {"a"}}}, eff) + + s.OnRequest(req("a")) // StartSwap(a) + s.OnRequest(req("b")) // queued + + s.OnUnload([]string{"b"}, time.Second) + + if got := eff.errored("b"); got != 1 { + t.Errorf("errored(b)=%d want 1 (queued request dropped)", got) + } + if got := eff.startsFor("b"); got != 0 { + t.Errorf("StartSwap(b)=%d want 0 (b should never start)", got) + } + // a's swap is untouched: its waiter is neither served nor errored yet. + if eff.served("a") != 0 || eff.errored("a") != 0 { + t.Errorf("a swap should be untouched: served=%d errored=%d", eff.served("a"), eff.errored("a")) + } +} + +// TestFIFO_PriorityQueueOrder verifies queued requests are ordered by descending +// priority, with arrival (FIFO) order preserved among equal-priority models. +func TestFIFO_PriorityQueueOrder(t *testing.T) { + eff := newFakeEffects() + for _, m := range []string{"z", "A", "B", "C", "D"} { + eff.states[m] = process.StateStopped + } + // z's swap evicts every other model, so any request that arrives while z is + // loading collides with z's in-flight swap and parks in the queue. + planner := &stubPlanner{evict: map[string][]string{"z": {"A", "B", "C", "D"}}} + cfg := config.FifoConfig{Priority: map[string]int{"A": 10, "B": 5, "C": 5, "D": 1}} + s := NewFIFO("test", logmon.NewWriter(io.Discard), planner, cfg, eff) + + s.OnRequest(req("z")) // StartSwap(z, [A,B,C,D]) + + // Arrive out of priority order; B before C exercises FIFO tie-breaking. + for _, m := range []string{"B", "D", "C", "A"} { + s.OnRequest(req(m)) + } + + got := make([]string, len(s.queued)) + for i, q := range s.queued { + got[i] = q.Model + } + want := []string{"A", "B", "C", "D"} + if len(got) != len(want) { + t.Fatalf("queue=%v want %v", got, want) + } + for i := range want { + if got[i] != want[i] { + t.Fatalf("queue=%v want %v", got, want) + } + } +} diff --git a/internal/router/scheduler/scheduler.go b/internal/router/scheduler/scheduler.go new file mode 100644 index 00000000..9dc9e281 --- /dev/null +++ b/internal/router/scheduler/scheduler.go @@ -0,0 +1,116 @@ +// Package scheduler contains the request-scheduling strategies used by the +// router's baseRouter. A Scheduler owns the queue, in-flight tracking, and the +// decision tree for when to start a swap versus queue a request. The baseRouter +// owns the channels, run loop, and process machinery, and exposes the +// side-effects a scheduler needs through the Effects interface. +// +// Splitting these apart lets the scheduling strategy be swapped out +// independently of both the process machinery (baseRouter) and the eviction +// policy (Swapper). FIFO is the first and currently only implementation. +package scheduler + +import ( + "context" + "fmt" + "net/http" + "time" + + "github.com/mostlygeek/llama-swap/internal/logmon" + "github.com/mostlygeek/llama-swap/internal/process" +) + +// ErrModelNotFound is granted to callers whose model is not handled by this +// router. The router package aliases it so SendError can match it. +var ErrModelNotFound = fmt.Errorf("local model not found") + +// Swapper is the eviction policy: it decides which running models must be +// stopped before a target can serve. It is orthogonal to the scheduling +// strategy — any Scheduler works with any Swapper. +type Swapper interface { + // EvictionFor returns running model IDs that must be stopped before + // target can serve. running is the complete set the scheduler considers + // live: every process that is not stopped, unioned with the targets of + // in-flight swaps the scheduler has already committed to (which are not yet + // visible in process state). The planner does not inspect process state + // itself. Pure decision; must not log. + EvictionFor(target string, running []string) []string + + // OnSwapStart runs once at the start of every swap, with the same running + // set EvictionFor was given for this decision. Planners may log their + // decision here at whatever verbosity they choose. + OnSwapStart(target string, running []string) +} + +// Scheduler decides what happens to each event the router's run loop receives. +// All methods run on that single run-loop goroutine, so implementations need no +// internal locking for their own state. +type Scheduler interface { + // OnRequest handles one incoming ServeHTTP request. + OnRequest(req HandlerReq) + // OnSwapDone handles a swap goroutine reporting completion. + OnSwapDone(ev SwapDone) + // OnServeDone handles a tracked ServeHTTP finishing (in-flight decrement). + OnServeDone(ev ServeDoneEvent) + // OnUnload reconciles scheduler state for an unload, stops the targeted + // processes via Effects, and drains the queue. It must block until the + // targeted processes have stopped. + OnUnload(targets []string, timeout time.Duration) + // OnShutdown grants err to every waiter the scheduler still holds (active + // swap waiters and queued requests). Process teardown is the baseRouter's + // responsibility. + OnShutdown(err error) +} + +// Effects is implemented by the baseRouter. The scheduler calls back through it +// for every side-effect: inspecting process state, launching swaps, responding +// to callers, and stopping processes. +type Effects interface { + // ModelState returns the current state of a model's process. ok is false + // when the model is not handled by this router. + ModelState(modelID string) (process.ProcessState, bool) + // RunningModels returns the state of every process that is not stopped or + // shut down, keyed by model ID. The scheduler uses it to build the running + // set it hands the Swapper. + RunningModels() map[string]process.ProcessState + // StartSwap launches the swap goroutine for modelID, stopping evict first. + StartSwap(modelID string, evict []string) + // GrantError responds to a caller with an error. + GrantError(req HandlerReq, err error) + // GrantServe hands a caller the wrapped handler for modelID and reports + // whether the caller was still there to receive it. The scheduler bumps + // its in-flight count only when this returns true. + GrantServe(req HandlerReq, modelID string) bool + // StopProcesses stops the named processes in parallel and blocks until all + // have stopped. Unknown IDs are skipped. + StopProcesses(timeout time.Duration, ids []string) +} + +// Factory builds a Scheduler bound to a baseRouter's Effects. The concrete +// router captures its Swapper in the closure it passes as a Factory. +type Factory func(name string, logger *logmon.Monitor, eff Effects) Scheduler + +// HandlerReq is one in-flight ServeHTTP request waiting for a routing decision. +type HandlerReq struct { + Model string + Ctx context.Context + Respond chan HandlerResp + PositionCh chan int +} + +// HandlerResp is the routing decision returned to a HandlerReq's caller: either +// a handler to serve with, or an error. +type HandlerResp struct { + HandleFunc http.HandlerFunc + Err error +} + +// SwapDone is reported by a swap goroutine when its target is ready (or failed). +type SwapDone struct { + ModelID string + Err error +} + +// ServeDoneEvent is reported when a tracked ServeHTTP handler returns. +type ServeDoneEvent struct { + ModelID string +} diff --git a/internal/server/server.go b/internal/server/server.go index 5a29a43b..5f36de32 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -99,12 +99,13 @@ func New(cfg config.Config, muxlog *logmon.Monitor, proxylog *logmon.Monitor, up var local router.LocalRouter var err error - if cfg.Matrix != nil { + switch cfg.Routing.Router.Use { + case "matrix": local, err = router.NewMatrix(cfg, proxylog, upstreamlog) if err != nil { return nil, fmt.Errorf("creating matrix router: %w", err) } - } else { + default: // "group" local, err = router.NewGroup(cfg, proxylog, upstreamlog) if err != nil { return nil, fmt.Errorf("creating group router: %w", err) diff --git a/internal/server/server_test.go b/internal/server/server_test.go index 3f9bbded..f16a915f 100644 --- a/internal/server/server_test.go +++ b/internal/server/server_test.go @@ -84,10 +84,15 @@ func chatRequest(model string) *http.Request { func TestServer_New_GroupConfig(t *testing.T) { discard := logmon.NewWriter(io.Discard) - s, err := New(config.Config{HealthCheckTimeout: 15}, discard, discard, discard, nil, BuildInfo{}) + cfg := config.Config{HealthCheckTimeout: 15} + cfg.Routing.Router.Use = "group" + s, err := New(cfg, discard, discard, discard, nil, BuildInfo{}) if err != nil { t.Fatalf("New (group): %v", err) } + if _, ok := s.local.(*router.Group); !ok { + t.Fatalf("localRouter=%T want *router.Group", s.local) + } if err := s.Shutdown(time.Second); err != nil { t.Fatalf("Shutdown: %v", err) } @@ -95,11 +100,16 @@ func TestServer_New_GroupConfig(t *testing.T) { func TestServer_New_MatrixConfig(t *testing.T) { discard := logmon.NewWriter(io.Discard) - cfg := config.Config{HealthCheckTimeout: 15, Matrix: &config.MatrixConfig{}} + cfg := config.Config{HealthCheckTimeout: 15} + cfg.Routing.Router.Use = "matrix" + cfg.Routing.Router.Settings.Matrix = &config.MatrixConfig{} s, err := New(cfg, discard, discard, discard, nil, BuildInfo{}) if err != nil { t.Fatalf("New (matrix): %v", err) } + if _, ok := s.local.(*router.Matrix); !ok { + t.Fatalf("localRouter=%T want *router.Matrix", s.local) + } if err := s.Shutdown(time.Second); err != nil { t.Fatalf("Shutdown: %v", err) }