Implement new scheduler (#823)
- introduce internal/router/scheduler to decouple routing, swapping and queuing into interface contracts. - introduce a new `routing` configuration section that supersedes `matrix` and `group` while maintaining backwards compatibility - add FIFO scheduler with prioritized queuing - add internal/router/design.md as developer documentation on implementing new schedulers and routers Fixes #797
This commit is contained in:
@@ -44,13 +44,10 @@ jobs:
|
||||
|
||||
echo "✓ config-schema.json is valid"
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 #v6.2.0
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c #6.4.0
|
||||
with:
|
||||
python-version: "3.x"
|
||||
|
||||
- name: Install check-jsonschema
|
||||
run: pip install check-jsonschema
|
||||
go-version-file: go.mod
|
||||
|
||||
- name: Validate config.example.yaml against schema
|
||||
run: check-jsonschema --schemafile config-schema.json config.example.yaml
|
||||
run: go test ./internal/config/ -run TestConfig_ExampleMatchesSchema -v
|
||||
|
||||
+155
-72
@@ -82,6 +82,78 @@
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"description": "Timeout settings for proxy connections."
|
||||
},
|
||||
"groupsConfig": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
"members"
|
||||
],
|
||||
"properties": {
|
||||
"swap": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Controls model swapping behaviour within the group. True: only one model runs at a time. False: all models can run together."
|
||||
},
|
||||
"exclusive": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Controls how the group affects other groups. True: causes all other groups to unload when this group runs a model. False: does not affect other groups."
|
||||
},
|
||||
"persistent": {
|
||||
"type": "boolean",
|
||||
"default": false,
|
||||
"description": "Prevents other groups from unloading the models in this group. Does not affect individual model behaviour."
|
||||
},
|
||||
"members": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Array of model IDs that are members of this group. Model IDs must be defined in models."
|
||||
}
|
||||
}
|
||||
},
|
||||
"description": "A dictionary of group settings. Provides advanced controls over model swapping behaviour. Model IDs must be defined in models. A model can only be a member of one group. Behaviour controlled via swap, exclusive, persistent."
|
||||
},
|
||||
"matrixConfig": {
|
||||
"type": "object",
|
||||
"description": "Solver-based alternative to groups. Declares valid combinations of concurrent models. The solver minimizes eviction cost when swapping. A config must use either groups or matrix, not both.",
|
||||
"required": [
|
||||
"vars",
|
||||
"sets"
|
||||
],
|
||||
"properties": {
|
||||
"vars": {
|
||||
"type": "object",
|
||||
"description": "Short names for models. Keys must be alphanumeric, 1-8 characters. All sets and evict_costs must use these IDs.",
|
||||
"minProperties": 1,
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
},
|
||||
"propertyNames": {
|
||||
"pattern": "^[a-zA-Z0-9]{1,8}$"
|
||||
}
|
||||
},
|
||||
"evict_costs": {
|
||||
"type": "object",
|
||||
"description": "Relative cost of evicting a running model. Models not listed default to 1. Values must be positive integers.",
|
||||
"additionalProperties": {
|
||||
"type": "integer",
|
||||
"minimum": 1
|
||||
}
|
||||
},
|
||||
"sets": {
|
||||
"type": "object",
|
||||
"description": "Named sets of concurrent model combinations. Values are DSL strings using & (AND), | (OR), () (grouping), and +ref (inline another set). Definition order is used for tie-breaking.",
|
||||
"minProperties": 1,
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false
|
||||
}
|
||||
},
|
||||
"properties": {
|
||||
@@ -311,76 +383,10 @@
|
||||
}
|
||||
},
|
||||
"groups": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
"members"
|
||||
],
|
||||
"properties": {
|
||||
"swap": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Controls model swapping behaviour within the group. True: only one model runs at a time. False: all models can run together."
|
||||
},
|
||||
"exclusive": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Controls how the group affects other groups. True: causes all other groups to unload when this group runs a model. False: does not affect other groups."
|
||||
},
|
||||
"persistent": {
|
||||
"type": "boolean",
|
||||
"default": false,
|
||||
"description": "Prevents other groups from unloading the models in this group. Does not affect individual model behaviour."
|
||||
},
|
||||
"members": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Array of model IDs that are members of this group. Model IDs must be defined in models."
|
||||
}
|
||||
}
|
||||
},
|
||||
"description": "A dictionary of group settings. Provides advanced controls over model swapping behaviour. Model IDs must be defined in models. A model can only be a member of one group. Behaviour controlled via swap, exclusive, persistent."
|
||||
"$ref": "#/definitions/groupsConfig"
|
||||
},
|
||||
"matrix": {
|
||||
"type": "object",
|
||||
"description": "Solver-based alternative to groups. Declares valid combinations of concurrent models. The solver minimizes eviction cost when swapping. A config must use either groups or matrix, not both.",
|
||||
"required": [
|
||||
"vars",
|
||||
"sets"
|
||||
],
|
||||
"properties": {
|
||||
"vars": {
|
||||
"type": "object",
|
||||
"description": "Short names for models. Keys must be alphanumeric, 1-8 characters. All sets and evict_costs must use these IDs.",
|
||||
"minProperties": 1,
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
},
|
||||
"propertyNames": {
|
||||
"pattern": "^[a-zA-Z0-9]{1,8}$"
|
||||
}
|
||||
},
|
||||
"evict_costs": {
|
||||
"type": "object",
|
||||
"description": "Relative cost of evicting a running model. Models not listed default to 1. Values must be positive integers.",
|
||||
"additionalProperties": {
|
||||
"type": "integer",
|
||||
"minimum": 1
|
||||
}
|
||||
},
|
||||
"sets": {
|
||||
"type": "object",
|
||||
"description": "Named sets of concurrent model combinations. Values are DSL strings using & (AND), | (OR), () (grouping), and +ref (inline another set). Definition order is used for tie-breaking.",
|
||||
"minProperties": 1,
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false
|
||||
"$ref": "#/definitions/matrixConfig"
|
||||
},
|
||||
"hooks": {
|
||||
"type": "object",
|
||||
@@ -512,26 +518,103 @@
|
||||
},
|
||||
"default": {},
|
||||
"description": "A dictionary of remote peers and models they provide. Peers can be another llama-swap or any server that provides the /v1/ generative API endpoints supported by llama-swap."
|
||||
},
|
||||
"routing": {
|
||||
"type": "object",
|
||||
"description": "Canonical routing/scheduling configuration. Alternative to the legacy top-level 'groups'/'matrix' keys; a config must not use both styles.",
|
||||
"properties": {
|
||||
"scheduler": {
|
||||
"type": "object",
|
||||
"description": "Scheduler configuration. Decides the order in which queued requests are serviced.",
|
||||
"properties": {
|
||||
"use": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"fifo"
|
||||
],
|
||||
"default": "fifo",
|
||||
"description": "Scheduler to use. Only 'fifo' is currently supported."
|
||||
},
|
||||
"settings": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"fifo": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"priority": {
|
||||
"type": "object",
|
||||
"description": "Per-model priority. Keys are model IDs, values are integers (default 0). Higher values are serviced first.",
|
||||
"additionalProperties": {
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false
|
||||
}
|
||||
},
|
||||
"additionalProperties": false
|
||||
}
|
||||
},
|
||||
"additionalProperties": false
|
||||
},
|
||||
"router": {
|
||||
"type": "object",
|
||||
"description": "Router configuration. Selects between the group and matrix swapping strategies.",
|
||||
"properties": {
|
||||
"use": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"group",
|
||||
"matrix"
|
||||
],
|
||||
"default": "group",
|
||||
"description": "Router to use. 'group' uses static groups, 'matrix' uses the solver-based swap matrix."
|
||||
},
|
||||
"settings": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"groups": {
|
||||
"$ref": "#/definitions/groupsConfig"
|
||||
},
|
||||
"matrix": {
|
||||
"$ref": "#/definitions/matrixConfig"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false
|
||||
}
|
||||
},
|
||||
"additionalProperties": false
|
||||
}
|
||||
},
|
||||
"additionalProperties": false
|
||||
}
|
||||
},
|
||||
"allOf": [
|
||||
{
|
||||
"if": {
|
||||
"required": ["groups"]
|
||||
"required": [
|
||||
"groups"
|
||||
]
|
||||
},
|
||||
"then": {
|
||||
"not": {
|
||||
"required": ["matrix"]
|
||||
"required": [
|
||||
"matrix"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"if": {
|
||||
"required": ["matrix"]
|
||||
"required": [
|
||||
"matrix"
|
||||
]
|
||||
},
|
||||
"then": {
|
||||
"not": {
|
||||
"required": ["groups"]
|
||||
"required": [
|
||||
"groups"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+170
-87
@@ -343,93 +343,6 @@ models:
|
||||
# - processes have 5 seconds to shutdown until forceful termination is attempted
|
||||
cmdStop: docker stop ${MODEL_ID}
|
||||
|
||||
# =============================================================================
|
||||
# matrix: run concurrent models with a solver-based swap DSL
|
||||
# =============================================================================
|
||||
#
|
||||
# Matrix or Groups?
|
||||
#
|
||||
# Groups are available and fully supported. The syntax may be easier to use
|
||||
# for simple use cases.
|
||||
#
|
||||
# Documentation can be found here:
|
||||
# https://github.com/mostlygeek/llama-swap/blob/40e39f7/config.example.yaml#L334-L396
|
||||
#
|
||||
# A config can only use a matrix (recommended) or groups. A configuration error
|
||||
# will occur if both are defined. Groups is legacy but is fully supported with
|
||||
# no plans to deprecate it.
|
||||
#
|
||||
# ~~~~~
|
||||
#
|
||||
# The matrix declares valid combinations of models that can run concurrently.
|
||||
# When a model is requested, the solver finds the cheapest way to make it
|
||||
# available by evicting as few (and least costly) running models as possible.
|
||||
#
|
||||
# Solver behavior:
|
||||
# 1. Request arrives for model X
|
||||
# 2. If X is already running, forward immediately. Done.
|
||||
# 3. Find all sets containing X
|
||||
# 4. For each candidate set, compute cost: sum of evict_costs for
|
||||
# every running model NOT in that set
|
||||
# 5. Pick lowest cost candidate. Ties broken by definition order.
|
||||
# 6. Evict what needs to stop. Start X. Forward request.
|
||||
#
|
||||
# Subset semantics: a set [a, b, c] means any subset is valid.
|
||||
# Only the requested model is started — others are not preloaded.
|
||||
#
|
||||
# A model not appearing in any set can only run alone.
|
||||
#
|
||||
matrix:
|
||||
# vars: short names for models (alphanumeric, 1-8 chars)
|
||||
# - required for sets and evict_costs settings
|
||||
# - each entry is a short name to a real model ID. Do not use an alias
|
||||
# - used to keep set DSL logic short and easier to read
|
||||
# - sets and evict_costs only use identifiers defined in vars
|
||||
vars:
|
||||
g: gemma-model
|
||||
q: qwen-model
|
||||
m: mistral-model
|
||||
v: voxtral-model
|
||||
e: reranker-model
|
||||
L: llama-70B
|
||||
sd: stable-diffusion
|
||||
|
||||
# evict_costs: relative cost of losing a running model (default: 1)
|
||||
evict_costs:
|
||||
v: 50 # vllm backend, slow cold start
|
||||
L: 30 # 70B weights, slow to load
|
||||
|
||||
# sets: named sets of concurrent model combinations
|
||||
# Values are DSL strings with operators:
|
||||
# & AND (models run together)
|
||||
# | OR (alternatives)
|
||||
# () grouping
|
||||
# +ref inline another set's expression
|
||||
#
|
||||
# Expansion examples:
|
||||
# "L" → [L]
|
||||
# "a & b" → [a, b]
|
||||
# "a | b" → [a], [b]
|
||||
# "(a | b) & c" → [a, c], [b, c]
|
||||
# "(a | b) & (c | d)" → [a,c], [a,d], [b,c], [b,d]
|
||||
# "+llms & v" → expands llms inline, then applies & v
|
||||
sets:
|
||||
# LLM + TTS: switching between g/q/m won't evict v
|
||||
# expands to: [g,v], [q,v], [m,v]
|
||||
standard: "(g | q | m) & v"
|
||||
|
||||
# LLM + TTS + reranker
|
||||
# expands to: [g,v,e], [q,v,e]
|
||||
with_rerank: "(g | q) & v & e"
|
||||
|
||||
# LLM + image generation, no TTS
|
||||
# expands to: [g,sd], [q,sd]
|
||||
creative: "(g | q) & sd"
|
||||
|
||||
# 70B model uses all GPUs, can only run alone
|
||||
# expands to: [L]
|
||||
full: "L"
|
||||
|
||||
# hooks: a dictionary of event triggers and actions
|
||||
# - optional, default: empty dictionary
|
||||
# - the only supported hook is on_startup
|
||||
@@ -446,6 +359,176 @@ hooks:
|
||||
preload:
|
||||
- "llama"
|
||||
|
||||
# routing:
|
||||
# Controls how llama-swap decides which models can run at the same time and
|
||||
# which get swapped out. Choose one of two swap engines:
|
||||
#
|
||||
# - group: the default engine. Simpler to configure. You define groups of
|
||||
# models that run together, and loading one group typically unloads
|
||||
# the others.
|
||||
#
|
||||
# - matrix: the newer engine. More involved to configure, but far more
|
||||
# flexible. It uses a small expression language to describe which
|
||||
# model combinations are allowed to run concurrently, enabling
|
||||
# setups that groups cannot express.
|
||||
#
|
||||
# The routing section is optional.
|
||||
routing:
|
||||
router:
|
||||
# use: a string defining which engine to use
|
||||
# - optional, default: "group"
|
||||
# - valid values: group, matrix
|
||||
use: group
|
||||
|
||||
# settings: a dictionary of settings for the specific engines
|
||||
settings:
|
||||
# groups: a dictionary of named groups
|
||||
# - optional, default: empty dictionary
|
||||
# - lets you keep some models loaded while others swap out
|
||||
# - every member must be a model ID defined in the models section
|
||||
# - a model can belong to only one group
|
||||
# - behaviour is set per group with the `swap`, `exclusive` and
|
||||
# `persistent` fields
|
||||
# - see issue #109 for details
|
||||
#
|
||||
# NOTE: the model names below are illustrative and are not defined above.
|
||||
groups:
|
||||
# group1 reproduces llama-swap's default behaviour: only one model
|
||||
# runs at a time across the entire instance.
|
||||
"group1":
|
||||
# swap: how members of this group swap among themselves
|
||||
# - optional, default: true
|
||||
# - true: only one member runs at a time
|
||||
# - false: all members can run together, no swapping
|
||||
swap: true
|
||||
|
||||
# exclusive: how this group affects other groups
|
||||
# - optional, default: true
|
||||
# - true: running a member unloads every other group
|
||||
# - false: running a member leaves other groups untouched
|
||||
exclusive: true
|
||||
|
||||
# members: the model IDs in this group
|
||||
# required
|
||||
members:
|
||||
- "llama"
|
||||
- "qwen-unlisted"
|
||||
|
||||
# group2: members all run together, but loading any other group
|
||||
# unloads them.
|
||||
"group2":
|
||||
# swap: false lets all members stay loaded at once
|
||||
swap: false
|
||||
|
||||
# exclusive: false means requesting a member loads it without
|
||||
# unloading any other group
|
||||
exclusive: false
|
||||
members:
|
||||
- "docker-llama"
|
||||
- "modelA"
|
||||
- "modelB"
|
||||
|
||||
# forever: a persistent group that other groups can never unload.
|
||||
"forever":
|
||||
# persistent: other groups cannot unload this group's members
|
||||
# - optional, default: false
|
||||
# - has no effect on swapping within the group
|
||||
persistent: true
|
||||
|
||||
# swap/exclusive: false keeps all members loaded and avoids
|
||||
# unloading other groups
|
||||
swap: false
|
||||
exclusive: false
|
||||
members:
|
||||
- "forever-modelA"
|
||||
- "forever-modelB"
|
||||
- "forever-modelc"
|
||||
|
||||
# The matrix lists the model combinations that are allowed to run
|
||||
# concurrently. When a model is requested, the solver makes room for it
|
||||
# by evicting as few running models as possible, preferring to keep the
|
||||
# costliest ones loaded.
|
||||
#
|
||||
# Solver behaviour:
|
||||
# 1. A request arrives for model X.
|
||||
# 2. If X is already running, forward the request. Done.
|
||||
# 3. Collect every set that contains X.
|
||||
# 4. For each set, add up the evict_costs of the running models that
|
||||
# are NOT in that set — that is the set's cost.
|
||||
# 5. Choose the lowest-cost set. Break ties by definition order.
|
||||
# 6. Evict the models outside that set, start X, forward the request.
|
||||
#
|
||||
# Subset semantics: a set [a, b, c] also permits any subset of itself.
|
||||
# Only the requested model is started; the others are not preloaded.
|
||||
#
|
||||
# A model that appears in no set can only run on its own.
|
||||
#
|
||||
matrix:
|
||||
# vars: short aliases for model IDs (alphanumeric, 1-8 chars)
|
||||
# - required: sets and evict_costs reference these names, not model IDs
|
||||
# - map each short name to a real model ID (not a model alias)
|
||||
# - keeps the set expressions short and readable
|
||||
vars:
|
||||
g: gemma-model
|
||||
q: qwen-model
|
||||
m: mistral-model
|
||||
v: voxtral-model
|
||||
e: reranker-model
|
||||
L: llama-70B
|
||||
sd: stable-diffusion
|
||||
|
||||
# evict_costs: relative cost of losing a running model (default: 1)
|
||||
evict_costs:
|
||||
v: 50 # vllm backend, slow cold start
|
||||
L: 30 # 70B weights, slow to load
|
||||
|
||||
# sets: named combinations of models that may run together.
|
||||
# Each value is an expression built from these operators:
|
||||
# & AND (models run together)
|
||||
# | OR (alternatives)
|
||||
# () grouping
|
||||
# +ref inline the expression of another set
|
||||
#
|
||||
# Each expression expands into one or more concrete sets:
|
||||
# "L" → [L]
|
||||
# "a & b" → [a, b]
|
||||
# "a | b" → [a], [b]
|
||||
# "(a | b) & c" → [a, c], [b, c]
|
||||
# "(a | b) & (c | d)" → [a,c], [a,d], [b,c], [b,d]
|
||||
# "+llms & v" → inline the llms set, then AND with v
|
||||
sets:
|
||||
# An LLM plus TTS. Switching between g/q/m keeps v loaded.
|
||||
# expands to: [g,v], [q,v], [m,v]
|
||||
standard: "(g | q | m) & v"
|
||||
|
||||
# An LLM plus TTS plus reranker.
|
||||
# expands to: [g,v,e], [q,v,e]
|
||||
with_rerank: "(g | q) & v & e"
|
||||
|
||||
# An LLM plus image generation, no TTS.
|
||||
# expands to: [g,sd], [q,sd]
|
||||
creative: "(g | q) & sd"
|
||||
|
||||
# The 70B model uses every GPU, so it can only run alone.
|
||||
# expands to: [L]
|
||||
full: "L"
|
||||
|
||||
# scheduler: how queued requests are ordered.
|
||||
# The default and only valid scheduler is "fifo"
|
||||
scheduler:
|
||||
use: fifo
|
||||
settings:
|
||||
fifo:
|
||||
# priority: a dictionary of model ID -> priority
|
||||
# - optional, default: empty dictionary
|
||||
# - models default to priority 0
|
||||
# - higher priority requests are serviced first in the queue
|
||||
priority:
|
||||
A: 10
|
||||
B: 5
|
||||
C: 5
|
||||
D: 1
|
||||
|
||||
# peers: a dictionary of remote peers and models they provide
|
||||
# - optional, default empty dictionary
|
||||
# - peers can be another llama-swap
|
||||
|
||||
@@ -9,6 +9,7 @@ require (
|
||||
github.com/charmbracelet/lipgloss v1.1.0
|
||||
github.com/fxamacker/cbor/v2 v2.9.1
|
||||
github.com/gin-gonic/gin v1.10.0
|
||||
github.com/google/jsonschema-go v0.4.3
|
||||
github.com/klauspost/compress v1.18.5
|
||||
github.com/shirou/gopsutil/v4 v4.26.4
|
||||
github.com/stretchr/testify v1.11.1
|
||||
|
||||
@@ -61,6 +61,8 @@ github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
|
||||
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
||||
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
|
||||
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
||||
github.com/google/jsonschema-go v0.4.3 h1:/DBOLZTfDow7pe2GmaJNhltueGTtDKICi8V8p+DQPd0=
|
||||
github.com/google/jsonschema-go v0.4.3/go.mod h1:r5quNTdLOYEz95Ru18zA0ydNbBuYoo9tgaYcxEYhJVE=
|
||||
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
|
||||
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
|
||||
github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBFW2IE=
|
||||
|
||||
@@ -129,14 +129,17 @@ type Config struct {
|
||||
GlobalTTL int `yaml:"globalTTL"`
|
||||
Models map[string]ModelConfig `yaml:"models"` /* key is model ID */
|
||||
Profiles map[string][]string `yaml:"profiles"`
|
||||
|
||||
// routing is the canonical source for swap/scheduling configuration.
|
||||
// New code must read Routing, never the backwards-compat fields below.
|
||||
Routing RoutingConfig `yaml:"routing"`
|
||||
|
||||
// Groups and Matrix are permanent backwards-compat input fields for the
|
||||
// legacy top-level `groups:`/`matrix:` keys. They are normalized into
|
||||
// Routing by LoadConfigFromReader. New code must not read them directly.
|
||||
Groups map[string]GroupConfig `yaml:"groups"` /* key is group ID */
|
||||
|
||||
// swap matrix: solver-based alternative to groups
|
||||
Matrix *MatrixConfig `yaml:"matrix"`
|
||||
|
||||
// populated during validation when matrix is configured
|
||||
ExpandedSets []ExpandedSet `yaml:"-"`
|
||||
|
||||
// for key/value replacements in model's cmd, cmdStop, proxy, checkEndPoint
|
||||
Macros MacroList `yaml:"macros"`
|
||||
|
||||
@@ -162,6 +165,35 @@ type Config struct {
|
||||
Peers PeerDictionaryConfig `yaml:"peers"`
|
||||
}
|
||||
|
||||
// RoutingConfig is the canonical, normalized routing/scheduling configuration.
|
||||
type RoutingConfig struct {
|
||||
Scheduler SchedulerConfig `yaml:"scheduler"`
|
||||
Router RouterConfig `yaml:"router"`
|
||||
}
|
||||
|
||||
type SchedulerConfig struct {
|
||||
Use string `yaml:"use"` // default "fifo"
|
||||
Settings SchedulerSettings `yaml:"settings"`
|
||||
}
|
||||
|
||||
type SchedulerSettings struct {
|
||||
Fifo FifoConfig `yaml:"fifo"`
|
||||
}
|
||||
|
||||
type FifoConfig struct {
|
||||
Priority map[string]int `yaml:"priority"` // model ID -> priority, default 0
|
||||
}
|
||||
|
||||
type RouterConfig struct {
|
||||
Use string `yaml:"use"` // "group" (default) | "matrix"
|
||||
Settings RouterSettings `yaml:"settings"`
|
||||
}
|
||||
|
||||
type RouterSettings struct {
|
||||
Groups map[string]GroupConfig `yaml:"groups"`
|
||||
Matrix *MatrixConfig `yaml:"matrix"`
|
||||
}
|
||||
|
||||
func (c *Config) RealModelName(search string) (string, bool) {
|
||||
if _, found := c.Models[search]; found {
|
||||
return search, true
|
||||
@@ -455,6 +487,34 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
|
||||
config.Models[modelId] = modelConfig
|
||||
}
|
||||
|
||||
// Normalize routing config. The legacy top-level `matrix`/`groups` keys and
|
||||
// the new `routing.router` block are mutually exclusive: a config may use
|
||||
// either style, never both.
|
||||
hasTopLevel := config.Matrix != nil || len(config.Groups) > 0
|
||||
rtr := config.Routing.Router
|
||||
hasRouting := rtr.Use != "" || rtr.Settings.Matrix != nil || len(rtr.Settings.Groups) > 0
|
||||
|
||||
if hasTopLevel && hasRouting {
|
||||
return Config{}, fmt.Errorf("config uses both the legacy top-level 'matrix'/'groups' keys and the new 'routing.router' block; please migrate the top-level keys into 'routing.router' and remove them")
|
||||
}
|
||||
|
||||
if !hasTopLevel {
|
||||
// Both groups and matrix may be defined under routing.router.settings;
|
||||
// routing.router.use selects which one is active, so there is no conflict.
|
||||
rs := config.Routing.Router.Settings
|
||||
switch config.Routing.Router.Use {
|
||||
case "matrix":
|
||||
if rs.Matrix == nil {
|
||||
return Config{}, fmt.Errorf("routing.router.use is 'matrix' but routing.router.settings.matrix is not set")
|
||||
}
|
||||
config.Matrix = rs.Matrix
|
||||
case "group", "":
|
||||
config.Groups = rs.Groups
|
||||
default:
|
||||
return Config{}, fmt.Errorf("routing.router.use: unknown router %q (valid: group, matrix)", config.Routing.Router.Use)
|
||||
}
|
||||
}
|
||||
|
||||
// groups XOR matrix
|
||||
if config.Matrix != nil && len(config.Groups) > 0 {
|
||||
return Config{}, fmt.Errorf("config cannot use both 'groups' and 'matrix'")
|
||||
@@ -465,7 +525,7 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
|
||||
if err != nil {
|
||||
return Config{}, fmt.Errorf("matrix: %w", err)
|
||||
}
|
||||
config.ExpandedSets = expandedSets
|
||||
config.Matrix.ExpandedSets = expandedSets
|
||||
} else {
|
||||
config = AddDefaultGroupToConfig(config)
|
||||
|
||||
@@ -487,6 +547,29 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
|
||||
}
|
||||
}
|
||||
|
||||
// Build the canonical Config.Routing from the effective result. Both legacy
|
||||
// and new-style configs converge here. The Matrix pointer is shared so
|
||||
// ExpandedSets stays in one place.
|
||||
if config.Matrix != nil {
|
||||
config.Routing.Router.Use = "matrix"
|
||||
} else {
|
||||
config.Routing.Router.Use = "group"
|
||||
}
|
||||
config.Routing.Router.Settings.Matrix = config.Matrix
|
||||
config.Routing.Router.Settings.Groups = config.Groups
|
||||
|
||||
if config.Routing.Scheduler.Use == "" {
|
||||
config.Routing.Scheduler.Use = "fifo"
|
||||
}
|
||||
if config.Routing.Scheduler.Use != "fifo" {
|
||||
return Config{}, fmt.Errorf("routing.scheduler.use: unknown scheduler %q (valid: fifo)", config.Routing.Scheduler.Use)
|
||||
}
|
||||
for modelID := range config.Routing.Scheduler.Settings.Fifo.Priority {
|
||||
if _, found := config.RealModelName(modelID); !found {
|
||||
return Config{}, fmt.Errorf("routing.scheduler.settings.fifo.priority references unknown model %q", modelID)
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up hooks preload
|
||||
if len(config.Hooks.OnStartup.Preload) > 0 {
|
||||
var toPreload []string
|
||||
|
||||
@@ -173,6 +173,25 @@ groups:
|
||||
IdleConn: 90,
|
||||
}
|
||||
|
||||
expectedGroups := map[string]GroupConfig{
|
||||
DEFAULT_GROUP_ID: {
|
||||
Swap: true,
|
||||
Exclusive: true,
|
||||
Members: []string{"model1", "model3"},
|
||||
},
|
||||
"group1": {
|
||||
Swap: true,
|
||||
Exclusive: false,
|
||||
Members: []string{"model2"},
|
||||
},
|
||||
"forever": {
|
||||
Swap: true,
|
||||
Exclusive: false,
|
||||
Persistent: true,
|
||||
Members: []string{"model4"},
|
||||
},
|
||||
}
|
||||
|
||||
expected := Config{
|
||||
LogLevel: "info",
|
||||
LogTimeFormat: "",
|
||||
@@ -246,22 +265,16 @@ groups:
|
||||
"m2": "model2",
|
||||
"mthree": "model3",
|
||||
},
|
||||
Groups: map[string]GroupConfig{
|
||||
DEFAULT_GROUP_ID: {
|
||||
Swap: true,
|
||||
Exclusive: true,
|
||||
Members: []string{"model1", "model3"},
|
||||
Groups: expectedGroups,
|
||||
Routing: RoutingConfig{
|
||||
Router: RouterConfig{
|
||||
Use: "group",
|
||||
Settings: RouterSettings{
|
||||
Groups: expectedGroups,
|
||||
},
|
||||
"group1": {
|
||||
Swap: true,
|
||||
Exclusive: false,
|
||||
Members: []string{"model2"},
|
||||
},
|
||||
"forever": {
|
||||
Swap: true,
|
||||
Exclusive: false,
|
||||
Persistent: true,
|
||||
Members: []string{"model4"},
|
||||
Scheduler: SchedulerConfig{
|
||||
Use: "fifo",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -0,0 +1,60 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/google/jsonschema-go/jsonschema"
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// TestConfig_ExampleMatchesSchema validates that config.example.yaml conforms to
|
||||
// config-schema.json. Both files live at the repository root.
|
||||
func TestConfig_ExampleMatchesSchema(t *testing.T) {
|
||||
const (
|
||||
schemaPath = "../../config-schema.json"
|
||||
examplePath = "../../config.example.yaml"
|
||||
)
|
||||
|
||||
schemaBytes, err := os.ReadFile(schemaPath)
|
||||
if err != nil {
|
||||
t.Fatalf("reading %s: %v", schemaPath, err)
|
||||
}
|
||||
|
||||
var schema jsonschema.Schema
|
||||
if err := json.Unmarshal(schemaBytes, &schema); err != nil {
|
||||
t.Fatalf("unmarshalling schema: %v", err)
|
||||
}
|
||||
|
||||
resolved, err := schema.Resolve(&jsonschema.ResolveOptions{
|
||||
BaseURI: "https://github.com/mostlygeek/llama-swap/",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("resolving schema: %v", err)
|
||||
}
|
||||
|
||||
exampleBytes, err := os.ReadFile(examplePath)
|
||||
if err != nil {
|
||||
t.Fatalf("reading %s: %v", examplePath, err)
|
||||
}
|
||||
|
||||
// Convert YAML to a JSON-like value so numbers and keys match what the
|
||||
// validator expects.
|
||||
var yamlValue any
|
||||
if err := yaml.Unmarshal(exampleBytes, &yamlValue); err != nil {
|
||||
t.Fatalf("unmarshalling example yaml: %v", err)
|
||||
}
|
||||
jsonBytes, err := json.Marshal(yamlValue)
|
||||
if err != nil {
|
||||
t.Fatalf("converting example to json: %v", err)
|
||||
}
|
||||
var instance any
|
||||
if err := json.Unmarshal(jsonBytes, &instance); err != nil {
|
||||
t.Fatalf("unmarshalling example json: %v", err)
|
||||
}
|
||||
|
||||
if err := resolved.Validate(instance); err != nil {
|
||||
t.Fatalf("config.example.yaml does not match config-schema.json:\n%v", err)
|
||||
}
|
||||
}
|
||||
@@ -1544,3 +1544,174 @@ peers:
|
||||
assert.Equal(t, 1, peerConfig.Timeouts.ExpectContinue)
|
||||
assert.Equal(t, 90, peerConfig.Timeouts.IdleConn)
|
||||
}
|
||||
|
||||
// twoModels is a minimal models block reused by the routing tests below.
|
||||
const twoModels = `
|
||||
models:
|
||||
gemma:
|
||||
cmd: echo gemma
|
||||
proxy: http://localhost:8080
|
||||
qwen:
|
||||
cmd: echo qwen
|
||||
proxy: http://localhost:8081
|
||||
`
|
||||
|
||||
func TestConfig_Routing_LegacyTopLevelGroups(t *testing.T) {
|
||||
yaml := twoModels + `
|
||||
groups:
|
||||
g1:
|
||||
members: [gemma, qwen]
|
||||
`
|
||||
cfg, err := LoadConfigFromReader(strings.NewReader(yaml))
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "group", cfg.Routing.Router.Use)
|
||||
// default group injected for orphaned models (none here) still leaves g1
|
||||
assert.Contains(t, cfg.Routing.Router.Settings.Groups, "g1")
|
||||
assert.Equal(t, "fifo", cfg.Routing.Scheduler.Use)
|
||||
}
|
||||
|
||||
func TestConfig_Routing_LegacyTopLevelMatrix(t *testing.T) {
|
||||
yaml := twoModels + `
|
||||
matrix:
|
||||
vars:
|
||||
g: gemma
|
||||
q: qwen
|
||||
sets:
|
||||
combo: "g | q"
|
||||
`
|
||||
cfg, err := LoadConfigFromReader(strings.NewReader(yaml))
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "matrix", cfg.Routing.Router.Use)
|
||||
require.NotNil(t, cfg.Routing.Router.Settings.Matrix)
|
||||
assert.Len(t, cfg.Routing.Router.Settings.Matrix.ExpandedSets, 2)
|
||||
}
|
||||
|
||||
func TestConfig_Routing_RouterUseMatrix(t *testing.T) {
|
||||
yaml := twoModels + `
|
||||
routing:
|
||||
router:
|
||||
use: matrix
|
||||
settings:
|
||||
matrix:
|
||||
vars:
|
||||
g: gemma
|
||||
q: qwen
|
||||
sets:
|
||||
combo: "g | q"
|
||||
`
|
||||
cfg, err := LoadConfigFromReader(strings.NewReader(yaml))
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "matrix", cfg.Routing.Router.Use)
|
||||
require.NotNil(t, cfg.Routing.Router.Settings.Matrix)
|
||||
assert.Len(t, cfg.Routing.Router.Settings.Matrix.ExpandedSets, 2)
|
||||
}
|
||||
|
||||
func TestConfig_Routing_RouterUseGroup(t *testing.T) {
|
||||
yaml := twoModels + `
|
||||
routing:
|
||||
router:
|
||||
use: group
|
||||
settings:
|
||||
groups:
|
||||
g1:
|
||||
members: [gemma, qwen]
|
||||
`
|
||||
cfg, err := LoadConfigFromReader(strings.NewReader(yaml))
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "group", cfg.Routing.Router.Use)
|
||||
assert.Contains(t, cfg.Routing.Router.Settings.Groups, "g1")
|
||||
}
|
||||
|
||||
func TestConfig_Routing_DefaultsToGroup(t *testing.T) {
|
||||
cfg, err := LoadConfigFromReader(strings.NewReader(twoModels))
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "group", cfg.Routing.Router.Use)
|
||||
assert.Equal(t, "fifo", cfg.Routing.Scheduler.Use)
|
||||
}
|
||||
|
||||
func TestConfig_Routing_LegacyAndRoutingConflict(t *testing.T) {
|
||||
yaml := twoModels + `
|
||||
groups:
|
||||
g1:
|
||||
members: [gemma, qwen]
|
||||
routing:
|
||||
router:
|
||||
use: group
|
||||
`
|
||||
_, err := LoadConfigFromReader(strings.NewReader(yaml))
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "migrate")
|
||||
}
|
||||
|
||||
func TestConfig_Routing_RouterUseMatrixWithoutSettings(t *testing.T) {
|
||||
yaml := twoModels + `
|
||||
routing:
|
||||
router:
|
||||
use: matrix
|
||||
`
|
||||
_, err := LoadConfigFromReader(strings.NewReader(yaml))
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "routing.router.settings.matrix is not set")
|
||||
}
|
||||
|
||||
// Both groups and matrix may be defined under routing.router.settings;
|
||||
// routing.router.use selects which one is active.
|
||||
func TestConfig_Routing_RouterSettingsBothGroupsAndMatrix(t *testing.T) {
|
||||
yaml := twoModels + `
|
||||
routing:
|
||||
router:
|
||||
use: group
|
||||
settings:
|
||||
groups:
|
||||
g1:
|
||||
members: [gemma, qwen]
|
||||
matrix:
|
||||
sets:
|
||||
s: "gemma"
|
||||
`
|
||||
config, err := LoadConfigFromReader(strings.NewReader(yaml))
|
||||
require.NoError(t, err)
|
||||
// use: group means groups are active and matrix is ignored
|
||||
assert.Equal(t, "group", config.Routing.Router.Use)
|
||||
assert.Nil(t, config.Matrix)
|
||||
assert.Contains(t, config.Groups, "g1")
|
||||
}
|
||||
|
||||
func TestConfig_Routing_UnknownRouter(t *testing.T) {
|
||||
yaml := twoModels + `
|
||||
routing:
|
||||
router:
|
||||
use: bogus
|
||||
`
|
||||
_, err := LoadConfigFromReader(strings.NewReader(yaml))
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "unknown router")
|
||||
}
|
||||
|
||||
func TestConfig_Routing_FifoPriorityUnknownModel(t *testing.T) {
|
||||
yaml := twoModels + `
|
||||
routing:
|
||||
scheduler:
|
||||
settings:
|
||||
fifo:
|
||||
priority:
|
||||
nope: 5
|
||||
`
|
||||
_, err := LoadConfigFromReader(strings.NewReader(yaml))
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "unknown model")
|
||||
}
|
||||
|
||||
func TestConfig_Routing_FifoPriorityKnownModel(t *testing.T) {
|
||||
yaml := twoModels + `
|
||||
routing:
|
||||
scheduler:
|
||||
settings:
|
||||
fifo:
|
||||
priority:
|
||||
gemma: 5
|
||||
`
|
||||
cfg, err := LoadConfigFromReader(strings.NewReader(yaml))
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 5, cfg.Routing.Scheduler.Settings.Fifo.Priority["gemma"])
|
||||
}
|
||||
|
||||
@@ -165,6 +165,25 @@ groups:
|
||||
IdleConn: 90,
|
||||
}
|
||||
|
||||
expectedGroups := map[string]GroupConfig{
|
||||
DEFAULT_GROUP_ID: {
|
||||
Swap: true,
|
||||
Exclusive: true,
|
||||
Members: []string{"model1", "model3"},
|
||||
},
|
||||
"group1": {
|
||||
Swap: true,
|
||||
Exclusive: false,
|
||||
Members: []string{"model2"},
|
||||
},
|
||||
"forever": {
|
||||
Swap: true,
|
||||
Exclusive: false,
|
||||
Persistent: true,
|
||||
Members: []string{"model4"},
|
||||
},
|
||||
}
|
||||
|
||||
expected := Config{
|
||||
LogLevel: "info",
|
||||
LogTimeFormat: "",
|
||||
@@ -235,22 +254,16 @@ groups:
|
||||
"m2": "model2",
|
||||
"mthree": "model3",
|
||||
},
|
||||
Groups: map[string]GroupConfig{
|
||||
DEFAULT_GROUP_ID: {
|
||||
Swap: true,
|
||||
Exclusive: true,
|
||||
Members: []string{"model1", "model3"},
|
||||
Groups: expectedGroups,
|
||||
Routing: RoutingConfig{
|
||||
Router: RouterConfig{
|
||||
Use: "group",
|
||||
Settings: RouterSettings{
|
||||
Groups: expectedGroups,
|
||||
},
|
||||
"group1": {
|
||||
Swap: true,
|
||||
Exclusive: false,
|
||||
Members: []string{"model2"},
|
||||
},
|
||||
"forever": {
|
||||
Swap: true,
|
||||
Exclusive: false,
|
||||
Persistent: true,
|
||||
Members: []string{"model4"},
|
||||
Scheduler: SchedulerConfig{
|
||||
Use: "fifo",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -15,6 +15,9 @@ type MatrixConfig struct {
|
||||
Var map[string]string `yaml:"vars"`
|
||||
EvictCosts map[string]int `yaml:"evict_costs"`
|
||||
Sets OrderedSets `yaml:"sets"`
|
||||
|
||||
// populated by ValidateMatrix; not settable from yaml
|
||||
ExpandedSets []ExpandedSet `yaml:"-"`
|
||||
}
|
||||
|
||||
// SetEntry is a single named set with its DSL expression.
|
||||
|
||||
@@ -289,7 +289,9 @@ matrix:
|
||||
cfg, err := LoadConfigFromReader(strings.NewReader(yaml))
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, cfg.Matrix)
|
||||
assert.Len(t, cfg.ExpandedSets, 2)
|
||||
assert.Len(t, cfg.Matrix.ExpandedSets, 2)
|
||||
assert.Equal(t, "matrix", cfg.Routing.Router.Use)
|
||||
assert.Len(t, cfg.Routing.Router.Settings.Matrix.ExpandedSets, 2)
|
||||
// Groups should be empty when matrix is used
|
||||
assert.Empty(t, cfg.Groups)
|
||||
}
|
||||
|
||||
+99
-412
@@ -11,6 +11,7 @@ import (
|
||||
"github.com/mostlygeek/llama-swap/internal/config"
|
||||
"github.com/mostlygeek/llama-swap/internal/logmon"
|
||||
"github.com/mostlygeek/llama-swap/internal/process"
|
||||
"github.com/mostlygeek/llama-swap/internal/router/scheduler"
|
||||
)
|
||||
|
||||
type shutdownReq struct {
|
||||
@@ -24,56 +25,17 @@ type unloadReq struct {
|
||||
respond chan struct{}
|
||||
}
|
||||
|
||||
type handlerReq struct {
|
||||
model string
|
||||
ctx context.Context
|
||||
respond chan handlerResp
|
||||
positionCh chan int
|
||||
}
|
||||
|
||||
type handlerResp struct {
|
||||
handleFunc http.HandlerFunc
|
||||
err error
|
||||
}
|
||||
|
||||
type swapDone struct {
|
||||
modelID string
|
||||
err error
|
||||
}
|
||||
|
||||
type serveDoneEvent struct {
|
||||
modelID string
|
||||
}
|
||||
|
||||
type activeSwap struct {
|
||||
modelID string
|
||||
evict []string
|
||||
waiters []handlerReq
|
||||
}
|
||||
|
||||
// swapPlanner is the only piece of behaviour that differs between concrete
|
||||
// routers. baseRouter never inspects its internals.
|
||||
type swapPlanner interface {
|
||||
// EvictionFor returns running model IDs that must be stopped before
|
||||
// target can serve. alsoRunning lists models the baseRouter has already
|
||||
// committed to loading (in-flight swaps) which the planner cannot see
|
||||
// via process.State() yet. Pure decision; must not log.
|
||||
EvictionFor(target string, alsoRunning []string) []string
|
||||
|
||||
// OnSwapStart runs once at the start of every swap. Planners may log
|
||||
// their decision here at whatever verbosity they choose.
|
||||
OnSwapStart(target string)
|
||||
}
|
||||
|
||||
// baseRouter owns the channels, run-loop, and orchestration code shared by
|
||||
// every concrete router. Concrete routers embed *baseRouter and supply a
|
||||
// swapPlanner that captures how their eviction set is decided.
|
||||
// baseRouter owns the channels, run-loop, and process machinery shared by every
|
||||
// concrete router. Concrete routers embed *baseRouter and supply a
|
||||
// scheduler.Factory (which captures their scheduler.Swapper) describing how
|
||||
// requests are scheduled and how their eviction set is decided. baseRouter
|
||||
// implements scheduler.Effects so the scheduler can call back for side-effects.
|
||||
type baseRouter struct {
|
||||
name string
|
||||
config config.Config
|
||||
processes map[string]process.Process
|
||||
logger *logmon.Monitor
|
||||
planner swapPlanner
|
||||
schedule scheduler.Scheduler
|
||||
|
||||
// shutdownCtx governs the request machinery: cancelling it tells grant()
|
||||
// and ServeHTTP to stop granting and reject callers. It is deliberately
|
||||
@@ -90,11 +52,11 @@ type baseRouter struct {
|
||||
procCtx context.Context
|
||||
procCancel context.CancelFunc
|
||||
|
||||
handlerCh chan handlerReq
|
||||
handlerCh chan scheduler.HandlerReq
|
||||
shutdownCh chan shutdownReq
|
||||
unloadCh chan unloadReq
|
||||
swapDoneCh chan swapDone
|
||||
serveDoneCh chan serveDoneEvent
|
||||
swapDoneCh chan scheduler.SwapDone
|
||||
serveDoneCh chan scheduler.ServeDoneEvent
|
||||
|
||||
runDone chan struct{}
|
||||
|
||||
@@ -106,26 +68,33 @@ type baseRouter struct {
|
||||
testProcessed chan struct{}
|
||||
}
|
||||
|
||||
func newBaseRouter(name string, conf config.Config, processes map[string]process.Process, planner swapPlanner, logger *logmon.Monitor) *baseRouter {
|
||||
func newBaseRouter(
|
||||
name string,
|
||||
conf config.Config,
|
||||
processes map[string]process.Process,
|
||||
logger *logmon.Monitor,
|
||||
newSched scheduler.Factory,
|
||||
) *baseRouter {
|
||||
shutdownCtx, shutdownFn := context.WithCancel(context.Background())
|
||||
procCtx, procCancel := context.WithCancel(context.Background())
|
||||
return &baseRouter{
|
||||
b := &baseRouter{
|
||||
name: name,
|
||||
config: conf,
|
||||
processes: processes,
|
||||
logger: logger,
|
||||
planner: planner,
|
||||
shutdownCtx: shutdownCtx,
|
||||
shutdownFn: shutdownFn,
|
||||
procCtx: procCtx,
|
||||
procCancel: procCancel,
|
||||
handlerCh: make(chan handlerReq),
|
||||
handlerCh: make(chan scheduler.HandlerReq),
|
||||
shutdownCh: make(chan shutdownReq),
|
||||
unloadCh: make(chan unloadReq),
|
||||
swapDoneCh: make(chan swapDone),
|
||||
serveDoneCh: make(chan serveDoneEvent),
|
||||
swapDoneCh: make(chan scheduler.SwapDone),
|
||||
serveDoneCh: make(chan scheduler.ServeDoneEvent),
|
||||
runDone: make(chan struct{}),
|
||||
}
|
||||
b.schedule = newSched(name, logger, b)
|
||||
return b
|
||||
}
|
||||
|
||||
func (b *baseRouter) notifyProcessed() {
|
||||
@@ -137,30 +106,27 @@ func (b *baseRouter) notifyProcessed() {
|
||||
func (b *baseRouter) run() {
|
||||
defer close(b.runDone)
|
||||
|
||||
active := make(map[string]*activeSwap)
|
||||
inFlight := make(map[string]int)
|
||||
var queued []handlerReq
|
||||
|
||||
for {
|
||||
select {
|
||||
case req := <-b.shutdownCh:
|
||||
b.handleShutdown(req, active, queued)
|
||||
b.handleShutdown(req)
|
||||
return
|
||||
|
||||
case req := <-b.handlerCh:
|
||||
b.handleRequest(req, active, inFlight, &queued)
|
||||
b.schedule.OnRequest(req)
|
||||
b.notifyProcessed()
|
||||
|
||||
case req := <-b.unloadCh:
|
||||
b.handleUnload(req, active, inFlight, &queued)
|
||||
b.schedule.OnUnload(req.targets, req.timeout)
|
||||
close(req.respond)
|
||||
b.notifyProcessed()
|
||||
|
||||
case ev := <-b.swapDoneCh:
|
||||
b.handleSwapDone(ev, active, inFlight, &queued)
|
||||
b.schedule.OnSwapDone(ev)
|
||||
b.notifyProcessed()
|
||||
|
||||
case ev := <-b.serveDoneCh:
|
||||
b.handleServeDone(ev, active, inFlight, &queued)
|
||||
b.schedule.OnServeDone(ev)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -177,37 +143,68 @@ func (b *baseRouter) run() {
|
||||
// down, the send never lands, one of the other select cases fires, and we
|
||||
// report back that the grant did NOT happen.
|
||||
//
|
||||
// That distinction matters for in-flight bookkeeping — see grantHandler.
|
||||
func (b *baseRouter) grant(req handlerReq, resp handlerResp) bool {
|
||||
// That distinction matters for in-flight bookkeeping — see GrantServe.
|
||||
func (b *baseRouter) grant(req scheduler.HandlerReq, resp scheduler.HandlerResp) bool {
|
||||
select {
|
||||
case req.respond <- resp:
|
||||
case req.Respond <- resp:
|
||||
return true
|
||||
case <-req.ctx.Done():
|
||||
case <-req.Ctx.Done():
|
||||
return false
|
||||
case <-b.shutdownCtx.Done():
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// grantHandler is the "this caller can now use process p" path. It does
|
||||
// two things that must stay locked together:
|
||||
//
|
||||
// 1. Hand the caller a wrapped p.ServeHTTP (via trackedServe) so when the
|
||||
// HTTP request finishes, the run loop hears about it.
|
||||
// 2. Bump inFlight[modelID] so the router knows this process is busy and
|
||||
// refuses to evict it until the count comes back down.
|
||||
//
|
||||
// The increment is gated on grant() returning true. If grant() returns
|
||||
// false, the caller already walked away and trackedServe will never run —
|
||||
// which means no matching decrement will ever arrive on serveDoneCh.
|
||||
// Incrementing in that case would strand the counter at >0 forever and
|
||||
// the router would never again be willing to swap this model out.
|
||||
//
|
||||
// In short: increment if and only if we know a decrement is coming.
|
||||
func (b *baseRouter) grantHandler(req handlerReq, modelID string, p process.Process, inFlight map[string]int) {
|
||||
if b.grant(req, handlerResp{handleFunc: b.trackedServe(modelID, p)}) {
|
||||
inFlight[modelID]++
|
||||
// ModelState implements scheduler.Effects.
|
||||
func (b *baseRouter) ModelState(modelID string) (process.ProcessState, bool) {
|
||||
p, ok := b.processes[modelID]
|
||||
if !ok {
|
||||
var zero process.ProcessState
|
||||
return zero, false
|
||||
}
|
||||
return p.State(), true
|
||||
}
|
||||
|
||||
// StartSwap implements scheduler.Effects, launching the swap goroutine.
|
||||
func (b *baseRouter) StartSwap(modelID string, evict []string) {
|
||||
go b.doSwap(modelID, evict)
|
||||
}
|
||||
|
||||
// GrantError implements scheduler.Effects.
|
||||
func (b *baseRouter) GrantError(req scheduler.HandlerReq, err error) {
|
||||
b.grant(req, scheduler.HandlerResp{Err: err})
|
||||
}
|
||||
|
||||
// GrantServe implements scheduler.Effects. It hands the caller a wrapped
|
||||
// p.ServeHTTP (via trackedServe) so the run loop hears about the request
|
||||
// finishing, and reports whether the caller received it. The scheduler bumps
|
||||
// its in-flight count only on a true return: if grant() returns false the
|
||||
// caller already walked away and trackedServe will never run, so no matching
|
||||
// decrement will ever arrive — incrementing would strand the counter at >0 and
|
||||
// the router would never again be willing to evict this model.
|
||||
func (b *baseRouter) GrantServe(req scheduler.HandlerReq, modelID string) bool {
|
||||
p := b.processes[modelID]
|
||||
return b.grant(req, scheduler.HandlerResp{HandleFunc: b.trackedServe(modelID, p)})
|
||||
}
|
||||
|
||||
// StopProcesses implements scheduler.Effects, stopping the named processes in
|
||||
// parallel and blocking until all have stopped.
|
||||
func (b *baseRouter) StopProcesses(timeout time.Duration, ids []string) {
|
||||
var wg sync.WaitGroup
|
||||
for _, id := range ids {
|
||||
p, ok := b.processes[id]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
wg.Add(1)
|
||||
go func(id string, p process.Process) {
|
||||
defer wg.Done()
|
||||
if err := p.Stop(timeout); err != nil {
|
||||
b.logger.Warnf("%s: stopping %s failed: %v", b.name, id, err)
|
||||
}
|
||||
}(id, p)
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
// trackedServe is the wrapper that closes the loop on in-flight tracking.
|
||||
@@ -224,7 +221,7 @@ func (b *baseRouter) trackedServe(modelID string, p process.Process) http.Handle
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
defer func() {
|
||||
select {
|
||||
case b.serveDoneCh <- serveDoneEvent{modelID: modelID}:
|
||||
case b.serveDoneCh <- scheduler.ServeDoneEvent{ModelID: modelID}:
|
||||
case <-b.shutdownCtx.Done():
|
||||
}
|
||||
}()
|
||||
@@ -232,240 +229,6 @@ func (b *baseRouter) trackedServe(modelID string, p process.Process) http.Handle
|
||||
}
|
||||
}
|
||||
|
||||
// handleRequest decides what to do with one incoming ServeHTTP request. It is
|
||||
// called from run() and never blocks indefinitely: any work that has to wait
|
||||
// (starting a process, stopping siblings, waiting for ready) is deferred to
|
||||
// a swap goroutine and reported back via swapDoneCh.
|
||||
//
|
||||
// The decision tree, in order:
|
||||
//
|
||||
// 1. Unknown model — respond with ErrNoLocalModelFound and move on.
|
||||
// 2. A swap to the same model is already in flight — attach this waiter so
|
||||
// one swap serves all callers that asked for the same model.
|
||||
// 3. Fast path — the target process is already ready, the planner sees
|
||||
// nothing to evict, and no in-flight swap is evicting it. Hand back its
|
||||
// ServeHTTP immediately (wrapped so the run loop knows when it ends).
|
||||
// 4. Would collide with an in-flight swap (we'd stop their target, or
|
||||
// they're stopping us) — park in the queue for handleSwapDone to drain.
|
||||
// 5. Would evict a process that is still handling requests — park in the
|
||||
// queue. handleServeDone will retry when the busy process drains.
|
||||
// 6. Otherwise — start a new swap. This may run in parallel with other
|
||||
// active swaps when their evict sets don't intersect.
|
||||
func (b *baseRouter) handleRequest(req handlerReq, active map[string]*activeSwap, inFlight map[string]int, queued *[]handlerReq) {
|
||||
// (1) Unknown model.
|
||||
p, ok := b.processes[req.model]
|
||||
if !ok {
|
||||
b.logger.Debugf("%s: model %s not handled by this router", b.name, req.model)
|
||||
b.grant(req, handlerResp{err: ErrNoLocalModelFound})
|
||||
return
|
||||
}
|
||||
|
||||
// (2) Join an in-flight swap for the same model.
|
||||
if s, ok := active[req.model]; ok {
|
||||
b.logger.Debugf("%s: joining in-flight swap for model %s (%d waiters)", b.name, req.model, len(s.waiters)+1)
|
||||
s.waiters = append(s.waiters, req)
|
||||
return
|
||||
}
|
||||
|
||||
evict := b.planner.EvictionFor(req.model, activeTargets(active, req.model))
|
||||
|
||||
// (3) Fast path: ready, nothing to evict, and nobody is evicting us.
|
||||
if p.State() == process.StateReady && len(evict) == 0 && !collidesWith(req.model, evict, active) {
|
||||
b.logger.Debugf("%s: fast-path serving model %s (already ready)", b.name, req.model)
|
||||
b.grantHandler(req, req.model, p, inFlight)
|
||||
return
|
||||
}
|
||||
|
||||
// (4) Collision with an in-flight swap — queue.
|
||||
if collidesWith(req.model, evict, active) {
|
||||
b.logger.Debugf("%s: queuing request for model %s (collides with in-flight swap)", b.name, req.model)
|
||||
*queued = append(*queued, req)
|
||||
b.broadcastQueuePositions(*queued)
|
||||
return
|
||||
}
|
||||
|
||||
// (5) Would evict a busy process — queue until it drains.
|
||||
if conflictsWithInFlight(evict, inFlight) {
|
||||
b.logger.Debugf("%s: queuing request for model %s (would evict in-flight process)", b.name, req.model)
|
||||
*queued = append(*queued, req)
|
||||
b.broadcastQueuePositions(*queued)
|
||||
return
|
||||
}
|
||||
|
||||
// (6) Start a new (possibly parallel) swap.
|
||||
b.logger.Debugf("%s: starting swap for model %s, evicting %v", b.name, req.model, evict)
|
||||
s := b.startSwap(req, evict)
|
||||
active[s.modelID] = s
|
||||
}
|
||||
|
||||
// handleSwapDone is called from run() when a swap goroutine reports that it
|
||||
// has finished. It fans out the result to every waiter that joined this swap,
|
||||
// removes the swap from the active map, and then walks the queue once,
|
||||
// promoting any items that no longer collide with the remaining active set.
|
||||
// FIFO order is preserved: items still blocked stay in place.
|
||||
func (b *baseRouter) handleSwapDone(ev swapDone, active map[string]*activeSwap, inFlight map[string]int, queued *[]handlerReq) {
|
||||
s, ok := active[ev.modelID]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
delete(active, ev.modelID)
|
||||
|
||||
for _, w := range s.waiters {
|
||||
if ev.err != nil {
|
||||
b.grant(w, handlerResp{err: ev.err})
|
||||
} else {
|
||||
p := b.processes[ev.modelID]
|
||||
b.grantHandler(w, ev.modelID, p, inFlight)
|
||||
}
|
||||
}
|
||||
|
||||
b.drainQueue(active, inFlight, queued)
|
||||
}
|
||||
|
||||
// handleServeDone is called from run() each time a tracked ServeHTTP
|
||||
// finishes. It decrements the per-model in-flight count and, when that
|
||||
// drops to zero, retries the queue: requests whose swap was deferred
|
||||
// because they would have evicted this (now-idle) process can now proceed.
|
||||
func (b *baseRouter) handleServeDone(ev serveDoneEvent, active map[string]*activeSwap, inFlight map[string]int, queued *[]handlerReq) {
|
||||
inFlight[ev.modelID]--
|
||||
if inFlight[ev.modelID] <= 0 {
|
||||
delete(inFlight, ev.modelID)
|
||||
b.drainQueue(active, inFlight, queued)
|
||||
}
|
||||
}
|
||||
|
||||
// drainQueue walks the queued requests in order, re-running the handleRequest
|
||||
// decision tree against the (now smaller) active set. Items that can now start
|
||||
// or join become satisfied; items still blocked remain queued in original
|
||||
// order so they get another chance on the next swap completion.
|
||||
func (b *baseRouter) drainQueue(active map[string]*activeSwap, inFlight map[string]int, queued *[]handlerReq) {
|
||||
if len(*queued) == 0 {
|
||||
return
|
||||
}
|
||||
pending := *queued
|
||||
var remaining []handlerReq
|
||||
for _, req := range pending {
|
||||
p, ok := b.processes[req.model]
|
||||
if !ok {
|
||||
b.grant(req, handlerResp{err: ErrNoLocalModelFound})
|
||||
continue
|
||||
}
|
||||
if s, ok := active[req.model]; ok {
|
||||
b.logger.Debugf("%s: queued request for model %s now joining in-flight swap", b.name, req.model)
|
||||
s.waiters = append(s.waiters, req)
|
||||
continue
|
||||
}
|
||||
evict := b.planner.EvictionFor(req.model, activeTargets(active, req.model))
|
||||
if p.State() == process.StateReady && len(evict) == 0 && !collidesWith(req.model, evict, active) {
|
||||
b.logger.Debugf("%s: queued request for model %s now served fast-path", b.name, req.model)
|
||||
b.grantHandler(req, req.model, p, inFlight)
|
||||
continue
|
||||
}
|
||||
if collidesWith(req.model, evict, active) {
|
||||
remaining = append(remaining, req)
|
||||
continue
|
||||
}
|
||||
if conflictsWithInFlight(evict, inFlight) {
|
||||
remaining = append(remaining, req)
|
||||
continue
|
||||
}
|
||||
b.logger.Debugf("%s: queued request for model %s now starting swap, evicting %v", b.name, req.model, evict)
|
||||
s := b.startSwap(req, evict)
|
||||
active[s.modelID] = s
|
||||
}
|
||||
*queued = remaining
|
||||
b.broadcastQueuePositions(*queued)
|
||||
}
|
||||
|
||||
// broadcastQueuePositions sends each queued request its current 1-indexed
|
||||
// position. Sends are non-blocking: if the channel is full, the old value is
|
||||
// drained first so the consumer always sees the latest position.
|
||||
func (b *baseRouter) broadcastQueuePositions(queued []handlerReq) {
|
||||
for i, req := range queued {
|
||||
pos := i + 1
|
||||
select {
|
||||
case req.positionCh <- pos:
|
||||
default:
|
||||
select {
|
||||
case <-req.positionCh:
|
||||
default:
|
||||
}
|
||||
select {
|
||||
case req.positionCh <- pos:
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (b *baseRouter) startSwap(initial handlerReq, evict []string) *activeSwap {
|
||||
swap := &activeSwap{
|
||||
modelID: initial.model,
|
||||
evict: evict,
|
||||
waiters: []handlerReq{initial},
|
||||
}
|
||||
b.planner.OnSwapStart(initial.model)
|
||||
go b.doSwap(initial.model, evict)
|
||||
return swap
|
||||
}
|
||||
|
||||
// activeTargets returns the IDs of every in-flight swap target except exclude.
|
||||
// baseRouter passes this to the planner so eviction decisions account for
|
||||
// models that have been committed to but have not yet transitioned to
|
||||
// StateStarting in their process state machine.
|
||||
func activeTargets(active map[string]*activeSwap, exclude string) []string {
|
||||
if len(active) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make([]string, 0, len(active))
|
||||
for id := range active {
|
||||
if id == exclude {
|
||||
continue
|
||||
}
|
||||
out = append(out, id)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// collidesWith reports whether a new swap with this target and evict set can
|
||||
// safely run alongside the currently active swaps. Same-target callers should
|
||||
// JOIN (handled before this) — they do not collide with themselves.
|
||||
func collidesWith(target string, evict []string, active map[string]*activeSwap) bool {
|
||||
for id, s := range active {
|
||||
if id == target {
|
||||
continue
|
||||
}
|
||||
if containsString(evict, id) {
|
||||
return true
|
||||
}
|
||||
if containsString(s.evict, target) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// conflictsWithInFlight reports whether any model in evict is still handling
|
||||
// requests. Stopping a busy process would cancel its callers' connections,
|
||||
// so the router defers the swap until those callers finish.
|
||||
func conflictsWithInFlight(evict []string, inFlight map[string]int) bool {
|
||||
for _, m := range evict {
|
||||
if inFlight[m] > 0 {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func containsString(xs []string, s string) bool {
|
||||
for _, x := range xs {
|
||||
if x == s {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (b *baseRouter) doSwap(modelID string, toStop []string) {
|
||||
timeout := b.healthCheckTimeout()
|
||||
|
||||
@@ -493,31 +256,24 @@ func (b *baseRouter) doSwap(modelID string, toStop []string) {
|
||||
err := target.WaitReady(b.shutdownCtx)
|
||||
|
||||
select {
|
||||
case b.swapDoneCh <- swapDone{modelID: modelID, err: err}:
|
||||
case b.swapDoneCh <- scheduler.SwapDone{ModelID: modelID, Err: err}:
|
||||
case <-b.shutdownCtx.Done():
|
||||
}
|
||||
}
|
||||
|
||||
func (b *baseRouter) handleShutdown(req shutdownReq, active map[string]*activeSwap, queued []handlerReq) {
|
||||
func (b *baseRouter) handleShutdown(req shutdownReq) {
|
||||
shutdownErr := fmt.Errorf("%s is shutting down", b.name)
|
||||
|
||||
// Cancel shutdownCtx first so any waiter that is currently parked on
|
||||
// its respond channel can exit via its own shutdownCtx.Done() branch.
|
||||
// The grant calls below then either land (waiter happened to receive
|
||||
// The OnShutdown grants below then either land (waiter happened to receive
|
||||
// before noticing shutdown) or fall through immediately via grant's
|
||||
// shutdownCtx case — either way the waiter sees a non-OK response.
|
||||
// This does NOT touch processes: their lifetime is procCtx, cancelled
|
||||
// only after the graceful Stop() calls below have reaped them.
|
||||
b.shutdownFn()
|
||||
|
||||
for _, s := range active {
|
||||
for _, w := range s.waiters {
|
||||
b.grant(w, handlerResp{err: shutdownErr})
|
||||
}
|
||||
}
|
||||
for _, w := range queued {
|
||||
b.grant(w, handlerResp{err: shutdownErr})
|
||||
}
|
||||
b.schedule.OnShutdown(shutdownErr)
|
||||
|
||||
stopTimeout := req.timeout
|
||||
if stopTimeout <= 0 {
|
||||
@@ -628,75 +384,6 @@ func (b *baseRouter) Unload(timeout time.Duration, models ...string) {
|
||||
<-req.respond
|
||||
}
|
||||
|
||||
// handleUnload runs on the run loop in response to an Unload call. It
|
||||
// reconciles router-owned state with the impending Stop, then performs
|
||||
// the Stop synchronously so callers of Unload remain blocked until each
|
||||
// targeted process has actually exited.
|
||||
func (b *baseRouter) handleUnload(req unloadReq, active map[string]*activeSwap, inFlight map[string]int, queued *[]handlerReq) {
|
||||
unloadErr := fmt.Errorf("%s: model unloaded", b.name)
|
||||
|
||||
targetSet := make(map[string]bool, len(req.targets))
|
||||
for _, id := range req.targets {
|
||||
targetSet[id] = true
|
||||
}
|
||||
|
||||
// Release waiters of any in-flight swap whose target is being
|
||||
// unloaded. The swap goroutine itself is left to finish on its own;
|
||||
// when its swapDone arrives, handleSwapDone will find no entry in
|
||||
// active and silently drop it.
|
||||
for id := range targetSet {
|
||||
s, ok := active[id]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
for _, w := range s.waiters {
|
||||
b.grant(w, handlerResp{err: unloadErr})
|
||||
}
|
||||
delete(active, id)
|
||||
}
|
||||
|
||||
// Drop queued requests addressed to unloaded models. Requests for
|
||||
// other models stay queued and may benefit from drainQueue at the end.
|
||||
if len(*queued) > 0 {
|
||||
kept := (*queued)[:0]
|
||||
for _, w := range *queued {
|
||||
if targetSet[w.model] {
|
||||
b.grant(w, handlerResp{err: unloadErr})
|
||||
continue
|
||||
}
|
||||
kept = append(kept, w)
|
||||
}
|
||||
*queued = kept
|
||||
}
|
||||
|
||||
// Stop the targeted processes. Done synchronously so Unload's caller
|
||||
// can rely on "after Unload returns, the process is stopped". inFlight
|
||||
// is intentionally NOT cleared here: each dying handler will fire its
|
||||
// trackedServe defer and reach handleServeDone in the normal way once
|
||||
// the run loop is free again.
|
||||
var wg sync.WaitGroup
|
||||
for id := range targetSet {
|
||||
p, ok := b.processes[id]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
wg.Add(1)
|
||||
go func(id string, p process.Process) {
|
||||
defer wg.Done()
|
||||
if err := p.Stop(req.timeout); err != nil {
|
||||
b.logger.Warnf("%s: unloading %s failed: %v", b.name, id, err)
|
||||
}
|
||||
}(id, p)
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
// Removing entries from active above may have unblocked queued
|
||||
// requests that previously collided with the now-cancelled swaps.
|
||||
b.drainQueue(active, inFlight, queued)
|
||||
|
||||
close(req.respond)
|
||||
}
|
||||
|
||||
func (b *baseRouter) Shutdown(timeout time.Duration) error {
|
||||
if !b.shuttingDown.CompareAndSwap(false, true) {
|
||||
return fmt.Errorf("%s shutdown already in progress", b.name)
|
||||
@@ -722,14 +409,14 @@ func (b *baseRouter) ServeHTTP(w http.ResponseWriter, req *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
hr := handlerReq{
|
||||
model: data.ModelID,
|
||||
ctx: req.Context(),
|
||||
// Unbuffered: a successful send on respond proves the waiter is
|
||||
hr := scheduler.HandlerReq{
|
||||
Model: data.ModelID,
|
||||
Ctx: req.Context(),
|
||||
// Unbuffered: a successful send on Respond proves the waiter is
|
||||
// alive and consuming. grant() relies on this to avoid handing a
|
||||
// handleFunc to a cancelled waiter and leaking the inFlight count.
|
||||
respond: make(chan handlerResp),
|
||||
positionCh: make(chan int, 1),
|
||||
Respond: make(chan scheduler.HandlerResp),
|
||||
PositionCh: make(chan int, 1),
|
||||
}
|
||||
|
||||
select {
|
||||
@@ -757,7 +444,7 @@ func (b *baseRouter) ServeHTTP(w http.ResponseWriter, req *http.Request) {
|
||||
go func() {
|
||||
for {
|
||||
select {
|
||||
case pos := <-hr.positionCh:
|
||||
case pos := <-hr.PositionCh:
|
||||
lw.setUpdate(fmt.Sprintf("Queue position: #%d", pos))
|
||||
case <-swapCtx.Done():
|
||||
return
|
||||
@@ -779,9 +466,9 @@ func (b *baseRouter) ServeHTTP(w http.ResponseWriter, req *http.Request) {
|
||||
}
|
||||
}
|
||||
|
||||
var resp handlerResp
|
||||
var resp scheduler.HandlerResp
|
||||
select {
|
||||
case resp = <-hr.respond:
|
||||
case resp = <-hr.Respond:
|
||||
finishLoading()
|
||||
case <-req.Context().Done():
|
||||
finishLoading()
|
||||
@@ -792,9 +479,9 @@ func (b *baseRouter) ServeHTTP(w http.ResponseWriter, req *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
if resp.err != nil {
|
||||
SendError(w, req, resp.err)
|
||||
if resp.Err != nil {
|
||||
SendError(w, req, resp.Err)
|
||||
return
|
||||
}
|
||||
resp.handleFunc(w, req)
|
||||
resp.HandleFunc(w, req)
|
||||
}
|
||||
|
||||
+15
-614
@@ -5,35 +5,34 @@ import (
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/mostlygeek/llama-swap/internal/config"
|
||||
"github.com/mostlygeek/llama-swap/internal/logmon"
|
||||
"github.com/mostlygeek/llama-swap/internal/process"
|
||||
"github.com/mostlygeek/llama-swap/internal/router/scheduler"
|
||||
)
|
||||
|
||||
// stubPlanner is a swapPlanner that returns a fixed eviction list per target
|
||||
// and never logs. It lets the base-router tests cover shared run-loop
|
||||
// behaviour without dragging in either real router's eviction rules.
|
||||
type stubPlanner struct {
|
||||
evict map[string][]string
|
||||
}
|
||||
// These tests cover baseRouter's own machinery — the run loop, process
|
||||
// lifecycle (doSwap), grant/ServeHTTP plumbing, Unload, and Shutdown. The
|
||||
// scheduling decision logic (queueing, collation, eviction collisions) lives in
|
||||
// the scheduler package and is tested directly there; see fifo_test.go.
|
||||
|
||||
func (s *stubPlanner) EvictionFor(target string, _ []string) []string {
|
||||
if s.evict == nil {
|
||||
return nil
|
||||
}
|
||||
return s.evict[target]
|
||||
}
|
||||
// stubPlanner evicts nothing. baseRouter tests drive the run loop through the
|
||||
// default FIFO scheduler without exercising any particular eviction policy.
|
||||
type stubPlanner struct{}
|
||||
|
||||
func (s *stubPlanner) OnSwapStart(string) {}
|
||||
func (s *stubPlanner) EvictionFor(string, []string) []string { return nil }
|
||||
func (s *stubPlanner) OnSwapStart(string, []string) {}
|
||||
|
||||
func newTestBase(t *testing.T, processes map[string]process.Process, planner swapPlanner) *baseRouter {
|
||||
func newTestBase(t *testing.T, processes map[string]process.Process, planner scheduler.Swapper) *baseRouter {
|
||||
t.Helper()
|
||||
conf := config.Config{HealthCheckTimeout: 5}
|
||||
b := newBaseRouter("test", conf, processes, planner, logmon.NewWriter(io.Discard))
|
||||
b := newBaseRouter("test", conf, processes, logmon.NewWriter(io.Discard),
|
||||
func(name string, logger *logmon.Monitor, eff scheduler.Effects) scheduler.Scheduler {
|
||||
return scheduler.NewFIFO(name, logger, planner, conf.Routing.Scheduler.Settings.Fifo, eff)
|
||||
})
|
||||
b.testProcessed = make(chan struct{}, 64)
|
||||
go b.run()
|
||||
t.Cleanup(func() {
|
||||
@@ -157,114 +156,6 @@ func TestBaseRouter_Unload_StopsInParallel(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestBaseRouter_Unload_ReleasesActiveSwapWaiters verifies that Unload
|
||||
// rejoins router state: a request whose swap to the unloaded model is
|
||||
// still in progress receives an error, instead of being abandoned
|
||||
// against a process that's about to vanish.
|
||||
func TestBaseRouter_Unload_ReleasesActiveSwapWaiters(t *testing.T) {
|
||||
a := newFakeProcess("a")
|
||||
// autoReady=false: the swap parks on WaitReady so we can interrupt
|
||||
// it with Unload before it completes.
|
||||
|
||||
b := newTestBase(t, map[string]process.Process{"a": a}, &stubPlanner{})
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
b.ServeHTTP(w, newRequest("a"))
|
||||
close(done)
|
||||
}()
|
||||
waitProcessed(t, b.testProcessed, 1) // handlerReq absorbed; swap started
|
||||
<-a.runStarted
|
||||
|
||||
b.Unload(time.Second, "a")
|
||||
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("ServeHTTP did not return after Unload")
|
||||
}
|
||||
if w.Code == http.StatusOK {
|
||||
t.Errorf("expected non-OK status after Unload, got %d body=%q", w.Code, w.Body.String())
|
||||
}
|
||||
if a.State() != process.StateStopped {
|
||||
t.Errorf("a state=%q want stopped", a.State())
|
||||
}
|
||||
}
|
||||
|
||||
// TestBaseRouter_Unload_DropsQueuedRequests verifies that queued requests
|
||||
// for an unloaded model receive an error rather than sitting forever in
|
||||
// the queue against state the router no longer maintains.
|
||||
func TestBaseRouter_Unload_DropsQueuedRequests(t *testing.T) {
|
||||
a := newFakeProcess("a")
|
||||
pb := newFakeProcess("b")
|
||||
// Loading B evicts A — so a request for B while A is loading queues.
|
||||
planner := &stubPlanner{evict: map[string][]string{"b": {"a"}}}
|
||||
b := newTestBase(t, map[string]process.Process{"a": a, "b": pb}, planner)
|
||||
|
||||
// r1 starts the swap to A and parks on WaitReady (autoReady=false).
|
||||
w1 := httptest.NewRecorder()
|
||||
done1 := make(chan struct{})
|
||||
go func() {
|
||||
b.ServeHTTP(w1, newRequest("a"))
|
||||
close(done1)
|
||||
}()
|
||||
waitProcessed(t, b.testProcessed, 1)
|
||||
<-a.runStarted
|
||||
|
||||
// r2 for B collides with A's in-flight swap and queues.
|
||||
w2 := httptest.NewRecorder()
|
||||
done2 := make(chan struct{})
|
||||
go func() {
|
||||
b.ServeHTTP(w2, newRequest("b"))
|
||||
close(done2)
|
||||
}()
|
||||
waitProcessed(t, b.testProcessed, 1)
|
||||
|
||||
// Unload B — r2 (queued, targeting B) must be released with an error.
|
||||
b.Unload(time.Second, "b")
|
||||
|
||||
select {
|
||||
case <-done2:
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("queued B request did not return after Unload(b)")
|
||||
}
|
||||
if w2.Code == http.StatusOK {
|
||||
t.Errorf("queued B request: expected non-OK status, got %d", w2.Code)
|
||||
}
|
||||
if got := pb.runCalls.Load(); got != 0 {
|
||||
t.Errorf("b.runCalls=%d want 0 (B should never have been started)", got)
|
||||
}
|
||||
|
||||
// Release r1 so the test cleans up cleanly.
|
||||
a.markReady()
|
||||
select {
|
||||
case <-done1:
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("r1 did not complete after a.markReady")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBaseRouter_FastPath(t *testing.T) {
|
||||
a := newFakeProcess("a")
|
||||
a.markReady()
|
||||
|
||||
b := newTestBase(t, map[string]process.Process{"a": a}, &stubPlanner{})
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
b.ServeHTTP(w, newRequest("a"))
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%q", w.Code, w.Body.String())
|
||||
}
|
||||
if got := a.serveCalls.Load(); got != 1 {
|
||||
t.Errorf("serveCalls=%d want 1", got)
|
||||
}
|
||||
if got := a.runCalls.Load(); got != 0 {
|
||||
t.Errorf("runCalls=%d want 0 (fast path should not start)", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBaseRouter_OnDemandStart(t *testing.T) {
|
||||
a := newFakeProcess("a")
|
||||
a.autoReady = true
|
||||
@@ -285,43 +176,6 @@ func TestBaseRouter_OnDemandStart(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBaseRouter_ConcurrentSameModel(t *testing.T) {
|
||||
a := newFakeProcess("a")
|
||||
// autoReady=false so the swap parks on WaitReady until we release it.
|
||||
|
||||
b := newTestBase(t, map[string]process.Process{"a": a}, &stubPlanner{})
|
||||
|
||||
const N = 5
|
||||
var wg sync.WaitGroup
|
||||
codes := make([]int, N)
|
||||
for i := 0; i < N; i++ {
|
||||
wg.Add(1)
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
w := httptest.NewRecorder()
|
||||
b.ServeHTTP(w, newRequest("a"))
|
||||
codes[i] = w.Code
|
||||
}(i)
|
||||
}
|
||||
|
||||
waitProcessed(t, b.testProcessed, N) // all N handlerReqs absorbed by run()
|
||||
<-a.runStarted // swap goroutine reached Run()
|
||||
a.markReady()
|
||||
wg.Wait()
|
||||
|
||||
for i, c := range codes {
|
||||
if c != http.StatusOK {
|
||||
t.Errorf("request %d: status=%d", i, c)
|
||||
}
|
||||
}
|
||||
if got := a.runCalls.Load(); got != 1 {
|
||||
t.Errorf("runCalls=%d want 1 (single swap should issue one Run)", got)
|
||||
}
|
||||
if got := a.serveCalls.Load(); got != N {
|
||||
t.Errorf("serveCalls=%d want %d", got, N)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBaseRouter_ContextCancel(t *testing.T) {
|
||||
a := newFakeProcess("a")
|
||||
// autoReady=false so swap parks forever until we mark ready.
|
||||
@@ -364,459 +218,6 @@ func TestBaseRouter_ContextCancel(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBaseRouter_QueuedDifferentModel(t *testing.T) {
|
||||
a := newFakeProcess("a")
|
||||
pa := newFakeProcess("b")
|
||||
|
||||
// Loading b must stop a.
|
||||
planner := &stubPlanner{evict: map[string][]string{"b": {"a"}}}
|
||||
b := newTestBase(t, map[string]process.Process{"a": a, "b": pa}, planner)
|
||||
|
||||
// First request starts a swap to A; A's autoReady=false so it parks.
|
||||
w1 := httptest.NewRecorder()
|
||||
done1 := make(chan struct{})
|
||||
go func() {
|
||||
b.ServeHTTP(w1, newRequest("a"))
|
||||
close(done1)
|
||||
}()
|
||||
waitProcessed(t, b.testProcessed, 1)
|
||||
<-a.runStarted
|
||||
|
||||
// Second request for B should queue while A's swap is in flight.
|
||||
w2 := httptest.NewRecorder()
|
||||
done2 := make(chan struct{})
|
||||
go func() {
|
||||
b.ServeHTTP(w2, newRequest("b"))
|
||||
close(done2)
|
||||
}()
|
||||
waitProcessed(t, b.testProcessed, 1)
|
||||
|
||||
if got := pa.runCalls.Load(); got != 0 {
|
||||
t.Errorf("b started early: runCalls=%d want 0 while A's swap is pending", got)
|
||||
}
|
||||
|
||||
// Release A's swap. B's swap should then run.
|
||||
a.markReady()
|
||||
waitProcessed(t, b.testProcessed, 1) // swapDone for A → B's swap kicked off
|
||||
<-pa.runStarted
|
||||
|
||||
select {
|
||||
case <-done1:
|
||||
case <-time.After(time.Second):
|
||||
t.Fatal("A request did not complete")
|
||||
}
|
||||
pa.markReady()
|
||||
select {
|
||||
case <-done2:
|
||||
case <-time.After(time.Second):
|
||||
t.Fatal("queued B request did not complete after A's swap")
|
||||
}
|
||||
if w2.Code != http.StatusOK {
|
||||
t.Errorf("B status=%d body=%q", w2.Code, w2.Body.String())
|
||||
}
|
||||
if got := a.stopCalls.Load(); got != 1 {
|
||||
t.Errorf("a.stopCalls=%d want 1 (B's swap must stop A)", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestBaseRouter_QueueCollation verifies that incoming requests of the form
|
||||
// a, b, c, a, b, c collapse into three swaps (one per model) and that the
|
||||
// second request for each model rides the fast path — either joining the
|
||||
// active swap, or being pulled out of the queue when handleSwapDone promotes
|
||||
// the next model.
|
||||
func TestBaseRouter_QueueCollation(t *testing.T) {
|
||||
a := newFakeProcess("a")
|
||||
pb := newFakeProcess("b")
|
||||
pc := newFakeProcess("c")
|
||||
|
||||
// Each model evicts the other two so all swaps are mutually exclusive.
|
||||
planner := &stubPlanner{evict: map[string][]string{
|
||||
"a": {"b", "c"},
|
||||
"b": {"a", "c"},
|
||||
"c": {"a", "b"},
|
||||
}}
|
||||
b := newTestBase(t, map[string]process.Process{"a": a, "b": pb, "c": pc}, planner)
|
||||
|
||||
var (
|
||||
completedMu sync.Mutex
|
||||
completed []string
|
||||
)
|
||||
record := func(id string) {
|
||||
completedMu.Lock()
|
||||
defer completedMu.Unlock()
|
||||
completed = append(completed, id)
|
||||
}
|
||||
|
||||
ids := []string{"a", "b", "c", "a", "b", "c"}
|
||||
var wg sync.WaitGroup
|
||||
for _, id := range ids {
|
||||
id := id
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
w := httptest.NewRecorder()
|
||||
b.ServeHTTP(w, newRequest(id))
|
||||
if w.Code != http.StatusOK {
|
||||
t.Errorf("%s: status=%d body=%q", id, w.Code, w.Body.String())
|
||||
return
|
||||
}
|
||||
record(id)
|
||||
}()
|
||||
// Wait for run() to absorb this request before launching the next,
|
||||
// so handlerCh receives them in launch order.
|
||||
waitProcessed(t, b.testProcessed, 1)
|
||||
}
|
||||
|
||||
// All 6 are now parked in run()'s waiters/queue. Release each swap in
|
||||
// sequence, waiting deterministically for each promotion to fire.
|
||||
<-a.runStarted
|
||||
a.markReady()
|
||||
waitProcessed(t, b.testProcessed, 1) // swapDone(a) → b swap kicked off
|
||||
|
||||
<-pb.runStarted
|
||||
pb.markReady()
|
||||
waitProcessed(t, b.testProcessed, 1) // swapDone(b) → c swap kicked off
|
||||
|
||||
<-pc.runStarted
|
||||
pc.markReady()
|
||||
wg.Wait()
|
||||
|
||||
if got := len(completed); got != 6 {
|
||||
t.Fatalf("completed=%v want 6", completed)
|
||||
}
|
||||
|
||||
// run() fans out responses in model-grouped order (a1,a2 → b1,b2 → c1,c2)
|
||||
// but waiter goroutines may be scheduled in any order after their respond
|
||||
// channel fires, so completion order isn't deterministic. Per-model counts
|
||||
// (combined with the runCalls checks below) are sufficient to prove queue
|
||||
// collation collapsed each pair into a single swap.
|
||||
aDone, bDone, cDone := 0, 0, 0
|
||||
for _, id := range completed {
|
||||
switch id {
|
||||
case "a":
|
||||
aDone++
|
||||
case "b":
|
||||
bDone++
|
||||
case "c":
|
||||
cDone++
|
||||
}
|
||||
}
|
||||
if aDone != 2 || bDone != 2 || cDone != 2 {
|
||||
t.Errorf("per-model counts: a=%d b=%d c=%d, want 2 each (order=%v)", aDone, bDone, cDone, completed)
|
||||
}
|
||||
|
||||
// Single swap per model — the second request for each must have ridden
|
||||
// the fast path (joined active swap or joined a queued sibling), not
|
||||
// triggered an extra Run.
|
||||
if got := a.runCalls.Load(); got != 1 {
|
||||
t.Errorf("a.runCalls=%d want 1", got)
|
||||
}
|
||||
if got := pb.runCalls.Load(); got != 1 {
|
||||
t.Errorf("b.runCalls=%d want 1", got)
|
||||
}
|
||||
if got := pc.runCalls.Load(); got != 1 {
|
||||
t.Errorf("c.runCalls=%d want 1", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestBaseRouter_ConcurrentDisjointSwaps verifies that two requests with
|
||||
// non-conflicting evict sets are loaded in parallel: both Run() calls happen
|
||||
// before either process is marked ready.
|
||||
func TestBaseRouter_ConcurrentDisjointSwaps(t *testing.T) {
|
||||
a := newFakeProcess("a")
|
||||
pb := newFakeProcess("b")
|
||||
|
||||
// Empty evict sets for both: they can load in parallel.
|
||||
b := newTestBase(t, map[string]process.Process{"a": a, "b": pb}, &stubPlanner{})
|
||||
|
||||
w1 := httptest.NewRecorder()
|
||||
done1 := make(chan struct{})
|
||||
go func() {
|
||||
b.ServeHTTP(w1, newRequest("a"))
|
||||
close(done1)
|
||||
}()
|
||||
waitProcessed(t, b.testProcessed, 1)
|
||||
|
||||
w2 := httptest.NewRecorder()
|
||||
done2 := make(chan struct{})
|
||||
go func() {
|
||||
b.ServeHTTP(w2, newRequest("b"))
|
||||
close(done2)
|
||||
}()
|
||||
waitProcessed(t, b.testProcessed, 1)
|
||||
|
||||
// Both swaps must have reached Run() before either is marked ready —
|
||||
// proves they ran in parallel rather than serializing.
|
||||
<-a.runStarted
|
||||
<-pb.runStarted
|
||||
|
||||
a.markReady()
|
||||
pb.markReady()
|
||||
|
||||
select {
|
||||
case <-done1:
|
||||
case <-time.After(time.Second):
|
||||
t.Fatal("request A did not complete")
|
||||
}
|
||||
select {
|
||||
case <-done2:
|
||||
case <-time.After(time.Second):
|
||||
t.Fatal("request B did not complete")
|
||||
}
|
||||
|
||||
if w1.Code != http.StatusOK {
|
||||
t.Errorf("A status=%d body=%q", w1.Code, w1.Body.String())
|
||||
}
|
||||
if w2.Code != http.StatusOK {
|
||||
t.Errorf("B status=%d body=%q", w2.Code, w2.Body.String())
|
||||
}
|
||||
if got := a.stopCalls.Load(); got != 0 {
|
||||
t.Errorf("a.stopCalls=%d want 0 (parallel swap, no eviction)", got)
|
||||
}
|
||||
if got := pb.stopCalls.Load(); got != 0 {
|
||||
t.Errorf("b.stopCalls=%d want 0 (parallel swap, no eviction)", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestBaseRouter_QueueDrainPromotesMultiple verifies that completing one swap
|
||||
// unblocks every queued request that no longer collides — they all start in
|
||||
// parallel rather than one-per-completion.
|
||||
func TestBaseRouter_QueueDrainPromotesMultiple(t *testing.T) {
|
||||
a := newFakeProcess("a")
|
||||
pb := newFakeProcess("b")
|
||||
pc := newFakeProcess("c")
|
||||
|
||||
// A's swap evicts both B and C, so B and C must queue. Once A finishes
|
||||
// B and C themselves have empty evict sets, so they can start together.
|
||||
planner := &stubPlanner{evict: map[string][]string{
|
||||
"a": {"b", "c"},
|
||||
}}
|
||||
b := newTestBase(t, map[string]process.Process{"a": a, "b": pb, "c": pc}, planner)
|
||||
|
||||
w1 := httptest.NewRecorder()
|
||||
done1 := make(chan struct{})
|
||||
go func() {
|
||||
b.ServeHTTP(w1, newRequest("a"))
|
||||
close(done1)
|
||||
}()
|
||||
waitProcessed(t, b.testProcessed, 1)
|
||||
<-a.runStarted
|
||||
|
||||
// B and C arrive while A is loading. evict_b and evict_c are empty,
|
||||
// but collidesWith returns true because they appear in A's evict set.
|
||||
w2 := httptest.NewRecorder()
|
||||
done2 := make(chan struct{})
|
||||
go func() {
|
||||
b.ServeHTTP(w2, newRequest("b"))
|
||||
close(done2)
|
||||
}()
|
||||
waitProcessed(t, b.testProcessed, 1)
|
||||
|
||||
w3 := httptest.NewRecorder()
|
||||
done3 := make(chan struct{})
|
||||
go func() {
|
||||
b.ServeHTTP(w3, newRequest("c"))
|
||||
close(done3)
|
||||
}()
|
||||
waitProcessed(t, b.testProcessed, 1)
|
||||
|
||||
if got := pb.runCalls.Load(); got != 0 {
|
||||
t.Errorf("b started early: runCalls=%d", got)
|
||||
}
|
||||
if got := pc.runCalls.Load(); got != 0 {
|
||||
t.Errorf("c started early: runCalls=%d", got)
|
||||
}
|
||||
|
||||
// Release A. The swapDone handler should drain the queue and start
|
||||
// both B and C in parallel.
|
||||
a.markReady()
|
||||
waitProcessed(t, b.testProcessed, 1) // swapDone(A) → drainQueue starts B and C
|
||||
<-pb.runStarted
|
||||
<-pc.runStarted
|
||||
|
||||
pb.markReady()
|
||||
pc.markReady()
|
||||
|
||||
for i, ch := range []chan struct{}{done1, done2, done3} {
|
||||
select {
|
||||
case <-ch:
|
||||
case <-time.After(time.Second):
|
||||
t.Fatalf("request %d did not complete", i)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestBaseRouter_Shutdown_FailsAllInFlight verifies that shutdown returns
|
||||
// the shutdown error to every waiter on every active swap AND to every
|
||||
// queued request.
|
||||
func TestBaseRouter_Shutdown_FailsAllInFlight(t *testing.T) {
|
||||
a := newFakeProcess("a")
|
||||
pb := newFakeProcess("b")
|
||||
pc := newFakeProcess("c")
|
||||
|
||||
// a and b load in parallel (empty evicts). c collides with both.
|
||||
planner := &stubPlanner{evict: map[string][]string{
|
||||
"c": {"a", "b"},
|
||||
}}
|
||||
b := newTestBase(t, map[string]process.Process{"a": a, "b": pb, "c": pc}, planner)
|
||||
|
||||
const waitersPer = 2
|
||||
var wg sync.WaitGroup
|
||||
codes := make([]int, 0, 2*waitersPer+1)
|
||||
var codesMu sync.Mutex
|
||||
record := func(code int) {
|
||||
codesMu.Lock()
|
||||
codes = append(codes, code)
|
||||
codesMu.Unlock()
|
||||
}
|
||||
|
||||
launch := func(model string) {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
w := httptest.NewRecorder()
|
||||
b.ServeHTTP(w, newRequest(model))
|
||||
record(w.Code)
|
||||
}()
|
||||
}
|
||||
|
||||
// Active swaps for a and b, each with 2 waiters.
|
||||
for i := 0; i < waitersPer; i++ {
|
||||
launch("a")
|
||||
waitProcessed(t, b.testProcessed, 1)
|
||||
}
|
||||
for i := 0; i < waitersPer; i++ {
|
||||
launch("b")
|
||||
waitProcessed(t, b.testProcessed, 1)
|
||||
}
|
||||
// c collides with both → queues.
|
||||
launch("c")
|
||||
waitProcessed(t, b.testProcessed, 1)
|
||||
|
||||
<-a.runStarted
|
||||
<-pb.runStarted
|
||||
|
||||
if err := b.Shutdown(time.Second); err != nil {
|
||||
t.Fatalf("Shutdown: %v", err)
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
codesMu.Lock()
|
||||
defer codesMu.Unlock()
|
||||
if len(codes) != 2*waitersPer+1 {
|
||||
t.Fatalf("got %d responses, want %d", len(codes), 2*waitersPer+1)
|
||||
}
|
||||
for i, c := range codes {
|
||||
if c == http.StatusOK {
|
||||
t.Errorf("response %d: status=%d, want non-200 (shutdown)", i, c)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestBaseRouter_NoSwapWhileServing verifies that an already-loaded model
|
||||
// is not stopped to satisfy another model's swap while it is still handling
|
||||
// a request.
|
||||
//
|
||||
// Sequence:
|
||||
// 1. r1 (A) — A loads; ServeHTTP enters and is pinned via serveBlock.
|
||||
// 2. r2 (B, planner: B evicts A) — must NOT cause A.Stop while r1 is live.
|
||||
// 3. r3 (A) — arrives next; the existing code queues it because B's swap
|
||||
// intent collides with A.
|
||||
// 4. r1 released — A finishes r1, then r3 is served by A.
|
||||
// 5. B's swap then proceeds; r2 is served by B.
|
||||
//
|
||||
// fakeProcess.stoppedWhileServing flips true if Stop is ever called while
|
||||
// a ServeHTTP is in flight — a direct, race-free signal of the violation.
|
||||
func TestBaseRouter_NoSwapWhileServing(t *testing.T) {
|
||||
a := newFakeProcess("a")
|
||||
// autoReady left false: we markReady manually after observing runStarted,
|
||||
// so autoReady's setState(Ready) cannot race with a later Stop and leave
|
||||
// A in Ready, masking the bug.
|
||||
a.serveBlock = make(chan struct{})
|
||||
pb := newFakeProcess("b")
|
||||
// Same reasoning for B: park its swap on WaitReady until we choose.
|
||||
|
||||
planner := &stubPlanner{evict: map[string][]string{"b": {"a"}}}
|
||||
b := newTestBase(t, map[string]process.Process{"a": a, "b": pb}, planner)
|
||||
|
||||
// r1 — load A and enter its ServeHTTP (which blocks on serveBlock).
|
||||
w1 := httptest.NewRecorder()
|
||||
done1 := make(chan struct{})
|
||||
go func() {
|
||||
b.ServeHTTP(w1, newRequest("a"))
|
||||
close(done1)
|
||||
}()
|
||||
waitProcessed(t, b.testProcessed, 1) // handlerReq for r1
|
||||
<-a.runStarted
|
||||
a.markReady()
|
||||
waitProcessed(t, b.testProcessed, 1) // swapDone for A
|
||||
<-a.serveStarted
|
||||
|
||||
// r2 — would evict A. A must not be stopped while r1 is in flight.
|
||||
w2 := httptest.NewRecorder()
|
||||
done2 := make(chan struct{})
|
||||
go func() {
|
||||
b.ServeHTTP(w2, newRequest("b"))
|
||||
close(done2)
|
||||
}()
|
||||
waitProcessed(t, b.testProcessed, 1)
|
||||
|
||||
// r3 — another request for A, arrives behind r2 and queues because
|
||||
// B's swap intent (which evicts A) is recorded as active.
|
||||
w3 := httptest.NewRecorder()
|
||||
done3 := make(chan struct{})
|
||||
go func() {
|
||||
b.ServeHTTP(w3, newRequest("a"))
|
||||
close(done3)
|
||||
}()
|
||||
waitProcessed(t, b.testProcessed, 1)
|
||||
|
||||
// Release r1 (and r3 if it is fast-pathed onto the still-loaded A).
|
||||
// The router must hold off B's swap until A has drained.
|
||||
close(a.serveBlock)
|
||||
|
||||
select {
|
||||
case <-done1:
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("r1 did not complete after serveBlock release")
|
||||
}
|
||||
|
||||
// Wait for B.Run before marking it ready: markReady before Run would
|
||||
// skip the Run path entirely and leave pb.runCalls at 0. In a correct
|
||||
// implementation B's swap only starts after A has drained; in the
|
||||
// current implementation it has already started — either way runStarted
|
||||
// fires.
|
||||
<-pb.runStarted
|
||||
pb.markReady()
|
||||
|
||||
select {
|
||||
case <-done2:
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("r2 did not complete after B marked ready")
|
||||
}
|
||||
select {
|
||||
case <-done3:
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("r3 did not complete")
|
||||
}
|
||||
|
||||
if w1.Code != http.StatusOK || w2.Code != http.StatusOK || w3.Code != http.StatusOK {
|
||||
t.Fatalf("statuses: w1=%d w2=%d w3=%d", w1.Code, w2.Code, w3.Code)
|
||||
}
|
||||
if w1.Body.String() != "ok:a" {
|
||||
t.Errorf("r1 body=%q want ok:a", w1.Body.String())
|
||||
}
|
||||
if w3.Body.String() != "ok:a" {
|
||||
t.Errorf("r3 body=%q want ok:a (r3 must be served by A)", w3.Body.String())
|
||||
}
|
||||
if w2.Body.String() != "ok:b" {
|
||||
t.Errorf("r2 body=%q want ok:b", w2.Body.String())
|
||||
}
|
||||
if a.stoppedWhileServing.Load() {
|
||||
t.Errorf("A.Stop was called while A was still handling a request — the router swapped out a busy process")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBaseRouter_ModelNotFound(t *testing.T) {
|
||||
a := newFakeProcess("a")
|
||||
b := newTestBase(t, map[string]process.Process{"a": a}, &stubPlanner{})
|
||||
|
||||
@@ -0,0 +1,404 @@
|
||||
# Router design
|
||||
|
||||
A developer tutorial for the `internal/router` package and its `scheduler`
|
||||
sub-package.
|
||||
|
||||
## Intro
|
||||
|
||||
A llama-swap router is the component that sits behind the proxy and answers one
|
||||
question for every incoming request: _can this model serve right now, and if
|
||||
not, what has to happen first?_ Answering it means juggling three concerns that
|
||||
used to live tangled together in one type:
|
||||
|
||||
1. **Process machinery** — owning the OS processes, starting and stopping them,
|
||||
running health checks, and shuttling HTTP requests onto the right upstream.
|
||||
2. **Scheduling strategy** — the queue, in-flight bookkeeping, and the decision
|
||||
tree that turns one request into "serve now", "join an existing swap",
|
||||
"queue", or "start a swap".
|
||||
3. **Eviction policy** — given a model we want to load, which currently-running
|
||||
models have to be stopped to make room?
|
||||
|
||||
The design pulls those three apart into separate, independently replaceable
|
||||
pieces:
|
||||
|
||||
| Concern | Type | Lives in |
|
||||
| ------------------- | ------------------------------ | ------------------------------- |
|
||||
| Process machinery | `baseRouter` | `internal/router/base.go` |
|
||||
| Scheduling strategy | `scheduler.Scheduler` (`FIFO`) | `internal/router/scheduler/` |
|
||||
| Eviction policy | `scheduler.Swapper` | `groupSwapper`, `matrixSwapper` |
|
||||
|
||||
`baseRouter` keeps the channels, run loop, process lifecycle, and shutdown
|
||||
teardown, and exposes the side-effects a scheduler needs through the
|
||||
`scheduler.Effects` interface. The scheduler owns the queue and decision tree
|
||||
but performs no side-effects directly — it calls back through `Effects`. The
|
||||
`Swapper` is a pure function from "target model + currently running" to "models
|
||||
to evict", and knows nothing about queues, channels, or processes.
|
||||
|
||||
Because the seams are interfaces, you can replace the scheduling strategy
|
||||
without touching process management, or write a new eviction policy without
|
||||
touching either. `FIFO` is the first and currently only `Scheduler`;
|
||||
`groupSwapper` and `matrixSwapper` are the two `Swapper`s.
|
||||
|
||||
## Key concepts
|
||||
|
||||
### One run loop, no locks
|
||||
|
||||
`baseRouter.run()` is a single goroutine selecting over a handful of channels:
|
||||
|
||||
```go
|
||||
for {
|
||||
select {
|
||||
case req := <-b.shutdownCh: b.handleShutdown(req); return
|
||||
case req := <-b.handlerCh: b.schedule.OnRequest(req)
|
||||
case req := <-b.unloadCh: b.schedule.OnUnload(req.targets, req.timeout); close(req.respond)
|
||||
case ev := <-b.swapDoneCh: b.schedule.OnSwapDone(ev)
|
||||
case ev := <-b.serveDoneCh: b.schedule.OnServeDone(ev)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Every `Scheduler` method runs on this one goroutine. That is the single most
|
||||
important fact about the design: **the scheduler never needs a mutex for its own
|
||||
state**. All scheduler state is touched only from these callbacks, which are
|
||||
serialized by the run loop. If you write a new scheduler, you get the same
|
||||
guarantee for free — and you must not break it by spinning up goroutines that
|
||||
mutate scheduler state.
|
||||
|
||||
### Events flow in, side-effects flow out
|
||||
|
||||
The run loop turns external happenings into method calls on the scheduler:
|
||||
|
||||
- A new HTTP request becomes `OnRequest(HandlerReq)`.
|
||||
- A swap goroutine finishing becomes `OnSwapDone(SwapDone)`.
|
||||
- A tracked request handler returning becomes `OnServeDone(ServeDoneEvent)`.
|
||||
- An admin unload becomes `OnUnload(targets, timeout)`.
|
||||
- Shutdown becomes `OnShutdown(err)`.
|
||||
|
||||
The scheduler reacts by calling **back out** through `Effects`: inspect a
|
||||
process state, start a swap, grant a response to a caller, or stop processes. It
|
||||
never calls `process.Process` directly and never writes to a channel directly.
|
||||
This keeps the scheduler pure enough to unit-test against a fake `Effects` with
|
||||
no goroutines or real processes involved (see `scheduler/fifo_test.go`).
|
||||
|
||||
```
|
||||
HTTP request admin Unload / Shutdown
|
||||
│ │
|
||||
▼ ▼
|
||||
ServeHTTP ──HandlerReq──▶ baseRouter.run() ◀──unloadCh/shutdownCh
|
||||
│ (single goroutine)
|
||||
▼
|
||||
Scheduler.On*(...)
|
||||
│ calls back through
|
||||
▼
|
||||
Effects: ModelState / StartSwap /
|
||||
GrantServe / GrantError / StopProcesses
|
||||
│
|
||||
▼
|
||||
baseRouter side-effects: doSwap goroutine,
|
||||
grant() to caller, process.Stop()
|
||||
│
|
||||
swap completes ──SwapDone──▶ back into run loop
|
||||
```
|
||||
|
||||
### The swap goroutine
|
||||
|
||||
Scheduling decisions must be quick and non-blocking, but loading a model is
|
||||
slow. The two are reconciled by doing the slow part on a separate goroutine.
|
||||
|
||||
When the scheduler decides to start a swap, inside `OnRequest` it:
|
||||
|
||||
1. records "a swap for X is in flight" in its own state, then
|
||||
2. calls `Effects.StartSwap(modelID, evict)`.
|
||||
|
||||
`StartSwap` does **not** load the model itself — it just launches a detached
|
||||
goroutine (`doSwap`) and returns straight away. `doSwap` is what does the slow
|
||||
work: stop the evicted processes, start the target, wait for it to become ready.
|
||||
Because `StartSwap` returned immediately, `OnRequest` returns too, and the run
|
||||
loop is free to pick up the next event — another request, a serve-done, an
|
||||
unload — while `doSwap` runs in the background.
|
||||
|
||||
The swap's eventual result comes back as just another event: when `doSwap`
|
||||
finishes it posts a `SwapDone` onto `swapDoneCh`, which the run loop delivers as
|
||||
`OnSwapDone`. So a slow load never blocks the run loop; it brackets it with two
|
||||
quick events (`OnRequest` to start, `OnSwapDone` to finish) and everything in
|
||||
between is handled normally.
|
||||
|
||||
### In-flight tracking and `trackedServe`
|
||||
|
||||
When the scheduler grants a request, the handler it hands back is wrapped by
|
||||
`baseRouter.trackedServe`. The wrapper runs the real `ServeHTTP` and, on return,
|
||||
posts a `ServeDoneEvent` so the run loop can decrement the per-model in-flight
|
||||
count. This is why the scheduler can know whether a process is "busy": it counts
|
||||
grants out and serve-dones in. A swap that would evict a busy process is
|
||||
deferred until that process's in-flight count hits zero (`OnServeDone` then
|
||||
re-drains the queue).
|
||||
|
||||
The subtle contract here is `GrantServe`'s boolean return. The caller's
|
||||
`Respond` channel is unbuffered, so a successful send proves the HTTP goroutine
|
||||
is alive and took the handler. If the caller already disconnected, the send
|
||||
fails, `trackedServe` never runs, and **no** `ServeDoneEvent` will ever arrive —
|
||||
so the scheduler must only increment `inFlight` when `GrantServe` returns true.
|
||||
Incrementing on a false return would strand the counter above zero and the model
|
||||
could never be evicted again.
|
||||
|
||||
## The interfaces
|
||||
|
||||
All three live in `scheduler/scheduler.go`.
|
||||
|
||||
### `Scheduler`
|
||||
|
||||
```go
|
||||
type Scheduler interface {
|
||||
OnRequest(req HandlerReq)
|
||||
OnSwapDone(ev SwapDone)
|
||||
OnServeDone(ev ServeDoneEvent)
|
||||
OnUnload(targets []string, timeout time.Duration)
|
||||
OnShutdown(err error)
|
||||
}
|
||||
```
|
||||
|
||||
Owns the queue, in-flight tracking, and the decision tree. All methods run on
|
||||
the run-loop goroutine, so no internal locking is needed.
|
||||
|
||||
### `Swapper`
|
||||
|
||||
```go
|
||||
type Swapper interface {
|
||||
EvictionFor(target string, running []string) []string
|
||||
OnSwapStart(target string, running []string)
|
||||
}
|
||||
```
|
||||
|
||||
The eviction policy. `EvictionFor` is a **pure decision** — given the target and
|
||||
the complete `running` set, return the running model IDs that must stop. It must
|
||||
not log or mutate anything, and it does **not** inspect process state itself:
|
||||
the scheduler hands it `running` already assembled (every non-stopped process,
|
||||
unioned with the targets of in-flight swaps already committed but not yet
|
||||
visible in process state). That keeps the swapper a pure function of its inputs,
|
||||
with no reference to processes.
|
||||
|
||||
The reason it must not log is that it is a _speculative_ query — "what would we
|
||||
evict if we started this swap right now?" — called far more often than swaps
|
||||
actually happen. The scheduler calls it once per incoming request, and then
|
||||
**again for every still-queued request on every queue drain** (each `OnSwapDone`,
|
||||
`OnServeDone`, and `OnUnload`). Most of those calls end in "still queued",
|
||||
"collides", or "nothing to evict", not a real swap. Logging there would emit
|
||||
duplicate lines for a request that simply sits in the queue, and lines for
|
||||
decisions that never happen — the log would stop meaning "a swap occurred".
|
||||
|
||||
`OnSwapStart` is the one place a Swapper may log, because it is called exactly
|
||||
once, at the moment a swap is committed. One log line there equals one real swap,
|
||||
with the evict set that is genuinely being applied — which is why `matrixSwapper`
|
||||
re-solves and logs the full decision (set, DSL, cost) in `OnSwapStart` rather
|
||||
than in `EvictionFor`.
|
||||
|
||||
### `Effects`
|
||||
|
||||
```go
|
||||
type Effects interface {
|
||||
ModelState(modelID string) (process.ProcessState, bool)
|
||||
RunningModels() map[string]process.ProcessState
|
||||
StartSwap(modelID string, evict []string)
|
||||
GrantError(req HandlerReq, err error)
|
||||
GrantServe(req HandlerReq, modelID string) bool
|
||||
StopProcesses(timeout time.Duration, ids []string)
|
||||
}
|
||||
```
|
||||
|
||||
Implemented by `baseRouter`. This is the scheduler's entire window onto the
|
||||
outside world; everything else about the router is hidden from it. See the
|
||||
deep-dive below.
|
||||
|
||||
### `Factory` — wiring it together
|
||||
|
||||
```go
|
||||
type Factory func(name string, logger *logmon.Monitor, eff Effects) Scheduler
|
||||
```
|
||||
|
||||
`baseRouter` doesn't know which scheduler or swapper it has — it is handed a
|
||||
`Factory` at construction and calls it once, passing itself as the `Effects`.
|
||||
The concrete router captures its `Swapper` in the closure. From `group.go`:
|
||||
|
||||
```go
|
||||
swapper := &groupSwapper{ /* ... */ }
|
||||
base := newBaseRouter("group", conf, processes, proxylog,
|
||||
func(name string, logger *logmon.Monitor, eff scheduler.Effects) scheduler.Scheduler {
|
||||
return scheduler.NewFIFO(name, logger, swapper, eff)
|
||||
})
|
||||
```
|
||||
|
||||
This closure is the single point where the three pieces meet: it binds a
|
||||
specific `Swapper` (`swapper`) and a specific `Scheduler` (`FIFO`) to the
|
||||
`baseRouter`'s `Effects` (`eff`).
|
||||
|
||||
**The swapper is a separate type from the concrete router.** There are currently two router implementations router.Group and router.Matrix. Each of these has a custom swapper that implements scheduler.Swapper for custom eviction logic. This decoupling of responsibilities makes it easy to implement custom swapping strategies.
|
||||
|
||||
### The events
|
||||
|
||||
A single goroutine in `baseRouter.run()` owns and serializes all state changes in the router. By processing events one at a time it ensures correctness and eliminates complex mutex lock logic.
|
||||
|
||||
These are the events the router currently uses:
|
||||
|
||||
```go
|
||||
type HandlerReq struct { // one in-flight ServeHTTP awaiting a decision
|
||||
Model string
|
||||
Ctx context.Context
|
||||
Respond chan HandlerResp // UNBUFFERED — see GrantServe contract
|
||||
PositionCh chan int // queue-position updates for the loading UI
|
||||
}
|
||||
|
||||
type HandlerResp struct { // the decision handed back to the caller
|
||||
HandleFunc http.HandlerFunc // serve with this, or...
|
||||
Err error // ...fail with this
|
||||
}
|
||||
|
||||
type SwapDone struct{ ModelID string; Err error } // swap goroutine finished
|
||||
type ServeDoneEvent struct{ ModelID string } // tracked handler returned
|
||||
```
|
||||
|
||||
## Deep-dive: the `Effects` interface and why it exists
|
||||
|
||||
`Effects` is the inversion-of-control boundary that makes the split possible.
|
||||
The scheduler decides and `baseRouter` _acts_. Pulling the side-effects behind this
|
||||
interface buys three things:
|
||||
|
||||
1. **Purity and testability.** The scheduler performs no I/O, starts no
|
||||
goroutines of its own, and touches no real processes. Its tests drive the
|
||||
`On*` methods directly and assert on a `fakeEffects` that just records the
|
||||
calls — synchronous, deterministic, no sleeps. (`scheduler/fifo_test.go`.)
|
||||
2. **A single, auditable side-effect surface.** Every externally-visible thing a
|
||||
scheduler can do is one of six methods. You can reason about the whole
|
||||
contract by reading one interface.
|
||||
3. **Decoupling lifetime.** The scheduler never holds a `process.Process`,
|
||||
never sees a channel, and never learns how shutdown teardown works. It only
|
||||
knows model IDs and states.
|
||||
|
||||
Method by method, as implemented in `base.go`:
|
||||
|
||||
- **`ModelState(modelID) (state, ok)`** — read-only snapshot of a process's
|
||||
state, and whether this router handles the model at all. The scheduler uses it
|
||||
for the "unknown model" check and the "already ready" fast path. Safe to call
|
||||
any time because the process map is fixed at construction and `State()` is a
|
||||
snapshot.
|
||||
|
||||
- **`RunningModels()`** — the state of every process that isn't stopped or shut
|
||||
down. The scheduler unions its keys with its own in-flight swap targets to
|
||||
build the `running` set it hands the `Swapper`, so the swapper never has to
|
||||
touch process state itself.
|
||||
|
||||
- **`StartSwap(modelID, evict)`** — fire-and-forget. `baseRouter` launches the
|
||||
`doSwap` goroutine and returns immediately; the result comes back later as a
|
||||
`SwapDone`. The scheduler records the swap as active _before_ calling this so
|
||||
that requests arriving in the meantime can join it.
|
||||
|
||||
- **`GrantError(req, err)`** — hand a caller an error response. Used for unknown
|
||||
models, failed swaps, unloads, and shutdown.
|
||||
|
||||
- **`GrantServe(req, modelID) bool`** — hand a caller the tracked handler for a
|
||||
ready model, returning whether the caller was still there to receive it. The
|
||||
scheduler increments the in-flight count **only on a true return** (see the
|
||||
in-flight contract above). This is the one `Effects` method whose return value
|
||||
carries state-machine significance.
|
||||
|
||||
- **`StopProcesses(timeout, ids)`** — stop processes in parallel and **block**
|
||||
until all have stopped. Used by `OnUnload` so an admin `Unload` call can
|
||||
guarantee the process is dead by the time it returns. (Note `StartSwap` is
|
||||
async but `StopProcesses` is sync — the difference is deliberate and tied to
|
||||
the caller's expectations.)
|
||||
|
||||
A useful way to hold it in your head: `Effects` is the scheduler's syscall
|
||||
table. The scheduler is a pure state machine; `Effects` is how it touches the
|
||||
world, and `baseRouter` is the kernel that implements those syscalls with real
|
||||
goroutines, channels, and processes.
|
||||
|
||||
## How to implement a new `Swapper`
|
||||
|
||||
A `Swapper` is a pure decision function plus a logging hook — the easiest of the three pieces to replace.
|
||||
|
||||
1. **Write the swapper type** and give it whatever config it needs to make a
|
||||
decision. It does **not** need the process map — the scheduler supplies the
|
||||
running set as an argument. `groupSwapper` holds only its group config;
|
||||
`matrixSwapper` holds only its solver and logger:
|
||||
|
||||
```go
|
||||
type mySwapper struct {
|
||||
config config.Config
|
||||
}
|
||||
```
|
||||
|
||||
2. **Implement `EvictionFor(target, running)`** as a _pure_ decision:
|
||||
- `running` is the complete live set, already assembled for you: every
|
||||
non-stopped process unioned with the targets of in-flight swaps the
|
||||
scheduler has committed to. You don't filter process state or fold in
|
||||
in-flight targets yourself, that's the scheduler's job. Just decide against the slice you're handed.
|
||||
- Return the list of model IDs in `running` that must stop for `target` to
|
||||
run. Return `nil`/empty when nothing needs evicting.
|
||||
- Do **not** mutate state here.
|
||||
- Do **not** log here. It can be called multiple times per request. Since it is pure function have tests verify the expected behaviour.
|
||||
|
||||
3. **Implement `OnSwapStart(target, running)`** — called once when a swap
|
||||
actually begins, with the same `running` set `EvictionFor` saw. This is the
|
||||
right place to log: one call equals one real swap. `matrixSwapper` re-solves
|
||||
and logs the chosen set and cost here; `groupSwapper` logs nothing.
|
||||
|
||||
4. **Wire it in** by instantiating the swapper in your router's constructor and
|
||||
capturing it in the `Factory` closure passed to `newBaseRouter` — exactly as
|
||||
`NewGroup` and `NewMatrix` do. The router struct itself only ever embeds
|
||||
`*baseRouter`; the swapper reaches the scheduler solely through that closure.
|
||||
|
||||
Reference implementations: `groupSwapper` (static group config) in `group.go`
|
||||
and `matrixSwapper` (cost-based set solver) in `matrix.go`.
|
||||
|
||||
## How to implement a new `Scheduler`
|
||||
|
||||
Replacing the scheduler means taking over the queue and the entire decision tree. Read `scheduler/fifo.go` end to end first — it is the reference implementation and the rules below are easiest to understand in context.
|
||||
|
||||
The rules you must honour:
|
||||
|
||||
- **Single goroutine.** Every method runs on the `baseRouter.run()` goroutine. Keep your state in plain maps/slices and never read or write it from another goroutine. If you need slow work done, hand it to `Effects.StartSwap` and react to the resulting `SwapDone` — do not block a method waiting for it.
|
||||
|
||||
- **Never block the run loop.** `OnRequest`, `OnSwapDone`, and `OnServeDone` must make a decision and return. The one method allowed to block is `OnUnload`, and only because it must wait on the synchronous `StopProcesses` so the admin caller's guarantee holds.
|
||||
|
||||
- **Respect the `GrantServe` boolean.** Only count a request as in-flight when `GrantServe` returns true (see the in-flight contract above). A false return means the caller is gone; no `ServeDoneEvent` will ever arrive, so incrementing on false permanently strands the counter.
|
||||
|
||||
- **Account for in-flight swaps in your running set.** When you call `Swapper.EvictionFor`, the running set you pass must include not just live processes (`Effects.RunningModels`) but also the targets of swaps you've already started that aren't yet visible in process state — otherwise the swapper contradicts decisions already in motion.
|
||||
|
||||
What each method must do:
|
||||
|
||||
- **`OnRequest(req)`** — every request must resolve to exactly one of: granted, errored, joined (piggybacks an in-flight swap), queued, or swap-started. No request may be silently dropped.
|
||||
|
||||
- **`OnSwapDone(ev)`** — deliver the result to every waiter that joined this swap (grant on success, error on `ev.Err`), drop the swap from active tracking, then re-examine anything queued — a finished swap may have unblocked it.
|
||||
|
||||
- **`OnServeDone(ev)`** — decrement the model's in-flight count; when it hits zero, re-examine the queue. Do **not** clear in-flight counts by hand; the handlers post their own `ServeDoneEvent`s on return.
|
||||
|
||||
- **`OnUnload(targets, timeout)`** — error out any waiters or queued requests for the unloaded models, call `Effects.StopProcesses` (synchronously — the admin caller relies on the process being dead afterwards), then re-examine the queue.
|
||||
|
||||
- **`OnShutdown(err)`** — error out every waiter you still hold (active swap waiters and queued requests). Don't touch processes; teardown is `baseRouter`'s job.
|
||||
|
||||
Expose a constructor matching the `Factory` shape:
|
||||
|
||||
```go
|
||||
func NewMyScheduler(name string, logger *logmon.Monitor, swapper Swapper, eff Effects) *MyScheduler {
|
||||
// ...
|
||||
}
|
||||
|
||||
// in the concrete router:
|
||||
base := newBaseRouter(name, conf, processes, proxylog,
|
||||
func(name string, logger *logmon.Monitor, eff scheduler.Effects) scheduler.Scheduler {
|
||||
return scheduler.NewMyScheduler(name, logger, swapper, eff)
|
||||
})
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
- **Schedulers** are tested as pure state machines in the `scheduler` package:
|
||||
drive the `On*` methods directly against a `fakeEffects` and assert on the
|
||||
recorded grants/starts/stops. No goroutines, no sleeps. See
|
||||
`scheduler/fifo_test.go` as the reference; follow the `TestSchedulerName_<scenario>`
|
||||
naming convention.
|
||||
- **`baseRouter` mechanism** (run loop, `grant`/`ServeHTTP`, `Unload`,
|
||||
`Shutdown`) is tested in `base_test.go`. The run loop exposes a
|
||||
`testProcessed` channel so tests can wait for an event to be fully processed
|
||||
instead of sleeping.
|
||||
- Run new tests with `go test -v -run TestMyScheduler_... ./internal/router/scheduler/`,
|
||||
then `make test-dev` for a quick `go test` + `staticcheck` pass over `proxy/`.
|
||||
+14
-19
@@ -6,6 +6,7 @@ import (
|
||||
"github.com/mostlygeek/llama-swap/internal/config"
|
||||
"github.com/mostlygeek/llama-swap/internal/logmon"
|
||||
"github.com/mostlygeek/llama-swap/internal/process"
|
||||
"github.com/mostlygeek/llama-swap/internal/router/scheduler"
|
||||
)
|
||||
|
||||
type Group struct {
|
||||
@@ -14,7 +15,7 @@ type Group struct {
|
||||
|
||||
func NewGroup(conf config.Config, proxylog, upstreamlog *logmon.Monitor) (*Group, error) {
|
||||
modelToGroup := make(map[string]string)
|
||||
for gid, gcfg := range conf.Groups {
|
||||
for gid, gcfg := range conf.Routing.Router.Settings.Groups {
|
||||
for _, mid := range gcfg.Members {
|
||||
if existing, dup := modelToGroup[mid]; dup {
|
||||
return nil, fmt.Errorf("model %q is in multiple groups: %q and %q", mid, existing, gid)
|
||||
@@ -23,14 +24,16 @@ func NewGroup(conf config.Config, proxylog, upstreamlog *logmon.Monitor) (*Group
|
||||
}
|
||||
}
|
||||
|
||||
planner := &groupPlanner{
|
||||
swapper := &groupSwapper{
|
||||
config: conf,
|
||||
modelToGroup: modelToGroup,
|
||||
}
|
||||
|
||||
processes := make(map[string]process.Process, len(modelToGroup))
|
||||
base := newBaseRouter("group", conf, processes, planner, proxylog)
|
||||
planner.processes = processes
|
||||
base := newBaseRouter("group", conf, processes, proxylog,
|
||||
func(name string, logger *logmon.Monitor, eff scheduler.Effects) scheduler.Scheduler {
|
||||
return scheduler.NewFIFO(name, logger, swapper, conf.Routing.Scheduler.Settings.Fifo, eff)
|
||||
})
|
||||
|
||||
for mid := range modelToGroup {
|
||||
modelCfg, _, ok := conf.FindConfig(mid)
|
||||
@@ -54,21 +57,20 @@ func NewGroup(conf config.Config, proxylog, upstreamlog *logmon.Monitor) (*Group
|
||||
return g, nil
|
||||
}
|
||||
|
||||
// groupPlanner decides evictions from static group configuration.
|
||||
// groupSwapper decides evictions from static group configuration.
|
||||
//
|
||||
// Same-group siblings are stopped when the group has swap=true. Cross-group
|
||||
// members are stopped only when the target's group is exclusive; loading a
|
||||
// model from a non-exclusive group leaves running exclusive groups alone,
|
||||
// matching the gotcha in the original ProcessGroup behaviour.
|
||||
type groupPlanner struct {
|
||||
type groupSwapper struct {
|
||||
config config.Config
|
||||
modelToGroup map[string]string
|
||||
processes map[string]process.Process
|
||||
}
|
||||
|
||||
func (p *groupPlanner) EvictionFor(target string, alsoRunning []string) []string {
|
||||
func (p *groupSwapper) EvictionFor(target string, running []string) []string {
|
||||
tg := p.modelToGroup[target]
|
||||
tgCfg := p.config.Groups[tg]
|
||||
tgCfg := p.config.Routing.Router.Settings.Groups[tg]
|
||||
|
||||
seen := make(map[string]struct{})
|
||||
var result []string
|
||||
@@ -89,24 +91,17 @@ func (p *groupPlanner) EvictionFor(target string, alsoRunning []string) []string
|
||||
// for backwards compatibility. The newer swap matrix approach does not
|
||||
// have this issue.
|
||||
case og != tg && tgCfg.Exclusive:
|
||||
if ogCfg := p.config.Groups[og]; !ogCfg.Persistent {
|
||||
if ogCfg := p.config.Routing.Router.Settings.Groups[og]; !ogCfg.Persistent {
|
||||
seen[mID] = struct{}{}
|
||||
result = append(result, mID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for mID, proc := range p.processes {
|
||||
st := proc.State()
|
||||
if st == process.StateStopped || st == process.StateShutdown {
|
||||
continue
|
||||
}
|
||||
consider(mID)
|
||||
}
|
||||
for _, mID := range alsoRunning {
|
||||
for _, mID := range running {
|
||||
consider(mID)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func (p *groupPlanner) OnSwapStart(target string) {}
|
||||
func (p *groupSwapper) OnSwapStart(target string, running []string) {}
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
"github.com/mostlygeek/llama-swap/internal/config"
|
||||
"github.com/mostlygeek/llama-swap/internal/logmon"
|
||||
"github.com/mostlygeek/llama-swap/internal/process"
|
||||
"github.com/mostlygeek/llama-swap/internal/router/scheduler"
|
||||
)
|
||||
|
||||
// newTestGroup builds a Group directly from the supplied processes and config,
|
||||
@@ -17,17 +18,19 @@ import (
|
||||
func newTestGroup(t *testing.T, conf config.Config, processes map[string]process.Process) *Group {
|
||||
t.Helper()
|
||||
modelToGroup := make(map[string]string)
|
||||
for gid, gcfg := range conf.Groups {
|
||||
for gid, gcfg := range conf.Routing.Router.Settings.Groups {
|
||||
for _, mid := range gcfg.Members {
|
||||
modelToGroup[mid] = gid
|
||||
}
|
||||
}
|
||||
planner := &groupPlanner{
|
||||
swapper := &groupSwapper{
|
||||
config: conf,
|
||||
modelToGroup: modelToGroup,
|
||||
processes: processes,
|
||||
}
|
||||
base := newBaseRouter("group", conf, processes, planner, logmon.NewWriter(io.Discard))
|
||||
base := newBaseRouter("group", conf, processes, logmon.NewWriter(io.Discard),
|
||||
func(name string, logger *logmon.Monitor, eff scheduler.Effects) scheduler.Scheduler {
|
||||
return scheduler.NewFIFO(name, logger, swapper, conf.Routing.Scheduler.Settings.Fifo, eff)
|
||||
})
|
||||
base.testProcessed = make(chan struct{}, 64)
|
||||
g := &Group{baseRouter: base}
|
||||
go base.run()
|
||||
@@ -41,10 +44,10 @@ func newTestGroup(t *testing.T, conf config.Config, processes map[string]process
|
||||
|
||||
func TestGroup_NewGroup_DuplicateMembership(t *testing.T) {
|
||||
conf := config.Config{
|
||||
Groups: map[string]config.GroupConfig{
|
||||
Routing: groupRouting(map[string]config.GroupConfig{
|
||||
"g1": {Swap: true, Members: []string{"a"}},
|
||||
"g2": {Swap: true, Members: []string{"a"}},
|
||||
},
|
||||
}),
|
||||
Models: map[string]config.ModelConfig{
|
||||
"a": {},
|
||||
},
|
||||
@@ -65,9 +68,9 @@ func TestGroup_ServeHTTP_SwapStopsPrevious(t *testing.T) {
|
||||
|
||||
conf := config.Config{
|
||||
HealthCheckTimeout: 5,
|
||||
Groups: map[string]config.GroupConfig{
|
||||
Routing: groupRouting(map[string]config.GroupConfig{
|
||||
"g": {Swap: true, Exclusive: true, Members: []string{"a", "b"}},
|
||||
},
|
||||
}),
|
||||
}
|
||||
g := newTestGroup(t, conf, map[string]process.Process{"a": a, "b": b})
|
||||
|
||||
@@ -97,9 +100,9 @@ func TestGroup_NonSwapGroup_NoStop(t *testing.T) {
|
||||
|
||||
conf := config.Config{
|
||||
HealthCheckTimeout: 5,
|
||||
Groups: map[string]config.GroupConfig{
|
||||
Routing: groupRouting(map[string]config.GroupConfig{
|
||||
"g": {Swap: false, Exclusive: false, Members: []string{"a", "b"}},
|
||||
},
|
||||
}),
|
||||
}
|
||||
g := newTestGroup(t, conf, map[string]process.Process{"a": a, "b": b})
|
||||
|
||||
@@ -127,10 +130,10 @@ func TestGroup_CrossGroupExclusive(t *testing.T) {
|
||||
|
||||
conf := config.Config{
|
||||
HealthCheckTimeout: 5,
|
||||
Groups: map[string]config.GroupConfig{
|
||||
Routing: groupRouting(map[string]config.GroupConfig{
|
||||
"g1": {Swap: true, Exclusive: true, Members: []string{"a"}},
|
||||
"g2": {Swap: true, Exclusive: true, Members: []string{"b"}},
|
||||
},
|
||||
}),
|
||||
}
|
||||
g := newTestGroup(t, conf, map[string]process.Process{"a": a, "b": b})
|
||||
|
||||
@@ -154,10 +157,10 @@ func TestGroup_CrossGroupNonExclusiveParallel(t *testing.T) {
|
||||
|
||||
conf := config.Config{
|
||||
HealthCheckTimeout: 5,
|
||||
Groups: map[string]config.GroupConfig{
|
||||
Routing: groupRouting(map[string]config.GroupConfig{
|
||||
"g1": {Swap: true, Exclusive: false, Members: []string{"a"}},
|
||||
"g2": {Swap: true, Exclusive: false, Members: []string{"b"}},
|
||||
},
|
||||
}),
|
||||
}
|
||||
g := newTestGroup(t, conf, map[string]process.Process{"a": a, "b": pb})
|
||||
|
||||
@@ -202,16 +205,17 @@ func TestGroup_CrossGroupNonExclusiveParallel(t *testing.T) {
|
||||
|
||||
// TestGroup_SameGroupSwapSerialises verifies that two same-group requests
|
||||
// (Swap=true) serialise even when both arrive while neither has reached
|
||||
// StateStarting yet — the alsoRunning hint to the planner closes that race.
|
||||
// StateStarting yet — the in-flight swap target the scheduler folds into the
|
||||
// running set closes that race.
|
||||
func TestGroup_SameGroupSwapSerialises(t *testing.T) {
|
||||
a := newFakeProcess("a")
|
||||
pb := newFakeProcess("b")
|
||||
|
||||
conf := config.Config{
|
||||
HealthCheckTimeout: 5,
|
||||
Groups: map[string]config.GroupConfig{
|
||||
Routing: groupRouting(map[string]config.GroupConfig{
|
||||
"g": {Swap: true, Exclusive: false, Members: []string{"a", "b"}},
|
||||
},
|
||||
}),
|
||||
}
|
||||
g := newTestGroup(t, conf, map[string]process.Process{"a": a, "b": pb})
|
||||
|
||||
@@ -224,8 +228,9 @@ func TestGroup_SameGroupSwapSerialises(t *testing.T) {
|
||||
waitProcessed(t, g.testProcessed, 1)
|
||||
|
||||
// Request B arrives before A transitions to StateStarting in the process
|
||||
// state machine. Without the alsoRunning hint, the planner would not see
|
||||
// A as running, and B would start in parallel, violating Swap=true.
|
||||
// state machine. Without folding the in-flight swap target into the running
|
||||
// set, the swapper would not see A as running, and B would start in
|
||||
// parallel, violating Swap=true.
|
||||
w2 := httptest.NewRecorder()
|
||||
done2 := make(chan struct{})
|
||||
go func() {
|
||||
@@ -269,10 +274,10 @@ func TestGroup_PersistentNotEvicted(t *testing.T) {
|
||||
|
||||
conf := config.Config{
|
||||
HealthCheckTimeout: 5,
|
||||
Groups: map[string]config.GroupConfig{
|
||||
Routing: groupRouting(map[string]config.GroupConfig{
|
||||
"persist": {Swap: true, Exclusive: false, Persistent: true, Members: []string{"a"}},
|
||||
"other": {Swap: true, Exclusive: true, Members: []string{"b"}},
|
||||
},
|
||||
}),
|
||||
}
|
||||
g := newTestGroup(t, conf, map[string]process.Process{"a": a, "b": b})
|
||||
|
||||
@@ -306,10 +311,10 @@ func TestGroup_NonExclusiveDoesNotUnloadExclusive(t *testing.T) {
|
||||
|
||||
conf := config.Config{
|
||||
HealthCheckTimeout: 5,
|
||||
Groups: map[string]config.GroupConfig{
|
||||
Routing: groupRouting(map[string]config.GroupConfig{
|
||||
"g1": {Swap: true, Exclusive: true, Members: []string{"a"}},
|
||||
"g2": {Swap: true, Exclusive: false, Members: []string{"b"}},
|
||||
},
|
||||
}),
|
||||
}
|
||||
g := newTestGroup(t, conf, map[string]process.Process{"a": a, "b": b})
|
||||
|
||||
|
||||
@@ -12,10 +12,23 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/mostlygeek/llama-swap/internal/config"
|
||||
"github.com/mostlygeek/llama-swap/internal/logmon"
|
||||
"github.com/mostlygeek/llama-swap/internal/process"
|
||||
)
|
||||
|
||||
// groupRouting builds a normalized RoutingConfig for the group router, mirroring
|
||||
// what config.LoadConfigFromReader produces. Tests use it to populate
|
||||
// config.Config.Routing without going through LoadConfig.
|
||||
func groupRouting(groups map[string]config.GroupConfig) config.RoutingConfig {
|
||||
return config.RoutingConfig{
|
||||
Router: config.RouterConfig{
|
||||
Use: "group",
|
||||
Settings: config.RouterSettings{Groups: groups},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// fakeProcess is an in-memory implementation of process.Process used to drive
|
||||
// the routers through their state machine without spawning real upstreams.
|
||||
type fakeProcess struct {
|
||||
|
||||
+15
-43
@@ -2,11 +2,11 @@ package router
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
|
||||
"github.com/mostlygeek/llama-swap/internal/config"
|
||||
"github.com/mostlygeek/llama-swap/internal/logmon"
|
||||
"github.com/mostlygeek/llama-swap/internal/process"
|
||||
"github.com/mostlygeek/llama-swap/internal/router/scheduler"
|
||||
)
|
||||
|
||||
type Matrix struct {
|
||||
@@ -14,20 +14,23 @@ type Matrix struct {
|
||||
}
|
||||
|
||||
func NewMatrix(conf config.Config, proxylog, upstreamlog *logmon.Monitor) (*Matrix, error) {
|
||||
if conf.Matrix == nil {
|
||||
mtx := conf.Routing.Router.Settings.Matrix
|
||||
if mtx == nil {
|
||||
return nil, fmt.Errorf("matrix router requires a matrix configuration")
|
||||
}
|
||||
|
||||
planner := &matrixPlanner{
|
||||
solver: newMatrixSolver(conf.ExpandedSets, conf.Matrix.ResolvedEvictCosts()),
|
||||
swapper := &matrixSwapper{
|
||||
solver: newMatrixSolver(mtx.ExpandedSets, mtx.ResolvedEvictCosts()),
|
||||
logger: proxylog,
|
||||
}
|
||||
|
||||
// Build a process for every model in the config. Any model can run alone
|
||||
// even if it is not part of a set; this mirrors proxy.NewMatrix.
|
||||
processes := make(map[string]process.Process, len(conf.Models))
|
||||
base := newBaseRouter("matrix", conf, processes, planner, proxylog)
|
||||
planner.processes = processes
|
||||
base := newBaseRouter("matrix", conf, processes, proxylog,
|
||||
func(name string, logger *logmon.Monitor, eff scheduler.Effects) scheduler.Scheduler {
|
||||
return scheduler.NewFIFO(name, logger, swapper, conf.Routing.Scheduler.Settings.Fifo, eff)
|
||||
})
|
||||
|
||||
for mid, modelCfg := range conf.Models {
|
||||
procLog := logmon.NewWriter(upstreamlog)
|
||||
@@ -45,20 +48,18 @@ func NewMatrix(conf config.Config, proxylog, upstreamlog *logmon.Monitor) (*Matr
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// matrixPlanner decides evictions by asking the matrix solver against the
|
||||
// current running set.
|
||||
type matrixPlanner struct {
|
||||
// matrixSwapper decides evictions by asking the matrix solver against the
|
||||
// running set the scheduler hands it.
|
||||
type matrixSwapper struct {
|
||||
solver *matrixSolver
|
||||
processes map[string]process.Process
|
||||
logger *logmon.Monitor
|
||||
}
|
||||
|
||||
func (p *matrixPlanner) EvictionFor(target string, alsoRunning []string) []string {
|
||||
return p.solver.Solve(target, p.runningSet(alsoRunning)).Evict
|
||||
func (p *matrixSwapper) EvictionFor(target string, running []string) []string {
|
||||
return p.solver.Solve(target, running).Evict
|
||||
}
|
||||
|
||||
func (p *matrixPlanner) OnSwapStart(target string) {
|
||||
running := p.runningModels()
|
||||
func (p *matrixSwapper) OnSwapStart(target string, running []string) {
|
||||
result := p.solver.Solve(target, running)
|
||||
switch {
|
||||
case len(result.Evict) > 0:
|
||||
@@ -70,32 +71,3 @@ func (p *matrixPlanner) OnSwapStart(target string) {
|
||||
p.logger.Debugf("matrix: model=%s already running in set=%s dsl=%q", target, result.SetName, result.DSL)
|
||||
}
|
||||
}
|
||||
|
||||
func (p *matrixPlanner) runningModels() []string {
|
||||
return p.runningSet(nil)
|
||||
}
|
||||
|
||||
// runningSet returns the union of live processes (State != Stopped/Shutdown)
|
||||
// and any extra IDs the baseRouter has already committed to loading but which
|
||||
// the process state machine has not yet reflected.
|
||||
func (p *matrixPlanner) runningSet(alsoRunning []string) []string {
|
||||
seen := make(map[string]struct{}, len(p.processes))
|
||||
var running []string
|
||||
for id, proc := range p.processes {
|
||||
st := proc.State()
|
||||
if st == process.StateStopped || st == process.StateShutdown {
|
||||
continue
|
||||
}
|
||||
seen[id] = struct{}{}
|
||||
running = append(running, id)
|
||||
}
|
||||
for _, id := range alsoRunning {
|
||||
if _, dup := seen[id]; dup {
|
||||
continue
|
||||
}
|
||||
seen[id] = struct{}{}
|
||||
running = append(running, id)
|
||||
}
|
||||
sort.Strings(running)
|
||||
return running
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
"github.com/mostlygeek/llama-swap/internal/config"
|
||||
"github.com/mostlygeek/llama-swap/internal/logmon"
|
||||
"github.com/mostlygeek/llama-swap/internal/process"
|
||||
"github.com/mostlygeek/llama-swap/internal/router/scheduler"
|
||||
)
|
||||
|
||||
// newTestMatrix builds a Matrix router from supplied processes, bypassing
|
||||
@@ -17,12 +18,14 @@ import (
|
||||
func newTestMatrix(t *testing.T, conf config.Config, expanded []config.ExpandedSet, evictCosts map[string]int, processes map[string]process.Process) *Matrix {
|
||||
t.Helper()
|
||||
logger := logmon.NewWriter(io.Discard)
|
||||
planner := &matrixPlanner{
|
||||
swapper := &matrixSwapper{
|
||||
solver: newMatrixSolver(expanded, evictCosts),
|
||||
processes: processes,
|
||||
logger: logger,
|
||||
}
|
||||
base := newBaseRouter("matrix", conf, processes, planner, logger)
|
||||
base := newBaseRouter("matrix", conf, processes, logger,
|
||||
func(name string, l *logmon.Monitor, eff scheduler.Effects) scheduler.Scheduler {
|
||||
return scheduler.NewFIFO(name, l, swapper, conf.Routing.Scheduler.Settings.Fifo, eff)
|
||||
})
|
||||
base.testProcessed = make(chan struct{}, 64)
|
||||
r := &Matrix{baseRouter: base}
|
||||
go base.run()
|
||||
@@ -153,8 +156,8 @@ func TestMatrix_CoexistingSetParallel(t *testing.T) {
|
||||
|
||||
// TestMatrix_IncompatibleQueues verifies that the second request for a model
|
||||
// that cannot coexist with the in-flight first model queues until the first
|
||||
// completes, and then evicts it. This exercises the alsoRunning hint via the
|
||||
// matrix solver's union into runningSet.
|
||||
// completes, and then evicts it. This exercises the scheduler folding in-flight
|
||||
// swap targets into the running set it hands the swapper.
|
||||
func TestMatrix_IncompatibleQueues(t *testing.T) {
|
||||
a := newFakeProcess("a")
|
||||
pb := newFakeProcess("b")
|
||||
@@ -173,8 +176,9 @@ func TestMatrix_IncompatibleQueues(t *testing.T) {
|
||||
}()
|
||||
waitProcessed(t, r.testProcessed, 1)
|
||||
|
||||
// B arrives before A transitions to StateStarting. The solver sees A via
|
||||
// alsoRunning and returns evict=[a], so collidesWith forces B to queue.
|
||||
// B arrives before A transitions to StateStarting. The running set the
|
||||
// scheduler builds includes A (an in-flight swap target), so the solver
|
||||
// returns evict=[a] and collidesWith forces B to queue.
|
||||
w2 := httptest.NewRecorder()
|
||||
done2 := make(chan struct{})
|
||||
go func() {
|
||||
|
||||
@@ -13,6 +13,7 @@ import (
|
||||
"github.com/mostlygeek/llama-swap/internal/config"
|
||||
"github.com/mostlygeek/llama-swap/internal/logmon"
|
||||
"github.com/mostlygeek/llama-swap/internal/process"
|
||||
"github.com/mostlygeek/llama-swap/internal/router/scheduler"
|
||||
"github.com/tidwall/gjson"
|
||||
)
|
||||
|
||||
@@ -31,7 +32,7 @@ var (
|
||||
ErrNoModelInContext = fmt.Errorf("no model in request context")
|
||||
ErrNoRouterFound = fmt.Errorf("no router found for model")
|
||||
ErrNoPeerModelFound = fmt.Errorf("peer model not found")
|
||||
ErrNoLocalModelFound = fmt.Errorf("local model not found")
|
||||
ErrNoLocalModelFound = scheduler.ErrModelNotFound
|
||||
|
||||
ContextKey = &contextkey{"context"}
|
||||
)
|
||||
|
||||
@@ -0,0 +1,411 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"github.com/mostlygeek/llama-swap/internal/config"
|
||||
"github.com/mostlygeek/llama-swap/internal/logmon"
|
||||
"github.com/mostlygeek/llama-swap/internal/process"
|
||||
)
|
||||
|
||||
// activeSwap tracks one in-flight swap and the callers waiting on it.
|
||||
type activeSwap struct {
|
||||
modelID string
|
||||
evict []string
|
||||
waiters []HandlerReq
|
||||
}
|
||||
|
||||
// FIFO is the default scheduler. Requests are handled in a first-in, first-out order.
|
||||
// To reduce swapping requests for a model that is already running will be handled
|
||||
// immediately by the running process.
|
||||
//
|
||||
// Requests into this schedule are handled like this:
|
||||
//
|
||||
// A B C A B C --> A A B B C C
|
||||
//
|
||||
// The strategy is simple and reduces the number of swaps required.
|
||||
type FIFO struct {
|
||||
name string
|
||||
logger *logmon.Monitor
|
||||
planner Swapper
|
||||
cfg config.FifoConfig
|
||||
effects Effects
|
||||
|
||||
active map[string]*activeSwap
|
||||
inFlight map[string]int
|
||||
queued []HandlerReq
|
||||
}
|
||||
|
||||
// NewFIFO builds a FIFO scheduler. It matches scheduler.Factory once a planner
|
||||
// is captured in a closure.
|
||||
func NewFIFO(name string, logger *logmon.Monitor, planner Swapper, cfg config.FifoConfig, eff Effects) *FIFO {
|
||||
return &FIFO{
|
||||
name: name,
|
||||
logger: logger,
|
||||
planner: planner,
|
||||
cfg: cfg,
|
||||
effects: eff,
|
||||
active: make(map[string]*activeSwap),
|
||||
inFlight: make(map[string]int),
|
||||
}
|
||||
}
|
||||
|
||||
// OnRequest decides what to do with one incoming ServeHTTP request. It never
|
||||
// blocks indefinitely: any work that has to wait (starting a process, stopping
|
||||
// siblings, waiting for ready) is deferred to a swap goroutine and reported back
|
||||
// via OnSwapDone.
|
||||
//
|
||||
// The decision tree, in order:
|
||||
//
|
||||
// 1. Unknown model — respond with ErrModelNotFound and move on.
|
||||
// 2. A swap to the same model is already in flight — attach this waiter so
|
||||
// one swap serves all callers that asked for the same model.
|
||||
// 3. Fast path — the target process is already ready, the planner sees
|
||||
// nothing to evict, and no in-flight swap is evicting it. Hand back its
|
||||
// ServeHTTP immediately.
|
||||
// 4. Would collide with an in-flight swap (we'd stop their target, or they're
|
||||
// stopping us) — park in the queue for OnSwapDone to drain.
|
||||
// 5. Would evict a process that is still handling requests — park in the
|
||||
// queue. OnServeDone will retry when the busy process drains.
|
||||
// 6. Otherwise — start a new swap. This may run in parallel with other active
|
||||
// swaps when their evict sets don't intersect.
|
||||
func (s *FIFO) OnRequest(req HandlerReq) {
|
||||
// (1) Unknown model.
|
||||
state, ok := s.effects.ModelState(req.Model)
|
||||
if !ok {
|
||||
s.logger.Debugf("%s: model %s not handled by this router", s.name, req.Model)
|
||||
s.effects.GrantError(req, ErrModelNotFound)
|
||||
return
|
||||
}
|
||||
|
||||
// (2) Join an in-flight swap for the same model.
|
||||
if sw, ok := s.active[req.Model]; ok {
|
||||
s.logger.Debugf("%s: joining in-flight swap for model %s (%d waiters)", s.name, req.Model, len(sw.waiters)+1)
|
||||
sw.waiters = append(sw.waiters, req)
|
||||
return
|
||||
}
|
||||
|
||||
running := s.runningSet(req.Model)
|
||||
evict := s.planner.EvictionFor(req.Model, running)
|
||||
|
||||
// (3) Fast path: ready, nothing to evict, and nobody is evicting us.
|
||||
if state == process.StateReady && len(evict) == 0 && !collidesWith(req.Model, evict, s.active) {
|
||||
s.logger.Debugf("%s: fast-path serving model %s (already ready)", s.name, req.Model)
|
||||
s.grantHandler(req, req.Model)
|
||||
return
|
||||
}
|
||||
|
||||
// (4) Collision with an in-flight swap — queue.
|
||||
if collidesWith(req.Model, evict, s.active) {
|
||||
s.logger.Debugf("%s: queuing request for model %s (collides with in-flight swap)", s.name, req.Model)
|
||||
s.enqueue(req)
|
||||
return
|
||||
}
|
||||
|
||||
// (5) Would evict a busy process — queue until it drains.
|
||||
if conflictsWithInFlight(evict, s.inFlight) {
|
||||
s.logger.Debugf("%s: queuing request for model %s (would evict in-flight process)", s.name, req.Model)
|
||||
s.enqueue(req)
|
||||
return
|
||||
}
|
||||
|
||||
// (6) Start a new (possibly parallel) swap.
|
||||
s.logger.Debugf("%s: starting swap for model %s, evicting %v", s.name, req.Model, evict)
|
||||
s.startSwap(req, evict, running)
|
||||
}
|
||||
|
||||
// OnSwapDone fans the result out to every waiter that joined this swap, removes
|
||||
// the swap from the active map, then walks the queue once, promoting any items
|
||||
// that no longer collide with the remaining active set. FIFO order is preserved:
|
||||
// items still blocked stay in place.
|
||||
func (s *FIFO) OnSwapDone(ev SwapDone) {
|
||||
sw, ok := s.active[ev.ModelID]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
delete(s.active, ev.ModelID)
|
||||
|
||||
for _, w := range sw.waiters {
|
||||
if ev.Err != nil {
|
||||
s.effects.GrantError(w, ev.Err)
|
||||
} else {
|
||||
s.grantHandler(w, ev.ModelID)
|
||||
}
|
||||
}
|
||||
|
||||
s.drainQueue()
|
||||
}
|
||||
|
||||
// OnServeDone decrements the per-model in-flight count and, when that drops to
|
||||
// zero, retries the queue: requests whose swap was deferred because they would
|
||||
// have evicted this (now-idle) process can now proceed.
|
||||
func (s *FIFO) OnServeDone(ev ServeDoneEvent) {
|
||||
s.inFlight[ev.ModelID]--
|
||||
if s.inFlight[ev.ModelID] <= 0 {
|
||||
delete(s.inFlight, ev.ModelID)
|
||||
s.drainQueue()
|
||||
}
|
||||
}
|
||||
|
||||
// OnUnload reconciles router-owned state with the impending Stop, performs the
|
||||
// Stop (synchronously, via Effects) so callers of Unload remain blocked until
|
||||
// each targeted process has exited, then drains the queue.
|
||||
func (s *FIFO) OnUnload(targets []string, timeout time.Duration) {
|
||||
unloadErr := fmt.Errorf("%s: model unloaded", s.name)
|
||||
|
||||
targetSet := make(map[string]bool, len(targets))
|
||||
for _, id := range targets {
|
||||
targetSet[id] = true
|
||||
}
|
||||
|
||||
// Release waiters of any in-flight swap whose target is being unloaded.
|
||||
// The swap goroutine itself is left to finish on its own; when its
|
||||
// SwapDone arrives, OnSwapDone will find no entry in active and drop it.
|
||||
for id := range targetSet {
|
||||
sw, ok := s.active[id]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
for _, w := range sw.waiters {
|
||||
s.effects.GrantError(w, unloadErr)
|
||||
}
|
||||
delete(s.active, id)
|
||||
}
|
||||
|
||||
// Drop queued requests addressed to unloaded models. Requests for other
|
||||
// models stay queued and may benefit from drainQueue at the end.
|
||||
if len(s.queued) > 0 {
|
||||
kept := s.queued[:0]
|
||||
for _, w := range s.queued {
|
||||
if targetSet[w.Model] {
|
||||
s.effects.GrantError(w, unloadErr)
|
||||
continue
|
||||
}
|
||||
kept = append(kept, w)
|
||||
}
|
||||
s.queued = kept
|
||||
}
|
||||
|
||||
// Stop the targeted processes. Done synchronously so Unload's caller can
|
||||
// rely on "after Unload returns, the process is stopped". inFlight is
|
||||
// intentionally NOT cleared here: each dying handler will fire its tracked
|
||||
// serve and reach OnServeDone in the normal way.
|
||||
s.effects.StopProcesses(timeout, targets)
|
||||
|
||||
// Removing entries from active above may have unblocked queued requests
|
||||
// that previously collided with the now-cancelled swaps.
|
||||
s.drainQueue()
|
||||
}
|
||||
|
||||
// OnShutdown grants err to every waiter still held by the scheduler.
|
||||
func (s *FIFO) OnShutdown(err error) {
|
||||
for _, sw := range s.active {
|
||||
for _, w := range sw.waiters {
|
||||
s.effects.GrantError(w, err)
|
||||
}
|
||||
}
|
||||
for _, w := range s.queued {
|
||||
s.effects.GrantError(w, err)
|
||||
}
|
||||
}
|
||||
|
||||
// grantHandler hands the caller a tracked handler for modelID and, only if the
|
||||
// caller was still there to receive it, bumps the in-flight count. Incrementing
|
||||
// when the grant failed would strand the counter and block future evictions.
|
||||
func (s *FIFO) grantHandler(req HandlerReq, modelID string) {
|
||||
if s.effects.GrantServe(req, modelID) {
|
||||
s.inFlight[modelID]++
|
||||
}
|
||||
}
|
||||
|
||||
// startSwap records the swap as active and launches it via Effects. running is
|
||||
// the set EvictionFor saw, forwarded to OnSwapStart so the planner logs against
|
||||
// the same picture it decided on.
|
||||
func (s *FIFO) startSwap(initial HandlerReq, evict, running []string) {
|
||||
s.active[initial.Model] = &activeSwap{
|
||||
modelID: initial.Model,
|
||||
evict: evict,
|
||||
waiters: []HandlerReq{initial},
|
||||
}
|
||||
s.planner.OnSwapStart(initial.Model, running)
|
||||
s.effects.StartSwap(initial.Model, evict)
|
||||
}
|
||||
|
||||
// enqueue inserts req into the queue in priority order: it goes just before the
|
||||
// first queued item whose priority is strictly lower, so higher-priority models
|
||||
// are serviced first while equal-priority requests keep their arrival (FIFO)
|
||||
// order. Priorities come from the FifoConfig; unlisted models default to 0.
|
||||
func (s *FIFO) enqueue(req HandlerReq) {
|
||||
p := s.cfg.Priority[req.Model]
|
||||
i := len(s.queued)
|
||||
for j, q := range s.queued {
|
||||
if s.cfg.Priority[q.Model] < p {
|
||||
i = j
|
||||
break
|
||||
}
|
||||
}
|
||||
s.queued = append(s.queued, HandlerReq{})
|
||||
copy(s.queued[i+1:], s.queued[i:])
|
||||
s.queued[i] = req
|
||||
broadcastQueuePositions(s.queued)
|
||||
}
|
||||
|
||||
// drainQueue walks the queued requests in order, re-running the OnRequest
|
||||
// decision tree against the (now smaller) active set. Items that can now start
|
||||
// or join become satisfied; items still blocked remain queued in original order
|
||||
// so they get another chance on the next swap completion.
|
||||
func (s *FIFO) drainQueue() {
|
||||
if len(s.queued) == 0 {
|
||||
return
|
||||
}
|
||||
pending := s.queued
|
||||
var remaining []HandlerReq
|
||||
for _, req := range pending {
|
||||
state, ok := s.effects.ModelState(req.Model)
|
||||
if !ok {
|
||||
s.effects.GrantError(req, ErrModelNotFound)
|
||||
continue
|
||||
}
|
||||
if sw, ok := s.active[req.Model]; ok {
|
||||
s.logger.Debugf("%s: queued request for model %s now joining in-flight swap", s.name, req.Model)
|
||||
sw.waiters = append(sw.waiters, req)
|
||||
continue
|
||||
}
|
||||
running := s.runningSet(req.Model)
|
||||
evict := s.planner.EvictionFor(req.Model, running)
|
||||
if state == process.StateReady && len(evict) == 0 && !collidesWith(req.Model, evict, s.active) {
|
||||
s.logger.Debugf("%s: queued request for model %s now served fast-path", s.name, req.Model)
|
||||
s.grantHandler(req, req.Model)
|
||||
continue
|
||||
}
|
||||
if collidesWith(req.Model, evict, s.active) {
|
||||
remaining = append(remaining, req)
|
||||
continue
|
||||
}
|
||||
if conflictsWithInFlight(evict, s.inFlight) {
|
||||
remaining = append(remaining, req)
|
||||
continue
|
||||
}
|
||||
s.logger.Debugf("%s: queued request for model %s now starting swap, evicting %v", s.name, req.Model, evict)
|
||||
s.startSwap(req, evict, running)
|
||||
}
|
||||
s.queued = remaining
|
||||
broadcastQueuePositions(s.queued)
|
||||
}
|
||||
|
||||
// runningSet is the live model set handed to the Swapper: every process the
|
||||
// baseRouter reports as running, unioned with the targets of in-flight swaps
|
||||
// (excluding excludeActive, the model whose own swap is being decided — its
|
||||
// in-flight entry must not count as "already running"). The result is sorted so
|
||||
// eviction decisions derived from it are deterministic.
|
||||
func (s *FIFO) runningSet(excludeActive string) []string {
|
||||
seen := make(map[string]struct{})
|
||||
var out []string
|
||||
add := func(id string) {
|
||||
if _, dup := seen[id]; dup {
|
||||
return
|
||||
}
|
||||
seen[id] = struct{}{}
|
||||
out = append(out, id)
|
||||
}
|
||||
for id := range s.effects.RunningModels() {
|
||||
add(id)
|
||||
}
|
||||
for _, id := range activeTargets(s.active, excludeActive) {
|
||||
add(id)
|
||||
}
|
||||
sort.Strings(out)
|
||||
return out
|
||||
}
|
||||
|
||||
// activeTargets returns the IDs of every in-flight swap target except exclude.
|
||||
// The planner uses this to account for models committed to but not yet reflected
|
||||
// in process state.
|
||||
func activeTargets(active map[string]*activeSwap, exclude string) []string {
|
||||
if len(active) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make([]string, 0, len(active))
|
||||
for id := range active {
|
||||
if id == exclude {
|
||||
continue
|
||||
}
|
||||
out = append(out, id)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// collidesWith reports whether a new swap with this target and evict set can
|
||||
// safely run alongside the currently active swaps. Same-target callers should
|
||||
// JOIN (handled before this) — they do not collide with themselves.
|
||||
func collidesWith(target string, evict []string, active map[string]*activeSwap) bool {
|
||||
for id, sw := range active {
|
||||
if id == target {
|
||||
continue
|
||||
}
|
||||
if containsString(evict, id) {
|
||||
return true
|
||||
}
|
||||
if containsString(sw.evict, target) {
|
||||
return true
|
||||
}
|
||||
if slicesOverlap(evict, sw.evict) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// slicesOverlap reports whether xs and ys share any common element.
|
||||
func slicesOverlap(xs, ys []string) bool {
|
||||
for _, x := range xs {
|
||||
if containsString(ys, x) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// conflictsWithInFlight reports whether any model in evict is still handling
|
||||
// requests. Stopping a busy process would cancel its callers' connections, so
|
||||
// the scheduler defers the swap until those callers finish.
|
||||
func conflictsWithInFlight(evict []string, inFlight map[string]int) bool {
|
||||
for _, m := range evict {
|
||||
if inFlight[m] > 0 {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func containsString(xs []string, s string) bool {
|
||||
for _, x := range xs {
|
||||
if x == s {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// broadcastQueuePositions sends each queued request its current 1-indexed
|
||||
// position. Sends are non-blocking: if the channel is full, the old value is
|
||||
// drained first so the consumer always sees the latest position.
|
||||
func broadcastQueuePositions(queued []HandlerReq) {
|
||||
for i, req := range queued {
|
||||
pos := i + 1
|
||||
select {
|
||||
case req.PositionCh <- pos:
|
||||
default:
|
||||
select {
|
||||
case <-req.PositionCh:
|
||||
default:
|
||||
}
|
||||
select {
|
||||
case req.PositionCh <- pos:
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,537 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"io"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/mostlygeek/llama-swap/internal/config"
|
||||
"github.com/mostlygeek/llama-swap/internal/logmon"
|
||||
"github.com/mostlygeek/llama-swap/internal/process"
|
||||
)
|
||||
|
||||
// FIFO methods all run on the router's single run-loop goroutine, so these
|
||||
// tests drive them directly and synchronously. A swap is "completed" by calling
|
||||
// OnSwapDone, a served request "finishes" by calling OnServeDone — exactly the
|
||||
// events the run loop would deliver. fakeEffects records every side-effect and
|
||||
// stubPlanner supplies a fixed eviction set per target.
|
||||
|
||||
// stubPlanner returns a fixed eviction list per target.
|
||||
type stubPlanner struct {
|
||||
evict map[string][]string
|
||||
}
|
||||
|
||||
func (s *stubPlanner) EvictionFor(target string, _ []string) []string {
|
||||
if s.evict == nil {
|
||||
return nil
|
||||
}
|
||||
return s.evict[target]
|
||||
}
|
||||
|
||||
func (s *stubPlanner) OnSwapStart(string, []string) {}
|
||||
|
||||
// grantRec is one GrantError / GrantServe call. err!=nil marks an error grant;
|
||||
// otherwise it is a serve grant and serve reports whether the caller received it.
|
||||
type grantRec struct {
|
||||
model string
|
||||
err error
|
||||
serve bool
|
||||
}
|
||||
|
||||
type startRec struct {
|
||||
model string
|
||||
evict []string
|
||||
}
|
||||
|
||||
type stopRec struct {
|
||||
timeout time.Duration
|
||||
ids []string
|
||||
}
|
||||
|
||||
// fakeEffects is an in-memory scheduler.Effects. Tests program process states
|
||||
// and GrantServe outcomes, then assert on the recorded calls.
|
||||
type fakeEffects struct {
|
||||
states map[string]process.ProcessState // model -> state; missing => not handled
|
||||
serveResult map[string]bool // GrantServe return per model (default true)
|
||||
|
||||
starts []startRec
|
||||
grants []grantRec
|
||||
stops []stopRec
|
||||
}
|
||||
|
||||
func newFakeEffects() *fakeEffects {
|
||||
return &fakeEffects{
|
||||
states: map[string]process.ProcessState{},
|
||||
serveResult: map[string]bool{},
|
||||
}
|
||||
}
|
||||
|
||||
func (f *fakeEffects) ModelState(modelID string) (process.ProcessState, bool) {
|
||||
st, ok := f.states[modelID]
|
||||
return st, ok
|
||||
}
|
||||
|
||||
func (f *fakeEffects) RunningModels() map[string]process.ProcessState {
|
||||
out := make(map[string]process.ProcessState)
|
||||
for id, st := range f.states {
|
||||
if st == process.StateStopped || st == process.StateShutdown {
|
||||
continue
|
||||
}
|
||||
out[id] = st
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (f *fakeEffects) StartSwap(modelID string, evict []string) {
|
||||
f.starts = append(f.starts, startRec{model: modelID, evict: evict})
|
||||
}
|
||||
|
||||
func (f *fakeEffects) GrantError(req HandlerReq, err error) {
|
||||
f.grants = append(f.grants, grantRec{model: req.Model, err: err})
|
||||
}
|
||||
|
||||
func (f *fakeEffects) GrantServe(req HandlerReq, modelID string) bool {
|
||||
ok := true
|
||||
if v, set := f.serveResult[modelID]; set {
|
||||
ok = v
|
||||
}
|
||||
f.grants = append(f.grants, grantRec{model: modelID, serve: ok})
|
||||
return ok
|
||||
}
|
||||
|
||||
func (f *fakeEffects) StopProcesses(timeout time.Duration, ids []string) {
|
||||
f.stops = append(f.stops, stopRec{timeout: timeout, ids: ids})
|
||||
}
|
||||
|
||||
// served counts grants that handed modelID a handler and were received.
|
||||
func (f *fakeEffects) served(modelID string) int {
|
||||
n := 0
|
||||
for _, g := range f.grants {
|
||||
if g.err == nil && g.serve && g.model == modelID {
|
||||
n++
|
||||
}
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// errored counts error grants, optionally filtered by model ("" = any).
|
||||
func (f *fakeEffects) errored(model string) int {
|
||||
n := 0
|
||||
for _, g := range f.grants {
|
||||
if g.err != nil && (model == "" || g.model == model) {
|
||||
n++
|
||||
}
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// startsFor counts StartSwap calls for modelID.
|
||||
func (f *fakeEffects) startsFor(modelID string) int {
|
||||
n := 0
|
||||
for _, s := range f.starts {
|
||||
if s.model == modelID {
|
||||
n++
|
||||
}
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
func newFIFO(planner Swapper, eff Effects) *FIFO {
|
||||
return NewFIFO("test", logmon.NewWriter(io.Discard), planner, config.FifoConfig{}, eff)
|
||||
}
|
||||
|
||||
func req(model string) HandlerReq { return HandlerReq{Model: model} }
|
||||
|
||||
func TestFIFO_FastPath(t *testing.T) {
|
||||
eff := newFakeEffects()
|
||||
eff.states["a"] = process.StateReady
|
||||
s := newFIFO(&stubPlanner{}, eff)
|
||||
|
||||
s.OnRequest(req("a"))
|
||||
|
||||
if got := eff.startsFor("a"); got != 0 {
|
||||
t.Errorf("StartSwap calls=%d want 0 (fast path should not swap)", got)
|
||||
}
|
||||
if got := eff.served("a"); got != 1 {
|
||||
t.Errorf("served(a)=%d want 1", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFIFO_ModelNotFound(t *testing.T) {
|
||||
eff := newFakeEffects() // no states => model unknown
|
||||
s := newFIFO(&stubPlanner{}, eff)
|
||||
|
||||
s.OnRequest(req("ghost"))
|
||||
|
||||
if got := len(eff.starts); got != 0 {
|
||||
t.Errorf("StartSwap calls=%d want 0", got)
|
||||
}
|
||||
if eff.errored("ghost") != 1 {
|
||||
t.Fatalf("want 1 error grant for ghost, grants=%+v", eff.grants)
|
||||
}
|
||||
if !errors.Is(eff.grants[0].err, ErrModelNotFound) {
|
||||
t.Errorf("err=%v want ErrModelNotFound", eff.grants[0].err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFIFO_OnDemandStartThenServe(t *testing.T) {
|
||||
eff := newFakeEffects()
|
||||
eff.states["a"] = process.StateStopped
|
||||
s := newFIFO(&stubPlanner{}, eff)
|
||||
|
||||
s.OnRequest(req("a"))
|
||||
if got := eff.startsFor("a"); got != 1 {
|
||||
t.Fatalf("StartSwap(a)=%d want 1", got)
|
||||
}
|
||||
if got := eff.served("a"); got != 0 {
|
||||
t.Errorf("served(a)=%d want 0 before swap completes", got)
|
||||
}
|
||||
|
||||
// Swap finishes, model is now ready.
|
||||
eff.states["a"] = process.StateReady
|
||||
s.OnSwapDone(SwapDone{ModelID: "a"})
|
||||
|
||||
if got := eff.served("a"); got != 1 {
|
||||
t.Errorf("served(a)=%d want 1 after swap done", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFIFO_JoinInFlightSwap(t *testing.T) {
|
||||
eff := newFakeEffects()
|
||||
eff.states["a"] = process.StateStopped
|
||||
s := newFIFO(&stubPlanner{}, eff)
|
||||
|
||||
s.OnRequest(req("a")) // starts swap
|
||||
s.OnRequest(req("a")) // joins
|
||||
s.OnRequest(req("a")) // joins
|
||||
|
||||
if got := eff.startsFor("a"); got != 1 {
|
||||
t.Fatalf("StartSwap(a)=%d want 1 (all three share one swap)", got)
|
||||
}
|
||||
|
||||
eff.states["a"] = process.StateReady
|
||||
s.OnSwapDone(SwapDone{ModelID: "a"})
|
||||
|
||||
if got := eff.served("a"); got != 3 {
|
||||
t.Errorf("served(a)=%d want 3 (one swap serves all waiters)", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFIFO_SwapDoneError_FailsAllWaiters(t *testing.T) {
|
||||
eff := newFakeEffects()
|
||||
eff.states["a"] = process.StateStopped
|
||||
s := newFIFO(&stubPlanner{}, eff)
|
||||
|
||||
s.OnRequest(req("a"))
|
||||
s.OnRequest(req("a"))
|
||||
|
||||
s.OnSwapDone(SwapDone{ModelID: "a", Err: errors.New("boom")})
|
||||
|
||||
if eff.served("a") != 0 {
|
||||
t.Errorf("served(a)=%d want 0 on swap error", eff.served("a"))
|
||||
}
|
||||
if eff.errored("a") != 2 {
|
||||
t.Errorf("errored(a)=%d want 2 (both waiters fail)", eff.errored("a"))
|
||||
}
|
||||
}
|
||||
|
||||
// TestFIFO_QueueOnEvictionCollision covers a request whose target evicts the
|
||||
// model currently being swapped: it must queue until that swap finishes AND its
|
||||
// served request drains, because starting it would stop a busy process.
|
||||
func TestFIFO_QueueOnEvictionCollision(t *testing.T) {
|
||||
eff := newFakeEffects()
|
||||
eff.states["a"] = process.StateStopped
|
||||
eff.states["b"] = process.StateStopped
|
||||
// Loading b evicts a.
|
||||
s := newFIFO(&stubPlanner{evict: map[string][]string{"b": {"a"}}}, eff)
|
||||
|
||||
s.OnRequest(req("a")) // StartSwap(a)
|
||||
s.OnRequest(req("b")) // collides with a's in-flight swap -> queue
|
||||
if got := eff.startsFor("b"); got != 0 {
|
||||
t.Fatalf("b started early: StartSwap(b)=%d want 0", got)
|
||||
}
|
||||
|
||||
// a becomes ready and is granted (now serving, inFlight[a]=1).
|
||||
eff.states["a"] = process.StateReady
|
||||
s.OnSwapDone(SwapDone{ModelID: "a"})
|
||||
if got := eff.startsFor("b"); got != 0 {
|
||||
t.Fatalf("b started while a is serving: StartSwap(b)=%d want 0", got)
|
||||
}
|
||||
|
||||
// a's request finishes -> a no longer in-flight -> b may now swap.
|
||||
s.OnServeDone(ServeDoneEvent{ModelID: "a"})
|
||||
if got := eff.startsFor("b"); got != 1 {
|
||||
t.Fatalf("StartSwap(b)=%d want 1 after a drained", got)
|
||||
}
|
||||
if got := eff.starts[len(eff.starts)-1].evict; len(got) != 1 || got[0] != "a" {
|
||||
t.Errorf("b swap evict=%v want [a]", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestFIFO_DisjointSwapsRunInParallel verifies two requests with
|
||||
// non-conflicting evict sets both start without waiting for each other.
|
||||
func TestFIFO_DisjointSwapsRunInParallel(t *testing.T) {
|
||||
eff := newFakeEffects()
|
||||
eff.states["a"] = process.StateStopped
|
||||
eff.states["b"] = process.StateStopped
|
||||
s := newFIFO(&stubPlanner{}, eff) // empty evicts
|
||||
|
||||
s.OnRequest(req("a"))
|
||||
s.OnRequest(req("b"))
|
||||
|
||||
if eff.startsFor("a") != 1 || eff.startsFor("b") != 1 {
|
||||
t.Fatalf("StartSwap a=%d b=%d want 1 each (parallel)", eff.startsFor("a"), eff.startsFor("b"))
|
||||
}
|
||||
}
|
||||
|
||||
// TestFIFO_OverlappingEvictSetsDoNotRunInParallel verifies two swaps with
|
||||
// different targets that evict the *same* model do not run concurrently: the
|
||||
// second must queue rather than double-evict the shared model. Neither target is
|
||||
// in the other's evict set, so this is only caught by the evict-set overlap
|
||||
// check in collidesWith.
|
||||
func TestFIFO_OverlappingEvictSetsDoNotRunInParallel(t *testing.T) {
|
||||
eff := newFakeEffects()
|
||||
eff.states["a"] = process.StateStopped
|
||||
eff.states["b"] = process.StateStopped
|
||||
eff.states["x"] = process.StateReady // shared eviction target, running
|
||||
// Loading a or b both require evicting x.
|
||||
s := newFIFO(&stubPlanner{evict: map[string][]string{"a": {"x"}, "b": {"x"}}}, eff)
|
||||
|
||||
s.OnRequest(req("a")) // StartSwap(a, [x])
|
||||
s.OnRequest(req("b")) // overlaps a's evict set ([x]) -> queue
|
||||
if eff.startsFor("a") != 1 {
|
||||
t.Fatalf("StartSwap(a)=%d want 1", eff.startsFor("a"))
|
||||
}
|
||||
if got := eff.startsFor("b"); got != 0 {
|
||||
t.Fatalf("b started in parallel while a evicts x: StartSwap(b)=%d want 0", got)
|
||||
}
|
||||
|
||||
// a's swap completes and x is gone; b can now evict nothing and start.
|
||||
eff.states["a"] = process.StateReady
|
||||
eff.states["x"] = process.StateStopped
|
||||
s.OnSwapDone(SwapDone{ModelID: "a"})
|
||||
if got := eff.startsFor("b"); got != 1 {
|
||||
t.Fatalf("StartSwap(b)=%d want 1 after a's swap drained", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestFIFO_QueueDrainPromotesMultiple verifies completing one swap unblocks
|
||||
// every queued request that no longer collides — they all start together.
|
||||
func TestFIFO_QueueDrainPromotesMultiple(t *testing.T) {
|
||||
eff := newFakeEffects()
|
||||
eff.states["a"] = process.StateStopped
|
||||
eff.states["b"] = process.StateStopped
|
||||
eff.states["c"] = process.StateStopped
|
||||
// a's swap evicts both b and c; b and c evict nothing.
|
||||
s := newFIFO(&stubPlanner{evict: map[string][]string{"a": {"b", "c"}}}, eff)
|
||||
|
||||
s.OnRequest(req("a")) // StartSwap(a, [b,c])
|
||||
s.OnRequest(req("b")) // collides (in a's evict set) -> queue
|
||||
s.OnRequest(req("c")) // collides -> queue
|
||||
if eff.startsFor("b") != 0 || eff.startsFor("c") != 0 {
|
||||
t.Fatalf("b/c started early")
|
||||
}
|
||||
|
||||
eff.states["a"] = process.StateReady
|
||||
s.OnSwapDone(SwapDone{ModelID: "a"})
|
||||
|
||||
// b and c have empty evict sets and don't evict a, so both start now.
|
||||
if eff.startsFor("b") != 1 || eff.startsFor("c") != 1 {
|
||||
t.Fatalf("StartSwap b=%d c=%d want 1 each after a done", eff.startsFor("b"), eff.startsFor("c"))
|
||||
}
|
||||
if eff.served("a") != 1 {
|
||||
t.Errorf("served(a)=%d want 1", eff.served("a"))
|
||||
}
|
||||
}
|
||||
|
||||
// TestFIFO_QueueCollation verifies duplicate requests collapse into one swap
|
||||
// per model: the second request for each model joins the active swap (at arrival
|
||||
// or at drain time) rather than triggering its own swap.
|
||||
func TestFIFO_QueueCollation(t *testing.T) {
|
||||
eff := newFakeEffects()
|
||||
for _, id := range []string{"a", "b", "c"} {
|
||||
eff.states[id] = process.StateStopped
|
||||
}
|
||||
// Each model evicts the other two: all swaps are mutually exclusive.
|
||||
s := newFIFO(&stubPlanner{evict: map[string][]string{
|
||||
"a": {"b", "c"},
|
||||
"b": {"a", "c"},
|
||||
"c": {"a", "b"},
|
||||
}}, eff)
|
||||
|
||||
for _, id := range []string{"a", "b", "c", "a", "b", "c"} {
|
||||
s.OnRequest(req(id))
|
||||
}
|
||||
|
||||
// Drain a, then its served requests, which promotes b; repeat for b -> c.
|
||||
drain := func(model string, waiters int) {
|
||||
eff.states[model] = process.StateReady
|
||||
s.OnSwapDone(SwapDone{ModelID: model})
|
||||
for i := 0; i < waiters; i++ {
|
||||
s.OnServeDone(ServeDoneEvent{ModelID: model})
|
||||
}
|
||||
}
|
||||
drain("a", 2)
|
||||
drain("b", 2)
|
||||
drain("c", 2)
|
||||
|
||||
for _, id := range []string{"a", "b", "c"} {
|
||||
if got := eff.startsFor(id); got != 1 {
|
||||
t.Errorf("StartSwap(%s)=%d want 1 (collation)", id, got)
|
||||
}
|
||||
if got := eff.served(id); got != 2 {
|
||||
t.Errorf("served(%s)=%d want 2", id, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestFIFO_NoSwapWhileServing verifies a model still handling requests is not
|
||||
// evicted: the evicting request waits until every in-flight request drains.
|
||||
func TestFIFO_NoSwapWhileServing(t *testing.T) {
|
||||
eff := newFakeEffects()
|
||||
eff.states["a"] = process.StateReady
|
||||
eff.states["b"] = process.StateStopped
|
||||
s := newFIFO(&stubPlanner{evict: map[string][]string{"b": {"a"}}}, eff)
|
||||
|
||||
s.OnRequest(req("a")) // fast path, inFlight[a]=1
|
||||
s.OnRequest(req("a")) // fast path, inFlight[a]=2
|
||||
s.OnRequest(req("b")) // would evict busy a -> queue
|
||||
if eff.startsFor("b") != 0 {
|
||||
t.Fatalf("b started while a serving")
|
||||
}
|
||||
|
||||
s.OnServeDone(ServeDoneEvent{ModelID: "a"}) // inFlight[a]=1
|
||||
if eff.startsFor("b") != 0 {
|
||||
t.Fatalf("b started while a still serving one request")
|
||||
}
|
||||
|
||||
s.OnServeDone(ServeDoneEvent{ModelID: "a"}) // inFlight[a]=0
|
||||
if eff.startsFor("b") != 1 {
|
||||
t.Fatalf("StartSwap(b)=%d want 1 after a fully drained", eff.startsFor("b"))
|
||||
}
|
||||
}
|
||||
|
||||
// TestFIFO_GrantServeFalseDoesNotLeakInFlight verifies that when a caller has
|
||||
// walked away (GrantServe returns false) the in-flight count is not bumped, so a
|
||||
// later evicting request is not blocked forever.
|
||||
func TestFIFO_GrantServeFalseDoesNotLeakInFlight(t *testing.T) {
|
||||
eff := newFakeEffects()
|
||||
eff.states["a"] = process.StateStopped
|
||||
eff.states["b"] = process.StateStopped
|
||||
eff.serveResult["a"] = false // a's waiter is gone by grant time
|
||||
s := newFIFO(&stubPlanner{evict: map[string][]string{"b": {"a"}}}, eff)
|
||||
|
||||
s.OnRequest(req("a"))
|
||||
eff.states["a"] = process.StateReady
|
||||
s.OnSwapDone(SwapDone{ModelID: "a"}) // grant fails, inFlight[a] stays 0
|
||||
|
||||
// b evicts a; since a is not in-flight, b should start immediately.
|
||||
s.OnRequest(req("b"))
|
||||
if eff.startsFor("b") != 1 {
|
||||
t.Fatalf("StartSwap(b)=%d want 1 (no leaked in-flight on a)", eff.startsFor("b"))
|
||||
}
|
||||
}
|
||||
|
||||
// TestFIFO_OnShutdown_FailsAllWaiters verifies shutdown errors every waiter the
|
||||
// scheduler holds: active-swap waiters and queued requests alike.
|
||||
func TestFIFO_OnShutdown_FailsAllWaiters(t *testing.T) {
|
||||
eff := newFakeEffects()
|
||||
for _, id := range []string{"a", "b", "c"} {
|
||||
eff.states[id] = process.StateStopped
|
||||
}
|
||||
// a and b load in parallel; c collides with both and queues.
|
||||
s := newFIFO(&stubPlanner{evict: map[string][]string{"c": {"a", "b"}}}, eff)
|
||||
|
||||
s.OnRequest(req("a")) // StartSwap(a)
|
||||
s.OnRequest(req("a")) // join a
|
||||
s.OnRequest(req("b")) // StartSwap(b)
|
||||
s.OnRequest(req("b")) // join b
|
||||
s.OnRequest(req("c")) // queued
|
||||
|
||||
s.OnShutdown(errors.New("shutting down"))
|
||||
|
||||
if got := eff.errored(""); got != 5 {
|
||||
t.Errorf("error grants=%d want 5 (2 a + 2 b + 1 c)", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFIFO_OnUnload_ReleasesActiveWaiters(t *testing.T) {
|
||||
eff := newFakeEffects()
|
||||
eff.states["a"] = process.StateStopped
|
||||
s := newFIFO(&stubPlanner{}, eff)
|
||||
|
||||
s.OnRequest(req("a")) // active swap a with one waiter
|
||||
s.OnRequest(req("a")) // join
|
||||
|
||||
s.OnUnload([]string{"a"}, time.Second)
|
||||
|
||||
if got := eff.errored("a"); got != 2 {
|
||||
t.Errorf("errored(a)=%d want 2 (active swap waiters released)", got)
|
||||
}
|
||||
if len(eff.stops) != 1 || len(eff.stops[0].ids) != 1 || eff.stops[0].ids[0] != "a" {
|
||||
t.Errorf("StopProcesses=%+v want one call stopping [a]", eff.stops)
|
||||
}
|
||||
if eff.stops[0].timeout != time.Second {
|
||||
t.Errorf("StopProcesses timeout=%v want 1s", eff.stops[0].timeout)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFIFO_OnUnload_DropsQueuedRequests(t *testing.T) {
|
||||
eff := newFakeEffects()
|
||||
eff.states["a"] = process.StateStopped
|
||||
eff.states["b"] = process.StateStopped
|
||||
// b evicts a, so a request for b queues while a is loading.
|
||||
s := newFIFO(&stubPlanner{evict: map[string][]string{"b": {"a"}}}, eff)
|
||||
|
||||
s.OnRequest(req("a")) // StartSwap(a)
|
||||
s.OnRequest(req("b")) // queued
|
||||
|
||||
s.OnUnload([]string{"b"}, time.Second)
|
||||
|
||||
if got := eff.errored("b"); got != 1 {
|
||||
t.Errorf("errored(b)=%d want 1 (queued request dropped)", got)
|
||||
}
|
||||
if got := eff.startsFor("b"); got != 0 {
|
||||
t.Errorf("StartSwap(b)=%d want 0 (b should never start)", got)
|
||||
}
|
||||
// a's swap is untouched: its waiter is neither served nor errored yet.
|
||||
if eff.served("a") != 0 || eff.errored("a") != 0 {
|
||||
t.Errorf("a swap should be untouched: served=%d errored=%d", eff.served("a"), eff.errored("a"))
|
||||
}
|
||||
}
|
||||
|
||||
// TestFIFO_PriorityQueueOrder verifies queued requests are ordered by descending
|
||||
// priority, with arrival (FIFO) order preserved among equal-priority models.
|
||||
func TestFIFO_PriorityQueueOrder(t *testing.T) {
|
||||
eff := newFakeEffects()
|
||||
for _, m := range []string{"z", "A", "B", "C", "D"} {
|
||||
eff.states[m] = process.StateStopped
|
||||
}
|
||||
// z's swap evicts every other model, so any request that arrives while z is
|
||||
// loading collides with z's in-flight swap and parks in the queue.
|
||||
planner := &stubPlanner{evict: map[string][]string{"z": {"A", "B", "C", "D"}}}
|
||||
cfg := config.FifoConfig{Priority: map[string]int{"A": 10, "B": 5, "C": 5, "D": 1}}
|
||||
s := NewFIFO("test", logmon.NewWriter(io.Discard), planner, cfg, eff)
|
||||
|
||||
s.OnRequest(req("z")) // StartSwap(z, [A,B,C,D])
|
||||
|
||||
// Arrive out of priority order; B before C exercises FIFO tie-breaking.
|
||||
for _, m := range []string{"B", "D", "C", "A"} {
|
||||
s.OnRequest(req(m))
|
||||
}
|
||||
|
||||
got := make([]string, len(s.queued))
|
||||
for i, q := range s.queued {
|
||||
got[i] = q.Model
|
||||
}
|
||||
want := []string{"A", "B", "C", "D"}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("queue=%v want %v", got, want)
|
||||
}
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Fatalf("queue=%v want %v", got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,116 @@
|
||||
// Package scheduler contains the request-scheduling strategies used by the
|
||||
// router's baseRouter. A Scheduler owns the queue, in-flight tracking, and the
|
||||
// decision tree for when to start a swap versus queue a request. The baseRouter
|
||||
// owns the channels, run loop, and process machinery, and exposes the
|
||||
// side-effects a scheduler needs through the Effects interface.
|
||||
//
|
||||
// Splitting these apart lets the scheduling strategy be swapped out
|
||||
// independently of both the process machinery (baseRouter) and the eviction
|
||||
// policy (Swapper). FIFO is the first and currently only implementation.
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/mostlygeek/llama-swap/internal/logmon"
|
||||
"github.com/mostlygeek/llama-swap/internal/process"
|
||||
)
|
||||
|
||||
// ErrModelNotFound is granted to callers whose model is not handled by this
|
||||
// router. The router package aliases it so SendError can match it.
|
||||
var ErrModelNotFound = fmt.Errorf("local model not found")
|
||||
|
||||
// Swapper is the eviction policy: it decides which running models must be
|
||||
// stopped before a target can serve. It is orthogonal to the scheduling
|
||||
// strategy — any Scheduler works with any Swapper.
|
||||
type Swapper interface {
|
||||
// EvictionFor returns running model IDs that must be stopped before
|
||||
// target can serve. running is the complete set the scheduler considers
|
||||
// live: every process that is not stopped, unioned with the targets of
|
||||
// in-flight swaps the scheduler has already committed to (which are not yet
|
||||
// visible in process state). The planner does not inspect process state
|
||||
// itself. Pure decision; must not log.
|
||||
EvictionFor(target string, running []string) []string
|
||||
|
||||
// OnSwapStart runs once at the start of every swap, with the same running
|
||||
// set EvictionFor was given for this decision. Planners may log their
|
||||
// decision here at whatever verbosity they choose.
|
||||
OnSwapStart(target string, running []string)
|
||||
}
|
||||
|
||||
// Scheduler decides what happens to each event the router's run loop receives.
|
||||
// All methods run on that single run-loop goroutine, so implementations need no
|
||||
// internal locking for their own state.
|
||||
type Scheduler interface {
|
||||
// OnRequest handles one incoming ServeHTTP request.
|
||||
OnRequest(req HandlerReq)
|
||||
// OnSwapDone handles a swap goroutine reporting completion.
|
||||
OnSwapDone(ev SwapDone)
|
||||
// OnServeDone handles a tracked ServeHTTP finishing (in-flight decrement).
|
||||
OnServeDone(ev ServeDoneEvent)
|
||||
// OnUnload reconciles scheduler state for an unload, stops the targeted
|
||||
// processes via Effects, and drains the queue. It must block until the
|
||||
// targeted processes have stopped.
|
||||
OnUnload(targets []string, timeout time.Duration)
|
||||
// OnShutdown grants err to every waiter the scheduler still holds (active
|
||||
// swap waiters and queued requests). Process teardown is the baseRouter's
|
||||
// responsibility.
|
||||
OnShutdown(err error)
|
||||
}
|
||||
|
||||
// Effects is implemented by the baseRouter. The scheduler calls back through it
|
||||
// for every side-effect: inspecting process state, launching swaps, responding
|
||||
// to callers, and stopping processes.
|
||||
type Effects interface {
|
||||
// ModelState returns the current state of a model's process. ok is false
|
||||
// when the model is not handled by this router.
|
||||
ModelState(modelID string) (process.ProcessState, bool)
|
||||
// RunningModels returns the state of every process that is not stopped or
|
||||
// shut down, keyed by model ID. The scheduler uses it to build the running
|
||||
// set it hands the Swapper.
|
||||
RunningModels() map[string]process.ProcessState
|
||||
// StartSwap launches the swap goroutine for modelID, stopping evict first.
|
||||
StartSwap(modelID string, evict []string)
|
||||
// GrantError responds to a caller with an error.
|
||||
GrantError(req HandlerReq, err error)
|
||||
// GrantServe hands a caller the wrapped handler for modelID and reports
|
||||
// whether the caller was still there to receive it. The scheduler bumps
|
||||
// its in-flight count only when this returns true.
|
||||
GrantServe(req HandlerReq, modelID string) bool
|
||||
// StopProcesses stops the named processes in parallel and blocks until all
|
||||
// have stopped. Unknown IDs are skipped.
|
||||
StopProcesses(timeout time.Duration, ids []string)
|
||||
}
|
||||
|
||||
// Factory builds a Scheduler bound to a baseRouter's Effects. The concrete
|
||||
// router captures its Swapper in the closure it passes as a Factory.
|
||||
type Factory func(name string, logger *logmon.Monitor, eff Effects) Scheduler
|
||||
|
||||
// HandlerReq is one in-flight ServeHTTP request waiting for a routing decision.
|
||||
type HandlerReq struct {
|
||||
Model string
|
||||
Ctx context.Context
|
||||
Respond chan HandlerResp
|
||||
PositionCh chan int
|
||||
}
|
||||
|
||||
// HandlerResp is the routing decision returned to a HandlerReq's caller: either
|
||||
// a handler to serve with, or an error.
|
||||
type HandlerResp struct {
|
||||
HandleFunc http.HandlerFunc
|
||||
Err error
|
||||
}
|
||||
|
||||
// SwapDone is reported by a swap goroutine when its target is ready (or failed).
|
||||
type SwapDone struct {
|
||||
ModelID string
|
||||
Err error
|
||||
}
|
||||
|
||||
// ServeDoneEvent is reported when a tracked ServeHTTP handler returns.
|
||||
type ServeDoneEvent struct {
|
||||
ModelID string
|
||||
}
|
||||
@@ -99,12 +99,13 @@ func New(cfg config.Config, muxlog *logmon.Monitor, proxylog *logmon.Monitor, up
|
||||
var local router.LocalRouter
|
||||
var err error
|
||||
|
||||
if cfg.Matrix != nil {
|
||||
switch cfg.Routing.Router.Use {
|
||||
case "matrix":
|
||||
local, err = router.NewMatrix(cfg, proxylog, upstreamlog)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("creating matrix router: %w", err)
|
||||
}
|
||||
} else {
|
||||
default: // "group"
|
||||
local, err = router.NewGroup(cfg, proxylog, upstreamlog)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("creating group router: %w", err)
|
||||
|
||||
@@ -84,10 +84,15 @@ func chatRequest(model string) *http.Request {
|
||||
|
||||
func TestServer_New_GroupConfig(t *testing.T) {
|
||||
discard := logmon.NewWriter(io.Discard)
|
||||
s, err := New(config.Config{HealthCheckTimeout: 15}, discard, discard, discard, nil, BuildInfo{})
|
||||
cfg := config.Config{HealthCheckTimeout: 15}
|
||||
cfg.Routing.Router.Use = "group"
|
||||
s, err := New(cfg, discard, discard, discard, nil, BuildInfo{})
|
||||
if err != nil {
|
||||
t.Fatalf("New (group): %v", err)
|
||||
}
|
||||
if _, ok := s.local.(*router.Group); !ok {
|
||||
t.Fatalf("localRouter=%T want *router.Group", s.local)
|
||||
}
|
||||
if err := s.Shutdown(time.Second); err != nil {
|
||||
t.Fatalf("Shutdown: %v", err)
|
||||
}
|
||||
@@ -95,11 +100,16 @@ func TestServer_New_GroupConfig(t *testing.T) {
|
||||
|
||||
func TestServer_New_MatrixConfig(t *testing.T) {
|
||||
discard := logmon.NewWriter(io.Discard)
|
||||
cfg := config.Config{HealthCheckTimeout: 15, Matrix: &config.MatrixConfig{}}
|
||||
cfg := config.Config{HealthCheckTimeout: 15}
|
||||
cfg.Routing.Router.Use = "matrix"
|
||||
cfg.Routing.Router.Settings.Matrix = &config.MatrixConfig{}
|
||||
s, err := New(cfg, discard, discard, discard, nil, BuildInfo{})
|
||||
if err != nil {
|
||||
t.Fatalf("New (matrix): %v", err)
|
||||
}
|
||||
if _, ok := s.local.(*router.Matrix); !ok {
|
||||
t.Fatalf("localRouter=%T want *router.Matrix", s.local)
|
||||
}
|
||||
if err := s.Shutdown(time.Second); err != nil {
|
||||
t.Fatalf("Shutdown: %v", err)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user