docs: update configuration.md for matrix

This commit is contained in:
Benson Wong
2026-04-14 22:01:03 -07:00
parent 4866d16c3e
commit 17233e9278
+199 -118
View File
@@ -22,7 +22,7 @@ models:
cmd: llama-server --port ${PORT} -m /path/to/third_model.gguf cmd: llama-server --port ${PORT} -m /path/to/third_model.gguf
``` ```
With this configuration models will be hot swapped and loaded on demand. The special `${PORT}` macro provides a unique port per model. Useful if you want to run multiple models at the same time with the `groups` feature. With this configuration models will be hot swapped and loaded on demand. The special `${PORT}` macro provides a unique port per model which is useful if you want to run multiple models at the same time with the `matrix` feature.
## Advanced control with `cmd` ## Advanced control with `cmd`
@@ -76,7 +76,7 @@ llama-swap supports many more features to customize how you want to manage your
| --------- | ---------------------------------------------- | | --------- | ---------------------------------------------- |
| `ttl` | automatic unloading of models after a timeout | | `ttl` | automatic unloading of models after a timeout |
| `macros` | reusable snippets to use in configurations | | `macros` | reusable snippets to use in configurations |
| `groups` | run multiple models at a time | | `matrix` | run multiple models at a time |
| `hooks` | event driven functionality | | `hooks` | event driven functionality |
| `env` | define environment variables per model | | `env` | define environment variables per model |
| `aliases` | serve a model with different names | | `aliases` | serve a model with different names |
@@ -141,6 +141,11 @@ logToStdout: "proxy"
# - useful for limiting memory usage when processing large volumes of metrics # - useful for limiting memory usage when processing large volumes of metrics
metricsMaxInMemory: 1000 metricsMaxInMemory: 1000
# captureBuffer: how many MBs to allocate for storing request/response captures
# - optional, default: 10
# - set to 0 to disable
captureBuffer: 15
# startPort: sets the starting port number for the automatic ${PORT} macro. # startPort: sets the starting port number for the automatic ${PORT} macro.
# - optional, default: 5800 # - optional, default: 5800
# - the ${PORT} macro can be used in model.cmd and model.proxy settings # - the ${PORT} macro can be used in model.cmd and model.proxy settings
@@ -161,15 +166,10 @@ sendLoadingState: true
# all fields except for Id so chat UIs can use the alias equivalent to the original. # all fields except for Id so chat UIs can use the alias equivalent to the original.
includeAliasesInList: false includeAliasesInList: false
# apiKeys: require an API key when making requests to inference endpoints # globalTTL: the default TTL in seconds before unloading a model
# - optional, default: [] # - optional, default: 0 (never automatically unload)
# - when empty (the default) authorization will not be checked as llama-swap is default-allow # - must be >= 0
# - each key is a non-empty string globalTTL: 0
apiKeys:
- "sk-hunter2"
# hint, one liner: printf "sk-%s\n" "$(head -c 48 /dev/urandom | base64 )"
- "sk-gyCPiKUcIfPlaM4OSMZekkprgijPx6+OsmQs8Rsg0xZ9qpy6gKWsIKqHOk+cgXVx"
- "sk-+QtIn0Zjj4UHjiaZYiZEnru4mrwKM9RzhmJeK5SobNXLl8QMFXxGz1/2lEuvQpkb"
# macros: a dictionary of string substitutions # macros: a dictionary of string substitutions
# - optional, default: empty dictionary # - optional, default: empty dictionary
@@ -181,6 +181,9 @@ apiKeys:
# - macro names must not be a reserved name: PORT or MODEL_ID # - macro names must not be a reserved name: PORT or MODEL_ID
# - macro values can be numbers, bools, or strings # - macro values can be numbers, bools, or strings
# - macros can contain other macros, but they must be defined before they are used # - macros can contain other macros, but they must be defined before they are used
# - environment variables can be referenced with ${env.VAR_NAME} syntax
# - env macros are substituted first, before regular macros
# - if the env var is not set, config loading will fail with an error
macros: macros:
# Example of a multi-line macro # Example of a multi-line macro
"latest-llama": > "latest-llama": >
@@ -193,6 +196,24 @@ macros:
# but they must be previously declared. # but they must be previously declared.
"default_args": "--ctx-size ${default_ctx}" "default_args": "--ctx-size ${default_ctx}"
# Example of environment variable macros
# - ${env.VAR_NAME} pulls the value from the system environment
# - useful for paths, secrets, or machine-specific configuration
"models_dir": "${env.HOME}/models"
# apiKeys: require an API key when making requests to inference endpoints
# - optional, default: []
# - when empty (the default) authorization will not be checked as llama-swap is default-allow
# - each key is a non-empty string
apiKeys:
- "sk-hunter2"
# tip, one liner: printf "sk-%s\n" "$(head -c 48 /dev/urandom | base64 )"
- "sk-gyCPiKUcIfPlaM4OSMZekkprgijPx6+OsmQs8Rsg0xZ9qpy6gKWsIKqHOk+cgXVx"
# use environment variable macros to keep secrets out of the config
- "${env.API_KEY_1}"
- "${env.API_KEY_2}"
# models: a dictionary of model configurations # models: a dictionary of model configurations
# - required # - required
# - each key is the model's ID, used in API requests # - each key is the model's ID, used in API requests
@@ -201,7 +222,7 @@ macros:
# - below are examples of the all the settings a model can have # - below are examples of the all the settings a model can have
models: models:
# keys are the model names used in API requests # keys are the model names used in API requests
"llama": "gpt-oss-120b":
# macros: a dictionary of string substitutions specific to this model # macros: a dictionary of string substitutions specific to this model
# - optional, default: empty dictionary # - optional, default: empty dictionary
# - macros defined here override macros defined in the global macros section # - macros defined here override macros defined in the global macros section
@@ -218,7 +239,7 @@ models:
cmd: | cmd: |
# ${latest-llama} is a macro that is defined above # ${latest-llama} is a macro that is defined above
${latest-llama} ${latest-llama}
--model path/to/llama-8B-Q4_K_M.gguf --model path/to/gpt-oss-120B.gguf
--ctx-size ${default_ctx} --ctx-size ${default_ctx}
--temperature ${temp} --temperature ${temp}
@@ -226,13 +247,13 @@ models:
# - optional, default: empty string # - optional, default: empty string
# - if set, it will be used in the v1/models API response # - if set, it will be used in the v1/models API response
# - if not set, it will be omitted in the JSON model record # - if not set, it will be omitted in the JSON model record
name: "llama 3.1 8B" name: "gpt-oss 120B"
# description: a description for the model # description: a description for the model
# - optional, default: empty string # - optional, default: empty string
# - if set, it will be used in the v1/models API response # - if set, it will be used in the v1/models API response
# - if not set, it will be omitted in the JSON model record # - if not set, it will be omitted in the JSON model record
description: "A small but capable model used for quick testing" description: "A thinking model from OpenAI"
# env: define an array of environment variables to inject into cmd's environment # env: define an array of environment variables to inject into cmd's environment
# - optional, default: empty array # - optional, default: empty array
@@ -247,14 +268,6 @@ models:
# - if you use a custom port in cmd this *must* be set # - if you use a custom port in cmd this *must* be set
proxy: http://127.0.0.1:8999 proxy: http://127.0.0.1:8999
# aliases: alternative model names that this model configuration is used for
# - optional, default: empty array
# - aliases must be unique globally
# - useful for impersonating a specific model
aliases:
- "gpt-4o-mini"
- "gpt-3.5-turbo"
# checkEndpoint: URL path to check if the server is ready # checkEndpoint: URL path to check if the server is ready
# - optional, default: /health # - optional, default: /health
# - endpoint is expected to return an HTTP 200 response # - endpoint is expected to return an HTTP 200 response
@@ -263,8 +276,10 @@ models:
checkEndpoint: /custom-endpoint checkEndpoint: /custom-endpoint
# ttl: automatically unload the model after ttl seconds # ttl: automatically unload the model after ttl seconds
# - optional, default: 0 # - optional, default: -1 (use global default)
# - ttl values must be a value greater than 0 # - ttl values must be a value greater than or equal to 0
# - a ttl of -1 will use the global TTL value as the default
# - a ttl of 0 will mean never unload
# - a value of 0 disables automatic unloading of the model # - a value of 0 disables automatic unloading of the model
ttl: 60 ttl: 60
@@ -272,11 +287,11 @@ models:
# - optional, default: "" # - optional, default: ""
# - useful for when the upstream server expects a specific model name that # - useful for when the upstream server expects a specific model name that
# is different from the model's ID # is different from the model's ID
useModelName: "qwen:qwq" useModelName: "openai/gpt-oss-120B"
# filters: a dictionary of filter settings # filters: a dictionary of filter settings
# - optional, default: empty dictionary # - optional, default: empty dictionary
# - only stripParams is currently supported # - same capabilities as peer filters (stripParams, setParams)
filters: filters:
# stripParams: a comma separated list of parameters to remove from the request # stripParams: a comma separated list of parameters to remove from the request
# - optional, default: "" # - optional, default: ""
@@ -286,6 +301,43 @@ models:
# - recommended to stick to sampling parameters # - recommended to stick to sampling parameters
stripParams: "temperature, top_p, top_k" stripParams: "temperature, top_p, top_k"
# setParams: a dictionary of parameters to set/override in requests
# - optional, default: empty dictionary
# - useful for enforcing specific parameter values
# - protected params like "model" cannot be overridden
# - values can be strings, numbers, booleans, arrays, or objects
# - always runs for the model
setParams:
# Example: enforce specific sampling parameters
temperature: 0.7
top_p: 0.9
# setParamsByID: a dictionary of parameters to set based the model ID
# - optional, default: empty dictionary
# - combine with aliases to create variant behaviour without reloading the model
# - parameters are set in the request body JSON
# - run after setParams so it will override any settings
# - protected params like "model" cannot be overridden
# - values can be strings, numbers, booleans, arrays, or objects
# - model aliases will be automatically created for each key
setParamsByID:
"${MODEL_ID}":
chat_template_kwargs:
reasoning_effort: medium
"${MODEL_ID}:high":
chat_template_kwargs:
reasoning_effort: high
"${MODEL_ID}:low":
chat_template_kwargs:
reasoning_effort: low
# aliases: alternative model names that this model configuration is used for
# - optional, default: empty array
# - aliases must be unique globally
# - useful for impersonating a specific model
aliases:
- "gpt-4o-mini"
# metadata: a dictionary of arbitrary values that are included in /v1/models # metadata: a dictionary of arbitrary values that are included in /v1/models
# - optional, default: empty dictionary # - optional, default: empty dictionary
# - while metadata can contains complex types it is recommended to keep it simple # - while metadata can contains complex types it is recommended to keep it simple
@@ -319,33 +371,26 @@ models:
# - recommended to be omitted and the default used # - recommended to be omitted and the default used
concurrencyLimit: 0 concurrencyLimit: 0
# timeouts: configure proxy connection timeouts for this model
# - optional, defaults shown below
# - useful for models on slower hardware that need longer timeouts
# - increase responseHeader to avoid "timeout awaiting response headers" errors
# - set any value to 0 to disable that timeout (not recommended)
timeouts:
# connect: TCP connection timeout in seconds
# - default: 30
connect: 30
# responseHeader: time to wait for response headers in seconds
# - default: 60
# - for slow image generation or large models, consider increasing to 300+ seconds
responseHeader: 60
# tlsHandshake: TLS handshake timeout in seconds
# - default: 10
tlsHandshake: 10
# idleConn: idle connection timeout in seconds
# - default: 90
idleConn: 90
# sendLoadingState: overrides the global sendLoadingState setting for this model # sendLoadingState: overrides the global sendLoadingState setting for this model
# - optional, default: undefined (use global setting) # - optional, default: undefined (use global setting)
sendLoadingState: false sendLoadingState: false
# timeouts: configure proxy connection timeouts for this model
# - optional, defaults shown below
# - useful for models running on slower hardware that need longer timeouts
# - connect: TCP dial connection timeout in seconds, default: 30 seconds
# - keepalive: TCP connection keepalive timeout, default: 30 seconds
# - responseHeader: time to wait for response headers in seconds, default: 0 (no timeout)
# - tlsHandshake: TLS handshake timeout in seconds, default: 10 seconds
# - idleConn: idle connection timeout in seconds, default: 90 seconds
# - set any value to 0 to disable that timeout (not recommended)
timeouts:
connect: 30
keepalive: 0
responseHeader: 60
tlsHandshake: 10
idleConn: 90
# Unlisted model example: # Unlisted model example:
"qwen-unlisted": "qwen-unlisted":
# unlisted: boolean, true or false # unlisted: boolean, true or false
@@ -377,68 +422,83 @@ models:
# - processes have 5 seconds to shutdown until forceful termination is attempted # - processes have 5 seconds to shutdown until forceful termination is attempted
cmdStop: docker stop ${MODEL_ID} cmdStop: docker stop ${MODEL_ID}
# groups: a dictionary of group settings # =============================================================================
# - optional, default: empty dictionary # matrix: run concurrent models with a solver-based swap DSL
# - provides advanced controls over model swapping behaviour # =============================================================================
# - using groups some models can be kept loaded indefinitely, while others are swapped out
# - model IDs must be defined in the Models section
# - a model can only be a member of one group
# - group behaviour is controlled via the `swap`, `exclusive` and `persistent` fields
# - see issue #109 for details
# #
# NOTE: the example below uses model names that are not defined above for demonstration purposes # Note:
groups: # A config must use either a matrix or legacy groups, not both. A configuration error
# group1 works the same as the default behaviour of llama-swap where only one model is allowed # will occur if both are defined. Configuration examples for legacy Groups can be found:
# to run a time across the whole llama-swap instance # https://github.com/mostlygeek/llama-swap/blob/40e39f7/config.example.yaml#L334-L396
"group1": #
# swap: controls the model swapping behaviour in within the group # The matrix declares valid combinations of models that can run concurrently.
# - optional, default: true # When a model is requested, the solver finds the cheapest way to make it
# - true : only one model is allowed to run at a time # available by evicting as few (and least costly) running models as possible.
# - false: all models can run together, no swapping #
swap: true # Solver behavior:
# 1. Request arrives for model X
# 2. If X is already running, forward immediately. Done.
# 3. Find all sets containing X
# 4. For each candidate set, compute cost: sum of evict_costs for
# every running model NOT in that set
# 5. Pick lowest cost candidate. Ties broken by definition order.
# 6. Evict what needs to stop. Start X. Forward request.
#
# Subset semantics: a set [a, b, c] means any subset is valid.
# Only the requested model is started — others are not preloaded.
#
# A model not appearing in any set can only run alone.
#
matrix:
# vars: short names for models (alphanumeric, 1-8 chars)
# - required for sets and evict_costs settings
# - each entry is a short name to a real model ID. Do not use an alias
# - used to keep set DSL logic short and easier to read
# - sets and evict_costs only use identifiers defined in vars
vars:
g: gemma-model
q: qwen-model
m: mistral-model
v: voxtral-model
e: reranker-model
L: llama-70B
sd: stable-diffusion
# exclusive: controls how the group affects other groups # evict_costs: relative cost of losing a running model (default: 1)
# - optional, default: true evict_costs:
# - true: causes all other groups to unload when this group runs a model v: 50 # vllm backend, slow cold start
# - false: does not affect other groups L: 30 # 70B weights, slow to load
exclusive: true
# members references the models defined above # sets: named sets of concurrent model combinations
# required # Values are DSL strings with operators:
members: # & AND (models run together)
- "llama" # | OR (alternatives)
- "qwen-unlisted" # () grouping
# +ref inline another set's expression
#
# Expansion examples:
# "L" → [L]
# "a & b" → [a, b]
# "a | b" → [a], [b]
# "(a | b) & c" → [a, c], [b, c]
# "(a | b) & (c | d)" → [a,c], [a,d], [b,c], [b,d]
# "+llms & v" → expands llms inline, then applies & v
sets:
# LLM + TTS: switching between g/q/m won't evict v
# expands to: [g,v], [q,v], [m,v]
standard: "(g | q | m) & v"
# Example: # LLM + TTS + reranker
# - in group2 all models can run at the same time # expands to: [g,v,e], [q,v,e]
# - when a different group is loaded it causes all running models in this group to unload with_rerank: "(g | q) & v & e"
"group2":
swap: false
# exclusive: false does not unload other groups when a model in group2 is requested # LLM + image generation, no TTS
# - the models in group2 will be loaded but will not unload any other groups # expands to: [g,sd], [q,sd]
exclusive: false creative: "(g | q) & sd"
members:
- "docker-llama"
- "modelA"
- "modelB"
# Example: # 70B model uses all GPUs, can only run alone
# - a persistent group, prevents other groups from unloading it # expands to: [L]
"forever": full: "L"
# persistent: prevents over groups from unloading the models in this group
# - optional, default: false
# - does not affect individual model behaviour
persistent: true
# set swap/exclusive to false to prevent swapping inside the group
# and the unloading of other groups
swap: false
exclusive: false
members:
- "forever-modelA"
- "forever-modelB"
- "forever-modelc"
# hooks: a dictionary of event triggers and actions # hooks: a dictionary of event triggers and actions
# - optional, default: empty dictionary # - optional, default: empty dictionary
@@ -467,17 +527,6 @@ peers:
# - required # - required
# - requested path to llama-swap will be appended to the end of the proxy value # - requested path to llama-swap will be appended to the end of the proxy value
proxy: http://192.168.1.23 proxy: http://192.168.1.23
# timeouts: configure proxy connection timeouts for this peer
# - optional, defaults shown below
# - useful when the peer runs on slower hardware
# - set any value to 0 to disable that timeout (not recommended)
timeouts:
connect: 30
responseHeader: 60
tlsHandshake: 10
idleConn: 90
# models: a list of models served by the peer # models: a list of models served by the peer
# - required # - required
models: models:
@@ -490,7 +539,8 @@ peers:
# - optional, default: "" # - optional, default: ""
# - if blank, no key will be added to the request # - if blank, no key will be added to the request
# - key will be injected into headers: Authorization: Bearer <key> and x-api-key: <key> # - key will be injected into headers: Authorization: Bearer <key> and x-api-key: <key>
apiKey: sk-your-openrouter-key # - can be a string or a macro
apiKey: ${env.OPENROUTER_API_KEY}
models: models:
- meta-llama/llama-3.1-8b-instruct - meta-llama/llama-3.1-8b-instruct
- qwen/qwen3-235b-a22b-2507 - qwen/qwen3-235b-a22b-2507
@@ -498,4 +548,35 @@ peers:
- z-ai/glm-4.7 - z-ai/glm-4.7
- moonshotai/kimi-k2-0905 - moonshotai/kimi-k2-0905
- minimax/minimax-m2.1 - minimax/minimax-m2.1
# timeouts: configure proxy connection timeouts for this peer
# - optional, defaults shown below
# - useful when the peer runs on slower hardware
# - set any value to 0 to disable that timeout (not recommended)
timeouts:
connect: 30
keepalive: 30
responseHeader: 60
tlsHandshake: 10
idleConn: 90
# filters: a dictionary of filter settings for peer requests
# - optional, default: empty dictionary
# - same capabilities as model filters (stripParams, setParams)
filters:
# stripParams: a comma separated list of parameters to remove from the request
# - optional, default: ""
# - useful for removing parameters that the peer doesn't support
# - the `model` parameter can never be removed
stripParams: "temperature, top_p"
# setParams: a dictionary of parameters to set/override in requests to this peer
# - optional, default: empty dictionary
# - useful for injecting provider-specific settings like data retention policies
# - protected params like "model" cannot be overridden
# - values can be strings, numbers, booleans, arrays, or objects
setParams:
# Example: enforce zero-data-retention for OpenRouter
provider:
data_collection: "deny"
zdr: true
``` ```