# add this modeline for validation in vscode # yaml-language-server: $schema=https://raw.githubusercontent.com/mostlygeek/llama-swap/refs/heads/main/config-schema.json # # llama-swap YAML configuration example # ------------------------------------- # # 💡 Tip - Use an LLM with this file! # ==================================== # This example configuration is written to be LLM friendly. Try # copying this file into an LLM and asking it to explain or generate # sections for you. # ==================================== # Usage notes: # - Below are all the available configuration options for llama-swap. # - Settings noted as "required" must be in your configuration file # - Settings noted as "optional" can be omitted # healthCheckTimeout: number of seconds to wait for a model to be ready to serve requests # - optional, default: 120 # - minimum value is 15 seconds, anything less will be set to this value healthCheckTimeout: 500 # logLevel: sets the logging value # - optional, default: info # - Valid log levels: debug, info, warn, error logLevel: info # logTimeFormat: enables and sets the logging timestamp format # - optional, default (disabled): "" # - Valid values: "", "ansic", "unixdate", "rubydate", "rfc822", "rfc822z", # "rfc850", "rfc1123", "rfc1123z", "rfc3339", "rfc3339nano", "kitchen", # "stamp", "stampmilli", "stampmicro", and "stampnano". # - For more info, read: https://pkg.go.dev/time#pkg-constants logTimeFormat: "" # logToStdout: controls what is logged to stdout # - optional, default: "proxy" # - valid values: # - "proxy": logs generated by llama-swap when swapping models, # handling requests, etc. # - "upstream": a copy of an upstream processes stdout logs # - "both": both the proxy and upstream logs interleaved together # - "none": no logs are ever written to stdout logToStdout: "proxy" # metricsMaxInMemory: maximum number of metrics to keep in memory # - optional, default: 1000 # - controls how many metrics are stored in memory before older ones are discarded # - useful for limiting memory usage when processing large volumes of metrics metricsMaxInMemory: 1000 # captureBuffer: how many MBs to allocate for storing request/response captures # - optional, default: 10 # - set to 0 to disable captureBuffer: 15 # performance: configuration for system monitoring statistics # - timing values are duration strings like 1s, 1h30m, 90m, 2h10s, etc. performance: # disabled: boolean # - default: false disabled: false # every: delay between polling for new performance statistics # - default: 5s # - minimum duration 5s every: 15s # startPort: sets the starting port number for the automatic ${PORT} macro. # - optional, default: 5800 # - the ${PORT} macro can be used in model.cmd and model.proxy settings # - it is automatically incremented for every model that uses it startPort: 10001 # sendLoadingState: inject loading status updates into the reasoning (thinking) # field # - optional, default: false # - when true, a stream of loading messages will be sent to the client in the # reasoning field so chat UIs can show that loading is in progress. # - see #366 for more details sendLoadingState: true # includeAliasesInList: present aliases within the /v1/models OpenAI API listing # - optional, default: false # - when true, model aliases will be output to the API model listing duplicating # all fields except for Id so chat UIs can use the alias equivalent to the original. includeAliasesInList: false # globalTTL: the default TTL in seconds before unloading a model # - optional, default: 0 (never automatically unload) # - must be >= 0 globalTTL: 0 # macros: a dictionary of string substitutions # - optional, default: empty dictionary # - macros are reusable snippets # - used in a model's cmd, cmdStop, proxy, checkEndpoint, filters.stripParams # - useful for reducing common configuration settings # - macro names are strings and must be less than 64 characters # - macro names must match the regex ^[a-zA-Z0-9_-]+$ # - macro names must not be a reserved name: PORT or MODEL_ID # - macro values can be numbers, bools, or strings # - macros can contain other macros, but they must be defined before they are used # - environment variables can be referenced with ${env.VAR_NAME} syntax # - env macros are substituted first, before regular macros # - if the env var is not set, config loading will fail with an error macros: # Example of a multi-line macro "latest-llama": > /path/to/llama-server/llama-server-ec9e0301 --port ${PORT} "default_ctx": 4096 # Example of macro-in-macro usage. macros can contain other macros # but they must be previously declared. "default_args": "--ctx-size ${default_ctx}" # Example of environment variable macros # - ${env.VAR_NAME} pulls the value from the system environment # - useful for paths, secrets, or machine-specific configuration "models_dir": "${env.HOME}/models" # apiKeys: require an API key when making requests to inference endpoints # - optional, default: [] # - when empty (the default) authorization will not be checked as llama-swap is default-allow # - each key is a non-empty string apiKeys: - "sk-hunter2" # tip, one liner: printf "sk-%s\n" "$(head -c 48 /dev/urandom | base64 )" - "sk-gyCPiKUcIfPlaM4OSMZekkprgijPx6+OsmQs8Rsg0xZ9qpy6gKWsIKqHOk+cgXVx" # use environment variable macros to keep secrets out of the config - "${env.API_KEY_1}" - "${env.API_KEY_2}" # upstream: controls behaviour of the /upstream passthrough endpoint # - optional, default: empty dictionary # - recommended to only use in special use cases. Leaving it as the # default will typically be the best experience upstream: # ignorePaths: list of RE2 compatible regular expressions # - default: (see below) # - any request to a path matching any of the regular expressions # will be ignored and not trigger a swap ignorePaths: - '.*\.(js|json|css|png|gif|jpg|jpeg|ico|txt)$' # models: a dictionary of model configurations # - required # - each key is the model's ID, used in API requests # - model settings have default values that are used if they are not defined here # - the model's ID is available in the ${MODEL_ID} macro, also available in macros defined above # - below are examples of the all the settings a model can have models: # keys are the model names used in API requests "gpt-oss-120b": # macros: a dictionary of string substitutions specific to this model # - optional, default: empty dictionary # - macros defined here override macros defined in the global macros section # - model level macros follow the same rules as global macros macros: "default_ctx": 16384 "temp": 0.7 # cmd: the command to run to start the inference server. # - required # - it is just a string, similar to what you would run on the CLI # - using `|` allows for comments in the command, these will be parsed out # - macros can be used within cmd cmd: | # ${latest-llama} is a macro that is defined above ${latest-llama} --model path/to/gpt-oss-120B.gguf --ctx-size ${default_ctx} --temperature ${temp} # name: a display name for the model # - optional, default: empty string # - if set, it will be used in the v1/models API response # - if not set, it will be omitted in the JSON model record name: "gpt-oss 120B" # description: a description for the model # - optional, default: empty string # - if set, it will be used in the v1/models API response # - if not set, it will be omitted in the JSON model record description: "A thinking model from OpenAI" # env: define an array of environment variables to inject into cmd's environment # - optional, default: empty array # - each value is a single string # - in the format: ENV_NAME=value env: - "CUDA_VISIBLE_DEVICES=0,1,2" # proxy: the URL where llama-swap routes API requests # - optional, default: http://localhost:${PORT} # - if you used ${PORT} in cmd this can be omitted # - if you use a custom port in cmd this *must* be set proxy: http://127.0.0.1:8999 # checkEndpoint: URL path to check if the server is ready # - optional, default: /health # - endpoint is expected to return an HTTP 200 response # - all requests wait until the endpoint is ready or fails # - use "none" to skip endpoint health checking checkEndpoint: /custom-endpoint # ttl: automatically unload the model after ttl seconds # - optional, default: -1 (use global default) # - ttl values must be a value greater than or equal to 0 # - a ttl of -1 will use the global TTL value as the default # - a ttl of 0 will mean never unload # - a value of 0 disables automatic unloading of the model ttl: 60 # useModelName: override the model name that is sent to upstream server # - optional, default: "" # - useful for when the upstream server expects a specific model name that # is different from the model's ID useModelName: "openai/gpt-oss-120B" # filters: a dictionary of filter settings # - optional, default: empty dictionary # - same capabilities as peer filters (stripParams, setParams) filters: # stripParams: a comma separated list of parameters to remove from the request # - optional, default: "" # - useful for server side enforcement of sampling parameters # - the `model` parameter can never be removed # - can be any JSON key in the request body # - recommended to stick to sampling parameters stripParams: "temperature, top_p, top_k" # setParams: a dictionary of parameters to set/override in requests # - optional, default: empty dictionary # - useful for enforcing specific parameter values # - protected params like "model" cannot be overridden # - values can be strings, numbers, booleans, arrays, or objects # - always runs for the model setParams: # Example: enforce specific sampling parameters temperature: 0.7 top_p: 0.9 # setParamsByID: a dictionary of parameters to set based the model ID # - optional, default: empty dictionary # - combine with aliases to create variant behaviour without reloading the model # - parameters are set in the request body JSON # - run after setParams so it will override any settings # - protected params like "model" cannot be overridden # - values can be strings, numbers, booleans, arrays, or objects # - model aliases will be automatically created for each key setParamsByID: "${MODEL_ID}": chat_template_kwargs: reasoning_effort: medium "${MODEL_ID}:high": chat_template_kwargs: reasoning_effort: high "${MODEL_ID}:low": chat_template_kwargs: reasoning_effort: low # aliases: alternative model names that this model configuration is used for # - optional, default: empty array # - aliases must be unique globally # - useful for impersonating a specific model aliases: - "gpt-4o-mini" # metadata: a dictionary of arbitrary values that are included in /v1/models # - optional, default: empty dictionary # - while metadata can contains complex types it is recommended to keep it simple # - metadata is only passed through in /v1/models responses metadata: # port will remain an integer port: ${PORT} # the ${temp} macro will remain a float temperature: ${temp} note: "The ${MODEL_ID} is running on port ${PORT} temp=${temp}, context=${default_ctx}" a_list: - 1 - 1.23 - "macros are OK in list and dictionary types: ${MODEL_ID}" an_obj: a: "1" b: 2 # objects can contain complex types with macro substitution # becomes: c: [0.7, false, "model: llama"] c: ["${temp}", false, "model: ${MODEL_ID}"] # concurrencyLimit: overrides the allowed number of active parallel requests to a model # - optional, default: 0 # - useful for limiting the number of active parallel requests a model can process # - must be set per model # - any number greater than 0 will override the internal default value of 10 # - any requests that exceeds the limit will receive an HTTP 429 Too Many Requests response # - recommended to be omitted and the default used concurrencyLimit: 0 # sendLoadingState: overrides the global sendLoadingState setting for this model # - optional, default: undefined (use global setting) sendLoadingState: false # timeouts: configure proxy connection timeouts for this model # - optional, defaults shown below # - useful for models running on slower hardware that need longer timeouts # - connect: TCP dial connection timeout in seconds, default: 30 seconds # - keepalive: TCP connection keepalive timeout, default: 30 seconds # - responseHeader: time to wait for response headers in seconds, default: 0 (no timeout) # - tlsHandshake: TLS handshake timeout in seconds, default: 10 seconds # - idleConn: idle connection timeout in seconds, default: 90 seconds # - set any value to 0 to disable that timeout (not recommended) timeouts: connect: 30 keepalive: 0 responseHeader: 60 tlsHandshake: 10 idleConn: 90 # capabilities: defines what the model accepts for input, output and other metadata # - optional; omitted or all-zero means no capabilities # - used in v1/models to inform clients what the model can do capabilities: # in: list of modalities understood by the model # - default: [] # - valid: text, audio, image in: - text - audio - image # out: list of modalities generated by the model # - default: [] # - valid: text, audio, image out: - text - audio - image # tools: the model supports function calling # - default: false tools: true # reranker: the model supports the /v1/rerank endpoint # - default: false reranker: false # context: the maximum token context length supported # - default: 0 # - must be an integer > 0 context: 32000 # Unlisted model example: "qwen-unlisted": # unlisted: boolean, true or false # - optional, default: false # - unlisted models do not show up in /v1/models api requests # - can be requested as normal through all apis unlisted: true cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0 # Docker example: # container runtimes like Docker and Podman can be used reliably with # a combination of cmd, cmdStop, and ${MODEL_ID} "docker-llama": proxy: "http://127.0.0.1:${PORT}" cmd: | docker run --name ${MODEL_ID} --init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models ghcr.io/ggml-org/llama.cpp:server --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf' # cmdStop: command to run to stop the model gracefully # - optional, default: "" # - useful for stopping commands managed by another system # - the upstream's process id is available in the ${PID} macro # # When empty, llama-swap has this default behaviour: # - on POSIX systems: a SIGTERM signal is sent # - on Windows, calls taskkill to stop the process # - processes have 5 seconds to shutdown until forceful termination is attempted cmdStop: docker stop ${MODEL_ID} # hooks: a dictionary of event triggers and actions # - optional, default: empty dictionary # - the only supported hook is on_startup hooks: # on_startup: a dictionary of actions to perform on startup # - optional, default: empty dictionary # - the only supported action is preload on_startup: # preload: a list of model ids to load on startup # - optional, default: empty list # - model names must match keys in the models sections # - when preloading multiple models at once, define a group # otherwise models will be loaded and swapped out preload: - "llama" # routing: # Controls how llama-swap decides which models can run at the same time and # which get swapped out. Choose one of two swap engines: # # - group: the default engine. Simpler to configure. You define groups of # models that run together, and loading one group typically unloads # the others. # # - matrix: the newer engine. More involved to configure, but far more # flexible. It uses a small expression language to describe which # model combinations are allowed to run concurrently, enabling # setups that groups cannot express. # # The routing section is optional. routing: router: # use: a string defining which engine to use # - optional, default: "group" # - valid values: group, matrix use: group # settings: a dictionary of settings for the specific engines settings: # groups: a dictionary of named groups # - optional, default: empty dictionary # - lets you keep some models loaded while others swap out # - every member must be a model ID defined in the models section # - a model can belong to only one group # - behaviour is set per group with the `swap`, `exclusive` and # `persistent` fields # - see issue #109 for details # # NOTE: the model names below are illustrative and are not defined above. groups: # group1 reproduces llama-swap's default behaviour: only one model # runs at a time across the entire instance. "group1": # swap: how members of this group swap among themselves # - optional, default: true # - true: only one member runs at a time # - false: all members can run together, no swapping swap: true # exclusive: how this group affects other groups # - optional, default: true # - true: running a member unloads every other group # - false: running a member leaves other groups untouched exclusive: true # members: the model IDs in this group # required members: - "llama" - "qwen-unlisted" # group2: members all run together, but loading any other group # unloads them. "group2": # swap: false lets all members stay loaded at once swap: false # exclusive: false means requesting a member loads it without # unloading any other group exclusive: false members: - "docker-llama" - "modelA" - "modelB" # forever: a persistent group that other groups can never unload. "forever": # persistent: other groups cannot unload this group's members # - optional, default: false # - has no effect on swapping within the group persistent: true # swap/exclusive: false keeps all members loaded and avoids # unloading other groups swap: false exclusive: false members: - "forever-modelA" - "forever-modelB" - "forever-modelc" # The matrix lists the model combinations that are allowed to run # concurrently. When a model is requested, the solver makes room for it # by evicting as few running models as possible, preferring to keep the # costliest ones loaded. # # Solver behaviour: # 1. A request arrives for model X. # 2. If X is already running, forward the request. Done. # 3. Collect every set that contains X. # 4. For each set, add up the evict_costs of the running models that # are NOT in that set — that is the set's cost. # 5. Choose the lowest-cost set. Break ties by definition order. # 6. Evict the models outside that set, start X, forward the request. # # Subset semantics: a set [a, b, c] also permits any subset of itself. # Only the requested model is started; the others are not preloaded. # # A model that appears in no set can only run on its own. # matrix: # vars: short aliases for model IDs (alphanumeric, 1-8 chars) # - required: sets and evict_costs reference these names, not model IDs # - map each short name to a real model ID (not a model alias) # - keeps the set expressions short and readable vars: g: gemma-model q: qwen-model m: mistral-model v: voxtral-model e: reranker-model L: llama-70B sd: stable-diffusion # evict_costs: relative cost of losing a running model (default: 1) evict_costs: v: 50 # vllm backend, slow cold start L: 30 # 70B weights, slow to load # sets: named combinations of models that may run together. # Each value is an expression built from these operators: # & AND (models run together) # | OR (alternatives) # () grouping # +ref inline the expression of another set # # Each expression expands into one or more concrete sets: # "L" → [L] # "a & b" → [a, b] # "a | b" → [a], [b] # "(a | b) & c" → [a, c], [b, c] # "(a | b) & (c | d)" → [a,c], [a,d], [b,c], [b,d] # "+llms & v" → inline the llms set, then AND with v sets: # An LLM plus TTS. Switching between g/q/m keeps v loaded. # expands to: [g,v], [q,v], [m,v] standard: "(g | q | m) & v" # An LLM plus TTS plus reranker. # expands to: [g,v,e], [q,v,e] with_rerank: "(g | q) & v & e" # An LLM plus image generation, no TTS. # expands to: [g,sd], [q,sd] creative: "(g | q) & sd" # The 70B model uses every GPU, so it can only run alone. # expands to: [L] full: "L" # scheduler: how queued requests are ordered. # The default and only valid scheduler is "fifo" scheduler: use: fifo settings: fifo: # priority: a dictionary of model ID -> priority # - optional, default: empty dictionary # - models default to priority 0 # - higher priority requests are serviced first in the queue priority: A: 10 B: 5 C: 5 D: 1 # peers: a dictionary of remote peers and models they provide # - optional, default empty dictionary # - peers can be another llama-swap # - peers can be any server that provides the /v1/ generative api endpoints supported by llama-swap peers: # keys is the peer'd ID llama-swap-peer: # proxy: a valid base URL to proxy requests to # - required # - requested path to llama-swap will be appended to the end of the proxy value proxy: http://192.168.1.23 # models: a list of models served by the peer # - required models: - model_a - model_b - embeddings/model_c openrouter: proxy: https://openrouter.ai/api # apiKey: a string key to be injected into the request # - optional, default: "" # - if blank, no key will be added to the request # - key will be injected into headers: Authorization: Bearer and x-api-key: # - can be a string or a macro apiKey: ${env.OPENROUTER_API_KEY} models: - meta-llama/llama-3.1-8b-instruct - qwen/qwen3-235b-a22b-2507 - deepseek/deepseek-v3.2 - z-ai/glm-4.7 - moonshotai/kimi-k2-0905 - minimax/minimax-m2.1 # timeouts: configure proxy connection timeouts for this peer # - optional, defaults shown below # - useful when the peer runs on slower hardware # - set any value to 0 to disable that timeout (not recommended) timeouts: connect: 30 keepalive: 30 responseHeader: 60 tlsHandshake: 10 idleConn: 90 # filters: a dictionary of filter settings for peer requests # - optional, default: empty dictionary # - same capabilities as model filters (stripParams, setParams) filters: # stripParams: a comma separated list of parameters to remove from the request # - optional, default: "" # - useful for removing parameters that the peer doesn't support # - the `model` parameter can never be removed stripParams: "temperature, top_p" # setParams: a dictionary of parameters to set/override in requests to this peer # - optional, default: empty dictionary # - useful for injecting provider-specific settings like data retention policies # - protected params like "model" cannot be overridden # - values can be strings, numbers, booleans, arrays, or objects setParams: # Example: enforce zero-data-retention for OpenRouter provider: data_collection: "deny" zdr: true