llama-swap/config.example.yaml

# add this modeline for validation in vscode
# yaml-language-server: $schema=https://raw.githubusercontent.com/mostlygeek/llama-swap/refs/heads/main/config-schema.json
#
# llama-swap YAML configuration example
# -------------------------------------
#
# 💡 Tip - Use an LLM with this file!
# ====================================
#  This example configuration is written to be LLM friendly. Try
#  copying this file into an LLM and asking it to explain or generate
#  sections for you.
# ====================================

# Usage notes:
# - Below are all the available configuration options for llama-swap.
# - Settings noted as "required" must be in your configuration file
# - Settings noted as "optional" can be omitted

# healthCheckTimeout: number of seconds to wait for a model to be ready to serve requests
# - optional, default: 120
# - minimum value is 15 seconds, anything less will be set to this value
healthCheckTimeout: 500

# logLevel: sets the logging value
# - optional, default: info
# - Valid log levels: debug, info, warn, error
logLevel: info

# logTimeFormat: enables and sets the logging timestamp format
# - optional, default (disabled): ""
# - Valid values: "", "ansic", "unixdate", "rubydate", "rfc822", "rfc822z",
#   "rfc850", "rfc1123", "rfc1123z", "rfc3339", "rfc3339nano", "kitchen",
#   "stamp", "stampmilli", "stampmicro", and "stampnano".
# - For more info, read: https://pkg.go.dev/time#pkg-constants
logTimeFormat: ""

# logToStdout: controls what is logged to stdout
# - optional, default: "proxy"
# - valid values:
#   - "proxy": logs generated by llama-swap when swapping models,
#      handling requests, etc.
#   - "upstream": a copy of an upstream processes stdout logs
#   - "both": both the proxy and upstream logs interleaved together
#   - "none": no logs are ever written to stdout
logToStdout: "proxy"

# metricsMaxInMemory: maximum number of metrics to keep in memory
# - optional, default: 1000
# - controls how many metrics are stored in memory before older ones are discarded
# - useful for limiting memory usage when processing large volumes of metrics
metricsMaxInMemory: 1000

# captureBuffer: how many MBs to allocate for storing request/response captures
# - optional, default: 10
# - set to 0 to disable
captureBuffer: 15

# performance: configuration for system monitoring statistics
# - timing values are duration strings like 1s, 1h30m, 90m, 2h10s, etc.
performance:
  # disabled: boolean
  # - default: false
  disabled: false

  # every: delay between polling for new performance statistics
  # - default: 5s
  # - minimum duration 5s
  every: 15s

# startPort: sets the starting port number for the automatic ${PORT} macro.
# - optional, default: 5800
# - the ${PORT} macro can be used in model.cmd and model.proxy settings
# - it is automatically incremented for every model that uses it
startPort: 10001

# sendLoadingState: inject loading status updates into the reasoning (thinking)
# field
# - optional, default: false
# - when true, a stream of loading messages will be sent to the client in the
#   reasoning field so chat UIs can show that loading is in progress.
# - see #366 for more details
sendLoadingState: true

# includeAliasesInList: present aliases within the /v1/models OpenAI API listing
# - optional, default: false
# - when true, model aliases will be output to the API model listing duplicating
#   all fields except for Id so chat UIs can use the alias equivalent to the original.
includeAliasesInList: false

# globalTTL: the default TTL in seconds before unloading a model
# - optional, default: 0 (never automatically unload)
# - must be >= 0
globalTTL: 0

# macros: a dictionary of string substitutions
# - optional, default: empty dictionary
# - macros are reusable snippets
# - used in a model's cmd, cmdStop, proxy, checkEndpoint, filters.stripParams
# - useful for reducing common configuration settings
# - macro names are strings and must be less than 64 characters
# - macro names must match the regex ^[a-zA-Z0-9_-]+$
# - macro names must not be a reserved name: PORT or MODEL_ID
# - macro values can be numbers, bools, or strings
# - macros can contain other macros, but they must be defined before they are used
# - environment variables can be referenced with ${env.VAR_NAME} syntax
#   - env macros are substituted first, before regular macros
#   - if the env var is not set, config loading will fail with an error
macros:
  # Example of a multi-line macro
  "latest-llama": >
    /path/to/llama-server/llama-server-ec9e0301 --port ${PORT}

  "default_ctx": 4096

  # Example of macro-in-macro usage. macros can contain other macros
  # but they must be previously declared.
  "default_args": "--ctx-size ${default_ctx}"

  # Example of environment variable macros
  # - ${env.VAR_NAME} pulls the value from the system environment
  # - useful for paths, secrets, or machine-specific configuration
  "models_dir": "${env.HOME}/models"

# apiKeys: require an API key when making requests to inference endpoints
# - optional, default: []
# - when empty (the default) authorization will not be checked as llama-swap is default-allow
# - each key is a non-empty string
apiKeys:
  - "sk-hunter2"
  # tip, one liner: printf "sk-%s\n" "$(head -c 48 /dev/urandom | base64 )"
  - "sk-gyCPiKUcIfPlaM4OSMZekkprgijPx6+OsmQs8Rsg0xZ9qpy6gKWsIKqHOk+cgXVx"

  # use environment variable macros to keep secrets out of the config
  - "${env.API_KEY_1}"
  - "${env.API_KEY_2}"

# models: a dictionary of model configurations
# - required
# - each key is the model's ID, used in API requests
# - model settings have default values that are used if they are not defined here
# - the model's ID is available in the ${MODEL_ID} macro, also available in macros defined above
# - below are examples of the all the settings a model can have
models:
  # keys are the model names used in API requests
  "gpt-oss-120b":
    # macros: a dictionary of string substitutions specific to this model
    # - optional, default: empty dictionary
    # - macros defined here override macros defined in the global macros section
    # - model level macros follow the same rules as global macros
    macros:
      "default_ctx": 16384
      "temp": 0.7

    # cmd: the command to run to start the inference server.
    # - required
    # - it is just a string, similar to what you would run on the CLI
    # - using `|` allows for comments in the command, these will be parsed out
    # - macros can be used within cmd
    cmd: |
      # ${latest-llama} is a macro that is defined above
      ${latest-llama}
      --model path/to/gpt-oss-120B.gguf
      --ctx-size ${default_ctx}
      --temperature ${temp}

    # name: a display name for the model
    # - optional, default: empty string
    # - if set, it will be used in the v1/models API response
    # - if not set, it will be omitted in the JSON model record
    name: "gpt-oss 120B"

    # description: a description for the model
    # - optional, default: empty string
    # - if set, it will be used in the v1/models API response
    # - if not set, it will be omitted in the JSON model record
    description: "A thinking model from OpenAI"

    # env: define an array of environment variables to inject into cmd's environment
    # - optional, default: empty array
    # - each value is a single string
    # - in the format: ENV_NAME=value
    env:
      - "CUDA_VISIBLE_DEVICES=0,1,2"

    # proxy: the URL where llama-swap routes API requests
    # - optional, default: http://localhost:${PORT}
    # - if you used ${PORT} in cmd this can be omitted
    # - if you use a custom port in cmd this *must* be set
    proxy: http://127.0.0.1:8999

    # checkEndpoint: URL path to check if the server is ready
    # - optional, default: /health
    # - endpoint is expected to return an HTTP 200 response
    # - all requests wait until the endpoint is ready or fails
    # - use "none" to skip endpoint health checking
    checkEndpoint: /custom-endpoint

    # ttl: automatically unload the model after ttl seconds
    # - optional, default: -1 (use global default)
    # - ttl values must be a value greater than or equal to 0
    # - a ttl of -1 will use the global TTL value as the default
    # - a ttl of 0 will mean never unload
    # - a value of 0 disables automatic unloading of the model
    ttl: 60

    # useModelName: override the model name that is sent to upstream server
    # - optional, default: ""
    # - useful for when the upstream server expects a specific model name that
    #   is different from the model's ID
    useModelName: "openai/gpt-oss-120B"

    # filters: a dictionary of filter settings
    # - optional, default: empty dictionary
    # - same capabilities as peer filters (stripParams, setParams)
    filters:
      # stripParams: a comma separated list of parameters to remove from the request
      # - optional, default: ""
      # - useful for server side enforcement of sampling parameters
      # - the `model` parameter can never be removed
      # - can be any JSON key in the request body
      # - recommended to stick to sampling parameters
      stripParams: "temperature, top_p, top_k"

      # setParams: a dictionary of parameters to set/override in requests
      # - optional, default: empty dictionary
      # - useful for enforcing specific parameter values
      # - protected params like "model" cannot be overridden
      # - values can be strings, numbers, booleans, arrays, or objects
      # - always runs for the model
      setParams:
        # Example: enforce specific sampling parameters
        temperature: 0.7
        top_p: 0.9

      # setParamsByID: a dictionary of parameters to set based the model ID
      # - optional, default: empty dictionary
      # - combine with aliases to create variant behaviour without reloading the model
      # - parameters are set in the request body JSON
      # - run after setParams so it will override any settings
      # - protected params like "model" cannot be overridden
      # - values can be strings, numbers, booleans, arrays, or objects
      # - model aliases will be automatically created for each key
      setParamsByID:
        "${MODEL_ID}":
          chat_template_kwargs:
            reasoning_effort: medium
        "${MODEL_ID}:high":
          chat_template_kwargs:
            reasoning_effort: high
        "${MODEL_ID}:low":
          chat_template_kwargs:
            reasoning_effort: low

    # aliases: alternative model names that this model configuration is used for
    # - optional, default: empty array
    # - aliases must be unique globally
    # - useful for impersonating a specific model
    aliases:
      - "gpt-4o-mini"

    # metadata: a dictionary of arbitrary values that are included in /v1/models
    # - optional, default: empty dictionary
    # - while metadata can contains complex types it is recommended to keep it simple
    # - metadata is only passed through in /v1/models responses
    metadata:
      # port will remain an integer
      port: ${PORT}

      # the ${temp} macro will remain a float
      temperature: ${temp}
      note: "The ${MODEL_ID} is running on port ${PORT} temp=${temp},
        context=${default_ctx}"

      a_list:
        - 1
        - 1.23
        - "macros are OK in list and dictionary types: ${MODEL_ID}"

      an_obj:
        a: "1"
        b: 2
        # objects can contain complex types with macro substitution
        # becomes: c: [0.7, false, "model: llama"]
        c: ["${temp}", false, "model: ${MODEL_ID}"]

    # concurrencyLimit: overrides the allowed number of active parallel requests to a model
    # - optional, default: 0
    # - useful for limiting the number of active parallel requests a model can process
    # - must be set per model
    # - any number greater than 0 will override the internal default value of 10
    # - any requests that exceeds the limit will receive an HTTP 429 Too Many Requests response
    # - recommended to be omitted and the default used
    concurrencyLimit: 0

    # sendLoadingState: overrides the global sendLoadingState setting for this model
    # - optional, default: undefined (use global setting)
    sendLoadingState: false

    # timeouts: configure proxy connection timeouts for this model
    # - optional, defaults shown below
    # - useful for models running on slower hardware that need longer timeouts
    # - connect: TCP dial connection timeout in seconds, default: 30 seconds
    # - keepalive: TCP connection keepalive timeout, default: 30 seconds
    # - responseHeader: time to wait for response headers in seconds, default: 0 (no timeout)
    # - tlsHandshake: TLS handshake timeout in seconds, default: 10 seconds
    # - idleConn: idle connection timeout in seconds, default: 90 seconds
    # - set any value to 0 to disable that timeout (not recommended)
    timeouts:
      connect: 30
      keepalive: 0
      responseHeader: 60
      tlsHandshake: 10
      idleConn: 90

    # capabilities: defines what the model accepts for input, output and other metadata
    # - optional; omitted or all-zero means no capabilities
    # - used in v1/models to inform clients what the model can do
    capabilities:
      # in: list of modalities understood by the model
      # - default: []
      # - valid: text, audio, image
      in:
        - text
        - audio
        - image
      # out: list of modalities generated by the model
      # - default: []
      # - valid: text, audio, image
      out:
        - text
        - audio
        - image
      # tools: the model supports function calling
      # - default: false
      tools: true

      # reranker: the model supports the /v1/rerank endpoint
      # - default: false
      reranker: false

      # context: the maximum token context length supported
      # - default: 0
      # - must be an integer > 0
      context: 32000

  # Unlisted model example:
  "qwen-unlisted":
    # unlisted: boolean, true or false
    # - optional, default: false
    # - unlisted models do not show up in /v1/models api requests
    # - can be requested as normal through all apis
    unlisted: true
    cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0

  # Docker example:
  # container runtimes like Docker and Podman can be used reliably with
  # a combination of cmd, cmdStop, and ${MODEL_ID}
  "docker-llama":
    proxy: "http://127.0.0.1:${PORT}"
    cmd: |
      docker run --name ${MODEL_ID}
      --init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
      ghcr.io/ggml-org/llama.cpp:server
      --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'

    # cmdStop: command to run to stop the model gracefully
    # - optional, default: ""
    # - useful for stopping commands managed by another system
    # - the upstream's process id is available in the ${PID} macro
    #
    # When empty, llama-swap has this default behaviour:
    # - on POSIX systems: a SIGTERM signal is sent
    # - on Windows, calls taskkill to stop the process
    # - processes have 5 seconds to shutdown until forceful termination is attempted
    cmdStop: docker stop ${MODEL_ID}

# hooks: a dictionary of event triggers and actions
# - optional, default: empty dictionary
# - the only supported hook is on_startup
hooks:
  # on_startup: a dictionary of actions to perform on startup
  # - optional, default: empty dictionary
  # - the only supported action is preload
  on_startup:
    # preload: a list of model ids to load on startup
    # - optional, default: empty list
    # - model names must match keys in the models sections
    # - when preloading multiple models at once, define a group
    #   otherwise models will be loaded and swapped out
    preload:
      - "llama"

# routing:
# Controls how llama-swap decides which models can run at the same time and
# which get swapped out. Choose one of two swap engines:
#
# - group:  the default engine. Simpler to configure. You define groups of
#           models that run together, and loading one group typically unloads
#           the others.
#
# - matrix: the newer engine. More involved to configure, but far more
#           flexible. It uses a small expression language to describe which
#           model combinations are allowed to run concurrently, enabling
#           setups that groups cannot express.
#
# The routing section is optional.
routing:
  router:
    # use: a string defining which engine to use
    # - optional, default: "group"
    # - valid values: group, matrix
    use: group

    # settings: a dictionary of settings for the specific engines
    settings:
      # groups: a dictionary of named groups
      # - optional, default: empty dictionary
      # - lets you keep some models loaded while others swap out
      # - every member must be a model ID defined in the models section
      # - a model can belong to only one group
      # - behaviour is set per group with the `swap`, `exclusive` and
      #   `persistent` fields
      # - see issue #109 for details
      #
      # NOTE: the model names below are illustrative and are not defined above.
      groups:
        # group1 reproduces llama-swap's default behaviour: only one model
        # runs at a time across the entire instance.
        "group1":
          # swap: how members of this group swap among themselves
          # - optional, default: true
          # - true:  only one member runs at a time
          # - false: all members can run together, no swapping
          swap: true

          # exclusive: how this group affects other groups
          # - optional, default: true
          # - true:  running a member unloads every other group
          # - false: running a member leaves other groups untouched
          exclusive: true

          # members: the model IDs in this group
          # required
          members:
            - "llama"
            - "qwen-unlisted"

        # group2: members all run together, but loading any other group
        # unloads them.
        "group2":
          # swap: false lets all members stay loaded at once
          swap: false

          # exclusive: false means requesting a member loads it without
          # unloading any other group
          exclusive: false
          members:
            - "docker-llama"
            - "modelA"
            - "modelB"

        # forever: a persistent group that other groups can never unload.
        "forever":
          # persistent: other groups cannot unload this group's members
          # - optional, default: false
          # - has no effect on swapping within the group
          persistent: true

          # swap/exclusive: false keeps all members loaded and avoids
          # unloading other groups
          swap: false
          exclusive: false
          members:
            - "forever-modelA"
            - "forever-modelB"
            - "forever-modelc"

      # The matrix lists the model combinations that are allowed to run
      # concurrently. When a model is requested, the solver makes room for it
      # by evicting as few running models as possible, preferring to keep the
      # costliest ones loaded.
      #
      # Solver behaviour:
      #   1. A request arrives for model X.
      #   2. If X is already running, forward the request. Done.
      #   3. Collect every set that contains X.
      #   4. For each set, add up the evict_costs of the running models that
      #      are NOT in that set — that is the set's cost.
      #   5. Choose the lowest-cost set. Break ties by definition order.
      #   6. Evict the models outside that set, start X, forward the request.
      #
      # Subset semantics: a set [a, b, c] also permits any subset of itself.
      # Only the requested model is started; the others are not preloaded.
      #
      # A model that appears in no set can only run on its own.
      #
      matrix:
        # vars: short aliases for model IDs (alphanumeric, 1-8 chars)
        # - required: sets and evict_costs reference these names, not model IDs
        # - map each short name to a real model ID (not a model alias)
        # - keeps the set expressions short and readable
        vars:
          g: gemma-model
          q: qwen-model
          m: mistral-model
          v: voxtral-model
          e: reranker-model
          L: llama-70B
          sd: stable-diffusion

        # evict_costs: relative cost of losing a running model (default: 1)
        evict_costs:
          v: 50 # vllm backend, slow cold start
          L: 30 # 70B weights, slow to load

        # sets: named combinations of models that may run together.
        # Each value is an expression built from these operators:
        #   &     AND   (models run together)
        #   |     OR    (alternatives)
        #   ()    grouping
        #   +ref  inline the expression of another set
        #
        # Each expression expands into one or more concrete sets:
        #   "L"                  → [L]
        #   "a & b"              → [a, b]
        #   "a | b"              → [a], [b]
        #   "(a | b) & c"        → [a, c], [b, c]
        #   "(a | b) & (c | d)"  → [a,c], [a,d], [b,c], [b,d]
        #   "+llms & v"          → inline the llms set, then AND with v
        sets:
          # An LLM plus TTS. Switching between g/q/m keeps v loaded.
          # expands to: [g,v], [q,v], [m,v]
          standard: "(g | q | m) & v"

          # An LLM plus TTS plus reranker.
          # expands to: [g,v,e], [q,v,e]
          with_rerank: "(g | q) & v & e"

          # An LLM plus image generation, no TTS.
          # expands to: [g,sd], [q,sd]
          creative: "(g | q) & sd"

          # The 70B model uses every GPU, so it can only run alone.
          # expands to: [L]
          full: "L"

  # scheduler: how queued requests are ordered.
  # The default and only valid scheduler is "fifo"
  scheduler:
    use: fifo
    settings:
      fifo:
        # priority: a dictionary of model ID -> priority
        # - optional, default: empty dictionary
        # - models default to priority 0
        # - higher priority requests are serviced first in the queue
        priority:
          A: 10
          B: 5
          C: 5
          D: 1

# peers: a dictionary of remote peers and models they provide
# - optional, default empty dictionary
# - peers can be another llama-swap
# - peers can be any server that provides the /v1/ generative api endpoints supported by llama-swap
peers:
  # keys is the peer'd ID
  llama-swap-peer:
    # proxy: a valid base URL to proxy requests to
    # - required
    # - requested path to llama-swap will be appended to the end of the proxy value
    proxy: http://192.168.1.23
    # models: a list of models served by the peer
    # - required
    models:
      - model_a
      - model_b
      - embeddings/model_c
  openrouter:
    proxy: https://openrouter.ai/api
    # apiKey: a string key to be injected into the request
    # - optional, default: ""
    # - if blank, no key will be added to the request
    # - key will be injected into headers: Authorization: Bearer <key> and x-api-key: <key>
    # - can be a string or a macro
    apiKey: ${env.OPENROUTER_API_KEY}
    models:
      - meta-llama/llama-3.1-8b-instruct
      - qwen/qwen3-235b-a22b-2507
      - deepseek/deepseek-v3.2
      - z-ai/glm-4.7
      - moonshotai/kimi-k2-0905
      - minimax/minimax-m2.1
    # timeouts: configure proxy connection timeouts for this peer
    # - optional, defaults shown below
    # - useful when the peer runs on slower hardware
    # - set any value to 0 to disable that timeout (not recommended)
    timeouts:
      connect: 30
      keepalive: 30
      responseHeader: 60
      tlsHandshake: 10
      idleConn: 90

    # filters: a dictionary of filter settings for peer requests
    # - optional, default: empty dictionary
    # - same capabilities as model filters (stripParams, setParams)
    filters:
      # stripParams: a comma separated list of parameters to remove from the request
      # - optional, default: ""
      # - useful for removing parameters that the peer doesn't support
      # - the `model` parameter can never be removed
      stripParams: "temperature, top_p"

      # setParams: a dictionary of parameters to set/override in requests to this peer
      # - optional, default: empty dictionary
      # - useful for injecting provider-specific settings like data retention policies
      # - protected params like "model" cannot be overridden
      # - values can be strings, numbers, booleans, arrays, or objects
      setParams:
        # Example: enforce zero-data-retention for OpenRouter
        provider:
          data_collection: "deny"
          zdr: true