92b90447e8
internal/config,server: implement model capabilities - define the capabilities of a model using a simple config block on the model - v1/models renders out capabilities to be compatible with openrouter, huggingface chat, and mistral formats for broader compatibility - add support for capabilities in UI Fixes #734
626 lines
24 KiB
YAML
626 lines
24 KiB
YAML
# add this modeline for validation in vscode
|
|
# yaml-language-server: $schema=https://raw.githubusercontent.com/mostlygeek/llama-swap/refs/heads/main/config-schema.json
|
|
#
|
|
# llama-swap YAML configuration example
|
|
# -------------------------------------
|
|
#
|
|
# 💡 Tip - Use an LLM with this file!
|
|
# ====================================
|
|
# This example configuration is written to be LLM friendly. Try
|
|
# copying this file into an LLM and asking it to explain or generate
|
|
# sections for you.
|
|
# ====================================
|
|
|
|
# Usage notes:
|
|
# - Below are all the available configuration options for llama-swap.
|
|
# - Settings noted as "required" must be in your configuration file
|
|
# - Settings noted as "optional" can be omitted
|
|
|
|
# healthCheckTimeout: number of seconds to wait for a model to be ready to serve requests
|
|
# - optional, default: 120
|
|
# - minimum value is 15 seconds, anything less will be set to this value
|
|
healthCheckTimeout: 500
|
|
|
|
# logLevel: sets the logging value
|
|
# - optional, default: info
|
|
# - Valid log levels: debug, info, warn, error
|
|
logLevel: info
|
|
|
|
# logTimeFormat: enables and sets the logging timestamp format
|
|
# - optional, default (disabled): ""
|
|
# - Valid values: "", "ansic", "unixdate", "rubydate", "rfc822", "rfc822z",
|
|
# "rfc850", "rfc1123", "rfc1123z", "rfc3339", "rfc3339nano", "kitchen",
|
|
# "stamp", "stampmilli", "stampmicro", and "stampnano".
|
|
# - For more info, read: https://pkg.go.dev/time#pkg-constants
|
|
logTimeFormat: ""
|
|
|
|
# logToStdout: controls what is logged to stdout
|
|
# - optional, default: "proxy"
|
|
# - valid values:
|
|
# - "proxy": logs generated by llama-swap when swapping models,
|
|
# handling requests, etc.
|
|
# - "upstream": a copy of an upstream processes stdout logs
|
|
# - "both": both the proxy and upstream logs interleaved together
|
|
# - "none": no logs are ever written to stdout
|
|
logToStdout: "proxy"
|
|
|
|
# metricsMaxInMemory: maximum number of metrics to keep in memory
|
|
# - optional, default: 1000
|
|
# - controls how many metrics are stored in memory before older ones are discarded
|
|
# - useful for limiting memory usage when processing large volumes of metrics
|
|
metricsMaxInMemory: 1000
|
|
|
|
# captureBuffer: how many MBs to allocate for storing request/response captures
|
|
# - optional, default: 10
|
|
# - set to 0 to disable
|
|
captureBuffer: 15
|
|
|
|
# performance: configuration for system monitoring statistics
|
|
# - timing values are duration strings like 1s, 1h30m, 90m, 2h10s, etc.
|
|
performance:
|
|
# disabled: boolean
|
|
# - default: false
|
|
disabled: false
|
|
|
|
# every: delay between polling for new performance statistics
|
|
# - default: 5s
|
|
# - minimum duration 5s
|
|
every: 15s
|
|
|
|
# startPort: sets the starting port number for the automatic ${PORT} macro.
|
|
# - optional, default: 5800
|
|
# - the ${PORT} macro can be used in model.cmd and model.proxy settings
|
|
# - it is automatically incremented for every model that uses it
|
|
startPort: 10001
|
|
|
|
# sendLoadingState: inject loading status updates into the reasoning (thinking)
|
|
# field
|
|
# - optional, default: false
|
|
# - when true, a stream of loading messages will be sent to the client in the
|
|
# reasoning field so chat UIs can show that loading is in progress.
|
|
# - see #366 for more details
|
|
sendLoadingState: true
|
|
|
|
# includeAliasesInList: present aliases within the /v1/models OpenAI API listing
|
|
# - optional, default: false
|
|
# - when true, model aliases will be output to the API model listing duplicating
|
|
# all fields except for Id so chat UIs can use the alias equivalent to the original.
|
|
includeAliasesInList: false
|
|
|
|
# globalTTL: the default TTL in seconds before unloading a model
|
|
# - optional, default: 0 (never automatically unload)
|
|
# - must be >= 0
|
|
globalTTL: 0
|
|
|
|
# macros: a dictionary of string substitutions
|
|
# - optional, default: empty dictionary
|
|
# - macros are reusable snippets
|
|
# - used in a model's cmd, cmdStop, proxy, checkEndpoint, filters.stripParams
|
|
# - useful for reducing common configuration settings
|
|
# - macro names are strings and must be less than 64 characters
|
|
# - macro names must match the regex ^[a-zA-Z0-9_-]+$
|
|
# - macro names must not be a reserved name: PORT or MODEL_ID
|
|
# - macro values can be numbers, bools, or strings
|
|
# - macros can contain other macros, but they must be defined before they are used
|
|
# - environment variables can be referenced with ${env.VAR_NAME} syntax
|
|
# - env macros are substituted first, before regular macros
|
|
# - if the env var is not set, config loading will fail with an error
|
|
macros:
|
|
# Example of a multi-line macro
|
|
"latest-llama": >
|
|
/path/to/llama-server/llama-server-ec9e0301 --port ${PORT}
|
|
|
|
"default_ctx": 4096
|
|
|
|
# Example of macro-in-macro usage. macros can contain other macros
|
|
# but they must be previously declared.
|
|
"default_args": "--ctx-size ${default_ctx}"
|
|
|
|
# Example of environment variable macros
|
|
# - ${env.VAR_NAME} pulls the value from the system environment
|
|
# - useful for paths, secrets, or machine-specific configuration
|
|
"models_dir": "${env.HOME}/models"
|
|
|
|
# apiKeys: require an API key when making requests to inference endpoints
|
|
# - optional, default: []
|
|
# - when empty (the default) authorization will not be checked as llama-swap is default-allow
|
|
# - each key is a non-empty string
|
|
apiKeys:
|
|
- "sk-hunter2"
|
|
# tip, one liner: printf "sk-%s\n" "$(head -c 48 /dev/urandom | base64 )"
|
|
- "sk-gyCPiKUcIfPlaM4OSMZekkprgijPx6+OsmQs8Rsg0xZ9qpy6gKWsIKqHOk+cgXVx"
|
|
|
|
# use environment variable macros to keep secrets out of the config
|
|
- "${env.API_KEY_1}"
|
|
- "${env.API_KEY_2}"
|
|
|
|
# models: a dictionary of model configurations
|
|
# - required
|
|
# - each key is the model's ID, used in API requests
|
|
# - model settings have default values that are used if they are not defined here
|
|
# - the model's ID is available in the ${MODEL_ID} macro, also available in macros defined above
|
|
# - below are examples of the all the settings a model can have
|
|
models:
|
|
# keys are the model names used in API requests
|
|
"gpt-oss-120b":
|
|
# macros: a dictionary of string substitutions specific to this model
|
|
# - optional, default: empty dictionary
|
|
# - macros defined here override macros defined in the global macros section
|
|
# - model level macros follow the same rules as global macros
|
|
macros:
|
|
"default_ctx": 16384
|
|
"temp": 0.7
|
|
|
|
# cmd: the command to run to start the inference server.
|
|
# - required
|
|
# - it is just a string, similar to what you would run on the CLI
|
|
# - using `|` allows for comments in the command, these will be parsed out
|
|
# - macros can be used within cmd
|
|
cmd: |
|
|
# ${latest-llama} is a macro that is defined above
|
|
${latest-llama}
|
|
--model path/to/gpt-oss-120B.gguf
|
|
--ctx-size ${default_ctx}
|
|
--temperature ${temp}
|
|
|
|
# name: a display name for the model
|
|
# - optional, default: empty string
|
|
# - if set, it will be used in the v1/models API response
|
|
# - if not set, it will be omitted in the JSON model record
|
|
name: "gpt-oss 120B"
|
|
|
|
# description: a description for the model
|
|
# - optional, default: empty string
|
|
# - if set, it will be used in the v1/models API response
|
|
# - if not set, it will be omitted in the JSON model record
|
|
description: "A thinking model from OpenAI"
|
|
|
|
# env: define an array of environment variables to inject into cmd's environment
|
|
# - optional, default: empty array
|
|
# - each value is a single string
|
|
# - in the format: ENV_NAME=value
|
|
env:
|
|
- "CUDA_VISIBLE_DEVICES=0,1,2"
|
|
|
|
# proxy: the URL where llama-swap routes API requests
|
|
# - optional, default: http://localhost:${PORT}
|
|
# - if you used ${PORT} in cmd this can be omitted
|
|
# - if you use a custom port in cmd this *must* be set
|
|
proxy: http://127.0.0.1:8999
|
|
|
|
# checkEndpoint: URL path to check if the server is ready
|
|
# - optional, default: /health
|
|
# - endpoint is expected to return an HTTP 200 response
|
|
# - all requests wait until the endpoint is ready or fails
|
|
# - use "none" to skip endpoint health checking
|
|
checkEndpoint: /custom-endpoint
|
|
|
|
# ttl: automatically unload the model after ttl seconds
|
|
# - optional, default: -1 (use global default)
|
|
# - ttl values must be a value greater than or equal to 0
|
|
# - a ttl of -1 will use the global TTL value as the default
|
|
# - a ttl of 0 will mean never unload
|
|
# - a value of 0 disables automatic unloading of the model
|
|
ttl: 60
|
|
|
|
# useModelName: override the model name that is sent to upstream server
|
|
# - optional, default: ""
|
|
# - useful for when the upstream server expects a specific model name that
|
|
# is different from the model's ID
|
|
useModelName: "openai/gpt-oss-120B"
|
|
|
|
# filters: a dictionary of filter settings
|
|
# - optional, default: empty dictionary
|
|
# - same capabilities as peer filters (stripParams, setParams)
|
|
filters:
|
|
# stripParams: a comma separated list of parameters to remove from the request
|
|
# - optional, default: ""
|
|
# - useful for server side enforcement of sampling parameters
|
|
# - the `model` parameter can never be removed
|
|
# - can be any JSON key in the request body
|
|
# - recommended to stick to sampling parameters
|
|
stripParams: "temperature, top_p, top_k"
|
|
|
|
# setParams: a dictionary of parameters to set/override in requests
|
|
# - optional, default: empty dictionary
|
|
# - useful for enforcing specific parameter values
|
|
# - protected params like "model" cannot be overridden
|
|
# - values can be strings, numbers, booleans, arrays, or objects
|
|
# - always runs for the model
|
|
setParams:
|
|
# Example: enforce specific sampling parameters
|
|
temperature: 0.7
|
|
top_p: 0.9
|
|
|
|
# setParamsByID: a dictionary of parameters to set based the model ID
|
|
# - optional, default: empty dictionary
|
|
# - combine with aliases to create variant behaviour without reloading the model
|
|
# - parameters are set in the request body JSON
|
|
# - run after setParams so it will override any settings
|
|
# - protected params like "model" cannot be overridden
|
|
# - values can be strings, numbers, booleans, arrays, or objects
|
|
# - model aliases will be automatically created for each key
|
|
setParamsByID:
|
|
"${MODEL_ID}":
|
|
chat_template_kwargs:
|
|
reasoning_effort: medium
|
|
"${MODEL_ID}:high":
|
|
chat_template_kwargs:
|
|
reasoning_effort: high
|
|
"${MODEL_ID}:low":
|
|
chat_template_kwargs:
|
|
reasoning_effort: low
|
|
|
|
# aliases: alternative model names that this model configuration is used for
|
|
# - optional, default: empty array
|
|
# - aliases must be unique globally
|
|
# - useful for impersonating a specific model
|
|
aliases:
|
|
- "gpt-4o-mini"
|
|
|
|
# metadata: a dictionary of arbitrary values that are included in /v1/models
|
|
# - optional, default: empty dictionary
|
|
# - while metadata can contains complex types it is recommended to keep it simple
|
|
# - metadata is only passed through in /v1/models responses
|
|
metadata:
|
|
# port will remain an integer
|
|
port: ${PORT}
|
|
|
|
# the ${temp} macro will remain a float
|
|
temperature: ${temp}
|
|
note: "The ${MODEL_ID} is running on port ${PORT} temp=${temp},
|
|
context=${default_ctx}"
|
|
|
|
a_list:
|
|
- 1
|
|
- 1.23
|
|
- "macros are OK in list and dictionary types: ${MODEL_ID}"
|
|
|
|
an_obj:
|
|
a: "1"
|
|
b: 2
|
|
# objects can contain complex types with macro substitution
|
|
# becomes: c: [0.7, false, "model: llama"]
|
|
c: ["${temp}", false, "model: ${MODEL_ID}"]
|
|
|
|
# concurrencyLimit: overrides the allowed number of active parallel requests to a model
|
|
# - optional, default: 0
|
|
# - useful for limiting the number of active parallel requests a model can process
|
|
# - must be set per model
|
|
# - any number greater than 0 will override the internal default value of 10
|
|
# - any requests that exceeds the limit will receive an HTTP 429 Too Many Requests response
|
|
# - recommended to be omitted and the default used
|
|
concurrencyLimit: 0
|
|
|
|
# sendLoadingState: overrides the global sendLoadingState setting for this model
|
|
# - optional, default: undefined (use global setting)
|
|
sendLoadingState: false
|
|
|
|
# timeouts: configure proxy connection timeouts for this model
|
|
# - optional, defaults shown below
|
|
# - useful for models running on slower hardware that need longer timeouts
|
|
# - connect: TCP dial connection timeout in seconds, default: 30 seconds
|
|
# - keepalive: TCP connection keepalive timeout, default: 30 seconds
|
|
# - responseHeader: time to wait for response headers in seconds, default: 0 (no timeout)
|
|
# - tlsHandshake: TLS handshake timeout in seconds, default: 10 seconds
|
|
# - idleConn: idle connection timeout in seconds, default: 90 seconds
|
|
# - set any value to 0 to disable that timeout (not recommended)
|
|
timeouts:
|
|
connect: 30
|
|
keepalive: 0
|
|
responseHeader: 60
|
|
tlsHandshake: 10
|
|
idleConn: 90
|
|
|
|
# capabilities: defines what the model accepts for input, output and other metadata
|
|
# - optional; omitted or all-zero means no capabilities
|
|
# - used in v1/models to inform clients what the model can do
|
|
capabilities:
|
|
# in: list of modalities understood by the model
|
|
# - default: []
|
|
# - valid: text, audio, image
|
|
in:
|
|
- text
|
|
- audio
|
|
- image
|
|
# out: list of modalities generated by the model
|
|
# - default: []
|
|
# - valid: text, audio, image
|
|
out:
|
|
- text
|
|
- audio
|
|
- image
|
|
# tools: the model supports function calling
|
|
# - default: false
|
|
tools: true
|
|
|
|
# reranker: the model supports the /v1/rerank endpoint
|
|
# - default: false
|
|
reranker: false
|
|
|
|
# context: the maximum token context length supported
|
|
# - default: 0
|
|
# - must be an integer > 0
|
|
context: 32000
|
|
|
|
# Unlisted model example:
|
|
"qwen-unlisted":
|
|
# unlisted: boolean, true or false
|
|
# - optional, default: false
|
|
# - unlisted models do not show up in /v1/models api requests
|
|
# - can be requested as normal through all apis
|
|
unlisted: true
|
|
cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
|
|
|
|
# Docker example:
|
|
# container runtimes like Docker and Podman can be used reliably with
|
|
# a combination of cmd, cmdStop, and ${MODEL_ID}
|
|
"docker-llama":
|
|
proxy: "http://127.0.0.1:${PORT}"
|
|
cmd: |
|
|
docker run --name ${MODEL_ID}
|
|
--init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
|
|
ghcr.io/ggml-org/llama.cpp:server
|
|
--model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
|
|
|
|
# cmdStop: command to run to stop the model gracefully
|
|
# - optional, default: ""
|
|
# - useful for stopping commands managed by another system
|
|
# - the upstream's process id is available in the ${PID} macro
|
|
#
|
|
# When empty, llama-swap has this default behaviour:
|
|
# - on POSIX systems: a SIGTERM signal is sent
|
|
# - on Windows, calls taskkill to stop the process
|
|
# - processes have 5 seconds to shutdown until forceful termination is attempted
|
|
cmdStop: docker stop ${MODEL_ID}
|
|
|
|
# hooks: a dictionary of event triggers and actions
|
|
# - optional, default: empty dictionary
|
|
# - the only supported hook is on_startup
|
|
hooks:
|
|
# on_startup: a dictionary of actions to perform on startup
|
|
# - optional, default: empty dictionary
|
|
# - the only supported action is preload
|
|
on_startup:
|
|
# preload: a list of model ids to load on startup
|
|
# - optional, default: empty list
|
|
# - model names must match keys in the models sections
|
|
# - when preloading multiple models at once, define a group
|
|
# otherwise models will be loaded and swapped out
|
|
preload:
|
|
- "llama"
|
|
|
|
# routing:
|
|
# Controls how llama-swap decides which models can run at the same time and
|
|
# which get swapped out. Choose one of two swap engines:
|
|
#
|
|
# - group: the default engine. Simpler to configure. You define groups of
|
|
# models that run together, and loading one group typically unloads
|
|
# the others.
|
|
#
|
|
# - matrix: the newer engine. More involved to configure, but far more
|
|
# flexible. It uses a small expression language to describe which
|
|
# model combinations are allowed to run concurrently, enabling
|
|
# setups that groups cannot express.
|
|
#
|
|
# The routing section is optional.
|
|
routing:
|
|
router:
|
|
# use: a string defining which engine to use
|
|
# - optional, default: "group"
|
|
# - valid values: group, matrix
|
|
use: group
|
|
|
|
# settings: a dictionary of settings for the specific engines
|
|
settings:
|
|
# groups: a dictionary of named groups
|
|
# - optional, default: empty dictionary
|
|
# - lets you keep some models loaded while others swap out
|
|
# - every member must be a model ID defined in the models section
|
|
# - a model can belong to only one group
|
|
# - behaviour is set per group with the `swap`, `exclusive` and
|
|
# `persistent` fields
|
|
# - see issue #109 for details
|
|
#
|
|
# NOTE: the model names below are illustrative and are not defined above.
|
|
groups:
|
|
# group1 reproduces llama-swap's default behaviour: only one model
|
|
# runs at a time across the entire instance.
|
|
"group1":
|
|
# swap: how members of this group swap among themselves
|
|
# - optional, default: true
|
|
# - true: only one member runs at a time
|
|
# - false: all members can run together, no swapping
|
|
swap: true
|
|
|
|
# exclusive: how this group affects other groups
|
|
# - optional, default: true
|
|
# - true: running a member unloads every other group
|
|
# - false: running a member leaves other groups untouched
|
|
exclusive: true
|
|
|
|
# members: the model IDs in this group
|
|
# required
|
|
members:
|
|
- "llama"
|
|
- "qwen-unlisted"
|
|
|
|
# group2: members all run together, but loading any other group
|
|
# unloads them.
|
|
"group2":
|
|
# swap: false lets all members stay loaded at once
|
|
swap: false
|
|
|
|
# exclusive: false means requesting a member loads it without
|
|
# unloading any other group
|
|
exclusive: false
|
|
members:
|
|
- "docker-llama"
|
|
- "modelA"
|
|
- "modelB"
|
|
|
|
# forever: a persistent group that other groups can never unload.
|
|
"forever":
|
|
# persistent: other groups cannot unload this group's members
|
|
# - optional, default: false
|
|
# - has no effect on swapping within the group
|
|
persistent: true
|
|
|
|
# swap/exclusive: false keeps all members loaded and avoids
|
|
# unloading other groups
|
|
swap: false
|
|
exclusive: false
|
|
members:
|
|
- "forever-modelA"
|
|
- "forever-modelB"
|
|
- "forever-modelc"
|
|
|
|
# The matrix lists the model combinations that are allowed to run
|
|
# concurrently. When a model is requested, the solver makes room for it
|
|
# by evicting as few running models as possible, preferring to keep the
|
|
# costliest ones loaded.
|
|
#
|
|
# Solver behaviour:
|
|
# 1. A request arrives for model X.
|
|
# 2. If X is already running, forward the request. Done.
|
|
# 3. Collect every set that contains X.
|
|
# 4. For each set, add up the evict_costs of the running models that
|
|
# are NOT in that set — that is the set's cost.
|
|
# 5. Choose the lowest-cost set. Break ties by definition order.
|
|
# 6. Evict the models outside that set, start X, forward the request.
|
|
#
|
|
# Subset semantics: a set [a, b, c] also permits any subset of itself.
|
|
# Only the requested model is started; the others are not preloaded.
|
|
#
|
|
# A model that appears in no set can only run on its own.
|
|
#
|
|
matrix:
|
|
# vars: short aliases for model IDs (alphanumeric, 1-8 chars)
|
|
# - required: sets and evict_costs reference these names, not model IDs
|
|
# - map each short name to a real model ID (not a model alias)
|
|
# - keeps the set expressions short and readable
|
|
vars:
|
|
g: gemma-model
|
|
q: qwen-model
|
|
m: mistral-model
|
|
v: voxtral-model
|
|
e: reranker-model
|
|
L: llama-70B
|
|
sd: stable-diffusion
|
|
|
|
# evict_costs: relative cost of losing a running model (default: 1)
|
|
evict_costs:
|
|
v: 50 # vllm backend, slow cold start
|
|
L: 30 # 70B weights, slow to load
|
|
|
|
# sets: named combinations of models that may run together.
|
|
# Each value is an expression built from these operators:
|
|
# & AND (models run together)
|
|
# | OR (alternatives)
|
|
# () grouping
|
|
# +ref inline the expression of another set
|
|
#
|
|
# Each expression expands into one or more concrete sets:
|
|
# "L" → [L]
|
|
# "a & b" → [a, b]
|
|
# "a | b" → [a], [b]
|
|
# "(a | b) & c" → [a, c], [b, c]
|
|
# "(a | b) & (c | d)" → [a,c], [a,d], [b,c], [b,d]
|
|
# "+llms & v" → inline the llms set, then AND with v
|
|
sets:
|
|
# An LLM plus TTS. Switching between g/q/m keeps v loaded.
|
|
# expands to: [g,v], [q,v], [m,v]
|
|
standard: "(g | q | m) & v"
|
|
|
|
# An LLM plus TTS plus reranker.
|
|
# expands to: [g,v,e], [q,v,e]
|
|
with_rerank: "(g | q) & v & e"
|
|
|
|
# An LLM plus image generation, no TTS.
|
|
# expands to: [g,sd], [q,sd]
|
|
creative: "(g | q) & sd"
|
|
|
|
# The 70B model uses every GPU, so it can only run alone.
|
|
# expands to: [L]
|
|
full: "L"
|
|
|
|
# scheduler: how queued requests are ordered.
|
|
# The default and only valid scheduler is "fifo"
|
|
scheduler:
|
|
use: fifo
|
|
settings:
|
|
fifo:
|
|
# priority: a dictionary of model ID -> priority
|
|
# - optional, default: empty dictionary
|
|
# - models default to priority 0
|
|
# - higher priority requests are serviced first in the queue
|
|
priority:
|
|
A: 10
|
|
B: 5
|
|
C: 5
|
|
D: 1
|
|
|
|
# peers: a dictionary of remote peers and models they provide
|
|
# - optional, default empty dictionary
|
|
# - peers can be another llama-swap
|
|
# - peers can be any server that provides the /v1/ generative api endpoints supported by llama-swap
|
|
peers:
|
|
# keys is the peer'd ID
|
|
llama-swap-peer:
|
|
# proxy: a valid base URL to proxy requests to
|
|
# - required
|
|
# - requested path to llama-swap will be appended to the end of the proxy value
|
|
proxy: http://192.168.1.23
|
|
# models: a list of models served by the peer
|
|
# - required
|
|
models:
|
|
- model_a
|
|
- model_b
|
|
- embeddings/model_c
|
|
openrouter:
|
|
proxy: https://openrouter.ai/api
|
|
# apiKey: a string key to be injected into the request
|
|
# - optional, default: ""
|
|
# - if blank, no key will be added to the request
|
|
# - key will be injected into headers: Authorization: Bearer <key> and x-api-key: <key>
|
|
# - can be a string or a macro
|
|
apiKey: ${env.OPENROUTER_API_KEY}
|
|
models:
|
|
- meta-llama/llama-3.1-8b-instruct
|
|
- qwen/qwen3-235b-a22b-2507
|
|
- deepseek/deepseek-v3.2
|
|
- z-ai/glm-4.7
|
|
- moonshotai/kimi-k2-0905
|
|
- minimax/minimax-m2.1
|
|
# timeouts: configure proxy connection timeouts for this peer
|
|
# - optional, defaults shown below
|
|
# - useful when the peer runs on slower hardware
|
|
# - set any value to 0 to disable that timeout (not recommended)
|
|
timeouts:
|
|
connect: 30
|
|
keepalive: 30
|
|
responseHeader: 60
|
|
tlsHandshake: 10
|
|
idleConn: 90
|
|
|
|
# filters: a dictionary of filter settings for peer requests
|
|
# - optional, default: empty dictionary
|
|
# - same capabilities as model filters (stripParams, setParams)
|
|
filters:
|
|
# stripParams: a comma separated list of parameters to remove from the request
|
|
# - optional, default: ""
|
|
# - useful for removing parameters that the peer doesn't support
|
|
# - the `model` parameter can never be removed
|
|
stripParams: "temperature, top_p"
|
|
|
|
# setParams: a dictionary of parameters to set/override in requests to this peer
|
|
# - optional, default: empty dictionary
|
|
# - useful for injecting provider-specific settings like data retention policies
|
|
# - protected params like "model" cannot be overridden
|
|
# - values can be strings, numbers, booleans, arrays, or objects
|
|
setParams:
|
|
# Example: enforce zero-data-retention for OpenRouter
|
|
provider:
|
|
data_collection: "deny"
|
|
zdr: true
|