Compare commits

...

11 Commits

Author SHA1 Message Date
Benson Wong d87f0ce2c5 docker/unified: publish rootless image variant (#630) 2026-04-07 03:05:53 -07:00
Leoy 06bc6a614c proxy: preserve wall-clock duration in metrics (#629)
Keep request duration from being underreported when upstream timings
only cover part of the full request lifecycle.

- compare wall-clock and upstream timing durations
- keep token and throughput values from timings
- add regression coverage for underreported timings

fixes #602
2026-04-07 01:52:41 -07:00
Ron M a37b4866d8 proxy: add configurable HTTP timeouts for models and peers (#619)
Add configurable HTTP timeout settings to both models and peers to support installations that requires longer timeouts than the current hardcoded defaults.

Closes #618
2026-04-06 19:30:27 +08:00
Benson Wong 981910d734 ci: validate config.example.yaml against config-schema.json (#627)
Extend the existing config-schema workflow to also validate
config.example.yaml against config-schema.json using check-jsonschema.

- add config.example.yaml to PR and push path triggers
- install check-jsonschema via pip
- run validation of config.example.yaml against schema

https://claude.ai/code/session_01Y1oqwE6mwNs9UTJgZRgXtG

---------

Co-authored-by: Claude <noreply@anthropic.com>
2026-04-05 15:17:57 +08:00
Benson Wong a185efe37e docker: make CMAKE_CUDA_ARCHITECTURES configurable via build arg (#625)
Expose CMAKE_CUDA_ARCHITECTURES as a Docker build ARG so users can
customize CUDA architectures via --build-arg without editing the
Dockerfile.

- convert hardcoded ENV to ARG with default, feeding into ENV
- replace silent fallback defaults (:-) in scripts with :? guards
  to fail fast if the env var is missing
- add usage example to Dockerfile header

Follow up to: #624

https://claude.ai/code/session_01EWiUe7jNABX7Uz95dUGJqK

Co-authored-by: Claude <noreply@anthropic.com>
2026-04-04 08:49:59 +08:00
Benson Wong 1dd1aadf93 docker/unified: add ik_llama.cpp to CUDA container (#620) 2026-04-03 15:16:30 +08:00
Benson Wong 955900972a add /sdapi to list of supported endpoints 2026-04-01 12:01:38 +08:00
Benson Wong c2c8cfaf81 docker/unified: build llama.cpp with static libraries (#616) 2026-04-01 03:38:07 +08:00
Benson Wong 1e440770ea ci: fix matrix exclude for scheduled docker workflow (#610) 2026-03-29 20:04:28 +09:00
Benson Wong c794273c83 docker/unified,.github: fix unified build (#606) 2026-03-27 10:31:12 +09:00
dependabot[bot] 6574a52cbb build(deps): bump picomatch from 4.0.3 to 4.0.4 in /ui-svelte (#605) 2026-03-26 22:28:24 +09:00
25 changed files with 670 additions and 57 deletions
+15
View File
@@ -4,11 +4,15 @@ on:
pull_request: pull_request:
paths: paths:
- "config-schema.json" - "config-schema.json"
- "config.example.yaml"
- ".github/workflows/config-schema.yml"
push: push:
branches: branches:
- main - main
paths: paths:
- "config-schema.json" - "config-schema.json"
- "config.example.yaml"
- ".github/workflows/config-schema.yml"
workflow_dispatch: workflow_dispatch:
@@ -39,3 +43,14 @@ jobs:
fi fi
echo "✓ config-schema.json is valid" echo "✓ config-schema.json is valid"
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.x"
- name: Install check-jsonschema
run: pip install check-jsonschema
- name: Validate config.example.yaml against schema
run: check-jsonschema --schemafile config-schema.json config.example.yaml
+40 -11
View File
@@ -18,6 +18,10 @@ on:
description: "stable-diffusion.cpp commit hash, tag, or branch" description: "stable-diffusion.cpp commit hash, tag, or branch"
required: false required: false
default: "master" default: "master"
ik_llama_ref:
description: "ik_llama.cpp commit hash, tag, or branch (CUDA only)"
required: false
default: "main"
llama_swap_version: llama_swap_version:
description: "llama-swap version (e.g. v198, latest, main)" description: "llama-swap version (e.g. v198, latest, main)"
required: false required: false
@@ -38,17 +42,39 @@ permissions:
packages: write packages: write
jobs: jobs:
setup:
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- id: set-matrix
run: |
backends=()
# schedule uses defaults (build both); workflow_dispatch respects inputs
if [[ "${{ github.event_name }}" == "schedule" ]] || [[ "${{ inputs.build_cuda }}" == "true" ]]; then
backends+=("cuda")
fi
if [[ "${{ github.event_name }}" == "schedule" ]] || [[ "${{ inputs.build_vulkan }}" == "true" ]]; then
backends+=("vulkan")
fi
matrix=$(printf '%s\n' "${backends[@]}" | jq -R . | jq -sc .)
echo "matrix=$matrix" >> $GITHUB_OUTPUT
build: build:
needs: setup
if: ${{ needs.setup.outputs.matrix != '[]' }}
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
backend: backend: ${{ fromJSON(needs.setup.outputs.matrix) }}
- cuda variant:
- vulkan - name: root
exclude: uid: "0"
- backend: ${{ inputs.build_cuda == false && 'cuda' || 'none' }} suffix: ""
- backend: ${{ inputs.build_vulkan == false && 'vulkan' || 'none' }} - name: rootless
uid: "10001"
suffix: "-rootless"
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
@@ -80,13 +106,15 @@ jobs:
username: ${{ github.actor }} username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }} password: ${{ secrets.GITHUB_TOKEN }}
- name: Build unified Docker image (${{ matrix.backend }}) - name: Build unified Docker image (${{ matrix.backend }}, ${{ matrix.variant.name }})
env: env:
LLAMA_REF: ${{ inputs.llama_cpp_ref || 'master' }} LLAMA_REF: ${{ inputs.llama_cpp_ref || 'master' }}
WHISPER_REF: ${{ inputs.whisper_ref || 'master' }} WHISPER_REF: ${{ inputs.whisper_ref || 'master' }}
SD_REF: ${{ inputs.sd_ref || 'master' }} SD_REF: ${{ inputs.sd_ref || 'master' }}
IK_LLAMA_REF: ${{ inputs.ik_llama_ref || 'main' }}
LS_VERSION: ${{ inputs.llama_swap_version || 'main' }} LS_VERSION: ${{ inputs.llama_swap_version || 'main' }}
DOCKER_IMAGE_TAG: ghcr.io/mostlygeek/llama-swap:unified-${{ matrix.backend }} RUN_UID: ${{ matrix.variant.uid }}
DOCKER_IMAGE_TAG: ghcr.io/mostlygeek/llama-swap:unified-${{ matrix.backend }}${{ matrix.variant.suffix }}
# When running under act, use the local builder that has warm ccache. # When running under act, use the local builder that has warm ccache.
# On GitHub Actions, BUILDX_BUILDER is unset so docker uses the builder # On GitHub Actions, BUILDX_BUILDER is unset so docker uses the builder
# created by setup-buildx-action above. # created by setup-buildx-action above.
@@ -98,7 +126,8 @@ jobs:
- name: Push to GitHub Container Registry - name: Push to GitHub Container Registry
if: ${{ !env.ACT }} if: ${{ !env.ACT }}
run: | run: |
docker push ghcr.io/mostlygeek/llama-swap:unified-${{ matrix.backend }} TAG="ghcr.io/mostlygeek/llama-swap:unified-${{ matrix.backend }}${{ matrix.variant.suffix }}"
docker push "${TAG}"
DATE_TAG=$(date -u +%Y-%m-%d) DATE_TAG=$(date -u +%Y-%m-%d)
docker tag ghcr.io/mostlygeek/llama-swap:unified-${{ matrix.backend }} ghcr.io/mostlygeek/llama-swap:unified-${{ matrix.backend }}-${DATE_TAG} docker tag "${TAG}" "${TAG}-${DATE_TAG}"
docker push ghcr.io/mostlygeek/llama-swap:unified-${{ matrix.backend }}-${DATE_TAG} docker push "${TAG}-${DATE_TAG}"
+1
View File
@@ -21,6 +21,7 @@ llama-swap is a light weight, transparent proxy server that provides automatic m
- Follow test naming conventions like `TestProxyManager_<test name>`, `TestProcessGroup_<test name>`, etc. - Follow test naming conventions like `TestProxyManager_<test name>`, `TestProcessGroup_<test name>`, etc.
- Use `go test -v -run <name pattern for new tests>` to run any new tests you've written. - Use `go test -v -run <name pattern for new tests>` to run any new tests you've written.
- Run `gofmt -l .` before committing to verify formatting. Fix any reported files with `gofmt -w <file>`.
- Use `make test-dev` after running new tests for a quick over all test run. This runs `go test` and `staticcheck`. Fix any static checking errors. Use this only when changes are made to any code under the `proxy/` directory - Use `make test-dev` after running new tests for a quick over all test run. This runs `go test` and `staticcheck`. Fix any static checking errors. Use this only when changes are made to any code under the `proxy/` directory
- Use `make test-all` before completing work. This includes long running concurrency tests. - Use `make test-all` before completing work. This includes long running concurrency tests.
+4
View File
@@ -32,6 +32,10 @@ Built in Go for performance and simplicity, llama-swap has zero dependencies and
- `v1/rerank`, `v1/reranking`, `/rerank` - `v1/rerank`, `v1/reranking`, `/rerank`
- `/infill` - for code infilling - `/infill` - for code infilling
- `/completion` - for completion endpoint - `/completion` - for completion endpoint
- ✅ SDAPI via [stable-diffusion.cpp's server](https://github.com/leejet/stable-diffusion.cpp/tree/master/examples/server)
- `/sdapi/v1/txt2img`
- `/sdapi/v1/img2img`
- `/sdapi/v1/loras` - requires `model` in request body to fetch the correct loras
- ✅ llama-swap API - ✅ llama-swap API
- `/ui` - web UI - `/ui` - web UI
- `/upstream/:model_id` - direct access to upstream server ([demo](https://github.com/mostlygeek/llama-swap/pull/31)) - `/upstream/:model_id` - direct access to upstream server ([demo](https://github.com/mostlygeek/llama-swap/pull/31))
+71
View File
@@ -39,6 +39,43 @@
}, },
"default": {}, "default": {},
"description": "A dictionary of string substitutions. Macros are reusable snippets used in model cmd, cmdStop, proxy, checkEndpoint, filters.stripParams. Macro names must be <64 chars, match ^[a-zA-Z0-9_-]+$, and not be PORT or MODEL_ID. Values can be string, number, or boolean. Macros can reference other macros defined before them." "description": "A dictionary of string substitutions. Macros are reusable snippets used in model cmd, cmdStop, proxy, checkEndpoint, filters.stripParams. Macro names must be <64 chars, match ^[a-zA-Z0-9_-]+$, and not be PORT or MODEL_ID. Values can be string, number, or boolean. Macros can reference other macros defined before them."
},
"timeouts": {
"type": "object",
"properties": {
"connect": {
"type": "integer",
"minimum": 0,
"default": 30,
"description": "TCP connection timeout in seconds. Set to 0 to disable (not recommended)."
},
"responseHeader": {
"type": "integer",
"minimum": 0,
"default": 60,
"description": "Time to wait for response headers in seconds. Set to 0 to disable (not recommended)."
},
"tlsHandshake": {
"type": "integer",
"minimum": 0,
"default": 10,
"description": "TLS handshake timeout in seconds. Set to 0 to disable (not recommended)."
},
"expectContinue": {
"type": "integer",
"minimum": 0,
"default": 1,
"description": "Expect-Continue timeout in seconds. Set to 0 to disable (not recommended)."
},
"idleConn": {
"type": "integer",
"minimum": 0,
"default": 90,
"description": "Idle connection timeout in seconds. Set to 0 to disable (not recommended)."
}
},
"additionalProperties": false,
"description": "Timeout settings for proxy connections."
} }
}, },
"properties": { "properties": {
@@ -241,6 +278,9 @@
"type": "boolean", "type": "boolean",
"default": false, "default": false,
"description": "If true the model will not show up in /v1/models responses. It can still be used as normal in API requests." "description": "If true the model will not show up in /v1/models responses. It can still be used as normal in API requests."
},
"timeouts": {
"$ref": "#/definitions/timeouts"
} }
} }
} }
@@ -367,6 +407,37 @@
"additionalProperties": false, "additionalProperties": false,
"default": {}, "default": {},
"description": "Dictionary of filter settings for peer requests. Supports stripParams and setParams." "description": "Dictionary of filter settings for peer requests. Supports stripParams and setParams."
},
"timeouts": {
"type": "object",
"properties": {
"connect": {
"type": "integer",
"minimum": 1,
"default": 30,
"description": "TCP connection timeout in seconds."
},
"responseHeader": {
"type": "integer",
"minimum": 1,
"default": 60,
"description": "Time to wait for response headers in seconds."
},
"tlsHandshake": {
"type": "integer",
"minimum": 1,
"default": 10,
"description": "TLS handshake timeout in seconds."
},
"idleConn": {
"type": "integer",
"minimum": 1,
"default": 90,
"description": "Idle connection timeout in seconds."
}
},
"additionalProperties": false,
"description": "Timeout settings for proxy connections to this peer."
} }
} }
}, },
+25
View File
@@ -284,6 +284,21 @@ models:
# - optional, default: undefined (use global setting) # - optional, default: undefined (use global setting)
sendLoadingState: false sendLoadingState: false
# timeouts: configure proxy connection timeouts for this model
# - optional, defaults shown below
# - useful for models running on slower hardware that need longer timeouts
# - connect: TCP connection timeout in seconds
# - responseHeader: time to wait for response headers in seconds
# (increasing this helps avoid 502 errors on slow hardware)
# - tlsHandshake: TLS handshake timeout in seconds
# - idleConn: idle connection timeout in seconds
# - set any value to 0 to disable that timeout (not recommended)
timeouts:
connect: 30
responseHeader: 60
tlsHandshake: 10
idleConn: 90
# Unlisted model example: # Unlisted model example:
"qwen-unlisted": "qwen-unlisted":
# unlisted: boolean, true or false # unlisted: boolean, true or false
@@ -426,6 +441,16 @@ peers:
- z-ai/glm-4.7 - z-ai/glm-4.7
- moonshotai/kimi-k2-0905 - moonshotai/kimi-k2-0905
- minimax/minimax-m2.1 - minimax/minimax-m2.1
# timeouts: configure proxy connection timeouts for this peer
# - optional, defaults shown below
# - useful when the peer runs on slower hardware
# - set any value to 0 to disable that timeout (not recommended)
timeouts:
connect: 30
responseHeader: 60
tlsHandshake: 10
idleConn: 90
# filters: a dictionary of filter settings for peer requests # filters: a dictionary of filter settings for peer requests
# - optional, default: empty dictionary # - optional, default: empty dictionary
# - same capabilities as model filters (stripParams, setParams) # - same capabilities as model filters (stripParams, setParams)
+49 -14
View File
@@ -4,6 +4,7 @@
# Usage: # Usage:
# docker buildx build --build-arg BACKEND=cuda -t llama-swap:unified-cuda . # docker buildx build --build-arg BACKEND=cuda -t llama-swap:unified-cuda .
# docker buildx build --build-arg BACKEND=vulkan -t llama-swap:unified-vulkan . # docker buildx build --build-arg BACKEND=vulkan -t llama-swap:unified-vulkan .
# docker buildx build --build-arg BACKEND=cuda --build-arg CMAKE_CUDA_ARCHITECTURES="86;89" -t llama-swap:unified-cuda .
# #
# Each project has its own install script that handles cloning, building, # Each project has its own install script that handles cloning, building,
# and installing binaries. Build stages are independent for cache efficiency. # and installing binaries. Build stages are independent for cache efficiency.
@@ -12,10 +13,11 @@ ARG BACKEND=cuda
# ── Builder bases ────────────────────────────────────────────────────── # ── Builder bases ──────────────────────────────────────────────────────
FROM nvidia/cuda:12.4.0-devel-ubuntu22.04 AS builder-base-cuda FROM nvidia/cuda:12.9.1-devel-ubuntu24.04 AS builder-base-cuda
ARG CMAKE_CUDA_ARCHITECTURES="60;61;75;86;89"
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ENV CMAKE_CUDA_ARCHITECTURES="60;61;75;86;89" ENV CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}
ENV CCACHE_DIR=/ccache ENV CCACHE_DIR=/ccache
ENV CCACHE_MAXSIZE=2G ENV CCACHE_MAXSIZE=2G
ENV PATH="/usr/lib/ccache:${PATH}" ENV PATH="/usr/lib/ccache:${PATH}"
@@ -29,7 +31,7 @@ WORKDIR /build
# ── # ──
FROM ubuntu:26.04 AS builder-base-vulkan FROM ubuntu:24.04 AS builder-base-vulkan
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ENV CCACHE_DIR=/ccache ENV CCACHE_DIR=/ccache
@@ -78,6 +80,27 @@ RUN --mount=type=cache,id=ccache-${BACKEND},target=/ccache \
--mount=type=cache,id=llama-${BACKEND},target=/src/llama.cpp/build \ --mount=type=cache,id=llama-${BACKEND},target=/src/llama.cpp/build \
BACKEND=${BACKEND} bash /build/install-llama.sh "${LLAMA_COMMIT_HASH}" BACKEND=${BACKEND} bash /build/install-llama.sh "${LLAMA_COMMIT_HASH}"
# ── Build ik_llama.cpp (CUDA only) ────────────────────────────────────
#
# Two named stages allow ARG BACKEND to select at build time:
# - ik-llama-cuda : real build (from builder-base-cuda)
# - ik-llama-vulkan: no-op (empty /install/bin, skips CUDA pull entirely)
# BuildKit only evaluates the selected branch, so vulkan builds never
# pull nvidia/cuda:*-devel or compile ik_llama.cpp.
FROM builder-base-vulkan AS ik-llama-vulkan
RUN mkdir -p /install/bin
FROM builder-base-cuda AS ik-llama-cuda
ARG IK_LLAMA_COMMIT_HASH=main
COPY install-ik-llama.sh /build/
RUN --mount=type=cache,id=ccache-cuda,target=/ccache \
--mount=type=cache,id=ik-llama-cuda,target=/src/ik_llama.cpp/build \
bash /build/install-ik-llama.sh "${IK_LLAMA_COMMIT_HASH}"
ARG BACKEND=cuda
FROM ik-llama-${BACKEND} AS ik-llama-build
# ── Download llama-swap release binary ──────────────────────────────── # ── Download llama-swap release binary ────────────────────────────────
FROM builder-base AS llama-swap-download FROM builder-base AS llama-swap-download
@@ -87,14 +110,14 @@ RUN bash /build/install-llama-swap.sh "${LS_VERSION}"
# ── Runtime bases ───────────────────────────────────────────────────── # ── Runtime bases ─────────────────────────────────────────────────────
FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04 AS runtime-cuda FROM nvidia/cuda:12.9.1-runtime-ubuntu24.04 AS runtime-cuda
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
ENV PATH="/usr/local/bin:${PATH}" ENV PATH="/usr/local/bin:${PATH}"
RUN apt-get update && apt-get install -y --no-install-recommends \ RUN apt-get update && apt-get install -y --no-install-recommends \
libgomp1 python3 python3-pip curl ca-certificates git \ libgomp1 python3 curl ca-certificates \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# CUDA stub drivers for container compatibility # CUDA stub drivers for container compatibility
@@ -103,14 +126,14 @@ COPY --from=builder-base-cuda /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/
# ── # ──
FROM ubuntu:26.04 AS runtime-vulkan FROM ubuntu:24.04 AS runtime-vulkan
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ENV PATH="/usr/local/bin:${PATH}" ENV PATH="/usr/local/bin:${PATH}"
RUN apt-get update && apt-get install -y --no-install-recommends \ RUN apt-get update && apt-get install -y --no-install-recommends \
libgomp1 libvulkan1 mesa-vulkan-drivers \ libgomp1 libvulkan1 mesa-vulkan-drivers \
python3 python3-pip curl ca-certificates git \ python3 curl ca-certificates \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# ── Select runtime base by BACKEND ──────────────────────────────────── # ── Select runtime base by BACKEND ────────────────────────────────────
@@ -121,13 +144,21 @@ ARG BACKEND=cuda
ARG LLAMA_COMMIT_HASH=unknown ARG LLAMA_COMMIT_HASH=unknown
ARG WHISPER_COMMIT_HASH=unknown ARG WHISPER_COMMIT_HASH=unknown
ARG SD_COMMIT_HASH=unknown ARG SD_COMMIT_HASH=unknown
ARG IK_LLAMA_COMMIT_HASH=unknown
ARG RUN_UID=0
RUN pip3 install --no-cache-dir --break-system-packages numpy sentencepiece RUN apt-get update && apt-get install -y --no-install-recommends \
python3-numpy python3-sentencepiece \
&& rm -rf /var/lib/apt/lists/*
# Create llama-swap user and config directory # Create non-root user when RUN_UID != 0
RUN useradd --system --create-home --shell /sbin/nologin llama-swap && \ RUN if [ "$RUN_UID" != "0" ]; then \
groupadd --system --gid $RUN_UID llama-swap && \
useradd --system --uid $RUN_UID --gid $RUN_UID \
--home /app --shell /sbin/nologin llama-swap; \
fi && \
mkdir -p /etc/llama-swap/config && \ mkdir -p /etc/llama-swap/config && \
chown -R llama-swap:llama-swap /etc/llama-swap chown -R ${RUN_UID}:${RUN_UID} /etc/llama-swap
WORKDIR /app WORKDIR /app
@@ -141,10 +172,12 @@ COPY --from=sd-build /install/bin/sd-server /usr/local/bin/
COPY --from=sd-build /install/bin/sd-cli /usr/local/bin/ COPY --from=sd-build /install/bin/sd-cli /usr/local/bin/
COPY --from=sd-build /install/lib/ /usr/local/lib/ COPY --from=sd-build /install/lib/ /usr/local/lib/
# Copy llama.cpp binaries and libraries # Copy llama.cpp binaries (statically linked)
COPY --from=llama-build /install/bin/llama-server /usr/local/bin/ COPY --from=llama-build /install/bin/llama-server /usr/local/bin/
COPY --from=llama-build /install/bin/llama-cli /usr/local/bin/ COPY --from=llama-build /install/bin/llama-cli /usr/local/bin/
COPY --from=llama-build /install/lib/ /usr/local/lib/
# Copy ik-llama-server (CUDA only; empty copy for vulkan)
COPY --from=ik-llama-build /install/bin/ /usr/local/bin/
# Copy llama-swap binary # Copy llama-swap binary
COPY --from=llama-swap-download /install/bin/llama-swap /usr/local/bin/ COPY --from=llama-swap-download /install/bin/llama-swap /usr/local/bin/
@@ -158,11 +191,13 @@ COPY config.example.yaml /etc/llama-swap/config/config.yaml
RUN echo "llama.cpp: ${LLAMA_COMMIT_HASH}" > /versions.txt && \ RUN echo "llama.cpp: ${LLAMA_COMMIT_HASH}" > /versions.txt && \
echo "whisper.cpp: ${WHISPER_COMMIT_HASH}" >> /versions.txt && \ echo "whisper.cpp: ${WHISPER_COMMIT_HASH}" >> /versions.txt && \
echo "stable-diffusion.cpp: ${SD_COMMIT_HASH}" >> /versions.txt && \ echo "stable-diffusion.cpp: ${SD_COMMIT_HASH}" >> /versions.txt && \
echo "ik_llama.cpp: ${IK_LLAMA_COMMIT_HASH}" >> /versions.txt && \
echo "llama-swap: $(cat /tmp/llama-swap-version)" >> /versions.txt && \ echo "llama-swap: $(cat /tmp/llama-swap-version)" >> /versions.txt && \
echo "backend: ${BACKEND}" >> /versions.txt && \ echo "backend: ${BACKEND}" >> /versions.txt && \
echo "build_timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> /versions.txt echo "build_timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> /versions.txt
RUN mkdir -p /models && chown ${RUN_UID}:${RUN_UID} /models
WORKDIR /models WORKDIR /models
USER llama-swap USER ${RUN_UID}
ENTRYPOINT ["llama-swap"] ENTRYPOINT ["llama-swap"]
CMD ["-config", "/etc/llama-swap/config/config.yaml", "-listen", "0.0.0.0:8080"] CMD ["-config", "/etc/llama-swap/config/config.yaml", "-listen", "0.0.0.0:8080"]
+37 -2
View File
@@ -11,6 +11,7 @@
# WHISPER_REF=v1.0.0 ./build-image.sh --vulkan # Pin whisper.cpp to a tag # WHISPER_REF=v1.0.0 ./build-image.sh --vulkan # Pin whisper.cpp to a tag
# SD_REF=master ./build-image.sh --cuda # Pin stable-diffusion.cpp to a branch # SD_REF=master ./build-image.sh --cuda # Pin stable-diffusion.cpp to a branch
# LS_VERSION=170 ./build-image.sh --cuda # Override llama-swap version # LS_VERSION=170 ./build-image.sh --cuda # Override llama-swap version
# IK_LLAMA_REF=main ./build-image.sh --cuda # Pin ik_llama.cpp to main branch (CUDA only)
# #
set -euo pipefail set -euo pipefail
@@ -43,6 +44,7 @@ for arg in "$@"; do
echo " LLAMA_REF Pin llama.cpp to a commit, tag, or branch" echo " LLAMA_REF Pin llama.cpp to a commit, tag, or branch"
echo " WHISPER_REF Pin whisper.cpp to a commit, tag, or branch" echo " WHISPER_REF Pin whisper.cpp to a commit, tag, or branch"
echo " SD_REF Pin stable-diffusion.cpp to a commit, tag, or branch" echo " SD_REF Pin stable-diffusion.cpp to a commit, tag, or branch"
echo " IK_LLAMA_REF Pin ik_llama.cpp to a commit, tag, or branch (CUDA only)"
echo " LS_VERSION Override llama-swap version (e.g., '170' or 'latest')" echo " LS_VERSION Override llama-swap version (e.g., '170' or 'latest')"
exit 0 exit 0
;; ;;
@@ -63,6 +65,7 @@ LLAMA_REPO="https://github.com/ggml-org/llama.cpp.git"
WHISPER_REPO="https://github.com/ggml-org/whisper.cpp.git" WHISPER_REPO="https://github.com/ggml-org/whisper.cpp.git"
SD_REPO="https://github.com/leejet/stable-diffusion.cpp.git" SD_REPO="https://github.com/leejet/stable-diffusion.cpp.git"
LLAMA_SWAP_REPO="https://github.com/mostlygeek/llama-swap.git" LLAMA_SWAP_REPO="https://github.com/mostlygeek/llama-swap.git"
IK_LLAMA_REPO="https://github.com/ikawrakow/ik_llama.cpp.git"
# Resolve a git ref (commit hash, tag, or branch) to a full commit hash. # Resolve a git ref (commit hash, tag, or branch) to a full commit hash.
# Requires only: git, network access to the remote. # Requires only: git, network access to the remote.
@@ -152,6 +155,24 @@ else
echo "stable-diffusion.cpp: latest HEAD: ${SD_HASH}" echo "stable-diffusion.cpp: latest HEAD: ${SD_HASH}"
fi fi
# Resolve ik_llama.cpp ref (CUDA only)
if [[ "$BACKEND" == "cuda" ]]; then
if [[ -n "${IK_LLAMA_REF:-}" ]]; then
IK_LLAMA_HASH=$(resolve_ref "${IK_LLAMA_REPO}" "${IK_LLAMA_REF}") || exit 1
echo "ik_llama.cpp: ${IK_LLAMA_REF} -> ${IK_LLAMA_HASH}"
else
IK_LLAMA_HASH=$(get_latest_hash "${IK_LLAMA_REPO}")
if [[ -z "${IK_LLAMA_HASH}" ]]; then
echo "ERROR: Could not determine latest commit for ik_llama.cpp" >&2
exit 1
fi
echo "ik_llama.cpp: latest HEAD: ${IK_LLAMA_HASH}"
fi
else
IK_LLAMA_HASH="n/a"
echo "ik_llama.cpp: skipped (vulkan build)"
fi
# Resolve llama-swap ref # Resolve llama-swap ref
if [[ -n "${LS_VERSION:-}" ]]; then if [[ -n "${LS_VERSION:-}" ]]; then
LS_HASH=$(resolve_ref "${LLAMA_SWAP_REPO}" "${LS_VERSION}") || exit 1 LS_HASH=$(resolve_ref "${LLAMA_SWAP_REPO}" "${LS_VERSION}") || exit 1
@@ -178,7 +199,9 @@ BUILD_ARGS=(
--build-arg "LLAMA_COMMIT_HASH=${LLAMA_HASH}" --build-arg "LLAMA_COMMIT_HASH=${LLAMA_HASH}"
--build-arg "WHISPER_COMMIT_HASH=${WHISPER_HASH}" --build-arg "WHISPER_COMMIT_HASH=${WHISPER_HASH}"
--build-arg "SD_COMMIT_HASH=${SD_HASH}" --build-arg "SD_COMMIT_HASH=${SD_HASH}"
--build-arg "IK_LLAMA_COMMIT_HASH=${IK_LLAMA_HASH}"
--build-arg "LS_VERSION=${LS_HASH}" --build-arg "LS_VERSION=${LS_HASH}"
--build-arg "RUN_UID=${RUN_UID:-0}"
-t "${DOCKER_IMAGE_TAG}" -t "${DOCKER_IMAGE_TAG}"
-f "${SCRIPT_DIR}/Dockerfile" -f "${SCRIPT_DIR}/Dockerfile"
) )
@@ -203,8 +226,13 @@ echo "Verifying build artifacts..."
echo "==========================================" echo "=========================================="
echo "" echo ""
EXPECTED_BINARIES=(llama-server llama-cli whisper-server whisper-cli sd-server sd-cli llama-swap)
if [[ "$BACKEND" == "cuda" ]]; then
EXPECTED_BINARIES+=(ik-llama-server)
fi
MISSING_BINARIES=() MISSING_BINARIES=()
for binary in llama-server llama-cli whisper-server whisper-cli sd-server sd-cli llama-swap; do for binary in "${EXPECTED_BINARIES[@]}"; do
if ! docker run --rm --entrypoint which "${DOCKER_IMAGE_TAG}" "${binary}" >/dev/null 2>&1; then if ! docker run --rm --entrypoint which "${DOCKER_IMAGE_TAG}" "${binary}" >/dev/null 2>&1; then
MISSING_BINARIES+=("${binary}") MISSING_BINARIES+=("${binary}")
fi fi
@@ -221,7 +249,11 @@ if [[ ${#MISSING_BINARIES[@]} -gt 0 ]]; then
exit 1 exit 1
fi fi
echo "All expected binaries verified: llama-server, llama-cli, whisper-server, whisper-cli, sd-server, sd-cli, llama-swap" VERIFIED_LIST="llama-server, llama-cli, whisper-server, whisper-cli, sd-server, sd-cli, llama-swap"
if [[ "$BACKEND" == "cuda" ]]; then
VERIFIED_LIST="${VERIFIED_LIST}, ik-llama-server"
fi
echo "All expected binaries verified: ${VERIFIED_LIST}"
echo "" echo ""
echo "==========================================" echo "=========================================="
@@ -234,6 +266,9 @@ echo "Built with:"
echo " llama.cpp: ${LLAMA_HASH}" echo " llama.cpp: ${LLAMA_HASH}"
echo " whisper.cpp: ${WHISPER_HASH}" echo " whisper.cpp: ${WHISPER_HASH}"
echo " stable-diffusion.cpp: ${SD_HASH}" echo " stable-diffusion.cpp: ${SD_HASH}"
if [[ "$BACKEND" == "cuda" ]]; then
echo " ik_llama.cpp: ${IK_LLAMA_HASH}"
fi
echo " llama-swap: $(docker run --rm --entrypoint cat "${DOCKER_IMAGE_TAG}" /versions.txt | grep llama-swap | cut -d' ' -f2-)" echo " llama-swap: $(docker run --rm --entrypoint cat "${DOCKER_IMAGE_TAG}" /versions.txt | grep llama-swap | cut -d' ' -f2-)"
echo "" echo ""
if [[ "$BACKEND" == "vulkan" ]]; then if [[ "$BACKEND" == "vulkan" ]]; then
+48
View File
@@ -0,0 +1,48 @@
#!/bin/bash
# Install ik_llama.cpp - clone, build, and install binaries
# Usage: ./install-ik-llama.sh <commit_hash>
# Note: CUDA only; always built against builder-base-cuda
set -e
COMMIT_HASH="${1:-main}"
mkdir -p /install/bin
# Clone and checkout (init-based so cache-mounted build dir doesn't break clone)
echo "=== Cloning ik_llama.cpp at ${COMMIT_HASH} ==="
mkdir -p /src/ik_llama.cpp
cd /src/ik_llama.cpp
if [ ! -d .git ]; then
git init
git remote add origin https://github.com/ikawrakow/ik_llama.cpp.git
fi
git fetch --depth=1 origin "${COMMIT_HASH}"
git checkout FETCH_HEAD
CMAKE_FLAGS=(
-DGGML_NATIVE=OFF
-DBUILD_SHARED_LIBS=OFF
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_C_COMPILER_LAUNCHER=ccache
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-DGGML_CUDA=ON
"-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:?CMAKE_CUDA_ARCHITECTURES must be set}"
"-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
"-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda -Wl,--allow-shlib-undefined"
)
rm -rf build/CMakeCache.txt build/CMakeFiles 2>/dev/null || true
echo "=== Building ik_llama.cpp ==="
cmake -B build "${CMAKE_FLAGS[@]}"
cmake --build build --config Release -j"$(nproc)" --target llama-server
if [ ! -f "build/bin/llama-server" ]; then
echo "FATAL: llama-server not found in build/bin/" >&2
exit 1
fi
# Install as ik-llama-server to avoid collision with llama.cpp's llama-server
cp "build/bin/llama-server" "/install/bin/ik-llama-server"
echo "=== ik_llama.cpp build complete ==="
ls -la /install/bin/
+3 -5
View File
@@ -6,7 +6,7 @@ set -e
COMMIT_HASH="${1:-master}" COMMIT_HASH="${1:-master}"
BACKEND="${BACKEND:-cuda}" BACKEND="${BACKEND:-cuda}"
mkdir -p /install/bin /install/lib mkdir -p /install/bin
# Clone and checkout (init-based so cache-mounted /src/llama.cpp/build dir doesn't break clone) # Clone and checkout (init-based so cache-mounted /src/llama.cpp/build dir doesn't break clone)
echo "=== Cloning llama.cpp at ${COMMIT_HASH} ===" echo "=== Cloning llama.cpp at ${COMMIT_HASH} ==="
@@ -22,6 +22,7 @@ git checkout FETCH_HEAD
# Common cmake flags # Common cmake flags
CMAKE_FLAGS=( CMAKE_FLAGS=(
-DGGML_NATIVE=OFF -DGGML_NATIVE=OFF
-DBUILD_SHARED_LIBS=OFF
-DCMAKE_BUILD_TYPE=Release -DCMAKE_BUILD_TYPE=Release
-DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
@@ -32,10 +33,9 @@ if [ "$BACKEND" = "cuda" ]; then
CMAKE_FLAGS+=( CMAKE_FLAGS+=(
-DGGML_CUDA=ON -DGGML_CUDA=ON
-DGGML_VULKAN=OFF -DGGML_VULKAN=OFF
"-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:-60;61;75;86;89}" "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:?CMAKE_CUDA_ARCHITECTURES must be set}"
"-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler" "-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
"-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda" "-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
"-DCMAKE_SHARED_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
) )
elif [ "$BACKEND" = "vulkan" ]; then elif [ "$BACKEND" = "vulkan" ]; then
CMAKE_FLAGS+=( CMAKE_FLAGS+=(
@@ -59,7 +59,5 @@ for bin in "${TARGETS[@]}"; do
fi fi
cp "build/bin/$bin" "/install/bin/" cp "build/bin/$bin" "/install/bin/"
done done
find build -name "*.so*" -type f -exec cp {} /install/lib/ \;
echo "=== llama.cpp build complete ===" echo "=== llama.cpp build complete ==="
ls -la /install/bin/ ls -la /install/bin/
+1 -1
View File
@@ -33,7 +33,7 @@ if [ "$BACKEND" = "cuda" ]; then
CMAKE_FLAGS+=( CMAKE_FLAGS+=(
-DGGML_CUDA=ON -DGGML_CUDA=ON
-DGGML_VULKAN=OFF -DGGML_VULKAN=OFF
"-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:-60;61;75;86;89}" "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:?CMAKE_CUDA_ARCHITECTURES must be set}"
"-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler" "-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
"-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda" "-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
"-DCMAKE_SHARED_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda" "-DCMAKE_SHARED_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
+1 -1
View File
@@ -31,7 +31,7 @@ if [ "$BACKEND" = "cuda" ]; then
CMAKE_FLAGS+=( CMAKE_FLAGS+=(
-DGGML_CUDA=ON -DGGML_CUDA=ON
-DGGML_VULKAN=OFF -DGGML_VULKAN=OFF
"-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:-60;61;75;86;89}" "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:?CMAKE_CUDA_ARCHITECTURES must be set}"
"-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler" "-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
"-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda" "-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
"-DCMAKE_SHARED_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda" "-DCMAKE_SHARED_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
+34
View File
@@ -319,6 +319,29 @@ models:
# - recommended to be omitted and the default used # - recommended to be omitted and the default used
concurrencyLimit: 0 concurrencyLimit: 0
# timeouts: configure proxy connection timeouts for this model
# - optional, defaults shown below
# - useful for models on slower hardware that need longer timeouts
# - increase responseHeader to avoid "timeout awaiting response headers" errors
# - set any value to 0 to disable that timeout (not recommended)
timeouts:
# connect: TCP connection timeout in seconds
# - default: 30
connect: 30
# responseHeader: time to wait for response headers in seconds
# - default: 60
# - for slow image generation or large models, consider increasing to 300+ seconds
responseHeader: 60
# tlsHandshake: TLS handshake timeout in seconds
# - default: 10
tlsHandshake: 10
# idleConn: idle connection timeout in seconds
# - default: 90
idleConn: 90
# sendLoadingState: overrides the global sendLoadingState setting for this model # sendLoadingState: overrides the global sendLoadingState setting for this model
# - optional, default: undefined (use global setting) # - optional, default: undefined (use global setting)
sendLoadingState: false sendLoadingState: false
@@ -444,6 +467,17 @@ peers:
# - required # - required
# - requested path to llama-swap will be appended to the end of the proxy value # - requested path to llama-swap will be appended to the end of the proxy value
proxy: http://192.168.1.23 proxy: http://192.168.1.23
# timeouts: configure proxy connection timeouts for this peer
# - optional, defaults shown below
# - useful when the peer runs on slower hardware
# - set any value to 0 to disable that timeout (not recommended)
timeouts:
connect: 30
responseHeader: 60
tlsHandshake: 10
idleConn: 90
# models: a list of models served by the peer # models: a list of models served by the peer
# - required # - required
models: models:
+28
View File
@@ -187,6 +187,13 @@ groups:
Name: "Model 1", Name: "Model 1",
Description: "This is model 1", Description: "This is model 1",
SendLoadingState: &modelLoadingState, SendLoadingState: &modelLoadingState,
Timeouts: TimeoutsConfig{
Connect: 30,
ResponseHeader: 60,
TLSHandshake: 10,
ExpectContinue: 1,
IdleConn: 90,
},
}, },
"model2": { "model2": {
Cmd: "path/to/server --arg1 one", Cmd: "path/to/server --arg1 one",
@@ -195,6 +202,13 @@ groups:
Env: []string{}, Env: []string{},
CheckEndpoint: "/", CheckEndpoint: "/",
SendLoadingState: &modelLoadingState, SendLoadingState: &modelLoadingState,
Timeouts: TimeoutsConfig{
Connect: 30,
ResponseHeader: 60,
TLSHandshake: 10,
ExpectContinue: 1,
IdleConn: 90,
},
}, },
"model3": { "model3": {
Cmd: "path/to/cmd --arg1 one", Cmd: "path/to/cmd --arg1 one",
@@ -203,6 +217,13 @@ groups:
Env: []string{}, Env: []string{},
CheckEndpoint: "/", CheckEndpoint: "/",
SendLoadingState: &modelLoadingState, SendLoadingState: &modelLoadingState,
Timeouts: TimeoutsConfig{
Connect: 30,
ResponseHeader: 60,
TLSHandshake: 10,
ExpectContinue: 1,
IdleConn: 90,
},
}, },
"model4": { "model4": {
Cmd: "path/to/cmd --arg1 one", Cmd: "path/to/cmd --arg1 one",
@@ -211,6 +232,13 @@ groups:
Aliases: []string{}, Aliases: []string{},
Env: []string{}, Env: []string{},
SendLoadingState: &modelLoadingState, SendLoadingState: &modelLoadingState,
Timeouts: TimeoutsConfig{
Connect: 30,
ResponseHeader: 60,
TLSHandshake: 10,
ExpectContinue: 1,
IdleConn: 90,
},
}, },
}, },
HealthCheckTimeout: 15, HealthCheckTimeout: 15,
+106
View File
@@ -6,6 +6,7 @@ import (
"testing" "testing"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
) )
func TestConfig_GroupMemberIsUnique(t *testing.T) { func TestConfig_GroupMemberIsUnique(t *testing.T) {
@@ -1438,3 +1439,108 @@ models:
}) })
} }
func TestConfig_TimeoutsParsing(t *testing.T) {
configYaml := `
models:
model1:
cmd: test-server --port ${PORT}
timeouts:
connect: 45
responseHeader: 120
`
config, err := LoadConfigFromReader(strings.NewReader(configYaml))
require.NoError(t, err)
modelConfig, found := config.Models["model1"]
require.True(t, found, "model1 should exist in config")
assert.Equal(t, 45, modelConfig.Timeouts.Connect)
assert.Equal(t, 120, modelConfig.Timeouts.ResponseHeader)
}
func TestConfig_TimeoutsDefaults(t *testing.T) {
configYaml := `
models:
model1:
cmd: test-server --port ${PORT}
`
config, err := LoadConfigFromReader(strings.NewReader(configYaml))
require.NoError(t, err)
modelConfig, found := config.Models["model1"]
require.True(t, found, "model1 should exist in config")
// Default values should be set during unmarshaling
assert.Equal(t, 30, modelConfig.Timeouts.Connect)
assert.Equal(t, 60, modelConfig.Timeouts.ResponseHeader)
assert.Equal(t, 10, modelConfig.Timeouts.TLSHandshake)
assert.Equal(t, 1, modelConfig.Timeouts.ExpectContinue)
assert.Equal(t, 90, modelConfig.Timeouts.IdleConn)
}
func TestConfig_TimeoutsZeroAllowed(t *testing.T) {
configYaml := `
models:
model1:
cmd: test-server --port ${PORT}
timeouts:
connect: 0
responseHeader: 0
`
config, err := LoadConfigFromReader(strings.NewReader(configYaml))
require.NoError(t, err)
modelConfig, found := config.Models["model1"]
require.True(t, found, "model1 should exist in config")
// Explicit 0 should be preserved (disables timeout)
assert.Equal(t, 0, modelConfig.Timeouts.Connect)
assert.Equal(t, 0, modelConfig.Timeouts.ResponseHeader)
}
func TestConfig_PeerTimeoutsParsing(t *testing.T) {
configYaml := `
peers:
peer1:
proxy: http://example.com
models: [model1]
timeouts:
connect: 45
responseHeader: 120
`
config, err := LoadConfigFromReader(strings.NewReader(configYaml))
require.NoError(t, err)
peerConfig, found := config.Peers["peer1"]
require.True(t, found, "peer1 should exist in config")
assert.Equal(t, 45, peerConfig.Timeouts.Connect)
assert.Equal(t, 120, peerConfig.Timeouts.ResponseHeader)
}
func TestConfig_PeerTimeoutsDefaults(t *testing.T) {
configYaml := `
peers:
peer1:
proxy: http://example.com
models: [model1]
`
config, err := LoadConfigFromReader(strings.NewReader(configYaml))
require.NoError(t, err)
peerConfig, found := config.Peers["peer1"]
require.True(t, found, "peer1 should exist in config")
// Default values should be set during unmarshaling
assert.Equal(t, 30, peerConfig.Timeouts.Connect)
assert.Equal(t, 60, peerConfig.Timeouts.ResponseHeader)
assert.Equal(t, 10, peerConfig.Timeouts.TLSHandshake)
assert.Equal(t, 1, peerConfig.Timeouts.ExpectContinue)
assert.Equal(t, 90, peerConfig.Timeouts.IdleConn)
}
+28
View File
@@ -173,6 +173,13 @@ groups:
Env: []string{"VAR1=value1", "VAR2=value2"}, Env: []string{"VAR1=value1", "VAR2=value2"},
CheckEndpoint: "/health", CheckEndpoint: "/health",
SendLoadingState: &modelLoadingState, SendLoadingState: &modelLoadingState,
Timeouts: TimeoutsConfig{
Connect: 30,
ResponseHeader: 60,
TLSHandshake: 10,
ExpectContinue: 1,
IdleConn: 90,
},
}, },
"model2": { "model2": {
Cmd: "path/to/server --arg1 one", Cmd: "path/to/server --arg1 one",
@@ -182,6 +189,13 @@ groups:
Env: []string{}, Env: []string{},
CheckEndpoint: "/", CheckEndpoint: "/",
SendLoadingState: &modelLoadingState, SendLoadingState: &modelLoadingState,
Timeouts: TimeoutsConfig{
Connect: 30,
ResponseHeader: 60,
TLSHandshake: 10,
ExpectContinue: 1,
IdleConn: 90,
},
}, },
"model3": { "model3": {
Cmd: "path/to/cmd --arg1 one", Cmd: "path/to/cmd --arg1 one",
@@ -191,6 +205,13 @@ groups:
Env: []string{}, Env: []string{},
CheckEndpoint: "/", CheckEndpoint: "/",
SendLoadingState: &modelLoadingState, SendLoadingState: &modelLoadingState,
Timeouts: TimeoutsConfig{
Connect: 30,
ResponseHeader: 60,
TLSHandshake: 10,
ExpectContinue: 1,
IdleConn: 90,
},
}, },
"model4": { "model4": {
Cmd: "path/to/cmd --arg1 one", Cmd: "path/to/cmd --arg1 one",
@@ -200,6 +221,13 @@ groups:
Aliases: []string{}, Aliases: []string{},
Env: []string{}, Env: []string{},
SendLoadingState: &modelLoadingState, SendLoadingState: &modelLoadingState,
Timeouts: TimeoutsConfig{
Connect: 30,
ResponseHeader: 60,
TLSHandshake: 10,
ExpectContinue: 1,
IdleConn: 90,
},
}, },
}, },
HealthCheckTimeout: 15, HealthCheckTimeout: 15,
+19
View File
@@ -9,6 +9,15 @@ const (
MODEL_CONFIG_DEFAULT_TTL = -1 MODEL_CONFIG_DEFAULT_TTL = -1
) )
// TimeoutsConfig holds timeout settings for proxy connections
type TimeoutsConfig struct {
Connect int `yaml:"connect"` // seconds, 0 = no timeout (not recommended)
ResponseHeader int `yaml:"responseHeader"` // seconds, 0 = no timeout (not recommended)
TLSHandshake int `yaml:"tlsHandshake"` // seconds, 0 = no timeout (not recommended)
ExpectContinue int `yaml:"expectContinue"` // seconds, 0 = no timeout (not recommended)
IdleConn int `yaml:"idleConn"` // seconds, 0 = no timeout (not recommended)
}
type ModelConfig struct { type ModelConfig struct {
Cmd string `yaml:"cmd"` Cmd string `yaml:"cmd"`
CmdStop string `yaml:"cmdStop"` CmdStop string `yaml:"cmdStop"`
@@ -40,6 +49,9 @@ type ModelConfig struct {
// override global setting // override global setting
SendLoadingState *bool `yaml:"sendLoadingState"` SendLoadingState *bool `yaml:"sendLoadingState"`
// Timeout settings for proxy connections
Timeouts TimeoutsConfig `yaml:"timeouts"`
} }
func (m *ModelConfig) UnmarshalYAML(unmarshal func(interface{}) error) error { func (m *ModelConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
@@ -57,6 +69,13 @@ func (m *ModelConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
ConcurrencyLimit: 0, ConcurrencyLimit: 0,
Name: "", Name: "",
Description: "", Description: "",
Timeouts: TimeoutsConfig{
Connect: 30,
ResponseHeader: 60,
TLSHandshake: 10,
ExpectContinue: 1,
IdleConn: 90,
},
} }
// the default cmdStop to taskkill /f /t /pid ${PID} // the default cmdStop to taskkill /f /t /pid ${PID}
+10
View File
@@ -12,6 +12,9 @@ type PeerConfig struct {
ApiKey string `yaml:"apiKey"` ApiKey string `yaml:"apiKey"`
Models []string `yaml:"models"` Models []string `yaml:"models"`
Filters Filters `yaml:"filters"` Filters Filters `yaml:"filters"`
// Timeout settings for proxy connections
Timeouts TimeoutsConfig `yaml:"timeouts"`
} }
func (c *PeerConfig) UnmarshalYAML(unmarshal func(interface{}) error) error { func (c *PeerConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
@@ -21,6 +24,13 @@ func (c *PeerConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
ApiKey: "", ApiKey: "",
Models: []string{}, Models: []string{},
Filters: Filters{}, Filters: Filters{},
Timeouts: TimeoutsConfig{
Connect: 30,
ResponseHeader: 60,
TLSHandshake: 10,
ExpectContinue: 1,
IdleConn: 90,
},
} }
if err := unmarshal(&defaults); err != nil { if err := unmarshal(&defaults); err != nil {
+7 -2
View File
@@ -365,6 +365,8 @@ func processStreamingResponse(modelID string, start time.Time, body []byte) (Tok
} }
func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result) (TokenMetrics, error) { func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result) (TokenMetrics, error) {
wallDurationMs := int(time.Since(start).Milliseconds())
// default values // default values
cachedTokens := -1 // unknown or missing data cachedTokens := -1 // unknown or missing data
outputTokens := 0 outputTokens := 0
@@ -373,7 +375,7 @@ func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result)
// timings data // timings data
tokensPerSecond := -1.0 tokensPerSecond := -1.0
promptPerSecond := -1.0 promptPerSecond := -1.0
durationMs := int(time.Since(start).Milliseconds()) durationMs := wallDurationMs
if usage.Exists() { if usage.Exists() {
if pt := usage.Get("prompt_tokens"); pt.Exists() { if pt := usage.Get("prompt_tokens"); pt.Exists() {
@@ -402,7 +404,10 @@ func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result)
outputTokens = int(timings.Get("predicted_n").Int()) outputTokens = int(timings.Get("predicted_n").Int())
promptPerSecond = timings.Get("prompt_per_second").Float() promptPerSecond = timings.Get("prompt_per_second").Float()
tokensPerSecond = timings.Get("predicted_per_second").Float() tokensPerSecond = timings.Get("predicted_per_second").Float()
durationMs = int(timings.Get("prompt_ms").Float() + timings.Get("predicted_ms").Float()) timingsDurationMs := int(timings.Get("prompt_ms").Float() + timings.Get("predicted_ms").Float())
if timingsDurationMs > durationMs {
durationMs = timingsDurationMs
}
if cachedValue := timings.Get("cache_n"); cachedValue.Exists() { if cachedValue := timings.Get("cache_n"); cachedValue.Exists() {
cachedTokens = int(cachedValue.Int()) cachedTokens = int(cachedValue.Int())
+22
View File
@@ -14,6 +14,7 @@ import (
"github.com/gin-gonic/gin" "github.com/gin-gonic/gin"
"github.com/mostlygeek/llama-swap/event" "github.com/mostlygeek/llama-swap/event"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/tidwall/gjson"
) )
func TestMetricsMonitor_AddMetrics(t *testing.T) { func TestMetricsMonitor_AddMetrics(t *testing.T) {
@@ -570,6 +571,27 @@ func TestMetricsMonitor_Concurrent(t *testing.T) {
} }
func TestMetricsMonitor_ParseMetrics(t *testing.T) { func TestMetricsMonitor_ParseMetrics(t *testing.T) {
t.Run("keeps wall clock duration when timings underreport request time", func(t *testing.T) {
start := time.Now().Add(-5 * time.Second)
usage := gjson.Parse(`{"prompt_tokens": 5, "completion_tokens": 1}`)
timings := gjson.Parse(`{
"prompt_n": 5,
"predicted_n": 1,
"prompt_per_second": 10.0,
"predicted_per_second": 2.0,
"prompt_ms": 5.0,
"predicted_ms": 15.0
}`)
metrics, err := parseMetrics("test-model", start, usage, timings)
assert.NoError(t, err)
assert.Equal(t, 5, metrics.InputTokens)
assert.Equal(t, 1, metrics.OutputTokens)
assert.Equal(t, 10.0, metrics.PromptPerSecond)
assert.Equal(t, 2.0, metrics.TokensPerSecond)
assert.GreaterOrEqual(t, metrics.DurationMs, 5000)
})
t.Run("prefers timings over usage data", func(t *testing.T) { t.Run("prefers timings over usage data", func(t *testing.T) {
mm := newMetricsMonitor(testLogger, 10, 0) mm := newMetricsMonitor(testLogger, 10, 0)
+17 -15
View File
@@ -34,23 +34,25 @@ func NewPeerProxy(peers config.PeerDictionaryConfig, proxyLogger *LogMonitor) (*
} }
sort.Strings(peerIDs) sort.Strings(peerIDs)
// Create a shared transport with reasonable timeouts for peer connections
// these can be tuned with feedback later
peerTransport := &http.Transport{
DialContext: (&net.Dialer{
Timeout: 30 * time.Second, // Connection timeout
KeepAlive: 30 * time.Second,
}).DialContext,
TLSHandshakeTimeout: 10 * time.Second,
ResponseHeaderTimeout: 60 * time.Second, // Time to wait for response headers
ExpectContinueTimeout: 1 * time.Second,
MaxIdleConns: 100,
MaxIdleConnsPerHost: 10,
IdleConnTimeout: 90 * time.Second,
}
for _, peerID := range peerIDs { for _, peerID := range peerIDs {
peer := peers[peerID] peer := peers[peerID]
// Create a transport with per-peer timeout configuration
peerTransport := &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: time.Duration(peer.Timeouts.Connect) * time.Second,
KeepAlive: 30 * time.Second,
}).DialContext,
TLSHandshakeTimeout: time.Duration(peer.Timeouts.TLSHandshake) * time.Second,
ResponseHeaderTimeout: time.Duration(peer.Timeouts.ResponseHeader) * time.Second,
ExpectContinueTimeout: time.Duration(peer.Timeouts.ExpectContinue) * time.Second,
ForceAttemptHTTP2: true,
MaxIdleConns: 100,
MaxIdleConnsPerHost: 10,
IdleConnTimeout: time.Duration(peer.Timeouts.IdleConn) * time.Second,
}
// Create reverse proxy for this peer // Create reverse proxy for this peer
reverseProxy := httputil.NewSingleHostReverseProxy(peer.ProxyURL) reverseProxy := httputil.NewSingleHostReverseProxy(peer.ProxyURL)
reverseProxy.Transport = peerTransport reverseProxy.Transport = peerTransport
+43
View File
@@ -6,6 +6,7 @@ import (
"net/url" "net/url"
"strings" "strings"
"testing" "testing"
"time"
"github.com/mostlygeek/llama-swap/proxy/config" "github.com/mostlygeek/llama-swap/proxy/config"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
@@ -266,3 +267,45 @@ func TestProxyRequest_SSEHeaderModification(t *testing.T) {
// The X-Accel-Buffering header should be set to "no" for SSE // The X-Accel-Buffering header should be set to "no" for SSE
assert.Equal(t, "no", w.Header().Get("X-Accel-Buffering")) assert.Equal(t, "no", w.Header().Get("X-Accel-Buffering"))
} }
func TestNewPeerProxy_CustomTimeouts(t *testing.T) {
proxyURL, _ := url.Parse("http://localhost:8080")
peers := config.PeerDictionaryConfig{
"test-peer": config.PeerConfig{
Proxy: "http://localhost:8080",
ProxyURL: proxyURL,
Models: []string{"model1"},
Timeouts: config.TimeoutsConfig{
Connect: 45,
ResponseHeader: 300,
TLSHandshake: 15,
ExpectContinue: 2,
IdleConn: 120,
},
},
}
peerProxy, err := NewPeerProxy(peers, testLogger)
assert.NoError(t, err)
assert.NotNil(t, peerProxy)
assert.True(t, peerProxy.HasPeerModel("model1"))
// Verify the timeout values are actually applied to the transport
member, found := peerProxy.proxyMap["model1"]
require.True(t, found, "model1 should exist in proxyMap")
assert.NotNil(t, member.reverseProxy)
assert.NotNil(t, member.reverseProxy.Transport)
transport, ok := member.reverseProxy.Transport.(*http.Transport)
require.True(t, ok, "Transport should be *http.Transport")
// Verify all timeout values are correctly applied
assert.Equal(t, 300*time.Second, transport.ResponseHeaderTimeout)
assert.Equal(t, 15*time.Second, transport.TLSHandshakeTimeout)
assert.Equal(t, 2*time.Second, transport.ExpectContinueTimeout)
assert.Equal(t, 120*time.Second, transport.IdleConnTimeout)
// ForceAttemptHTTP2 should be enabled
assert.True(t, transport.ForceAttemptHTTP2)
}
+18
View File
@@ -96,6 +96,24 @@ func NewProcess(ID string, healthCheckTimeout int, config config.ModelConfig, pr
var reverseProxy *httputil.ReverseProxy var reverseProxy *httputil.ReverseProxy
if proxyURL != nil { if proxyURL != nil {
reverseProxy = httputil.NewSingleHostReverseProxy(proxyURL) reverseProxy = httputil.NewSingleHostReverseProxy(proxyURL)
// Create custom transport with configured timeouts
transport := &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: time.Duration(config.Timeouts.Connect) * time.Second,
KeepAlive: 30 * time.Second,
}).DialContext,
TLSHandshakeTimeout: time.Duration(config.Timeouts.TLSHandshake) * time.Second,
ResponseHeaderTimeout: time.Duration(config.Timeouts.ResponseHeader) * time.Second,
ExpectContinueTimeout: time.Duration(config.Timeouts.ExpectContinue) * time.Second,
ForceAttemptHTTP2: true,
MaxIdleConns: 100,
MaxIdleConnsPerHost: 10,
IdleConnTimeout: time.Duration(config.Timeouts.IdleConn) * time.Second,
}
reverseProxy.Transport = transport
reverseProxy.ModifyResponse = func(resp *http.Response) error { reverseProxy.ModifyResponse = func(resp *http.Response) error {
// prevent nginx from buffering streaming responses (e.g., SSE) // prevent nginx from buffering streaming responses (e.g., SSE)
if strings.Contains(strings.ToLower(resp.Header.Get("Content-Type")), "text/event-stream") { if strings.Contains(strings.ToLower(resp.Header.Get("Content-Type")), "text/event-stream") {
+37
View File
@@ -2,6 +2,7 @@ package proxy
import ( import (
"fmt" "fmt"
"io"
"net/http" "net/http"
"net/http/httptest" "net/http/httptest"
"os" "os"
@@ -569,3 +570,39 @@ func (w *panicOnWriteResponseWriter) Write(b []byte) (int, error) {
} }
return w.ResponseRecorder.Write(b) return w.ResponseRecorder.Write(b)
} }
func TestProcess_CustomTimeouts(t *testing.T) {
modelConfig := config.ModelConfig{
Cmd: "echo test",
Proxy: "http://localhost:8080",
CheckEndpoint: "/health",
Timeouts: config.TimeoutsConfig{
Connect: 45,
ResponseHeader: 120,
TLSHandshake: 15,
ExpectContinue: 2,
IdleConn: 120,
},
}
debugLogger := NewLogMonitorWriter(io.Discard)
process := NewProcess("test-model", 30, modelConfig, debugLogger, debugLogger)
// Verify the process was created successfully
assert.NotNil(t, process)
assert.Equal(t, "test-model", process.ID)
assert.NotNil(t, process.reverseProxy)
assert.NotNil(t, process.reverseProxy.Transport)
// Verify it's using http.Transport (not some other type)
transport, ok := process.reverseProxy.Transport.(*http.Transport)
assert.True(t, ok, "Transport should be *http.Transport")
assert.NotNil(t, transport)
// Verify the timeouts are correctly applied
assert.Equal(t, 120*time.Second, transport.ResponseHeaderTimeout)
assert.Equal(t, 15*time.Second, transport.TLSHandshakeTimeout)
assert.Equal(t, 2*time.Second, transport.ExpectContinueTimeout)
assert.Equal(t, 120*time.Second, transport.IdleConnTimeout)
assert.True(t, transport.ForceAttemptHTTP2)
}
+3 -3
View File
@@ -2781,9 +2781,9 @@
"license": "ISC" "license": "ISC"
}, },
"node_modules/picomatch": { "node_modules/picomatch": {
"version": "4.0.3", "version": "4.0.4",
"resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz", "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz",
"integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==",
"dev": true, "dev": true,
"license": "MIT", "license": "MIT",
"engines": { "engines": {