Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| a9d840ffd7 | |||
| 7b2b82777f | |||
| d87f0ce2c5 | |||
| 06bc6a614c | |||
| a37b4866d8 | |||
| 981910d734 | |||
| a185efe37e | |||
| 1dd1aadf93 | |||
| 955900972a | |||
| c2c8cfaf81 | |||
| 1e440770ea | |||
| c794273c83 | |||
| 6574a52cbb |
@@ -4,11 +4,15 @@ on:
|
|||||||
pull_request:
|
pull_request:
|
||||||
paths:
|
paths:
|
||||||
- "config-schema.json"
|
- "config-schema.json"
|
||||||
|
- "config.example.yaml"
|
||||||
|
- ".github/workflows/config-schema.yml"
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- main
|
- main
|
||||||
paths:
|
paths:
|
||||||
- "config-schema.json"
|
- "config-schema.json"
|
||||||
|
- "config.example.yaml"
|
||||||
|
- ".github/workflows/config-schema.yml"
|
||||||
|
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
@@ -39,3 +43,14 @@ jobs:
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
echo "✓ config-schema.json is valid"
|
echo "✓ config-schema.json is valid"
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: "3.x"
|
||||||
|
|
||||||
|
- name: Install check-jsonschema
|
||||||
|
run: pip install check-jsonschema
|
||||||
|
|
||||||
|
- name: Validate config.example.yaml against schema
|
||||||
|
run: check-jsonschema --schemafile config-schema.json config.example.yaml
|
||||||
|
|||||||
@@ -18,6 +18,10 @@ on:
|
|||||||
description: "stable-diffusion.cpp commit hash, tag, or branch"
|
description: "stable-diffusion.cpp commit hash, tag, or branch"
|
||||||
required: false
|
required: false
|
||||||
default: "master"
|
default: "master"
|
||||||
|
ik_llama_ref:
|
||||||
|
description: "ik_llama.cpp commit hash, tag, or branch (CUDA only)"
|
||||||
|
required: false
|
||||||
|
default: "main"
|
||||||
llama_swap_version:
|
llama_swap_version:
|
||||||
description: "llama-swap version (e.g. v198, latest, main)"
|
description: "llama-swap version (e.g. v198, latest, main)"
|
||||||
required: false
|
required: false
|
||||||
@@ -38,17 +42,32 @@ permissions:
|
|||||||
packages: write
|
packages: write
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
setup:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
outputs:
|
||||||
|
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||||
|
steps:
|
||||||
|
- id: set-matrix
|
||||||
|
run: |
|
||||||
|
backends=()
|
||||||
|
# schedule uses defaults (build both); workflow_dispatch respects inputs
|
||||||
|
if [[ "${{ github.event_name }}" == "schedule" ]] || [[ "${{ inputs.build_cuda }}" == "true" ]]; then
|
||||||
|
backends+=("cuda")
|
||||||
|
fi
|
||||||
|
if [[ "${{ github.event_name }}" == "schedule" ]] || [[ "${{ inputs.build_vulkan }}" == "true" ]]; then
|
||||||
|
backends+=("vulkan")
|
||||||
|
fi
|
||||||
|
matrix=$(printf '%s\n' "${backends[@]}" | jq -R . | jq -sc .)
|
||||||
|
echo "matrix=$matrix" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
build:
|
build:
|
||||||
|
needs: setup
|
||||||
|
if: ${{ needs.setup.outputs.matrix != '[]' }}
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
backend:
|
backend: ${{ fromJSON(needs.setup.outputs.matrix) }}
|
||||||
- cuda
|
|
||||||
- vulkan
|
|
||||||
exclude:
|
|
||||||
- backend: ${{ inputs.build_cuda == false && 'cuda' || 'none' }}
|
|
||||||
- backend: ${{ inputs.build_vulkan == false && 'vulkan' || 'none' }}
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -85,6 +104,7 @@ jobs:
|
|||||||
LLAMA_REF: ${{ inputs.llama_cpp_ref || 'master' }}
|
LLAMA_REF: ${{ inputs.llama_cpp_ref || 'master' }}
|
||||||
WHISPER_REF: ${{ inputs.whisper_ref || 'master' }}
|
WHISPER_REF: ${{ inputs.whisper_ref || 'master' }}
|
||||||
SD_REF: ${{ inputs.sd_ref || 'master' }}
|
SD_REF: ${{ inputs.sd_ref || 'master' }}
|
||||||
|
IK_LLAMA_REF: ${{ inputs.ik_llama_ref || 'main' }}
|
||||||
LS_VERSION: ${{ inputs.llama_swap_version || 'main' }}
|
LS_VERSION: ${{ inputs.llama_swap_version || 'main' }}
|
||||||
DOCKER_IMAGE_TAG: ghcr.io/mostlygeek/llama-swap:unified-${{ matrix.backend }}
|
DOCKER_IMAGE_TAG: ghcr.io/mostlygeek/llama-swap:unified-${{ matrix.backend }}
|
||||||
# When running under act, use the local builder that has warm ccache.
|
# When running under act, use the local builder that has warm ccache.
|
||||||
@@ -98,7 +118,14 @@ jobs:
|
|||||||
- name: Push to GitHub Container Registry
|
- name: Push to GitHub Container Registry
|
||||||
if: ${{ !env.ACT }}
|
if: ${{ !env.ACT }}
|
||||||
run: |
|
run: |
|
||||||
docker push ghcr.io/mostlygeek/llama-swap:unified-${{ matrix.backend }}
|
BASE_TAG="ghcr.io/mostlygeek/llama-swap:unified-${{ matrix.backend }}"
|
||||||
DATE_TAG=$(date -u +%Y-%m-%d)
|
DATE_TAG=$(date -u +%Y-%m-%d)
|
||||||
docker tag ghcr.io/mostlygeek/llama-swap:unified-${{ matrix.backend }} ghcr.io/mostlygeek/llama-swap:unified-${{ matrix.backend }}-${DATE_TAG}
|
|
||||||
docker push ghcr.io/mostlygeek/llama-swap:unified-${{ matrix.backend }}-${DATE_TAG}
|
docker push "${BASE_TAG}"
|
||||||
|
docker tag "${BASE_TAG}" "${BASE_TAG}-${DATE_TAG}"
|
||||||
|
docker push "${BASE_TAG}-${DATE_TAG}"
|
||||||
|
|
||||||
|
ROOTLESS_TAG="${BASE_TAG}-rootless"
|
||||||
|
docker push "${ROOTLESS_TAG}"
|
||||||
|
docker tag "${ROOTLESS_TAG}" "${ROOTLESS_TAG}-${DATE_TAG}"
|
||||||
|
docker push "${ROOTLESS_TAG}-${DATE_TAG}"
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ llama-swap is a light weight, transparent proxy server that provides automatic m
|
|||||||
|
|
||||||
- Follow test naming conventions like `TestProxyManager_<test name>`, `TestProcessGroup_<test name>`, etc.
|
- Follow test naming conventions like `TestProxyManager_<test name>`, `TestProcessGroup_<test name>`, etc.
|
||||||
- Use `go test -v -run <name pattern for new tests>` to run any new tests you've written.
|
- Use `go test -v -run <name pattern for new tests>` to run any new tests you've written.
|
||||||
|
- Run `gofmt -l .` before committing to verify formatting. Fix any reported files with `gofmt -w <file>`.
|
||||||
- Use `make test-dev` after running new tests for a quick over all test run. This runs `go test` and `staticcheck`. Fix any static checking errors. Use this only when changes are made to any code under the `proxy/` directory
|
- Use `make test-dev` after running new tests for a quick over all test run. This runs `go test` and `staticcheck`. Fix any static checking errors. Use this only when changes are made to any code under the `proxy/` directory
|
||||||
- Use `make test-all` before completing work. This includes long running concurrency tests.
|
- Use `make test-all` before completing work. This includes long running concurrency tests.
|
||||||
|
|
||||||
|
|||||||
@@ -32,6 +32,10 @@ Built in Go for performance and simplicity, llama-swap has zero dependencies and
|
|||||||
- `v1/rerank`, `v1/reranking`, `/rerank`
|
- `v1/rerank`, `v1/reranking`, `/rerank`
|
||||||
- `/infill` - for code infilling
|
- `/infill` - for code infilling
|
||||||
- `/completion` - for completion endpoint
|
- `/completion` - for completion endpoint
|
||||||
|
- ✅ SDAPI via [stable-diffusion.cpp's server](https://github.com/leejet/stable-diffusion.cpp/tree/master/examples/server)
|
||||||
|
- `/sdapi/v1/txt2img`
|
||||||
|
- `/sdapi/v1/img2img`
|
||||||
|
- `/sdapi/v1/loras` - requires `model` in request body to fetch the correct loras
|
||||||
- ✅ llama-swap API
|
- ✅ llama-swap API
|
||||||
- `/ui` - web UI
|
- `/ui` - web UI
|
||||||
- `/upstream/:model_id` - direct access to upstream server ([demo](https://github.com/mostlygeek/llama-swap/pull/31))
|
- `/upstream/:model_id` - direct access to upstream server ([demo](https://github.com/mostlygeek/llama-swap/pull/31))
|
||||||
|
|||||||
@@ -39,6 +39,49 @@
|
|||||||
},
|
},
|
||||||
"default": {},
|
"default": {},
|
||||||
"description": "A dictionary of string substitutions. Macros are reusable snippets used in model cmd, cmdStop, proxy, checkEndpoint, filters.stripParams. Macro names must be <64 chars, match ^[a-zA-Z0-9_-]+$, and not be PORT or MODEL_ID. Values can be string, number, or boolean. Macros can reference other macros defined before them."
|
"description": "A dictionary of string substitutions. Macros are reusable snippets used in model cmd, cmdStop, proxy, checkEndpoint, filters.stripParams. Macro names must be <64 chars, match ^[a-zA-Z0-9_-]+$, and not be PORT or MODEL_ID. Values can be string, number, or boolean. Macros can reference other macros defined before them."
|
||||||
|
},
|
||||||
|
"timeouts": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"connect": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"default": 30,
|
||||||
|
"description": "TCP connection timeout in seconds. Set to 0 to disable."
|
||||||
|
},
|
||||||
|
"keepalive": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"default": 30,
|
||||||
|
"description": "TCP keepalive timeout in seconds. Set to 0 to disable."
|
||||||
|
},
|
||||||
|
"responseHeader": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"default": 0,
|
||||||
|
"description": "Time to wait for response headers in seconds. Set to 0 to disable."
|
||||||
|
},
|
||||||
|
"tlsHandshake": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"default": 10,
|
||||||
|
"description": "TLS handshake timeout in seconds. Set to 0 to disable."
|
||||||
|
},
|
||||||
|
"expectContinue": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"default": 1,
|
||||||
|
"description": "Expect-Continue timeout in seconds. Set to 0 to disable."
|
||||||
|
},
|
||||||
|
"idleConn": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"default": 90,
|
||||||
|
"description": "Idle connection timeout in seconds. Set to 0 to disable."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"description": "Timeout settings for proxy connections."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"properties": {
|
"properties": {
|
||||||
@@ -241,6 +284,9 @@
|
|||||||
"type": "boolean",
|
"type": "boolean",
|
||||||
"default": false,
|
"default": false,
|
||||||
"description": "If true the model will not show up in /v1/models responses. It can still be used as normal in API requests."
|
"description": "If true the model will not show up in /v1/models responses. It can still be used as normal in API requests."
|
||||||
|
},
|
||||||
|
"timeouts": {
|
||||||
|
"$ref": "#/definitions/timeouts"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -367,6 +413,43 @@
|
|||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
"default": {},
|
"default": {},
|
||||||
"description": "Dictionary of filter settings for peer requests. Supports stripParams and setParams."
|
"description": "Dictionary of filter settings for peer requests. Supports stripParams and setParams."
|
||||||
|
},
|
||||||
|
"timeouts": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"connect": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"default": 30,
|
||||||
|
"description": "TCP connection timeout in seconds."
|
||||||
|
},
|
||||||
|
"keepalive": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"default": 30,
|
||||||
|
"description": "TCP keepalive connection timeout in seconds."
|
||||||
|
},
|
||||||
|
"responseHeader": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"default": 0,
|
||||||
|
"description": "Time to wait for response headers in seconds."
|
||||||
|
},
|
||||||
|
"tlsHandshake": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"default": 10,
|
||||||
|
"description": "TLS handshake timeout in seconds."
|
||||||
|
},
|
||||||
|
"idleConn": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"default": 90,
|
||||||
|
"description": "Idle connection timeout in seconds."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"description": "Timeout settings for proxy connections to this peer."
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -284,6 +284,22 @@ models:
|
|||||||
# - optional, default: undefined (use global setting)
|
# - optional, default: undefined (use global setting)
|
||||||
sendLoadingState: false
|
sendLoadingState: false
|
||||||
|
|
||||||
|
# timeouts: configure proxy connection timeouts for this model
|
||||||
|
# - optional, defaults shown below
|
||||||
|
# - useful for models running on slower hardware that need longer timeouts
|
||||||
|
# - connect: TCP dial connection timeout in seconds, default: 30 seconds
|
||||||
|
# - keepalive: TCP connection keepalive timeout, default: 30 seconds
|
||||||
|
# - responseHeader: time to wait for response headers in seconds, default: 0 (no timeout)
|
||||||
|
# - tlsHandshake: TLS handshake timeout in seconds, default: 10 seconds
|
||||||
|
# - idleConn: idle connection timeout in seconds, default: 90 seconds
|
||||||
|
# - set any value to 0 to disable that timeout (not recommended)
|
||||||
|
timeouts:
|
||||||
|
connect: 30
|
||||||
|
keepalive: 0
|
||||||
|
responseHeader: 60
|
||||||
|
tlsHandshake: 10
|
||||||
|
idleConn: 90
|
||||||
|
|
||||||
# Unlisted model example:
|
# Unlisted model example:
|
||||||
"qwen-unlisted":
|
"qwen-unlisted":
|
||||||
# unlisted: boolean, true or false
|
# unlisted: boolean, true or false
|
||||||
@@ -426,6 +442,17 @@ peers:
|
|||||||
- z-ai/glm-4.7
|
- z-ai/glm-4.7
|
||||||
- moonshotai/kimi-k2-0905
|
- moonshotai/kimi-k2-0905
|
||||||
- minimax/minimax-m2.1
|
- minimax/minimax-m2.1
|
||||||
|
# timeouts: configure proxy connection timeouts for this peer
|
||||||
|
# - optional, defaults shown below
|
||||||
|
# - useful when the peer runs on slower hardware
|
||||||
|
# - set any value to 0 to disable that timeout (not recommended)
|
||||||
|
timeouts:
|
||||||
|
connect: 30
|
||||||
|
keepalive: 30
|
||||||
|
responseHeader: 60
|
||||||
|
tlsHandshake: 10
|
||||||
|
idleConn: 90
|
||||||
|
|
||||||
# filters: a dictionary of filter settings for peer requests
|
# filters: a dictionary of filter settings for peer requests
|
||||||
# - optional, default: empty dictionary
|
# - optional, default: empty dictionary
|
||||||
# - same capabilities as model filters (stripParams, setParams)
|
# - same capabilities as model filters (stripParams, setParams)
|
||||||
|
|||||||
+49
-14
@@ -4,6 +4,7 @@
|
|||||||
# Usage:
|
# Usage:
|
||||||
# docker buildx build --build-arg BACKEND=cuda -t llama-swap:unified-cuda .
|
# docker buildx build --build-arg BACKEND=cuda -t llama-swap:unified-cuda .
|
||||||
# docker buildx build --build-arg BACKEND=vulkan -t llama-swap:unified-vulkan .
|
# docker buildx build --build-arg BACKEND=vulkan -t llama-swap:unified-vulkan .
|
||||||
|
# docker buildx build --build-arg BACKEND=cuda --build-arg CMAKE_CUDA_ARCHITECTURES="86;89" -t llama-swap:unified-cuda .
|
||||||
#
|
#
|
||||||
# Each project has its own install script that handles cloning, building,
|
# Each project has its own install script that handles cloning, building,
|
||||||
# and installing binaries. Build stages are independent for cache efficiency.
|
# and installing binaries. Build stages are independent for cache efficiency.
|
||||||
@@ -12,10 +13,11 @@ ARG BACKEND=cuda
|
|||||||
|
|
||||||
# ── Builder bases ──────────────────────────────────────────────────────
|
# ── Builder bases ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
FROM nvidia/cuda:12.4.0-devel-ubuntu22.04 AS builder-base-cuda
|
FROM nvidia/cuda:12.9.1-devel-ubuntu24.04 AS builder-base-cuda
|
||||||
|
|
||||||
|
ARG CMAKE_CUDA_ARCHITECTURES="60;61;75;86;89"
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
ENV CMAKE_CUDA_ARCHITECTURES="60;61;75;86;89"
|
ENV CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}
|
||||||
ENV CCACHE_DIR=/ccache
|
ENV CCACHE_DIR=/ccache
|
||||||
ENV CCACHE_MAXSIZE=2G
|
ENV CCACHE_MAXSIZE=2G
|
||||||
ENV PATH="/usr/lib/ccache:${PATH}"
|
ENV PATH="/usr/lib/ccache:${PATH}"
|
||||||
@@ -29,7 +31,7 @@ WORKDIR /build
|
|||||||
|
|
||||||
# ──
|
# ──
|
||||||
|
|
||||||
FROM ubuntu:26.04 AS builder-base-vulkan
|
FROM ubuntu:24.04 AS builder-base-vulkan
|
||||||
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
ENV CCACHE_DIR=/ccache
|
ENV CCACHE_DIR=/ccache
|
||||||
@@ -78,6 +80,27 @@ RUN --mount=type=cache,id=ccache-${BACKEND},target=/ccache \
|
|||||||
--mount=type=cache,id=llama-${BACKEND},target=/src/llama.cpp/build \
|
--mount=type=cache,id=llama-${BACKEND},target=/src/llama.cpp/build \
|
||||||
BACKEND=${BACKEND} bash /build/install-llama.sh "${LLAMA_COMMIT_HASH}"
|
BACKEND=${BACKEND} bash /build/install-llama.sh "${LLAMA_COMMIT_HASH}"
|
||||||
|
|
||||||
|
# ── Build ik_llama.cpp (CUDA only) ────────────────────────────────────
|
||||||
|
#
|
||||||
|
# Two named stages allow ARG BACKEND to select at build time:
|
||||||
|
# - ik-llama-cuda : real build (from builder-base-cuda)
|
||||||
|
# - ik-llama-vulkan: no-op (empty /install/bin, skips CUDA pull entirely)
|
||||||
|
# BuildKit only evaluates the selected branch, so vulkan builds never
|
||||||
|
# pull nvidia/cuda:*-devel or compile ik_llama.cpp.
|
||||||
|
|
||||||
|
FROM builder-base-vulkan AS ik-llama-vulkan
|
||||||
|
RUN mkdir -p /install/bin
|
||||||
|
|
||||||
|
FROM builder-base-cuda AS ik-llama-cuda
|
||||||
|
ARG IK_LLAMA_COMMIT_HASH=main
|
||||||
|
COPY install-ik-llama.sh /build/
|
||||||
|
RUN --mount=type=cache,id=ccache-cuda,target=/ccache \
|
||||||
|
--mount=type=cache,id=ik-llama-cuda,target=/src/ik_llama.cpp/build \
|
||||||
|
bash /build/install-ik-llama.sh "${IK_LLAMA_COMMIT_HASH}"
|
||||||
|
|
||||||
|
ARG BACKEND=cuda
|
||||||
|
FROM ik-llama-${BACKEND} AS ik-llama-build
|
||||||
|
|
||||||
# ── Download llama-swap release binary ────────────────────────────────
|
# ── Download llama-swap release binary ────────────────────────────────
|
||||||
|
|
||||||
FROM builder-base AS llama-swap-download
|
FROM builder-base AS llama-swap-download
|
||||||
@@ -87,14 +110,14 @@ RUN bash /build/install-llama-swap.sh "${LS_VERSION}"
|
|||||||
|
|
||||||
# ── Runtime bases ─────────────────────────────────────────────────────
|
# ── Runtime bases ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04 AS runtime-cuda
|
FROM nvidia/cuda:12.9.1-runtime-ubuntu24.04 AS runtime-cuda
|
||||||
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
|
ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
|
||||||
ENV PATH="/usr/local/bin:${PATH}"
|
ENV PATH="/usr/local/bin:${PATH}"
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
libgomp1 python3 python3-pip curl ca-certificates git \
|
libgomp1 python3 curl ca-certificates \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# CUDA stub drivers for container compatibility
|
# CUDA stub drivers for container compatibility
|
||||||
@@ -103,14 +126,14 @@ COPY --from=builder-base-cuda /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/
|
|||||||
|
|
||||||
# ──
|
# ──
|
||||||
|
|
||||||
FROM ubuntu:26.04 AS runtime-vulkan
|
FROM ubuntu:24.04 AS runtime-vulkan
|
||||||
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
ENV PATH="/usr/local/bin:${PATH}"
|
ENV PATH="/usr/local/bin:${PATH}"
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
libgomp1 libvulkan1 mesa-vulkan-drivers \
|
libgomp1 libvulkan1 mesa-vulkan-drivers \
|
||||||
python3 python3-pip curl ca-certificates git \
|
python3 curl ca-certificates \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# ── Select runtime base by BACKEND ────────────────────────────────────
|
# ── Select runtime base by BACKEND ────────────────────────────────────
|
||||||
@@ -121,13 +144,21 @@ ARG BACKEND=cuda
|
|||||||
ARG LLAMA_COMMIT_HASH=unknown
|
ARG LLAMA_COMMIT_HASH=unknown
|
||||||
ARG WHISPER_COMMIT_HASH=unknown
|
ARG WHISPER_COMMIT_HASH=unknown
|
||||||
ARG SD_COMMIT_HASH=unknown
|
ARG SD_COMMIT_HASH=unknown
|
||||||
|
ARG IK_LLAMA_COMMIT_HASH=unknown
|
||||||
|
ARG RUN_UID=0
|
||||||
|
|
||||||
RUN pip3 install --no-cache-dir --break-system-packages numpy sentencepiece
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3-numpy python3-sentencepiece \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Create llama-swap user and config directory
|
# Create non-root user when RUN_UID != 0
|
||||||
RUN useradd --system --create-home --shell /sbin/nologin llama-swap && \
|
RUN if [ "$RUN_UID" != "0" ]; then \
|
||||||
|
groupadd --system --gid $RUN_UID llama-swap && \
|
||||||
|
useradd --system --uid $RUN_UID --gid $RUN_UID \
|
||||||
|
--home /app --shell /sbin/nologin llama-swap; \
|
||||||
|
fi && \
|
||||||
mkdir -p /etc/llama-swap/config && \
|
mkdir -p /etc/llama-swap/config && \
|
||||||
chown -R llama-swap:llama-swap /etc/llama-swap
|
chown -R ${RUN_UID}:${RUN_UID} /etc/llama-swap
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
@@ -141,10 +172,12 @@ COPY --from=sd-build /install/bin/sd-server /usr/local/bin/
|
|||||||
COPY --from=sd-build /install/bin/sd-cli /usr/local/bin/
|
COPY --from=sd-build /install/bin/sd-cli /usr/local/bin/
|
||||||
COPY --from=sd-build /install/lib/ /usr/local/lib/
|
COPY --from=sd-build /install/lib/ /usr/local/lib/
|
||||||
|
|
||||||
# Copy llama.cpp binaries and libraries
|
# Copy llama.cpp binaries (statically linked)
|
||||||
COPY --from=llama-build /install/bin/llama-server /usr/local/bin/
|
COPY --from=llama-build /install/bin/llama-server /usr/local/bin/
|
||||||
COPY --from=llama-build /install/bin/llama-cli /usr/local/bin/
|
COPY --from=llama-build /install/bin/llama-cli /usr/local/bin/
|
||||||
COPY --from=llama-build /install/lib/ /usr/local/lib/
|
|
||||||
|
# Copy ik-llama-server (CUDA only; empty copy for vulkan)
|
||||||
|
COPY --from=ik-llama-build /install/bin/ /usr/local/bin/
|
||||||
|
|
||||||
# Copy llama-swap binary
|
# Copy llama-swap binary
|
||||||
COPY --from=llama-swap-download /install/bin/llama-swap /usr/local/bin/
|
COPY --from=llama-swap-download /install/bin/llama-swap /usr/local/bin/
|
||||||
@@ -158,11 +191,13 @@ COPY config.example.yaml /etc/llama-swap/config/config.yaml
|
|||||||
RUN echo "llama.cpp: ${LLAMA_COMMIT_HASH}" > /versions.txt && \
|
RUN echo "llama.cpp: ${LLAMA_COMMIT_HASH}" > /versions.txt && \
|
||||||
echo "whisper.cpp: ${WHISPER_COMMIT_HASH}" >> /versions.txt && \
|
echo "whisper.cpp: ${WHISPER_COMMIT_HASH}" >> /versions.txt && \
|
||||||
echo "stable-diffusion.cpp: ${SD_COMMIT_HASH}" >> /versions.txt && \
|
echo "stable-diffusion.cpp: ${SD_COMMIT_HASH}" >> /versions.txt && \
|
||||||
|
echo "ik_llama.cpp: ${IK_LLAMA_COMMIT_HASH}" >> /versions.txt && \
|
||||||
echo "llama-swap: $(cat /tmp/llama-swap-version)" >> /versions.txt && \
|
echo "llama-swap: $(cat /tmp/llama-swap-version)" >> /versions.txt && \
|
||||||
echo "backend: ${BACKEND}" >> /versions.txt && \
|
echo "backend: ${BACKEND}" >> /versions.txt && \
|
||||||
echo "build_timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> /versions.txt
|
echo "build_timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> /versions.txt
|
||||||
|
|
||||||
|
RUN mkdir -p /models && chown ${RUN_UID}:${RUN_UID} /models
|
||||||
WORKDIR /models
|
WORKDIR /models
|
||||||
USER llama-swap
|
USER ${RUN_UID}
|
||||||
ENTRYPOINT ["llama-swap"]
|
ENTRYPOINT ["llama-swap"]
|
||||||
CMD ["-config", "/etc/llama-swap/config/config.yaml", "-listen", "0.0.0.0:8080"]
|
CMD ["-config", "/etc/llama-swap/config/config.yaml", "-listen", "0.0.0.0:8080"]
|
||||||
|
|||||||
@@ -11,6 +11,7 @@
|
|||||||
# WHISPER_REF=v1.0.0 ./build-image.sh --vulkan # Pin whisper.cpp to a tag
|
# WHISPER_REF=v1.0.0 ./build-image.sh --vulkan # Pin whisper.cpp to a tag
|
||||||
# SD_REF=master ./build-image.sh --cuda # Pin stable-diffusion.cpp to a branch
|
# SD_REF=master ./build-image.sh --cuda # Pin stable-diffusion.cpp to a branch
|
||||||
# LS_VERSION=170 ./build-image.sh --cuda # Override llama-swap version
|
# LS_VERSION=170 ./build-image.sh --cuda # Override llama-swap version
|
||||||
|
# IK_LLAMA_REF=main ./build-image.sh --cuda # Pin ik_llama.cpp to main branch (CUDA only)
|
||||||
#
|
#
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
@@ -43,6 +44,7 @@ for arg in "$@"; do
|
|||||||
echo " LLAMA_REF Pin llama.cpp to a commit, tag, or branch"
|
echo " LLAMA_REF Pin llama.cpp to a commit, tag, or branch"
|
||||||
echo " WHISPER_REF Pin whisper.cpp to a commit, tag, or branch"
|
echo " WHISPER_REF Pin whisper.cpp to a commit, tag, or branch"
|
||||||
echo " SD_REF Pin stable-diffusion.cpp to a commit, tag, or branch"
|
echo " SD_REF Pin stable-diffusion.cpp to a commit, tag, or branch"
|
||||||
|
echo " IK_LLAMA_REF Pin ik_llama.cpp to a commit, tag, or branch (CUDA only)"
|
||||||
echo " LS_VERSION Override llama-swap version (e.g., '170' or 'latest')"
|
echo " LS_VERSION Override llama-swap version (e.g., '170' or 'latest')"
|
||||||
exit 0
|
exit 0
|
||||||
;;
|
;;
|
||||||
@@ -63,6 +65,7 @@ LLAMA_REPO="https://github.com/ggml-org/llama.cpp.git"
|
|||||||
WHISPER_REPO="https://github.com/ggml-org/whisper.cpp.git"
|
WHISPER_REPO="https://github.com/ggml-org/whisper.cpp.git"
|
||||||
SD_REPO="https://github.com/leejet/stable-diffusion.cpp.git"
|
SD_REPO="https://github.com/leejet/stable-diffusion.cpp.git"
|
||||||
LLAMA_SWAP_REPO="https://github.com/mostlygeek/llama-swap.git"
|
LLAMA_SWAP_REPO="https://github.com/mostlygeek/llama-swap.git"
|
||||||
|
IK_LLAMA_REPO="https://github.com/ikawrakow/ik_llama.cpp.git"
|
||||||
|
|
||||||
# Resolve a git ref (commit hash, tag, or branch) to a full commit hash.
|
# Resolve a git ref (commit hash, tag, or branch) to a full commit hash.
|
||||||
# Requires only: git, network access to the remote.
|
# Requires only: git, network access to the remote.
|
||||||
@@ -152,6 +155,24 @@ else
|
|||||||
echo "stable-diffusion.cpp: latest HEAD: ${SD_HASH}"
|
echo "stable-diffusion.cpp: latest HEAD: ${SD_HASH}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Resolve ik_llama.cpp ref (CUDA only)
|
||||||
|
if [[ "$BACKEND" == "cuda" ]]; then
|
||||||
|
if [[ -n "${IK_LLAMA_REF:-}" ]]; then
|
||||||
|
IK_LLAMA_HASH=$(resolve_ref "${IK_LLAMA_REPO}" "${IK_LLAMA_REF}") || exit 1
|
||||||
|
echo "ik_llama.cpp: ${IK_LLAMA_REF} -> ${IK_LLAMA_HASH}"
|
||||||
|
else
|
||||||
|
IK_LLAMA_HASH=$(get_latest_hash "${IK_LLAMA_REPO}")
|
||||||
|
if [[ -z "${IK_LLAMA_HASH}" ]]; then
|
||||||
|
echo "ERROR: Could not determine latest commit for ik_llama.cpp" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "ik_llama.cpp: latest HEAD: ${IK_LLAMA_HASH}"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
IK_LLAMA_HASH="n/a"
|
||||||
|
echo "ik_llama.cpp: skipped (vulkan build)"
|
||||||
|
fi
|
||||||
|
|
||||||
# Resolve llama-swap ref
|
# Resolve llama-swap ref
|
||||||
if [[ -n "${LS_VERSION:-}" ]]; then
|
if [[ -n "${LS_VERSION:-}" ]]; then
|
||||||
LS_HASH=$(resolve_ref "${LLAMA_SWAP_REPO}" "${LS_VERSION}") || exit 1
|
LS_HASH=$(resolve_ref "${LLAMA_SWAP_REPO}" "${LS_VERSION}") || exit 1
|
||||||
@@ -178,6 +199,7 @@ BUILD_ARGS=(
|
|||||||
--build-arg "LLAMA_COMMIT_HASH=${LLAMA_HASH}"
|
--build-arg "LLAMA_COMMIT_HASH=${LLAMA_HASH}"
|
||||||
--build-arg "WHISPER_COMMIT_HASH=${WHISPER_HASH}"
|
--build-arg "WHISPER_COMMIT_HASH=${WHISPER_HASH}"
|
||||||
--build-arg "SD_COMMIT_HASH=${SD_HASH}"
|
--build-arg "SD_COMMIT_HASH=${SD_HASH}"
|
||||||
|
--build-arg "IK_LLAMA_COMMIT_HASH=${IK_LLAMA_HASH}"
|
||||||
--build-arg "LS_VERSION=${LS_HASH}"
|
--build-arg "LS_VERSION=${LS_HASH}"
|
||||||
-t "${DOCKER_IMAGE_TAG}"
|
-t "${DOCKER_IMAGE_TAG}"
|
||||||
-f "${SCRIPT_DIR}/Dockerfile"
|
-f "${SCRIPT_DIR}/Dockerfile"
|
||||||
@@ -203,8 +225,13 @@ echo "Verifying build artifacts..."
|
|||||||
echo "=========================================="
|
echo "=========================================="
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
|
EXPECTED_BINARIES=(llama-server llama-cli whisper-server whisper-cli sd-server sd-cli llama-swap)
|
||||||
|
if [[ "$BACKEND" == "cuda" ]]; then
|
||||||
|
EXPECTED_BINARIES+=(ik-llama-server)
|
||||||
|
fi
|
||||||
|
|
||||||
MISSING_BINARIES=()
|
MISSING_BINARIES=()
|
||||||
for binary in llama-server llama-cli whisper-server whisper-cli sd-server sd-cli llama-swap; do
|
for binary in "${EXPECTED_BINARIES[@]}"; do
|
||||||
if ! docker run --rm --entrypoint which "${DOCKER_IMAGE_TAG}" "${binary}" >/dev/null 2>&1; then
|
if ! docker run --rm --entrypoint which "${DOCKER_IMAGE_TAG}" "${binary}" >/dev/null 2>&1; then
|
||||||
MISSING_BINARIES+=("${binary}")
|
MISSING_BINARIES+=("${binary}")
|
||||||
fi
|
fi
|
||||||
@@ -221,20 +248,48 @@ if [[ ${#MISSING_BINARIES[@]} -gt 0 ]]; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "All expected binaries verified: llama-server, llama-cli, whisper-server, whisper-cli, sd-server, sd-cli, llama-swap"
|
VERIFIED_LIST="llama-server, llama-cli, whisper-server, whisper-cli, sd-server, sd-cli, llama-swap"
|
||||||
|
if [[ "$BACKEND" == "cuda" ]]; then
|
||||||
|
VERIFIED_LIST="${VERIFIED_LIST}, ik-llama-server"
|
||||||
|
fi
|
||||||
|
echo "All expected binaries verified: ${VERIFIED_LIST}"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=========================================="
|
||||||
|
echo "Building rootless image..."
|
||||||
|
echo "=========================================="
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
ROOTLESS_TAG="${DOCKER_IMAGE_TAG}-rootless"
|
||||||
|
docker buildx build --load -t "${ROOTLESS_TAG}" - <<EOF
|
||||||
|
FROM ${DOCKER_IMAGE_TAG}
|
||||||
|
USER root
|
||||||
|
RUN groupadd --system --gid 10001 llama-swap && \\
|
||||||
|
useradd --system --uid 10001 --gid 10001 \\
|
||||||
|
--home /app --shell /sbin/nologin llama-swap && \\
|
||||||
|
chown -R 10001:10001 /etc/llama-swap /models
|
||||||
|
USER 10001
|
||||||
|
EOF
|
||||||
|
|
||||||
|
echo "Rootless image built: ${ROOTLESS_TAG}"
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "=========================================="
|
echo "=========================================="
|
||||||
echo "Build complete!"
|
echo "Build complete!"
|
||||||
echo "=========================================="
|
echo "=========================================="
|
||||||
echo ""
|
echo ""
|
||||||
echo "Image tag: ${DOCKER_IMAGE_TAG}"
|
echo "Image tags:"
|
||||||
|
echo " ${DOCKER_IMAGE_TAG}"
|
||||||
|
echo " ${ROOTLESS_TAG}"
|
||||||
echo ""
|
echo ""
|
||||||
echo "Built with:"
|
echo "Built with:"
|
||||||
echo " llama.cpp: ${LLAMA_HASH}"
|
echo " llama.cpp: ${LLAMA_HASH}"
|
||||||
echo " whisper.cpp: ${WHISPER_HASH}"
|
echo " whisper.cpp: ${WHISPER_HASH}"
|
||||||
echo " stable-diffusion.cpp: ${SD_HASH}"
|
echo " stable-diffusion.cpp: ${SD_HASH}"
|
||||||
echo " llama-swap: $(docker run --rm --entrypoint cat "${DOCKER_IMAGE_TAG}" /versions.txt | grep llama-swap | cut -d' ' -f2-)"
|
if [[ "$BACKEND" == "cuda" ]]; then
|
||||||
|
echo " ik_llama.cpp: ${IK_LLAMA_HASH}"
|
||||||
|
fi
|
||||||
|
echo " llama-swap: $(docker run --rm --entrypoint cat "${DOCKER_IMAGE_TAG}" /versions.txt | grep llama-swap | cut -d' ' -f2-)"
|
||||||
echo ""
|
echo ""
|
||||||
if [[ "$BACKEND" == "vulkan" ]]; then
|
if [[ "$BACKEND" == "vulkan" ]]; then
|
||||||
echo "Run with:"
|
echo "Run with:"
|
||||||
|
|||||||
@@ -0,0 +1,48 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Install ik_llama.cpp - clone, build, and install binaries
|
||||||
|
# Usage: ./install-ik-llama.sh <commit_hash>
|
||||||
|
# Note: CUDA only; always built against builder-base-cuda
|
||||||
|
set -e
|
||||||
|
|
||||||
|
COMMIT_HASH="${1:-main}"
|
||||||
|
|
||||||
|
mkdir -p /install/bin
|
||||||
|
|
||||||
|
# Clone and checkout (init-based so cache-mounted build dir doesn't break clone)
|
||||||
|
echo "=== Cloning ik_llama.cpp at ${COMMIT_HASH} ==="
|
||||||
|
mkdir -p /src/ik_llama.cpp
|
||||||
|
cd /src/ik_llama.cpp
|
||||||
|
if [ ! -d .git ]; then
|
||||||
|
git init
|
||||||
|
git remote add origin https://github.com/ikawrakow/ik_llama.cpp.git
|
||||||
|
fi
|
||||||
|
git fetch --depth=1 origin "${COMMIT_HASH}"
|
||||||
|
git checkout FETCH_HEAD
|
||||||
|
|
||||||
|
CMAKE_FLAGS=(
|
||||||
|
-DGGML_NATIVE=OFF
|
||||||
|
-DBUILD_SHARED_LIBS=OFF
|
||||||
|
-DCMAKE_BUILD_TYPE=Release
|
||||||
|
-DCMAKE_C_COMPILER_LAUNCHER=ccache
|
||||||
|
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache
|
||||||
|
-DGGML_CUDA=ON
|
||||||
|
"-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:?CMAKE_CUDA_ARCHITECTURES must be set}"
|
||||||
|
"-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
|
||||||
|
"-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda -Wl,--allow-shlib-undefined"
|
||||||
|
)
|
||||||
|
|
||||||
|
rm -rf build/CMakeCache.txt build/CMakeFiles 2>/dev/null || true
|
||||||
|
|
||||||
|
echo "=== Building ik_llama.cpp ==="
|
||||||
|
cmake -B build "${CMAKE_FLAGS[@]}"
|
||||||
|
cmake --build build --config Release -j"$(nproc)" --target llama-server
|
||||||
|
|
||||||
|
if [ ! -f "build/bin/llama-server" ]; then
|
||||||
|
echo "FATAL: llama-server not found in build/bin/" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Install as ik-llama-server to avoid collision with llama.cpp's llama-server
|
||||||
|
cp "build/bin/llama-server" "/install/bin/ik-llama-server"
|
||||||
|
echo "=== ik_llama.cpp build complete ==="
|
||||||
|
ls -la /install/bin/
|
||||||
@@ -6,7 +6,7 @@ set -e
|
|||||||
COMMIT_HASH="${1:-master}"
|
COMMIT_HASH="${1:-master}"
|
||||||
BACKEND="${BACKEND:-cuda}"
|
BACKEND="${BACKEND:-cuda}"
|
||||||
|
|
||||||
mkdir -p /install/bin /install/lib
|
mkdir -p /install/bin
|
||||||
|
|
||||||
# Clone and checkout (init-based so cache-mounted /src/llama.cpp/build dir doesn't break clone)
|
# Clone and checkout (init-based so cache-mounted /src/llama.cpp/build dir doesn't break clone)
|
||||||
echo "=== Cloning llama.cpp at ${COMMIT_HASH} ==="
|
echo "=== Cloning llama.cpp at ${COMMIT_HASH} ==="
|
||||||
@@ -22,6 +22,7 @@ git checkout FETCH_HEAD
|
|||||||
# Common cmake flags
|
# Common cmake flags
|
||||||
CMAKE_FLAGS=(
|
CMAKE_FLAGS=(
|
||||||
-DGGML_NATIVE=OFF
|
-DGGML_NATIVE=OFF
|
||||||
|
-DBUILD_SHARED_LIBS=OFF
|
||||||
-DCMAKE_BUILD_TYPE=Release
|
-DCMAKE_BUILD_TYPE=Release
|
||||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache
|
-DCMAKE_C_COMPILER_LAUNCHER=ccache
|
||||||
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache
|
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache
|
||||||
@@ -32,10 +33,9 @@ if [ "$BACKEND" = "cuda" ]; then
|
|||||||
CMAKE_FLAGS+=(
|
CMAKE_FLAGS+=(
|
||||||
-DGGML_CUDA=ON
|
-DGGML_CUDA=ON
|
||||||
-DGGML_VULKAN=OFF
|
-DGGML_VULKAN=OFF
|
||||||
"-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:-60;61;75;86;89}"
|
"-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:?CMAKE_CUDA_ARCHITECTURES must be set}"
|
||||||
"-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
|
"-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
|
||||||
"-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
|
"-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
|
||||||
"-DCMAKE_SHARED_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
|
|
||||||
)
|
)
|
||||||
elif [ "$BACKEND" = "vulkan" ]; then
|
elif [ "$BACKEND" = "vulkan" ]; then
|
||||||
CMAKE_FLAGS+=(
|
CMAKE_FLAGS+=(
|
||||||
@@ -59,7 +59,5 @@ for bin in "${TARGETS[@]}"; do
|
|||||||
fi
|
fi
|
||||||
cp "build/bin/$bin" "/install/bin/"
|
cp "build/bin/$bin" "/install/bin/"
|
||||||
done
|
done
|
||||||
find build -name "*.so*" -type f -exec cp {} /install/lib/ \;
|
|
||||||
|
|
||||||
echo "=== llama.cpp build complete ==="
|
echo "=== llama.cpp build complete ==="
|
||||||
ls -la /install/bin/
|
ls -la /install/bin/
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ if [ "$BACKEND" = "cuda" ]; then
|
|||||||
CMAKE_FLAGS+=(
|
CMAKE_FLAGS+=(
|
||||||
-DGGML_CUDA=ON
|
-DGGML_CUDA=ON
|
||||||
-DGGML_VULKAN=OFF
|
-DGGML_VULKAN=OFF
|
||||||
"-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:-60;61;75;86;89}"
|
"-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:?CMAKE_CUDA_ARCHITECTURES must be set}"
|
||||||
"-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
|
"-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
|
||||||
"-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
|
"-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
|
||||||
"-DCMAKE_SHARED_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
|
"-DCMAKE_SHARED_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ if [ "$BACKEND" = "cuda" ]; then
|
|||||||
CMAKE_FLAGS+=(
|
CMAKE_FLAGS+=(
|
||||||
-DGGML_CUDA=ON
|
-DGGML_CUDA=ON
|
||||||
-DGGML_VULKAN=OFF
|
-DGGML_VULKAN=OFF
|
||||||
"-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:-60;61;75;86;89}"
|
"-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:?CMAKE_CUDA_ARCHITECTURES must be set}"
|
||||||
"-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
|
"-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
|
||||||
"-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
|
"-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
|
||||||
"-DCMAKE_SHARED_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
|
"-DCMAKE_SHARED_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
|
||||||
|
|||||||
@@ -319,6 +319,29 @@ models:
|
|||||||
# - recommended to be omitted and the default used
|
# - recommended to be omitted and the default used
|
||||||
concurrencyLimit: 0
|
concurrencyLimit: 0
|
||||||
|
|
||||||
|
# timeouts: configure proxy connection timeouts for this model
|
||||||
|
# - optional, defaults shown below
|
||||||
|
# - useful for models on slower hardware that need longer timeouts
|
||||||
|
# - increase responseHeader to avoid "timeout awaiting response headers" errors
|
||||||
|
# - set any value to 0 to disable that timeout (not recommended)
|
||||||
|
timeouts:
|
||||||
|
# connect: TCP connection timeout in seconds
|
||||||
|
# - default: 30
|
||||||
|
connect: 30
|
||||||
|
|
||||||
|
# responseHeader: time to wait for response headers in seconds
|
||||||
|
# - default: 60
|
||||||
|
# - for slow image generation or large models, consider increasing to 300+ seconds
|
||||||
|
responseHeader: 60
|
||||||
|
|
||||||
|
# tlsHandshake: TLS handshake timeout in seconds
|
||||||
|
# - default: 10
|
||||||
|
tlsHandshake: 10
|
||||||
|
|
||||||
|
# idleConn: idle connection timeout in seconds
|
||||||
|
# - default: 90
|
||||||
|
idleConn: 90
|
||||||
|
|
||||||
# sendLoadingState: overrides the global sendLoadingState setting for this model
|
# sendLoadingState: overrides the global sendLoadingState setting for this model
|
||||||
# - optional, default: undefined (use global setting)
|
# - optional, default: undefined (use global setting)
|
||||||
sendLoadingState: false
|
sendLoadingState: false
|
||||||
@@ -444,6 +467,17 @@ peers:
|
|||||||
# - required
|
# - required
|
||||||
# - requested path to llama-swap will be appended to the end of the proxy value
|
# - requested path to llama-swap will be appended to the end of the proxy value
|
||||||
proxy: http://192.168.1.23
|
proxy: http://192.168.1.23
|
||||||
|
|
||||||
|
# timeouts: configure proxy connection timeouts for this peer
|
||||||
|
# - optional, defaults shown below
|
||||||
|
# - useful when the peer runs on slower hardware
|
||||||
|
# - set any value to 0 to disable that timeout (not recommended)
|
||||||
|
timeouts:
|
||||||
|
connect: 30
|
||||||
|
responseHeader: 60
|
||||||
|
tlsHandshake: 10
|
||||||
|
idleConn: 90
|
||||||
|
|
||||||
# models: a list of models served by the peer
|
# models: a list of models served by the peer
|
||||||
# - required
|
# - required
|
||||||
models:
|
models:
|
||||||
|
|||||||
@@ -163,6 +163,15 @@ groups:
|
|||||||
|
|
||||||
modelLoadingState := false
|
modelLoadingState := false
|
||||||
|
|
||||||
|
defaultTimeout := TimeoutsConfig{
|
||||||
|
Connect: 30,
|
||||||
|
KeepAlive: 30,
|
||||||
|
ResponseHeader: 0,
|
||||||
|
TLSHandshake: 10,
|
||||||
|
ExpectContinue: 1,
|
||||||
|
IdleConn: 90,
|
||||||
|
}
|
||||||
|
|
||||||
expected := Config{
|
expected := Config{
|
||||||
LogLevel: "info",
|
LogLevel: "info",
|
||||||
LogTimeFormat: "",
|
LogTimeFormat: "",
|
||||||
@@ -187,6 +196,7 @@ groups:
|
|||||||
Name: "Model 1",
|
Name: "Model 1",
|
||||||
Description: "This is model 1",
|
Description: "This is model 1",
|
||||||
SendLoadingState: &modelLoadingState,
|
SendLoadingState: &modelLoadingState,
|
||||||
|
Timeouts: defaultTimeout,
|
||||||
},
|
},
|
||||||
"model2": {
|
"model2": {
|
||||||
Cmd: "path/to/server --arg1 one",
|
Cmd: "path/to/server --arg1 one",
|
||||||
@@ -195,6 +205,7 @@ groups:
|
|||||||
Env: []string{},
|
Env: []string{},
|
||||||
CheckEndpoint: "/",
|
CheckEndpoint: "/",
|
||||||
SendLoadingState: &modelLoadingState,
|
SendLoadingState: &modelLoadingState,
|
||||||
|
Timeouts: defaultTimeout,
|
||||||
},
|
},
|
||||||
"model3": {
|
"model3": {
|
||||||
Cmd: "path/to/cmd --arg1 one",
|
Cmd: "path/to/cmd --arg1 one",
|
||||||
@@ -203,6 +214,7 @@ groups:
|
|||||||
Env: []string{},
|
Env: []string{},
|
||||||
CheckEndpoint: "/",
|
CheckEndpoint: "/",
|
||||||
SendLoadingState: &modelLoadingState,
|
SendLoadingState: &modelLoadingState,
|
||||||
|
Timeouts: defaultTimeout,
|
||||||
},
|
},
|
||||||
"model4": {
|
"model4": {
|
||||||
Cmd: "path/to/cmd --arg1 one",
|
Cmd: "path/to/cmd --arg1 one",
|
||||||
@@ -211,6 +223,7 @@ groups:
|
|||||||
Aliases: []string{},
|
Aliases: []string{},
|
||||||
Env: []string{},
|
Env: []string{},
|
||||||
SendLoadingState: &modelLoadingState,
|
SendLoadingState: &modelLoadingState,
|
||||||
|
Timeouts: defaultTimeout,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
HealthCheckTimeout: 15,
|
HealthCheckTimeout: 15,
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import (
|
|||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestConfig_GroupMemberIsUnique(t *testing.T) {
|
func TestConfig_GroupMemberIsUnique(t *testing.T) {
|
||||||
@@ -1438,3 +1439,108 @@ models:
|
|||||||
})
|
})
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestConfig_TimeoutsParsing(t *testing.T) {
|
||||||
|
configYaml := `
|
||||||
|
models:
|
||||||
|
model1:
|
||||||
|
cmd: test-server --port ${PORT}
|
||||||
|
timeouts:
|
||||||
|
connect: 45
|
||||||
|
responseHeader: 120
|
||||||
|
`
|
||||||
|
|
||||||
|
config, err := LoadConfigFromReader(strings.NewReader(configYaml))
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
modelConfig, found := config.Models["model1"]
|
||||||
|
require.True(t, found, "model1 should exist in config")
|
||||||
|
|
||||||
|
assert.Equal(t, 45, modelConfig.Timeouts.Connect)
|
||||||
|
assert.Equal(t, 120, modelConfig.Timeouts.ResponseHeader)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConfig_TimeoutsDefaults(t *testing.T) {
|
||||||
|
configYaml := `
|
||||||
|
models:
|
||||||
|
model1:
|
||||||
|
cmd: test-server --port ${PORT}
|
||||||
|
`
|
||||||
|
|
||||||
|
config, err := LoadConfigFromReader(strings.NewReader(configYaml))
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
modelConfig, found := config.Models["model1"]
|
||||||
|
require.True(t, found, "model1 should exist in config")
|
||||||
|
|
||||||
|
// Default values should be set during unmarshaling
|
||||||
|
assert.Equal(t, 30, modelConfig.Timeouts.Connect)
|
||||||
|
assert.Equal(t, 0, modelConfig.Timeouts.ResponseHeader)
|
||||||
|
assert.Equal(t, 10, modelConfig.Timeouts.TLSHandshake)
|
||||||
|
assert.Equal(t, 1, modelConfig.Timeouts.ExpectContinue)
|
||||||
|
assert.Equal(t, 90, modelConfig.Timeouts.IdleConn)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConfig_TimeoutsZeroAllowed(t *testing.T) {
|
||||||
|
configYaml := `
|
||||||
|
models:
|
||||||
|
model1:
|
||||||
|
cmd: test-server --port ${PORT}
|
||||||
|
timeouts:
|
||||||
|
connect: 0
|
||||||
|
responseHeader: 0
|
||||||
|
`
|
||||||
|
|
||||||
|
config, err := LoadConfigFromReader(strings.NewReader(configYaml))
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
modelConfig, found := config.Models["model1"]
|
||||||
|
require.True(t, found, "model1 should exist in config")
|
||||||
|
|
||||||
|
// Explicit 0 should be preserved (disables timeout)
|
||||||
|
assert.Equal(t, 0, modelConfig.Timeouts.Connect)
|
||||||
|
assert.Equal(t, 0, modelConfig.Timeouts.ResponseHeader)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConfig_PeerTimeoutsParsing(t *testing.T) {
|
||||||
|
configYaml := `
|
||||||
|
peers:
|
||||||
|
peer1:
|
||||||
|
proxy: http://example.com
|
||||||
|
models: [model1]
|
||||||
|
timeouts:
|
||||||
|
connect: 45
|
||||||
|
responseHeader: 120
|
||||||
|
`
|
||||||
|
|
||||||
|
config, err := LoadConfigFromReader(strings.NewReader(configYaml))
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
peerConfig, found := config.Peers["peer1"]
|
||||||
|
require.True(t, found, "peer1 should exist in config")
|
||||||
|
|
||||||
|
assert.Equal(t, 45, peerConfig.Timeouts.Connect)
|
||||||
|
assert.Equal(t, 120, peerConfig.Timeouts.ResponseHeader)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConfig_PeerTimeoutsDefaults(t *testing.T) {
|
||||||
|
configYaml := `
|
||||||
|
peers:
|
||||||
|
peer1:
|
||||||
|
proxy: http://example.com
|
||||||
|
models: [model1]
|
||||||
|
`
|
||||||
|
|
||||||
|
config, err := LoadConfigFromReader(strings.NewReader(configYaml))
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
peerConfig, found := config.Peers["peer1"]
|
||||||
|
require.True(t, found, "peer1 should exist in config")
|
||||||
|
|
||||||
|
// Default values should be set during unmarshaling
|
||||||
|
assert.Equal(t, 30, peerConfig.Timeouts.Connect)
|
||||||
|
assert.Equal(t, 60, peerConfig.Timeouts.ResponseHeader)
|
||||||
|
assert.Equal(t, 10, peerConfig.Timeouts.TLSHandshake)
|
||||||
|
assert.Equal(t, 1, peerConfig.Timeouts.ExpectContinue)
|
||||||
|
assert.Equal(t, 90, peerConfig.Timeouts.IdleConn)
|
||||||
|
}
|
||||||
|
|||||||
@@ -155,6 +155,15 @@ groups:
|
|||||||
|
|
||||||
modelLoadingState := false
|
modelLoadingState := false
|
||||||
|
|
||||||
|
defaultTimeout := TimeoutsConfig{
|
||||||
|
Connect: 30,
|
||||||
|
KeepAlive: 30,
|
||||||
|
ResponseHeader: 0,
|
||||||
|
TLSHandshake: 10,
|
||||||
|
ExpectContinue: 1,
|
||||||
|
IdleConn: 90,
|
||||||
|
}
|
||||||
|
|
||||||
expected := Config{
|
expected := Config{
|
||||||
LogLevel: "info",
|
LogLevel: "info",
|
||||||
LogTimeFormat: "",
|
LogTimeFormat: "",
|
||||||
@@ -173,6 +182,7 @@ groups:
|
|||||||
Env: []string{"VAR1=value1", "VAR2=value2"},
|
Env: []string{"VAR1=value1", "VAR2=value2"},
|
||||||
CheckEndpoint: "/health",
|
CheckEndpoint: "/health",
|
||||||
SendLoadingState: &modelLoadingState,
|
SendLoadingState: &modelLoadingState,
|
||||||
|
Timeouts: defaultTimeout,
|
||||||
},
|
},
|
||||||
"model2": {
|
"model2": {
|
||||||
Cmd: "path/to/server --arg1 one",
|
Cmd: "path/to/server --arg1 one",
|
||||||
@@ -182,6 +192,7 @@ groups:
|
|||||||
Env: []string{},
|
Env: []string{},
|
||||||
CheckEndpoint: "/",
|
CheckEndpoint: "/",
|
||||||
SendLoadingState: &modelLoadingState,
|
SendLoadingState: &modelLoadingState,
|
||||||
|
Timeouts: defaultTimeout,
|
||||||
},
|
},
|
||||||
"model3": {
|
"model3": {
|
||||||
Cmd: "path/to/cmd --arg1 one",
|
Cmd: "path/to/cmd --arg1 one",
|
||||||
@@ -191,6 +202,7 @@ groups:
|
|||||||
Env: []string{},
|
Env: []string{},
|
||||||
CheckEndpoint: "/",
|
CheckEndpoint: "/",
|
||||||
SendLoadingState: &modelLoadingState,
|
SendLoadingState: &modelLoadingState,
|
||||||
|
Timeouts: defaultTimeout,
|
||||||
},
|
},
|
||||||
"model4": {
|
"model4": {
|
||||||
Cmd: "path/to/cmd --arg1 one",
|
Cmd: "path/to/cmd --arg1 one",
|
||||||
@@ -200,6 +212,7 @@ groups:
|
|||||||
Aliases: []string{},
|
Aliases: []string{},
|
||||||
Env: []string{},
|
Env: []string{},
|
||||||
SendLoadingState: &modelLoadingState,
|
SendLoadingState: &modelLoadingState,
|
||||||
|
Timeouts: defaultTimeout,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
HealthCheckTimeout: 15,
|
HealthCheckTimeout: 15,
|
||||||
|
|||||||
@@ -9,6 +9,17 @@ const (
|
|||||||
MODEL_CONFIG_DEFAULT_TTL = -1
|
MODEL_CONFIG_DEFAULT_TTL = -1
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// TimeoutsConfig holds timeout settings for proxy connections
|
||||||
|
// 0 = no timeout
|
||||||
|
type TimeoutsConfig struct {
|
||||||
|
Connect int `yaml:"connect"`
|
||||||
|
KeepAlive int `yaml:"keepalive"`
|
||||||
|
ResponseHeader int `yaml:"responseHeader"`
|
||||||
|
TLSHandshake int `yaml:"tlsHandshake"`
|
||||||
|
ExpectContinue int `yaml:"expectContinue"`
|
||||||
|
IdleConn int `yaml:"idleConn"`
|
||||||
|
}
|
||||||
|
|
||||||
type ModelConfig struct {
|
type ModelConfig struct {
|
||||||
Cmd string `yaml:"cmd"`
|
Cmd string `yaml:"cmd"`
|
||||||
CmdStop string `yaml:"cmdStop"`
|
CmdStop string `yaml:"cmdStop"`
|
||||||
@@ -40,6 +51,9 @@ type ModelConfig struct {
|
|||||||
|
|
||||||
// override global setting
|
// override global setting
|
||||||
SendLoadingState *bool `yaml:"sendLoadingState"`
|
SendLoadingState *bool `yaml:"sendLoadingState"`
|
||||||
|
|
||||||
|
// Timeout settings for proxy connections
|
||||||
|
Timeouts TimeoutsConfig `yaml:"timeouts"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *ModelConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
func (m *ModelConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
||||||
@@ -57,6 +71,16 @@ func (m *ModelConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
|||||||
ConcurrencyLimit: 0,
|
ConcurrencyLimit: 0,
|
||||||
Name: "",
|
Name: "",
|
||||||
Description: "",
|
Description: "",
|
||||||
|
|
||||||
|
// matches http.DefaultTransport
|
||||||
|
Timeouts: TimeoutsConfig{
|
||||||
|
Connect: 30,
|
||||||
|
KeepAlive: 30,
|
||||||
|
ResponseHeader: 0,
|
||||||
|
TLSHandshake: 10,
|
||||||
|
ExpectContinue: 1,
|
||||||
|
IdleConn: 90,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
// the default cmdStop to taskkill /f /t /pid ${PID}
|
// the default cmdStop to taskkill /f /t /pid ${PID}
|
||||||
|
|||||||
@@ -12,6 +12,9 @@ type PeerConfig struct {
|
|||||||
ApiKey string `yaml:"apiKey"`
|
ApiKey string `yaml:"apiKey"`
|
||||||
Models []string `yaml:"models"`
|
Models []string `yaml:"models"`
|
||||||
Filters Filters `yaml:"filters"`
|
Filters Filters `yaml:"filters"`
|
||||||
|
|
||||||
|
// Timeout settings for proxy connections
|
||||||
|
Timeouts TimeoutsConfig `yaml:"timeouts"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *PeerConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
func (c *PeerConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
||||||
@@ -21,6 +24,17 @@ func (c *PeerConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
|||||||
ApiKey: "",
|
ApiKey: "",
|
||||||
Models: []string{},
|
Models: []string{},
|
||||||
Filters: Filters{},
|
Filters: Filters{},
|
||||||
|
|
||||||
|
// mostly matches http.DefaultTransport but with a 60s ResponseHeader timeout
|
||||||
|
// to match the pre PR #619 functionality
|
||||||
|
Timeouts: TimeoutsConfig{
|
||||||
|
Connect: 30,
|
||||||
|
KeepAlive: 30,
|
||||||
|
ResponseHeader: 60,
|
||||||
|
TLSHandshake: 10,
|
||||||
|
ExpectContinue: 1,
|
||||||
|
IdleConn: 90,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := unmarshal(&defaults); err != nil {
|
if err := unmarshal(&defaults); err != nil {
|
||||||
|
|||||||
@@ -365,6 +365,8 @@ func processStreamingResponse(modelID string, start time.Time, body []byte) (Tok
|
|||||||
}
|
}
|
||||||
|
|
||||||
func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result) (TokenMetrics, error) {
|
func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result) (TokenMetrics, error) {
|
||||||
|
wallDurationMs := int(time.Since(start).Milliseconds())
|
||||||
|
|
||||||
// default values
|
// default values
|
||||||
cachedTokens := -1 // unknown or missing data
|
cachedTokens := -1 // unknown or missing data
|
||||||
outputTokens := 0
|
outputTokens := 0
|
||||||
@@ -373,7 +375,7 @@ func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result)
|
|||||||
// timings data
|
// timings data
|
||||||
tokensPerSecond := -1.0
|
tokensPerSecond := -1.0
|
||||||
promptPerSecond := -1.0
|
promptPerSecond := -1.0
|
||||||
durationMs := int(time.Since(start).Milliseconds())
|
durationMs := wallDurationMs
|
||||||
|
|
||||||
if usage.Exists() {
|
if usage.Exists() {
|
||||||
if pt := usage.Get("prompt_tokens"); pt.Exists() {
|
if pt := usage.Get("prompt_tokens"); pt.Exists() {
|
||||||
@@ -402,7 +404,10 @@ func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result)
|
|||||||
outputTokens = int(timings.Get("predicted_n").Int())
|
outputTokens = int(timings.Get("predicted_n").Int())
|
||||||
promptPerSecond = timings.Get("prompt_per_second").Float()
|
promptPerSecond = timings.Get("prompt_per_second").Float()
|
||||||
tokensPerSecond = timings.Get("predicted_per_second").Float()
|
tokensPerSecond = timings.Get("predicted_per_second").Float()
|
||||||
durationMs = int(timings.Get("prompt_ms").Float() + timings.Get("predicted_ms").Float())
|
timingsDurationMs := int(timings.Get("prompt_ms").Float() + timings.Get("predicted_ms").Float())
|
||||||
|
if timingsDurationMs > durationMs {
|
||||||
|
durationMs = timingsDurationMs
|
||||||
|
}
|
||||||
|
|
||||||
if cachedValue := timings.Get("cache_n"); cachedValue.Exists() {
|
if cachedValue := timings.Get("cache_n"); cachedValue.Exists() {
|
||||||
cachedTokens = int(cachedValue.Int())
|
cachedTokens = int(cachedValue.Int())
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ import (
|
|||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
"github.com/mostlygeek/llama-swap/event"
|
"github.com/mostlygeek/llama-swap/event"
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/tidwall/gjson"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestMetricsMonitor_AddMetrics(t *testing.T) {
|
func TestMetricsMonitor_AddMetrics(t *testing.T) {
|
||||||
@@ -570,6 +571,27 @@ func TestMetricsMonitor_Concurrent(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestMetricsMonitor_ParseMetrics(t *testing.T) {
|
func TestMetricsMonitor_ParseMetrics(t *testing.T) {
|
||||||
|
t.Run("keeps wall clock duration when timings underreport request time", func(t *testing.T) {
|
||||||
|
start := time.Now().Add(-5 * time.Second)
|
||||||
|
usage := gjson.Parse(`{"prompt_tokens": 5, "completion_tokens": 1}`)
|
||||||
|
timings := gjson.Parse(`{
|
||||||
|
"prompt_n": 5,
|
||||||
|
"predicted_n": 1,
|
||||||
|
"prompt_per_second": 10.0,
|
||||||
|
"predicted_per_second": 2.0,
|
||||||
|
"prompt_ms": 5.0,
|
||||||
|
"predicted_ms": 15.0
|
||||||
|
}`)
|
||||||
|
|
||||||
|
metrics, err := parseMetrics("test-model", start, usage, timings)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Equal(t, 5, metrics.InputTokens)
|
||||||
|
assert.Equal(t, 1, metrics.OutputTokens)
|
||||||
|
assert.Equal(t, 10.0, metrics.PromptPerSecond)
|
||||||
|
assert.Equal(t, 2.0, metrics.TokensPerSecond)
|
||||||
|
assert.GreaterOrEqual(t, metrics.DurationMs, 5000)
|
||||||
|
})
|
||||||
|
|
||||||
t.Run("prefers timings over usage data", func(t *testing.T) {
|
t.Run("prefers timings over usage data", func(t *testing.T) {
|
||||||
mm := newMetricsMonitor(testLogger, 10, 0)
|
mm := newMetricsMonitor(testLogger, 10, 0)
|
||||||
|
|
||||||
|
|||||||
+17
-15
@@ -34,23 +34,25 @@ func NewPeerProxy(peers config.PeerDictionaryConfig, proxyLogger *LogMonitor) (*
|
|||||||
}
|
}
|
||||||
sort.Strings(peerIDs)
|
sort.Strings(peerIDs)
|
||||||
|
|
||||||
// Create a shared transport with reasonable timeouts for peer connections
|
|
||||||
// these can be tuned with feedback later
|
|
||||||
peerTransport := &http.Transport{
|
|
||||||
DialContext: (&net.Dialer{
|
|
||||||
Timeout: 30 * time.Second, // Connection timeout
|
|
||||||
KeepAlive: 30 * time.Second,
|
|
||||||
}).DialContext,
|
|
||||||
TLSHandshakeTimeout: 10 * time.Second,
|
|
||||||
ResponseHeaderTimeout: 60 * time.Second, // Time to wait for response headers
|
|
||||||
ExpectContinueTimeout: 1 * time.Second,
|
|
||||||
MaxIdleConns: 100,
|
|
||||||
MaxIdleConnsPerHost: 10,
|
|
||||||
IdleConnTimeout: 90 * time.Second,
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, peerID := range peerIDs {
|
for _, peerID := range peerIDs {
|
||||||
peer := peers[peerID]
|
peer := peers[peerID]
|
||||||
|
|
||||||
|
// Create a transport with per-peer timeout configuration
|
||||||
|
peerTransport := &http.Transport{
|
||||||
|
Proxy: http.ProxyFromEnvironment,
|
||||||
|
DialContext: (&net.Dialer{
|
||||||
|
Timeout: time.Duration(peer.Timeouts.Connect) * time.Second,
|
||||||
|
KeepAlive: time.Duration(peer.Timeouts.KeepAlive) * time.Second,
|
||||||
|
}).DialContext,
|
||||||
|
TLSHandshakeTimeout: time.Duration(peer.Timeouts.TLSHandshake) * time.Second,
|
||||||
|
ResponseHeaderTimeout: time.Duration(peer.Timeouts.ResponseHeader) * time.Second,
|
||||||
|
ExpectContinueTimeout: time.Duration(peer.Timeouts.ExpectContinue) * time.Second,
|
||||||
|
ForceAttemptHTTP2: true,
|
||||||
|
MaxIdleConns: 100,
|
||||||
|
MaxIdleConnsPerHost: 10,
|
||||||
|
IdleConnTimeout: time.Duration(peer.Timeouts.IdleConn) * time.Second,
|
||||||
|
}
|
||||||
|
|
||||||
// Create reverse proxy for this peer
|
// Create reverse proxy for this peer
|
||||||
reverseProxy := httputil.NewSingleHostReverseProxy(peer.ProxyURL)
|
reverseProxy := httputil.NewSingleHostReverseProxy(peer.ProxyURL)
|
||||||
reverseProxy.Transport = peerTransport
|
reverseProxy.Transport = peerTransport
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import (
|
|||||||
"net/url"
|
"net/url"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/mostlygeek/llama-swap/proxy/config"
|
"github.com/mostlygeek/llama-swap/proxy/config"
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
@@ -266,3 +267,45 @@ func TestProxyRequest_SSEHeaderModification(t *testing.T) {
|
|||||||
// The X-Accel-Buffering header should be set to "no" for SSE
|
// The X-Accel-Buffering header should be set to "no" for SSE
|
||||||
assert.Equal(t, "no", w.Header().Get("X-Accel-Buffering"))
|
assert.Equal(t, "no", w.Header().Get("X-Accel-Buffering"))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestNewPeerProxy_CustomTimeouts(t *testing.T) {
|
||||||
|
proxyURL, _ := url.Parse("http://localhost:8080")
|
||||||
|
|
||||||
|
peers := config.PeerDictionaryConfig{
|
||||||
|
"test-peer": config.PeerConfig{
|
||||||
|
Proxy: "http://localhost:8080",
|
||||||
|
ProxyURL: proxyURL,
|
||||||
|
Models: []string{"model1"},
|
||||||
|
Timeouts: config.TimeoutsConfig{
|
||||||
|
Connect: 45,
|
||||||
|
ResponseHeader: 300,
|
||||||
|
TLSHandshake: 15,
|
||||||
|
ExpectContinue: 2,
|
||||||
|
IdleConn: 120,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
peerProxy, err := NewPeerProxy(peers, testLogger)
|
||||||
|
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.NotNil(t, peerProxy)
|
||||||
|
assert.True(t, peerProxy.HasPeerModel("model1"))
|
||||||
|
|
||||||
|
// Verify the timeout values are actually applied to the transport
|
||||||
|
member, found := peerProxy.proxyMap["model1"]
|
||||||
|
require.True(t, found, "model1 should exist in proxyMap")
|
||||||
|
assert.NotNil(t, member.reverseProxy)
|
||||||
|
assert.NotNil(t, member.reverseProxy.Transport)
|
||||||
|
|
||||||
|
transport, ok := member.reverseProxy.Transport.(*http.Transport)
|
||||||
|
require.True(t, ok, "Transport should be *http.Transport")
|
||||||
|
|
||||||
|
// Verify all timeout values are correctly applied
|
||||||
|
assert.Equal(t, 300*time.Second, transport.ResponseHeaderTimeout)
|
||||||
|
assert.Equal(t, 15*time.Second, transport.TLSHandshakeTimeout)
|
||||||
|
assert.Equal(t, 2*time.Second, transport.ExpectContinueTimeout)
|
||||||
|
assert.Equal(t, 120*time.Second, transport.IdleConnTimeout)
|
||||||
|
// ForceAttemptHTTP2 should be enabled
|
||||||
|
assert.True(t, transport.ForceAttemptHTTP2)
|
||||||
|
}
|
||||||
|
|||||||
@@ -96,6 +96,24 @@ func NewProcess(ID string, healthCheckTimeout int, config config.ModelConfig, pr
|
|||||||
var reverseProxy *httputil.ReverseProxy
|
var reverseProxy *httputil.ReverseProxy
|
||||||
if proxyURL != nil {
|
if proxyURL != nil {
|
||||||
reverseProxy = httputil.NewSingleHostReverseProxy(proxyURL)
|
reverseProxy = httputil.NewSingleHostReverseProxy(proxyURL)
|
||||||
|
|
||||||
|
// Create custom transport with configured timeouts
|
||||||
|
transport := &http.Transport{
|
||||||
|
Proxy: http.ProxyFromEnvironment,
|
||||||
|
DialContext: (&net.Dialer{
|
||||||
|
Timeout: time.Duration(config.Timeouts.Connect) * time.Second,
|
||||||
|
KeepAlive: time.Duration(config.Timeouts.KeepAlive) * time.Second,
|
||||||
|
}).DialContext,
|
||||||
|
TLSHandshakeTimeout: time.Duration(config.Timeouts.TLSHandshake) * time.Second,
|
||||||
|
ResponseHeaderTimeout: time.Duration(config.Timeouts.ResponseHeader) * time.Second,
|
||||||
|
ExpectContinueTimeout: time.Duration(config.Timeouts.ExpectContinue) * time.Second,
|
||||||
|
ForceAttemptHTTP2: true,
|
||||||
|
MaxIdleConns: 100,
|
||||||
|
MaxIdleConnsPerHost: 10,
|
||||||
|
IdleConnTimeout: time.Duration(config.Timeouts.IdleConn) * time.Second,
|
||||||
|
}
|
||||||
|
reverseProxy.Transport = transport
|
||||||
|
|
||||||
reverseProxy.ModifyResponse = func(resp *http.Response) error {
|
reverseProxy.ModifyResponse = func(resp *http.Response) error {
|
||||||
// prevent nginx from buffering streaming responses (e.g., SSE)
|
// prevent nginx from buffering streaming responses (e.g., SSE)
|
||||||
if strings.Contains(strings.ToLower(resp.Header.Get("Content-Type")), "text/event-stream") {
|
if strings.Contains(strings.ToLower(resp.Header.Get("Content-Type")), "text/event-stream") {
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ package proxy
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
"os"
|
"os"
|
||||||
@@ -569,3 +570,39 @@ func (w *panicOnWriteResponseWriter) Write(b []byte) (int, error) {
|
|||||||
}
|
}
|
||||||
return w.ResponseRecorder.Write(b)
|
return w.ResponseRecorder.Write(b)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestProcess_CustomTimeouts(t *testing.T) {
|
||||||
|
modelConfig := config.ModelConfig{
|
||||||
|
Cmd: "echo test",
|
||||||
|
Proxy: "http://localhost:8080",
|
||||||
|
CheckEndpoint: "/health",
|
||||||
|
Timeouts: config.TimeoutsConfig{
|
||||||
|
Connect: 45,
|
||||||
|
ResponseHeader: 120,
|
||||||
|
TLSHandshake: 15,
|
||||||
|
ExpectContinue: 2,
|
||||||
|
IdleConn: 120,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
debugLogger := NewLogMonitorWriter(io.Discard)
|
||||||
|
process := NewProcess("test-model", 30, modelConfig, debugLogger, debugLogger)
|
||||||
|
|
||||||
|
// Verify the process was created successfully
|
||||||
|
assert.NotNil(t, process)
|
||||||
|
assert.Equal(t, "test-model", process.ID)
|
||||||
|
assert.NotNil(t, process.reverseProxy)
|
||||||
|
assert.NotNil(t, process.reverseProxy.Transport)
|
||||||
|
|
||||||
|
// Verify it's using http.Transport (not some other type)
|
||||||
|
transport, ok := process.reverseProxy.Transport.(*http.Transport)
|
||||||
|
assert.True(t, ok, "Transport should be *http.Transport")
|
||||||
|
assert.NotNil(t, transport)
|
||||||
|
|
||||||
|
// Verify the timeouts are correctly applied
|
||||||
|
assert.Equal(t, 120*time.Second, transport.ResponseHeaderTimeout)
|
||||||
|
assert.Equal(t, 15*time.Second, transport.TLSHandshakeTimeout)
|
||||||
|
assert.Equal(t, 2*time.Second, transport.ExpectContinueTimeout)
|
||||||
|
assert.Equal(t, 120*time.Second, transport.IdleConnTimeout)
|
||||||
|
assert.True(t, transport.ForceAttemptHTTP2)
|
||||||
|
}
|
||||||
|
|||||||
Generated
+3
-3
@@ -2781,9 +2781,9 @@
|
|||||||
"license": "ISC"
|
"license": "ISC"
|
||||||
},
|
},
|
||||||
"node_modules/picomatch": {
|
"node_modules/picomatch": {
|
||||||
"version": "4.0.3",
|
"version": "4.0.4",
|
||||||
"resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz",
|
"resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz",
|
||||||
"integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
|
"integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"engines": {
|
"engines": {
|
||||||
|
|||||||
Reference in New Issue
Block a user