config,server: add upstream.ignorePaths (#869 )

Add upstream.ignorePaths config to prevent model swaps for static-asset requests made through the /upstream/<model>/<path> passthrough endpoint. - add UpstreamConfig with compiled *regexp.Regexp slice; invalid regex returns an error at load time - apply a default pattern matching common static-asset suffixes (.js/.json/.css/.png/.gif/.jpg/.jpeg/.ico/.txt) when unset - in handleUpstream, return 409 Conflict when a path matches and the local model is not already loaded; peer and already-loaded models fall through to normal dispatch - update config-schema.json and config.example.yaml Updates discussion: #868
feat: hide performance menu item if disabled (#832 )
2026-06-21 13:49:53 -07:00 · 2026-06-21 13:38:29 -07:00 · 2026-06-20 11:50:35 -07:00 · 2026-06-18 20:55:02 -07:00 · 2026-06-17 17:38:52 -07:00 · 2026-06-16 21:49:09 -07:00
252 changed files with 46770 additions and 1956 deletions
@@ -0,0 +1,24 @@
+# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json
+language: "en-US"
+early_access: false
+reviews:
+  profile: "chill"
+  request_changes_workflow: false
+  high_level_summary: false
+  poem: false
+  review_status: true
+  collapse_walkthrough: false
+  sequence_diagrams: false
+  finishing_touches:
+    docstrings:
+      enabled: false
+  auto_review:
+    enabled: false
+    drafts: false
+  unit_tests:
+    enabled: false
+chat:
+  auto_reply: true
+issue_enrichment:
+  planning:
+    enabled: false
@@ -0,0 +1,39 @@
+---
+name: Bug Report
+about: I found a defect
+title: ''
+labels: 'unconfirmed bug'
+assignees: ''
+
+---
+> [!IMPORTANT]
+> If you have questions about llama-swap please post in the Q&A in Discussions. Use bug reports when you've found a defect and wish to discuss a fix.
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**Expected behaviour**
+A clear and concise description of what you expected to happen.
+
+**Operating system and version**
+
+- OS: (linux, osx, windows, freebsd, etc)
+- GPUs: (list architecture)
+
+**My Configuration**
+
+```yaml
+# copy / paste your configuration here
+```
+
+**Proxy Logs**
+
+```
+# copy / paste from /logs
+```
+
+**Upstream Logs**
+
+```
+# copy/paste from /logs
+```
@@ -0,0 +1,23 @@
+# https://docs.github.com/en/actions/use-cases-and-examples/project-management/closing-inactive-issues
+name: Close inactive issues
+on:
+  schedule:
+    - cron: "32 1 * * *"
+
+jobs:
+  close-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@b5d41d4e1d5dceea10e7104786b73624c18a190f #v10.2.0
+        with:
+          days-before-issue-stale: 30
+          days-before-issue-close: 30
+          stale-issue-label: "stale"
+          stale-issue-message: "This issue is stale because it has been open without activity for 30 days. Please remove the stale label if this was an error."
+          close-issue-message: "This issue was closed because it has been inactive for 30 days since being marked as stale."
+          days-before-pr-stale: -1
+          days-before-pr-close: -1
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
@@ -0,0 +1,53 @@
+name: Validate JSON Schema
+
+on:
+  pull_request:
+    paths:
+      - "config-schema.json"
+      - "config.example.yaml"
+      - ".github/workflows/config-schema.yml"
+  push:
+    branches:
+      - main
+    paths:
+      - "config-schema.json"
+      - "config.example.yaml"
+      - ".github/workflows/config-schema.yml"
+
+  workflow_dispatch:
+
+jobs:
+  validate-schema:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
+
+      - name: Validate JSON Schema
+        run: |
+          # Check if the file is valid JSON
+          if ! jq empty config-schema.json 2>/dev/null; then
+            echo "Error: config-schema.json is not valid JSON"
+            exit 1
+          fi
+
+          # Validate that it's a valid JSON Schema
+          # Check for required $schema field
+          if ! jq -e '."$schema"' config-schema.json > /dev/null; then
+            echo "Warning: config-schema.json should have a \$schema field"
+          fi
+
+          # Check that it has either properties or definitions
+          if ! jq -e '.properties or .definitions or ."$defs"' config-schema.json > /dev/null; then
+            echo "Warning: JSON Schema should contain properties, definitions, or \$defs"
+          fi
+
+          echo "✓ config-schema.json is valid"
+
+      - name: Set up Go
+        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c #6.4.0
+        with:
+          go-version-file: go.mod
+
+      - name: Validate config.example.yaml against schema
+        run: go test ./internal/config/ -run TestConfig_ExampleMatchesSchema -v
@@ -0,0 +1,97 @@
+name: Build Containers
+
+on:
+  # time has no specific meaning, trying to time it after
+  # the llama.cpp daily packages have time to build and publish (~8hr after llama.cpp project's cron)
+  # https://github.com/ggml-org/llama.cpp/blob/master/.github/workflows/docker.yml
+  schedule:
+    - cron: "00 12,18 * * *"
+
+  # Allows manual triggering of the workflow
+  workflow_dispatch:
+    inputs:
+      dryrun:
+        description: "Run cleanup step in dry-run mode (log what would be deleted, delete nothing)"
+        type: boolean
+        default: false
+
+  # Run on workflow file changes (without pushing)
+  push:
+    paths:
+      - '.github/workflows/containers.yml'
+      - 'docker/build-container.sh'
+      - 'docker/*.Containerfile'
+
+# grant permissions on GITHUB_TOKEN to publish packages
+# ref: https://docs.github.com/en/packages/managing-github-packages-using-github-actions-workflows/publishing-and-installing-a-package-with-github-actions#publishing-a-package-using-an-action
+permissions:
+  contents: read
+  packages: write
+  id-token: write
+
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        platform: [intel, cuda, cuda13, vulkan, cpu, musa, rocm]
+      fail-fast: false
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
+
+      - name: Free up disk space
+        if: matrix.platform == 'rocm'
+        run: |
+          echo "Before cleanup:"
+          df -h
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+          sudo docker system prune -af
+          echo "After cleanup:"
+          df -h
+
+      # QEMU enables arm64 cross-builds on the amd64 GitHub runner.
+      # Currently only the cpu backend goes multi-arch; the action is a
+      # no-op for amd64-only builds, so leaving it on for every matrix
+      # entry keeps the workflow simple.
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a #v4.0.0
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd #v4.0.0
+
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 #v4.1.0
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Run build-container
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: ./docker/build-container.sh ${{ matrix.platform }} ${{ github.event_name != 'push' }}
+
+  # actions/delete-package-versions can't see manifest lists: pushing
+  # a multi-arch image with `docker buildx --push` creates a tagged OCI
+  # index plus one untagged per-platform manifest per arch, and
+  # `delete-only-untagged-versions: true` then nukes the per-platform
+  # children, leaving the index dangling — `docker pull :cpu` 404s on
+  # the referenced digest. dataaxiom/ghcr-cleanup-action walks tagged
+  # manifest lists and excludes their children from deletion.
+  delete-untagged-containers:
+    needs: build-and-push
+    # Skip on forks — the delete API requires package-admin on the
+    # upstream account and would otherwise red-x every fork CI run.
+    if: github.repository == 'mostlygeek/llama-swap'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: dataaxiom/ghcr-cleanup-action@cd0cdb900b5dbf3a6f2cc869f0dbb0b8211f50c4 # v1.0.16
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          package: llama-swap
+          delete-untagged: true
+          dry-run: ${{ inputs.dryrun || false }}
@@ -0,0 +1,66 @@
+name: Windows CI
+
+on:
+  push:
+    branches: [ "main" ]
+    # only run when backend source changes
+    # cmd/ is excluded because it contains utilities without tests
+    paths:
+      - '**/*.go'
+      - '!cmd/**'
+      - 'go.mod'
+      - 'go.sum'
+      - 'Makefile'
+      - '.github/workflows/go-ci-windows.yml'
+
+  pull_request:
+    branches: [ "main" ]
+    paths:
+      - '**/*.go'
+      - '!cmd/**'
+      - 'go.mod'
+      - 'go.sum'
+      - 'Makefile'
+      - '.github/workflows/go-ci-windows.yml'
+
+  # Allows manual triggering of the workflow
+  workflow_dispatch:
+
+jobs:
+
+  run-tests:
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
+
+    - name: Set up Go
+      uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c #6.4.0
+      with:
+        go-version-file: go.mod
+
+    # cache simple-responder to save the build time
+    - name: Restore Simple Responder
+      id: restore-simple-responder
+      uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+      with:
+        path: ./build
+        key: ${{ runner.os }}-simple-responder-${{ hashFiles('cmd/simple-responder/simple-responder.go') }}
+
+    # necessary for testing proxy/Process swapping
+    - name: Create simple-responder
+      if: steps.restore-simple-responder.outputs.cache-hit != 'true'
+      shell: bash
+      run: make simple-responder-windows
+
+    - name: Save Simple Responder
+      # nothing new to save ... skip this step
+      if: steps.restore-simple-responder.outputs.cache-hit != 'true'
+      id: save-simple-responder
+      uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+      with:
+        path: ./build
+        key: ${{ runner.os }}-simple-responder-${{ hashFiles('cmd/simple-responder/simple-responder.go') }}
+
+    - name: Test all
+      shell: bash
+      run: make test-all
@@ -0,0 +1,70 @@
+name: Linux CI
+
+on:
+  push:
+    branches: ["main"]
+    # only run when backend source changes
+    # cmd/ is excluded because it contains utilities without tests
+    paths:
+      - "**/*.go"
+      - "!cmd/**"
+      - "go.mod"
+      - "go.sum"
+      - "Makefile"
+      - ".github/workflows/go-ci.yml"
+
+  pull_request:
+    branches: ["main"]
+    paths:
+      - "**/*.go"
+      - "!cmd/**"
+      - "go.mod"
+      - "go.sum"
+      - "Makefile"
+      - ".github/workflows/go-ci.yml"
+
+  # Allows manual triggering of the workflow
+  workflow_dispatch:
+
+jobs:
+  run-tests:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
+
+      - name: Set up Go
+        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c #6.4.0
+        with:
+          go-version-file: go.mod
+
+      # Only run in this linux based runner
+      - name: Check Formatting
+        run: |
+          if [ "$(gofmt -l . | wc -l)" -gt 0 ]; then
+            gofmt -l .
+            exit 1
+          fi
+      # cache simple-responder to save the build time
+      - name: Restore Simple Responder
+        id: restore-simple-responder
+        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+        with:
+          path: ./build
+          key: ${{ runner.os }}-simple-responder-${{ hashFiles('cmd/simple-responder/simple-responder.go') }}
+
+      # necessary for testing proxy/Process swapping
+      - name: Create simple-responder
+        if: steps.restore-simple-responder.outputs.cache-hit != 'true'
+        run: make simple-responder
+
+      - name: Save Simple Responder
+        # nothing new to save ... skip this step
+        if: steps.restore-simple-responder.outputs.cache-hit != 'true'
+        id: save-simple-responder
+        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+        with:
+          path: ./build
+          key: ${{ runner.os }}-simple-responder-${{ hashFiles('cmd/simple-responder/simple-responder.go') }}
+
+      - name: Test all
+        run: make test-all
@@ -3,7 +3,14 @@ name: goreleaser
 on:
  push:
    tags:
-      - '*'
+      - "*"
+
+  # Allows manual triggering of the workflow
+  workflow_dispatch:
+    inputs:
+      tag:
+        description: "Tag version to release (e.g. v144)"
+        required: true

 permissions:
  contents: write
@@ -12,22 +19,56 @@ jobs:
  goreleaser:
    runs-on: ubuntu-latest
    steps:
-      -
-        name: Checkout
-        uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
        with:
          fetch-depth: 0
-      -
-        name: Set up Go
-        uses: actions/setup-go@v5
-      -
-        name: Run GoReleaser
-        uses: goreleaser/goreleaser-action@v6
+          ref: ${{ github.event.inputs.tag || github.ref }}
+      - name: Set up Go
+        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c #6.4.0
+        with:
+          go-version-file: go.mod
+      - name: Set up Node.js
+        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # 6.4.0
+        with:
+          node-version: "24"
+      - name: Build UI
+        run: |
+          make ui
+
+      - name: Run GoReleaser
+        uses: goreleaser/goreleaser-action@1a80836c5c9d9e5755a25cb59ec6f45a3b5f41a8 #7.2.1
        with:
          # either 'goreleaser' (default) or 'goreleaser-pro'
          distribution: goreleaser
          # 'latest', 'nightly', or a semver
-          version: '~> v2'
+          version: "~> v2"
          args: release --clean
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+  trigger-tap-update:
+    runs-on: ubuntu-latest
+    needs: goreleaser
+    steps:
+      - name: "Resolve tag to dispatch"
+        id: tag
+        run: |
+          if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
+            echo "tag=${{ github.event.inputs.tag }}" >> "$GITHUB_OUTPUT"
+          else
+            echo "tag=${{ github.ref_name }}" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: "Trigger tap repository update"
+        uses: peter-evans/repository-dispatch@28959ce8df70de7be546dd1250a005dd32156697 #4.0.1
+        with:
+          token: ${{ secrets.TAP_REPO_PAT }}
+          repository: mostlygeek/homebrew-llama-swap
+          event-type: new-release
+          client-payload: |
+            {
+              "release": {
+                "tag_name": "${{ steps.tag.outputs.tag }}"
+              }
+            }
@@ -0,0 +1,33 @@
+name: UI Tests
+
+on:
+  push:
+    branches: [ "main" ]
+    paths:
+      - 'ui-svelte/**'
+      - '.github/workflows/ui-tests.yml'
+
+  pull_request:
+    branches: [ "main" ]
+    paths:
+      - 'ui-svelte/**'
+      - '.github/workflows/ui-tests.yml'
+
+  workflow_dispatch:
+
+jobs:
+
+  run-tests:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
+
+    - name: Set up Node.js
+      uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # 6.4.0
+      with:
+        node-version: '24'
+        cache: 'npm'
+        cache-dependency-path: ui-svelte/package-lock.json
+
+    - name: Run UI tests
+      run: make test-ui
@@ -0,0 +1,136 @@
+name: Build Unified Docker Image
+
+on:
+  schedule:
+    - cron: "37 5 * * *"
+
+  workflow_dispatch:
+    inputs:
+      llama_cpp_ref:
+        description: "llama.cpp commit hash, tag, or branch"
+        required: false
+        default: "master"
+      whisper_ref:
+        description: "whisper.cpp commit hash, tag, or branch"
+        required: false
+        default: "master"
+      sd_ref:
+        description: "stable-diffusion.cpp commit hash, tag, or branch"
+        required: false
+        default: "master"
+      ik_llama_ref:
+        description: "ik_llama.cpp commit hash, tag, or branch (CUDA only)"
+        required: false
+        default: "main"
+      llama_swap_version:
+        description: "llama-swap version (e.g. v198, latest, main)"
+        required: false
+        default: "main"
+      build_cuda:
+        description: "Build CUDA image"
+        type: boolean
+        required: false
+        default: true
+      build_vulkan:
+        description: "Build Vulkan image"
+        type: boolean
+        required: false
+        default: true
+      push_to_ghcr:
+        description: "Push images to ghcr.io"
+        type: boolean
+        required: false
+        default: true
+
+permissions:
+  contents: read
+  packages: write
+
+jobs:
+  setup:
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - id: set-matrix
+        run: |
+          backends=()
+          # schedule uses defaults (build both); workflow_dispatch respects inputs
+          if [[ "${{ github.event_name }}" == "schedule" ]] || [[ "${{ inputs.build_cuda }}" == "true" ]]; then
+            backends+=("cuda")
+          fi
+          if [[ "${{ github.event_name }}" == "schedule" ]] || [[ "${{ inputs.build_vulkan }}" == "true" ]]; then
+            backends+=("vulkan")
+          fi
+          matrix=$(printf '%s\n' "${backends[@]}" | jq -R . | jq -sc .)
+          echo "matrix=$matrix" >> $GITHUB_OUTPUT
+
+  build:
+    needs: setup
+    if: ${{ needs.setup.outputs.matrix != '[]' }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: ${{ fromJSON(needs.setup.outputs.matrix) }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
+
+      - name: Free up disk space
+        run: |
+          echo "Before cleanup:"
+          df -h
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+          sudo docker system prune -af
+          echo "After cleanup:"
+          df -h
+
+      # On GitHub Actions runners, create a fresh builder.
+      # When running locally under act, skip this and reuse the existing
+      # llama-swap-builder (which has ccache warm) to avoid exhausting disk.
+      - name: Set up Docker Buildx
+        if: ${{ !env.ACT }}
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd #v4.0.0
+
+      - name: Log in to GitHub Container Registry
+        if: ${{ !env.ACT }}
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 #v4.1.0
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build unified Docker image (${{ matrix.backend }})
+        env:
+          LLAMA_REF: ${{ inputs.llama_cpp_ref || 'master' }}
+          WHISPER_REF: ${{ inputs.whisper_ref || 'master' }}
+          SD_REF: ${{ inputs.sd_ref || 'master' }}
+          IK_LLAMA_REF: ${{ inputs.ik_llama_ref || 'main' }}
+          LS_VERSION: ${{ inputs.llama_swap_version || 'main' }}
+          DOCKER_IMAGE_TAG: ghcr.io/mostlygeek/llama-swap:unified-${{ matrix.backend }}
+          # When running under act, use the local builder that has warm ccache.
+          # On GitHub Actions, BUILDX_BUILDER is unset so docker uses the builder
+          # created by setup-buildx-action above.
+          BUILDX_BUILDER: ${{ env.ACT == 'true' && 'llama-swap-builder' || '' }}
+        run: |
+          chmod +x docker/unified/build-image.sh
+          docker/unified/build-image.sh --${{ matrix.backend }}
+
+      - name: Push to GitHub Container Registry
+        if: ${{ !env.ACT && (github.event_name == 'schedule' || inputs.push_to_ghcr == true) }}
+        run: |
+          BASE_TAG="ghcr.io/mostlygeek/llama-swap:unified-${{ matrix.backend }}"
+          DATE_TAG=$(date -u +%Y-%m-%d)
+
+          docker push "${BASE_TAG}"
+          docker tag "${BASE_TAG}" "${BASE_TAG}-${DATE_TAG}"
+          docker push "${BASE_TAG}-${DATE_TAG}"
+
+          ROOTLESS_TAG="${BASE_TAG}-rootless"
+          docker push "${ROOTLESS_TAG}"
+          docker tag "${ROOTLESS_TAG}" "${ROOTLESS_TAG}-${DATE_TAG}"
+          docker push "${ROOTLESS_TAG}-${DATE_TAG}"
@@ -2,4 +2,9 @@
 .env
 build/
 dist/
-.vscode
+.vscode
+.DS_Store
+.dev/
+
+# UI build output; placeholder.txt is kept so the go:embed succeeds.
+internal/server/ui_dist/*
@@ -6,6 +6,27 @@ builds:
    goos:
      - linux
      - darwin
+      - freebsd
+      - windows
    goarch:
      - amd64
-      - arm64
+      - arm64
+    ignore:
+      - goos: freebsd
+        goarch: arm64
+      - goos: windows
+        goarch: arm64
+
+archives:
+  - id: default
+    formats:
+      - tar.gz
+    name_template: "{{ .ProjectName }}_{{ .Version }}_{{ .Os }}_{{ .Arch }}"
+    builds_info:
+      group: root
+      owner: root
+    format_overrides:
+      # use zip format for windows
+      - goos: windows
+        formats:
+          - zip
@@ -0,0 +1,53 @@
+## Project Description:
+
+llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
+
+## Tech stack
+
+- golang
+- typescript, vite and svelt5 for UI (located in ui/)
+
+## Workflow Tasks
+
+- when summarizing changes only include details that require further action
+- just say "Done." when there is no further action
+- use the github CLI `gh` to create pull requests and work with github
+- Rules for creating pull requests:
+  - keep them short and focused on changes.
+  - never include a test plan
+  - write the summary using the same style rules as commit message
+
+## Testing
+
+- Follow test naming conventions like `TestProxyManager_<test name>`, `TestProcessGroup_<test name>`, etc.
+- Use `go test -v -run <name pattern for new tests>` to run any new tests you've written.
+- Run `gofmt -w <file>` before committing to fix any formatting
+- Build go binaries into the ./build/ subdirectory
+- Use `make test-dev` after running new tests for a quick over all test run. This runs `go test` and `staticcheck`. Fix any static checking errors. Use this only when changes are made to any code under the `proxy/` directory
+- Use `make test-all` before completing work. This includes long running concurrency tests.
+- Use `make test-ui` after making changes to the UI in ui-svelte/
+
+### Commit message example format:
+
+```
+proxy: add new feature
+
+Add new feature that implements functionality X and Y.
+
+- key change 1
+- key change 2
+- key change 3
+
+fixes #123
+```
+
+## Code Reviews
+
+- use three levels High, Medium, Low severity
+- label each discovered issue with a label like H1, M2, L3 respectively
+- High severity are must fix issues (security, race conditions, critical bugs)
+- Medium severity are recommended improvements (coding style, missing functionality, inconsistencies)
+- Low severity are nice to have changes and nits
+- Include a suggestion with each discovered item
+- Limit your code review to three items with the highest priority first
+- Double check your discovered items and recommended remediations
@@ -0,0 +1 @@
+@AGENTS.md
@@ -2,6 +2,16 @@
 APP_NAME = llama-swap
 BUILD_DIR = build

+# Get the current Git hash
+GIT_HASH := $(shell git rev-parse --short HEAD)
+ifneq ($(shell git status --porcelain),)
+    # There are untracked changes
+    GIT_HASH := $(GIT_HASH)+
+endif
+
+# Capture the current build date in RFC3339 format
+BUILD_DATE := $(shell date -u +"%Y-%m-%dT%H:%M:%SZ")
+
 # Default target: Builds binaries for both OSX and Linux
 all: mac linux simple-responder

@@ -9,31 +19,84 @@ all: mac linux simple-responder
 clean:
 	rm -rf $(BUILD_DIR)

-test:
-	go test -short -v ./proxy
+# use cached test results while developing
+test-dev:
+	go test -short ./...
+	staticcheck ./... || true

+test:
+	go test -short -count=1 ./internal/...
+
+# for CI - full test (takes longer)
 test-all:
-	go test -v ./proxy
+	go test -race -count=1 ./internal/...
+
+ui/node_modules:
+	cd ui-svelte && npm install
+
+# build react UI
+ui: ui/node_modules
+	cd ui-svelte && npm run build
+	touch internal/server/ui_dist/placeholder.txt

 # Build OSX binary
-mac:
+mac: ui
 	@echo "Building Mac binary..."
-	GOOS=darwin GOARCH=arm64 go build -o $(BUILD_DIR)/$(APP_NAME)-darwin-arm64
+	GOOS=darwin GOARCH=arm64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-darwin-arm64

 # Build Linux binary
-linux:
-	@echo "Building Linux binary..."
-	GOOS=linux GOARCH=amd64 go build -o $(BUILD_DIR)/$(APP_NAME)-linux-amd64
+linux: linux-arm64 linux-amd64

-# for testing proxy.Process
+linux-amd64: ui
+	@echo "Building Linux AMD64 binary..."
+	GOOS=linux GOARCH=amd64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-linux-amd64
+
+linux-arm64: ui
+	@echo "Building Linux ARM64 binary..."
+	GOOS=linux GOARCH=arm64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-linux-arm64
+
+# Build Windows binary
+windows: ui
+	@echo "Building Windows binary..."
+	GOOS=windows GOARCH=amd64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-windows-amd64.exe
+
+# for testing with real external processes
 simple-responder:
 	@echo "Building simple responder"
-	GOOS=darwin GOARCH=arm64 go build -o $(BUILD_DIR)/simple-responder_darwin_arm64 misc/simple-responder/simple-responder.go
-	GOOS=linux GOARCH=amd64 go build -o $(BUILD_DIR)/simple-responder_linux_amd64 misc/simple-responder/simple-responder.go
+	GOOS=darwin GOARCH=arm64 go build -o $(BUILD_DIR)/simple-responder_darwin_arm64 cmd/simple-responder/simple-responder.go
+	GOOS=linux GOARCH=amd64 go build -o $(BUILD_DIR)/simple-responder_linux_amd64 cmd/simple-responder/simple-responder.go
+
+simple-responder-windows:
+	@echo "Building simple responder for windows"
+	GOOS=windows GOARCH=amd64 go build -o $(BUILD_DIR)/simple-responder.exe cmd/simple-responder/simple-responder.go

 # Ensure build directory exists
 $(BUILD_DIR):
 	mkdir -p $(BUILD_DIR)

+# Create a new release tag
+release:
+	@echo "Checking for unstaged changes..."
+	@if [ -n "$(shell git status --porcelain)" ]; then \
+		echo "Error: There are unstaged changes. Please commit or stash your changes before creating a release tag." >&2; \
+		exit 1; \
+	fi
+
+# Get the highest tag in v{number} format, increment it, and create a new tag
+	@highest_tag=$$(git tag --sort=-v:refname | grep -E '^v[0-9]+$$' | head -n 1 || echo "v0"); \
+	new_tag="v$$(( $${highest_tag#v} + 1 ))"; \
+	echo "tagging new version: $$new_tag"; \
+	git tag "$$new_tag";
+
+GOOS ?= $(shell go env GOOS 2>/dev/null || echo linux)
+GOARCH ?= $(shell go env GOARCH 2>/dev/null || echo amd64)
+wol-proxy: $(BUILD_DIR)
+	@echo "Building wol-proxy"
+	go build -o $(BUILD_DIR)/wol-proxy-$(GOOS)-$(GOARCH)-$(shell date +%Y-%m-%d) cmd/wol-proxy/wol-proxy.go
+
+test-ui:
+	cd ui-svelte && npm ci && npm run check && npm test
+
 # Phony targets
-.PHONY: all clean osx linux
+.PHONY: all clean ui mac windows simple-responder simple-responder-windows test test-all test-dev test-ui wol-proxy
+.PHONE: linux linux-arm64 linux-amd64
@@ -1,139 +1,293 @@
+![llama-swap header image](docs/assets/hero3.webp)
+![GitHub Downloads (all assets, all releases)](https://img.shields.io/github/downloads/mostlygeek/llama-swap/total)
+![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/mostlygeek/llama-swap/go-ci.yml)
+![GitHub Repo stars](https://img.shields.io/github/stars/mostlygeek/llama-swap)
+
 # llama-swap

-![llama-swap header image](header.jpeg)
+Run multiple generative AI models on your machine and hot-swap between them on demand. llama-swap works with any OpenAI and Anthropic API compatible server and is used by thousands of people to power their local AI workflows.

-# Introduction
-llama-swap is an OpenAI API compatible server that gives you complete control over how you use your hardware. It automatically swaps to the configuration of your choice for serving a model. Since [llama.cpp's server](https://github.com/ggerganov/llama.cpp/tree/master/examples/server) can't swap models, let's swap the server instead!
+Built in Go for performance and simplicity, llama-swap has zero dependencies and is incredibly easy to set up. Get started in minutes - just one binary and one configuration file.

-Features:
+## Features:

- ✅ Easy to deploy: single binary with no dependencies
- ✅ Single yaml configuration file
+- ✅ Easy to deploy and configure: one binary, one configuration file. no external dependencies
 - ✅ On-demand model switching
- ✅ Full control over server settings per model
- ✅ OpenAI API support (`v1/completions` and `v1/chat/completions`)
- ✅ Multiple GPU support
- ✅ Run multiple models at once with `profiles`
- ✅ Remote log monitoring at `/log`
- ✅ Automatic unloading of models from GPUs after timeout
+- ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabbyAPI, stable-diffusion.cpp, etc.)
+  - future proof, upgrade your inference servers at any time.
+- ✅ OpenAI API supported endpoints:
+  - `v1/completions`
+  - `v1/chat/completions`
+  - `v1/responses`
+  - `v1/embeddings`
+  - `v1/models` - list available models
+  - `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36))
+  - `v1/audio/transcriptions` ([docs](https://github.com/mostlygeek/llama-swap/issues/41#issuecomment-2722637867))
+  - `v1/audio/voices`
+  - `v1/images/generations`
+  - `v1/images/edits`
+- ✅ Anthropic API supported endpoints:
+  - `v1/messages`
+  - `v1/messages/count_tokens`
+- ✅ llama-server (llama.cpp) supported endpoints
+  - `v1/rerank`, `v1/reranking`, `/rerank`
+  - `/infill` - for code infilling
+  - `/completion` - for completion endpoint
+- ✅ SDAPI via [stable-diffusion.cpp's server](https://github.com/leejet/stable-diffusion.cpp/tree/master/examples/server)
+  - `/sdapi/v1/txt2img`
+  - `/sdapi/v1/img2img`
+  - `/sdapi/v1/loras` - requires `model` in request body to fetch the correct loras
+- ✅ llama-swap API
+  - `/ui` - web UI
+  - `/upstream/:model_id` - direct access to upstream server ([demo](https://github.com/mostlygeek/llama-swap/pull/31))
+  - `/running` - list currently running models ([#61](https://github.com/mostlygeek/llama-swap/issues/61))
+  - `POST /api/models/unload` - manually unload all running models ([#58](https://github.com/mostlygeek/llama-swap/issues/58))
+  - `POST /api/models/unload/:model_id` - unload a specific model
+  - `/logs` - remote log monitoring
+    - `GET /logs` returns buffered plain text logs.
+      - If `Accept: text/html` is sent, `/logs` redirects to `/ui/`.
+    - `GET /logs/stream` keeps the connection open for live log streaming.
+      - Stream endpoints send buffered history first by default; add `?no-history` to stream only new lines.
+    - `GET /logs/stream/proxy` streams proxy logs only.
+    - `GET /logs/stream/upstream` streams upstream process logs only.
+    - `GET /logs/stream/{model_id}` streams logs for one model (including IDs with slashes, like `author/model`).
+  - `/health` - just returns "OK"
+  - `/metrics` - system and GPU metrics for prometheus
+- ✅ API Key support - define keys to restrict access to API endpoints
+- ✅ Customizable
+  - Run concurrent models with a custom DSL swap matrix ([#643](https://github.com/mostlygeek/llama-swap/issues/643))
+  - Automatic unloading of models after timeout by setting a `ttl`
+  - Docker and Podman support using `cmd` and `cmdStop` together
+  - Preload models on startup with `hooks` ([#235](https://github.com/mostlygeek/llama-swap/pull/235))
+  - Apply filters to requests to control inference with `stripParams`, `setParams` and `setParamsByID`

-## Releases
+### Web UI

-Builds for Linux and OSX are available on the [Releases](https://github.com/mostlygeek/llama-swap/releases) page.
+llama-swap includes a real time web interface with a playground for testing out all sorts of local models:

-### Building from source
+<img width="1125" height="876" alt="image" src="https://github.com/user-attachments/assets/8ee41947-97af-463d-b0f0-8e9c478fac07" />

-1. Install golang for your system
-1. `git clone git@github.com:mostlygeek/llama-swap.git`
-1. `make clean all`
-1. Binaries will be in `build/` subdirectory
+View detailed token metrics:

-## config.yaml
+<img width="1111" height="515" alt="image" src="https://github.com/user-attachments/assets/64bfb280-d7a3-4126-971a-a128fd40410c" />

-llama-swap's configuration is purposefully simple.
+Inspect request and responses:

-```yaml
-# Seconds to wait for llama.cpp to load and be ready to serve requests
-# Default (and minimum) is 15 seconds
-healthCheckTimeout: 60
+<img width="1111" height="720" alt="image" src="https://github.com/user-attachments/assets/24fe4aca-1448-4d7c-b9e8-a967589bda6c" />

-# define valid model values and the upstream server start
-models:
-  "llama":
-    cmd: llama-server --port 8999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf
+Manually load and unload models:

-    # where to reach the server started by cmd, make sure the ports match
-    proxy: http://127.0.0.1:8999
+<img width="1109" height="719" alt="image" src="https://github.com/user-attachments/assets/02b1e1f2-abd0-4050-84ae-facd66ff01c4" />

-    # aliases names to use this model for
-    aliases:
-    - "gpt-4o-mini"
-    - "gpt-3.5-turbo"
+Real time log streaming:

-    # check this path for an HTTP 200 OK before serving requests
-    # default: /health to match llama.cpp
-    # use "none" to skip endpoint checking, but may cause HTTP errors
-    # until the model is ready
-    checkEndpoint: /custom-endpoint
-
-    # automatically unload the model after this many seconds
-    # ttl values must be a value greater than 0
-    # default: 0 = never unload model
-    ttl: 60
-
-  "qwen":
-    # environment variables to pass to the command
-    env:
-      - "CUDA_VISIBLE_DEVICES=0"
-
-    # multiline for readability
-    cmd: >
-      llama-server --port 8999
-      --model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
-    proxy: http://127.0.0.1:8999
-
-# profiles make it easy to managing multi model (and gpu) configurations.
-#
-# Tips:
-#  - each model must be listening on a unique address and port
-#  - the model name is in this format: "profile_name:model", like "coding:qwen"
-#  - the profile will load and unload all models in the profile at the same time
-profiles:
-  coding:
-    - "qwen"
-    - "llama"
-```
-
-More [examples](examples/README.md) are available for different use cases.
+<img width="1107" height="559" alt="image" src="https://github.com/user-attachments/assets/39669a10-cff2-409e-836a-5bad8bd0140c" />

 ## Installation

-1. Create a configuration file, see [config.example.yaml](config.example.yaml)
-1. Download a [release](https://github.com/mostlygeek/llama-swap/releases) appropriate for your OS and architecture.
-    * _Note: Windows currently untested._
-1. Run the binary with `llama-swap --config path/to/config.yaml`
+llama-swap can be installed in multiple ways

-## Monitoring Logs
+1. Docker
+2. Homebrew (macOS and Linux)
+3. MacPorts (macOS)
+4. WinGet
+5. From release binaries
+6. From source

-Open the `http://<host>/logs` with your browser to get a web interface with streaming logs.
+### Docker Install ([download images](https://github.com/mostlygeek/llama-swap/pkgs/container/llama-swap))

-Of course, CLI access is also supported:
+Two types of container images are built nightly for llama-swap:
+
+1. A unified container with llama-server, ik-llama-server, stable-diffusion.cpp, whisper.cpp and llama-swap built from source. This is only available for cuda and vulkan but has more capabilities. This one is recommended for use.
+2. A legacy image that is based on llama.cpp's images and llama-swap copied into the container. Use this one if you prefer to stay close to llama.cpp's container images.
+
+#### Unified container (Recommended)
+
+```shell
+$ docker pull ghcr.io/mostlygeek/llama-swap:unified-cuda
+
+# run with a custom configuration and models directory
+$ docker run -it --rm --runtime nvidia -p 9292:8080 \
+ -v /path/to/models:/models \
+ -v /path/to/custom/config.yaml:/etc/llama-swap/config/config.yaml \
+ ghcr.io/mostlygeek/llama-swap:unified-cuda
+```
+
+#### Legacy container
+
+```shell
+$ docker pull ghcr.io/mostlygeek/llama-swap:cuda
+
+# run with a custom configuration and models directory
+$ docker run -it --rm --runtime nvidia -p 9292:8080 \
+ -v /path/to/models:/models \
+ -v /path/to/custom/config.yaml:/app/config.yaml \
+ ghcr.io/mostlygeek/llama-swap:cuda
+```
+
+<details>
+<summary>
+more examples
+</summary>
+
+```shell
+# pull latest images per platform
+docker pull ghcr.io/mostlygeek/llama-swap:cpu
+docker pull ghcr.io/mostlygeek/llama-swap:cuda
+docker pull ghcr.io/mostlygeek/llama-swap:vulkan
+docker pull ghcr.io/mostlygeek/llama-swap:intel
+docker pull ghcr.io/mostlygeek/llama-swap:musa
+
+# tagged llama-swap, platform and llama-server version images
+docker pull ghcr.io/mostlygeek/llama-swap:v166-cuda-b6795
+
+# non-root cuda
+docker pull ghcr.io/mostlygeek/llama-swap:cuda-non-root

 ```
-# sends up to the last 10KB of logs
-curl http://host/logs'

-# streams logs
-curl -Ns 'http://host/logs/stream'
+</details>
+
+### Homebrew Install (macOS/Linux)
+
+```shell
+brew tap mostlygeek/llama-swap
+brew install llama-swap
+llama-swap --config path/to/config.yaml --listen localhost:8080
+```
+
+### MacPorts (macOS)
+
+> [!NOTE]
+> Maintained by MacPorts community - [llama-swap port](https://ports.macports.org/port/llama-swap). It is not an official part of llama-swap.
+
+```shell
+sudo port install llama-swap
+llama-swap --config path/to/config.yaml --listen localhost:8080
+```
+
+### WinGet Install (Windows)
+
+> [!NOTE]
+> WinGet is maintained by community contributor [Dvd-Znf](https://github.com/Dvd-Znf) ([#327](https://github.com/mostlygeek/llama-swap/issues/327)). It is not an official part of llama-swap.
+
+```shell
+# install
+C:\> winget install llama-swap
+
+# upgrade
+C:\> winget upgrade llama-swap
+```
+
+### Pre-built Binaries
+
+Binaries are available on the [release](https://github.com/mostlygeek/llama-swap/releases) page for Linux, Mac, Windows and FreeBSD.
+
+### Building from source
+
+1. Building requires Go and Node.js (for UI).
+1. `git clone https://github.com/mostlygeek/llama-swap.git`
+1. `make clean all`
+1. look in the `build/` subdirectory for the llama-swap binary
+
+## Configuration
+
+```yaml
+# minimum viable config.yaml
+
+models:
+  model1:
+    cmd: llama-server --port ${PORT} --model /path/to/model.gguf
+```
+
+That's all you need to get started:
+
+1. `models` - holds all model configurations
+2. `model1` - the ID used in API calls
+3. `cmd` - the command to run to start the server.
+4. `${PORT}` - an automatically assigned port number
+
+Almost all configuration settings are optional and can be added one step at a time:
+
+- Advanced features
+  - `matrix` to run concurrent models with a custom swap logic DSL
+  - `hooks` to run things on startup
+  - `macros` reusable snippets
+- Model customization
+  - `ttl` to automatically unload models
+  - `aliases` to use familiar model names (e.g., "gpt-4o-mini")
+  - `env` to pass custom environment variables to inference servers
+  - `cmdStop` gracefully stop Docker/Podman containers
+  - `useModelName` to override model names sent to upstream servers
+  - `${PORT}` automatic port variables for dynamic port assignment
+  - `filters` rewrite parts of requests before sending to the upstream server
+
+See the [configuration documentation](docs/configuration.md) for all options.
+
+## How does llama-swap work?
+
+When a request is made to an OpenAI compatible endpoint, llama-swap will extract the `model` value and load the appropriate server configuration to serve it. If the wrong upstream server is running, it will be replaced with the correct one. This is where the "swap" part comes in. The upstream server is automatically swapped to handle the request correctly.
+
+In the most basic configuration llama-swap handles one model at a time. For more advanced use cases, using a `matrix` allows multiple models to be loaded at the same time. You have complete control over how your system resources are used.
+
+## Reverse Proxy Configuration (nginx)
+
+If you deploy llama-swap behind nginx, disable response buffering for streaming endpoints. By default, nginx buffers responses which breaks Server‑Sent Events (SSE) and streaming chat completion. ([#236](https://github.com/mostlygeek/llama-swap/issues/236))
+
+Recommended nginx configuration snippets:
+
+```nginx
+# SSE for UI events/logs
+location /api/events {
+    proxy_pass http://your-llama-swap-backend;
+    proxy_buffering off;
+    proxy_cache off;
+}
+
+# Streaming chat completions (stream=true)
+location /v1/chat/completions {
+    proxy_pass http://your-llama-swap-backend;
+    proxy_buffering off;
+    proxy_cache off;
+}
+```
+
+As a safeguard, llama-swap also sets `X-Accel-Buffering: no` on SSE responses. However, explicitly disabling `proxy_buffering` at your reverse proxy is still recommended for reliable streaming behavior.
+
+## Monitoring Logs on the CLI
+
+```sh
+# sends up to the last 10KB of logs
+$ curl http://host/logs
+
+# streams combined logs
+curl -Ns http://host/logs/stream
+
+# stream llama-swap's proxy status logs
+curl -Ns http://host/logs/stream/proxy
+
+# stream logs from upstream processes that llama-swap loads
+curl -Ns http://host/logs/stream/upstream
+
+# stream logs only from a specific model
+curl -Ns http://host/logs/stream/{model_id}

 # stream and filter logs with linux pipes
 curl -Ns http://host/logs/stream | grep 'eval time'

-# skips history and just streams new log entries
+# appending ?no-history will disable sending buffered history first
 curl -Ns 'http://host/logs/stream?no-history'
 ```

-## Systemd Unit Files
+## Do I need to use llama.cpp's server (llama-server)?

-Use this unit file to start llama-swap on boot. This is only tested on Ubuntu.
+Any OpenAI compatible server would work. llama-swap was originally designed for llama-server and it is the best supported.

-`/etc/systemd/system/llama-swap.service`
-```
-[Unit]
-Description=llama-swap
-After=network.target
+For Python based inference servers like vllm or tabbyAPI it is recommended to run them via podman or docker. This provides clean environment isolation as well as responding correctly to `SIGTERM` signals for proper shutdown.

-[Service]
-User=nobody
+## Star History

-# set this to match your environment
-ExecStart=/path/to/llama-swap --config /path/to/llama-swap.config.yml
+> [!NOTE]
+> Thank you to everyone who has given this project a ⭐️!

-Restart=on-failure
-RestartSec=3
-StartLimitBurst=3
-StartLimitInterval=30
-
-[Install]
-WantedBy=multi-user.target
-```
+[![Star History Chart](https://api.star-history.com/svg?repos=mostlygeek/llama-swap&type=Date)](https://www.star-history.com/#mostlygeek/llama-swap&Date)
@@ -0,0 +1,85 @@
+# Replace ring.Ring with Efficient Circular Byte Buffer
+
+## Overview
+
+Replace the inefficient `container/ring.Ring` implementation in `logMonitor.go` with a simple circular byte buffer that uses a single contiguous `[]byte` slice. This eliminates per-write allocations, improves cache locality, and correctly implements a 10KB buffer.
+
+## Current Issues
+
+1. `ring.New(10 * 1024)` creates 10,240 ring **elements**, not 10KB of storage
+2. Every `Write()` call allocates a new `[]byte` slice inside the lock
+3. `GetHistory()` iterates all 10,240 elements and appends repeatedly (geometric reallocs)
+4. Linked list structure has poor cache locality and pointer overhead
+
+## Design Requirements
+
+### New CircularBuffer Type
+
+Create a simple circular byte buffer with:
+- Single pre-allocated `[]byte` of fixed capacity (10KB)
+- `head` and `size` integers to track write position and data length
+- No per-write allocations
+
+### API Requirements
+
+The new buffer must support:
+1. **Write(p []byte)** - Append bytes, overwriting oldest data when full
+2. **GetHistory() []byte** - Return all buffered data in correct order (oldest to newest)
+
+### Implementation Details
+
+```go
+type circularBuffer struct {
+    data []byte  // pre-allocated capacity
+    head int     // next write position
+    size int     // current number of bytes stored (0 to cap)
+}
+```
+
+**Write logic:**
+- If `len(p) >= capacity`: just keep the last `capacity` bytes
+- Otherwise: write bytes at `head`, wrapping around if needed
+- Update `head` and `size` accordingly
+- Data is copied into the internal buffer (not stored by reference)
+
+**GetHistory logic:**
+- Calculate start position: `(head - size + cap) % cap`
+- If not wrapped: single slice copy
+- If wrapped: two copies (end of buffer + beginning)
+- Returns a **new slice** (copy), not a view into internal buffer
+
+### Immutability Guarantees (must preserve)
+
+Per existing tests:
+1. Modifying input `[]byte` after `Write()` must not affect stored data
+2. `GetHistory()` returns independent copy - modifications don't affect buffer
+
+## Files to Modify
+
+- `proxy/logMonitor.go` - Replace `buffer *ring.Ring` with new circular buffer
+
+## Testing Plan
+
+Existing tests in `logMonitor_test.go` should continue to pass:
+- `TestLogMonitor` - Basic write/read and subscriber notification
+- `TestWrite_ImmutableBuffer` - Verify writes don't affect returned history
+- `TestWrite_LogTimeFormat` - Timestamp formatting
+
+Add new tests:
+- Test buffer wrap-around behavior
+- Test large writes that exceed buffer capacity
+- Test exact capacity boundary conditions
+
+## Checklist
+
+- [ ] Create `circularBuffer` struct in `logMonitor.go`
+- [ ] Implement `Write()` method for circular buffer
+- [ ] Implement `GetHistory()` method for circular buffer
+- [ ] Update `LogMonitor` struct to use new buffer
+- [ ] Update `NewLogMonitorWriter()` to initialize new buffer
+- [ ] Update `LogMonitor.Write()` to use new buffer
+- [ ] Update `LogMonitor.GetHistory()` to use new buffer
+- [ ] Remove `"container/ring"` import
+- [ ] Run `make test-dev` to verify existing tests pass
+- [ ] Add wrap-around test case
+- [ ] Run `make test-all` for final validation
@@ -0,0 +1,183 @@
+# Improve Testability (#655)
+
+## Current Pain Points
+
+1. **Tests bypass config loading** - ~80% of tests build `config.Config` structs directly, skipping YAML parsing, env var substitution, macro expansion, and `${PORT}` assignment. Config bugs in those paths go untested.
+
+2. **simple-responder is everywhere** - Every proxy/routing test launches a real subprocess, waits for health checks (~healthCheckTimeout: 15), and manages process lifecycle just to test HTTP routing. Most of that overhead is wasted.
+
+3. **Port counter is fragile** - A global `nextTestPort` counter starting at 12000 with a mutex. Parallel tests or leftover processes can collide.
+
+## Stages
+
+### Stage 1: YAML-based test config helper
+
+**Goal:** Tests go through the real `LoadConfigFromReader` path instead of hand-building structs.
+
+**Effort:** Low | **Impact:** Config bugs caught earlier | **Risk:** None
+
+Create a test helper in `proxy/helpers_test.go`:
+
+```go
+// testConfigFromYAML substitutes simple-responder paths and loads through
+// the real config pipeline (env vars, macros, port assignment, etc.)
+func testConfigFromYAML(t *testing.T, yamlTmpl string) config.Config {
+    t.Helper()
+    yamlStr := strings.ReplaceAll(yamlTmpl, "{{RESPONDER}}", filepath.ToSlash(simpleResponderPath))
+    cfg, err := config.LoadConfigFromReader(strings.NewReader(yamlStr))
+    require.NoError(t, err)
+    return cfg
+}
+```
+
+Tests would then look like:
+
+```go
+func TestProxyManager_SwapProcessCorrectly(t *testing.T) {
+    config := testConfigFromYAML(t, `
+healthCheckTimeout: 15
+logLevel: error
+models:
+  model1:
+    cmd: {{RESPONDER}} --port ${PORT} -silent -respond model1
+  model2:
+    cmd: {{RESPONDER}} --port ${PORT} -silent -respond model2
+`)
+    proxy := New(config)
+    // ... same assertions
+}
+```
+
+**Why this stage first:** Zero production code changes. Pure test-side refactoring. Can be done incrementally - migrate tests one at a time. Each migrated test now validates the full config pipeline.
+
+**Scope:** ~20-30 tests in `proxymanager_test.go`, `processgroup_test.go`, `peerproxy_test.go`.
+
+### Stage 2: Injected test handler (eliminate simple-responder for routing tests)
+
+**Goal:** Replace simple-responder subprocess launches with an injected `http.Handler` for tests that don't specifically test process lifecycle.
+
+**Effort:** Medium | **Impact:** 10-100x faster routing tests | **Risk:** Low (additive, no existing code broken)
+
+Add a `testHandler http.Handler` field to `Process`. When set, `ProxyRequest` delegates directly to this handler instead of going through the reverse proxy. No subprocess, no health checks, no TCP roundtrip.
+
+**2a. Add testHandler to Process:**
+
+```go
+// In Process struct (process.go):
+testHandler http.Handler  // set only in tests; bypasses subprocess and reverse proxy
+```
+
+In `Process.Start()`, skip subprocess + health check when handler is set:
+
+```go
+func (p *Process) start() error {
+    if p.testHandler != nil {
+        p.setState(StateReady)
+        return nil
+    }
+    // existing subprocess logic...
+}
+```
+
+In `Process.ProxyRequest()`, delegate directly to the handler:
+
+```go
+// Before the reverseProxy.ServeHTTP call:
+if p.testHandler != nil {
+    p.testHandler.ServeHTTP(w, r)
+    return
+}
+```
+
+**2b. Test helper to create the handler:**
+
+```go
+// newTestHandler returns an http.Handler that mimics llama.cpp's API
+// (same endpoints as simple-responder).
+func newTestHandler(respond string) http.Handler {
+    mux := http.NewServeMux()
+    mux.HandleFunc("/v1/chat/completions", func(w http.ResponseWriter, r *http.Request) { ... })
+    mux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) { ... })
+    // ... other endpoints
+    return mux
+}
+```
+
+Tests for routing/auth/CORS/streaming then become:
+
+```go
+func TestProxyManager_AuthRequired(t *testing.T) {
+    handler := newTestHandler("model1")
+
+    config := testConfigFromYAML(t, `
+healthCheckTimeout: 15
+logLevel: error
+requiredAPIKeys: [test-key]
+models:
+  model1:
+    cmd: {{RESPONDER}} --port ${PORT} -silent -respond model1
+`)
+    pm := NewProxyManager(config)
+    // inject handler — skips subprocess, health check, port allocation
+    pm.processGroups["model1"].process.testHandler = handler
+}
+```
+
+**Why this matters:** The handler is called directly in-process. No subprocess spawn, no health check timeout, no port allocation, no TCP roundtrip, no reverse proxy overhead. Routing tests go from ~100ms each (process startup + health check) to ~1ms. Unlike an `httptest.Server` approach, there are zero network hops.
+
+**Why not blank-cmd + proxy URL:** A blank `cmd` with a `proxy` field pointing at `httptest.Server` still requires a real TCP roundtrip through the reverse proxy and introduces "external process" semantics to the config schema. Injecting the handler directly keeps it purely a test concern with no config changes.
+
+**Scope:** Most tests in `proxymanager_test.go` (auth, CORS, model listing, streaming, peer proxy), `peerproxy_test.go`, `metrics_monitor_test.go`.
+
+### Stage 3: Migrate tests incrementally
+
+**Goal:** Convert existing tests to use the Stage 1 + Stage 2 helpers.
+
+**Effort:** Medium | **Impact:** Cleaner, more reliable tests | **Risk:** None
+
+Priority order:
+1. `proxymanager_test.go` routing tests (highest count, most repetition)
+2. `peerproxy_test.go` (straightforward, all HTTP routing)
+3. `metrics_monitor_test.go` (capture logic doesn't need real processes)
+4. `processgroup_test.go` swap tests (keep simple-responder for actual swap lifecycle tests)
+
+Tests that **must keep simple-responder:**
+- Process lifecycle: start/stop, SIGKILL, SIGTERM, TTL expiry, health check failures, failed start counting
+- ProcessGroup swap concurrency (the port-collision test in `TestProcessGroup_ProxyRequestSwapIsTrueParallel`)
+
+**Scope:** ~60-70% of tests can drop simple-responder.
+
+### Stage 4 (optional): Process interface for ProcessGroup
+
+**Goal:** Enable pure unit tests of ProcessGroup's swap/exclusive/concurrency logic without any HTTP server at all.
+
+**Effort:** High | **Impact:** Pure unit tests possible | **Risk:** Medium (refactor core code)
+
+```go
+type ProcessController interface {
+    Start() error
+    Stop(StopStrategy)
+    ProxyRequest(http.ResponseWriter, *http.Request) error
+    CurrentState() ProcessState
+    ID() string
+    SetState(ProcessState)  // for test setup
+}
+```
+
+This requires:
+- Extracting the interface
+- A `MockProcess` implementation
+- Refactoring `ProcessGroup` to use the interface instead of `*Process`
+
+**Recommendation:** Only do this if ProcessGroup grows significantly more complex. Stages 1-3 give 80% of the benefit for 20% of the effort.
+
+## Effort/Impact Summary
+
+| Stage | Effort | Impact | Risk |
+|-------|--------|--------|------|
+| 1. YAML config helper | Low | Config bugs caught earlier | None |
+| 2. Injected test handler | Medium | 10-100x faster routing tests | Low |
+| 3. Migrate tests | Medium | Cleaner, more reliable tests | None |
+| 4. Process interface | High | Pure unit tests possible | Medium |
+
+**Recommended approach:** Do stages 1-3 in order. Each stage is independently valuable and can ship on its own. Stage 4 is deferred unless there's a specific need.
@@ -0,0 +1,292 @@
+# Add Model Metadata Support with Typed Macros
+
+## Overview
+
+Implement support for arbitrary metadata on model configurations that can be exposed through the `/v1/models` API endpoint. This feature extends the existing macro system to support scalar types (string, int, float, bool) instead of only strings, enabling type-safe metadata values.
+
+The metadata will be schemaless, allowing users to define any key-value pairs they need. Macro substitution will work within metadata values, preserving types when macros are used directly and converting to strings when macros are interpolated within strings.
+
+## Design Requirements
+
+### 1. Enhanced Macro System
+
+**Current State:**
+
+- Macros are defined as `map[string]string` at both global and model levels
+- Only string substitution is supported
+- Macros are replaced in: `cmd`, `cmdStop`, `proxy`, `checkEndpoint`, `filters.stripParams`
+
+**Required Changes:**
+
+- Change `MacroList` type from `map[string]string` to `map[string]any`
+- Support scalar types: `string`, `int`, `float64`, `bool`
+- Implement type-preserving macro substitution:
+  - Direct macro usage (`key: ${macro}`) preserves the macro's type
+  - Interpolated usage (`key: "text ${macro}"`) converts to string
+- Add validation to ensure macro values are scalar types only
+- Update existing macro substitution logic in [proxy/config/config.go](proxy/config/config.go) to handle `any` types
+
+**Implementation Details:**
+
+- Create a generic helper function to perform macro substitution that:
+  - Takes a value of type `any`
+  - Recursively processes maps, slices, and scalar values
+  - Replaces `${macro_name}` patterns with macro values
+  - Preserves types for direct substitution
+  - Converts to strings for interpolated substitution
+- Update `validateMacro()` function to accept `any` type and validate scalar types
+- Maintain backward compatibility with existing string-only macros
+
+### 2. Metadata Field in ModelConfig
+
+**Location:** [proxy/config/model_config.go](proxy/config/model_config.go)
+
+**Required Changes:**
+
+- Add `Metadata map[string]any` field to `ModelConfig` struct
+- Support YAML unmarshaling of arbitrary structures (maps, arrays, scalars)
+- Apply macro substitution to metadata values during config loading
+
+**Schema Requirements:**
+
+- Metadata is optional (default: empty/nil map)
+- Supports nested structures (objects within objects, arrays, etc.)
+- All string values within metadata undergo macro substitution
+- Type preservation rules apply as described above
+
+### 3. Macro Substitution in Metadata
+
+**Location:** [proxy/config/config.go](proxy/config/config.go) in `LoadConfigFromReader()`
+
+**Process Flow:**
+
+1. After loading YAML configuration
+2. After model-level and global macro merging
+3. Apply macro substitution to `ModelConfig.Metadata` field
+4. Use the same merged macros available to `cmd`, `proxy`, etc.
+5. Process recursively through all nested structures
+
+**Substitution Rules:**
+
+- `port: ${PORT}` → keeps integer type from PORT macro
+- `temperature: ${temp}` → keeps float type from temp macro
+- `note: "Running on ${PORT}"` → converts to string `"Running on 10001"`
+- Arrays and nested objects are processed recursively
+- Unknown macros should cause configuration load error (consistent with existing behavior)
+
+### 4. API Response Updates
+
+**Location:** [proxy/proxymanager.go:350](proxy/proxymanager.go#L350) `listModelsHandler()`
+
+**Current Behavior:**
+
+- Returns model records with: `id`, `object`, `created`, `owned_by`
+- Optionally includes: `name`, `description`
+
+**Required Changes:**
+
+- Add metadata to each model record under the key `llamaswap_meta`
+- Only include `llamaswap_meta` if metadata is non-empty
+- Preserve all types when marshaling to JSON
+- Maintain existing sorting by model ID
+
+**Example Response:**
+
+```json
+{
+  "object": "list",
+  "data": [
+    {
+      "id": "llama",
+      "object": "model",
+      "created": 1234567890,
+      "owned_by": "llama-swap",
+      "name": "llama 3.1 8B",
+      "description": "A small but capable model",
+      "llamaswap_meta": {
+        "port": 10001,
+        "temperature": 0.7,
+        "note": "The llama is running on port 10001 temp=0.7, context=16384",
+        "a_list": [1, 1.23, "macros are OK in list and dictionary types: llama"],
+        "an_obj": {
+          "a": "1",
+          "b": 2,
+          "c": [0.7, false, "model: llama"]
+        }
+      }
+    }
+  ]
+}
+```
+
+### 5. Validation and Error Handling
+
+**Macro Validation:**
+
+- Extend `validateMacro()` to accept values of type `any`
+- Verify macro values are scalar types: `string`, `int`, `float64`, `bool`
+- Reject complex types (maps, slices, structs) as macro values
+- Maintain existing validation for macro names and lengths
+
+**Configuration Loading:**
+
+- Fail fast if unknown macros are found in metadata
+- Provide clear error messages indicating which model and field contains errors
+- Ensure macros in metadata follow same rules as macros in cmd/proxy fields
+
+## Testing Plan
+
+### Test 1: Model-Level Macros with Different Types
+
+**File:** [proxy/config/model_config_test.go](proxy/config/model_config_test.go)
+
+**Test Cases:**
+
+- Define model with macros of each scalar type
+- Verify metadata correctly substitutes and preserves types
+- Test direct substitution (`port: ${PORT}`)
+- Test string interpolation (`note: "Port is ${PORT}"`)
+- Verify nested objects and arrays work correctly
+
+### Test 2: Global and Model Macro Precedence
+
+**File:** [proxy/config/config_test.go](proxy/config/config_test.go)
+
+**Test Cases:**
+
+- Define same macro at global and model level with different types
+- Verify model-level macro takes precedence
+- Test metadata uses correct macro value
+- Verify type is preserved from the winning macro
+
+### Test 3: Macro Validation
+
+**File:** [proxy/config/config_test.go](proxy/config/config_test.go)
+
+**Test Cases:**
+
+- Test that complex types (maps, arrays) are rejected as macro values
+  - Verify error message includes: macro name and type that was rejected
+- Test that scalar types (string, int, float, bool) are accepted
+  - Each type should load without error
+- Test macro name validation still works with `any` types
+  - Invalid characters, reserved names, length limits should still be enforced
+
+### Test 4: Metadata in API Response
+
+**File:** [proxy/proxymanager_test.go](proxy/proxymanager_test.go)
+
+**Existing Test:** `TestProxyManager_ListModelsHandler`
+
+**Test Cases:**
+
+- Model with metadata → verify `llamaswap_meta` key appears
+- Model without metadata → verify `llamaswap_meta` key is absent
+- Verify all types are correctly marshaled to JSON
+- Verify nested structures are preserved
+- Verify macro substitution has occurred before serialization
+
+### Test 5: Unknown Macros in Metadata
+
+**File:** [proxy/config/config_test.go](proxy/config/config_test.go)
+
+**Test Cases:**
+
+- Use undefined macro in metadata
+- Verify configuration loading fails with clear error
+- Error should indicate model name and that macro is undefined
+
+### Test 6: Recursive Substitution
+
+**File:** [proxy/config/config_test.go](proxy/config/config_test.go)
+
+**Test Cases:**
+
+- Metadata with deeply nested structures
+- Arrays containing objects with macros
+- Objects containing arrays with macros
+- Mixed string interpolation and direct substitution at various nesting levels
+
+## Checklist
+
+### Configuration Schema Changes
+
+- [x] Change `MacroList` type from `map[string]string` to `map[string]any` in [proxy/config/config.go:19](proxy/config/config.go#L19)
+- [x] Add `Metadata map[string]any` field to `ModelConfig` struct in [proxy/config/model_config.go:37](proxy/config/model_config.go#L37)
+- [x] Update `validateMacro()` function signature to accept `any` type for values
+- [x] Add validation logic to ensure macro values are scalar types only
+
+### Macro Substitution Logic
+
+- [x] Create generic recursive function `substituteMetadataMacros()` to handle `any` types
+- [x] Implement type-preserving direct substitution logic
+- [x] Implement string interpolation with type conversion
+- [x] Handle maps: recursively process all values
+- [x] Handle slices: recursively process all elements
+- [x] Handle scalar types: perform string-based macro substitution if value is string
+- [x] Integrate macro substitution into `LoadConfigFromReader()` after existing macro expansion
+- [x] Update existing macro substitution calls to use merged macros with correct types
+
+### API Response Changes
+
+- [x] Modify `listModelsHandler()` in [proxy/proxymanager.go:350](proxy/proxymanager.go#L350)
+- [x] Add `llamaswap_meta` field to model records when metadata exists
+- [x] Ensure empty metadata results in omitted `llamaswap_meta` key
+- [x] Verify JSON marshaling preserves all types correctly
+
+### Testing - Config Package
+
+- [x] Add test for string macros in metadata: [proxy/config/config_test.go](proxy/config/config_test.go)
+- [x] Add test for int macros in metadata: [proxy/config/config_test.go](proxy/config/config_test.go)
+- [x] Add test for float macros in metadata: [proxy/config/config_test.go](proxy/config/config_test.go)
+- [x] Add test for bool macros in metadata: [proxy/config/config_test.go](proxy/config/config_test.go)
+- [x] Add test for string interpolation in metadata: [proxy/config/config_test.go](proxy/config/config_test.go)
+- [x] Add test for model-level macro precedence: [proxy/config/config_test.go](proxy/config/config_test.go)
+- [x] Add test for nested structures in metadata: [proxy/config/config_test.go](proxy/config/config_test.go)
+- [x] Add test for unknown macro in metadata (should error): [proxy/config/config_test.go](proxy/config/config_test.go)
+- [x] Add test for invalid macro type validation: [proxy/config/config_test.go](proxy/config/config_test.go)
+
+### Testing - Model Config Package
+
+- [x] Add test cases to [proxy/config/model_config_test.go](proxy/config/model_config_test.go) for metadata unmarshaling
+- [x] Test metadata with various scalar types
+- [x] Test metadata with nested objects and arrays
+
+### Testing - Proxy Manager
+
+- [x] Update `TestProxyManager_ListModelsHandler` in [proxy/proxymanager_test.go](proxy/proxymanager_test.go)
+- [x] Add test case for model with metadata
+- [x] Add test case for model without metadata
+- [x] Verify `llamaswap_meta` key presence/absence
+- [x] Verify type preservation in JSON output
+- [x] Verify macro substitution has occurred
+
+### Documentation
+
+- [x] Verify [config.example.yaml](config.example.yaml) already has complete metadata examples (lines 149-171)
+- [x] No additional documentation needed per project instructions
+
+## Known Issues and Considerations
+
+### Inconsistencies
+
+None identified. The plan references the correct existing example in [config.example.yaml:149-171](config.example.yaml#L149-L171).
+
+### Design Decisions
+
+1. **Why `llamaswap_meta` instead of merging into record?**
+
+   - Avoids potential collisions with OpenAI API standard fields
+   - Makes it clear this is llama-swap specific metadata
+   - Easier for clients to distinguish standard vs. custom fields
+
+2. **Why support nested structures?**
+
+   - Provides maximum flexibility for users
+   - Aligns with the schemaless design principle
+   - Example config already demonstrates this capability
+
+3. **Why validate macro types?**
+   - Prevents confusing behavior (e.g., substituting a map)
+   - Makes configuration errors explicit at load time
+   - Simpler implementation and testing
@@ -0,0 +1,397 @@
+# Improve macro-in-macro support
+
+**Status: COMPLETED ✅**
+
+## Title
+
+Fix macro substitution ordering by preserving definition order using ordered YAML parsing
+
+## Overview
+
+The current macro implementation uses `map[string]any` which does not preserve insertion order. This causes issues when macros reference other macros - if macro `B` contains `${A}` but `B` is processed before `A`, the reference won't be substituted, leading to "unknown macro" errors.
+
+**Goal:** Ensure macros are substituted in definition order (LIFO - last in, first out) to allow macros to reliably reference previously-defined macros.
+
+**Outcomes:**
+- Macros can reference other macros defined earlier in the config
+- Macro substitution is deterministic and order-dependent
+- Single-pass substitution prevents circular dependencies
+- Use `yaml.Node` from `gopkg.in/yaml.v3` to preserve macro definition order
+- All existing tests pass
+- New tests validate substitution order and self-reference detection
+
+## Design Requirements
+
+### 1. YAML Parsing Strategy
+- **Continue using:** `gopkg.in/yaml.v3` (current library)
+- **Use:** `yaml.Node` for ordered parsing of macros
+- **Reason:** `yaml.Node` preserves document structure and order, avoiding need for migration
+
+### 2. Data Structure Changes
+
+#### Current Implementation (config.go:19)
+```go
+type MacroList map[string]any
+```
+
+#### New Implementation
+```go
+type MacroList []MacroEntry
+
+type MacroEntry struct {
+    Name  string
+    Value any
+}
+```
+
+**Implementation Note:** Parse macros using `yaml.Node` to extract key-value pairs in document order, then construct the ordered `MacroList`.
+
+### 3. Macro Substitution Order Rules
+
+The substitution must follow this hierarchy (from most specific to least):
+
+1. **Reserved macros** (last): `PORT`, `MODEL_ID` - substituted last, highest priority
+2. **Model-level macros** (middle): Defined in specific model config, overrides global
+3. **Global macros** (first): Defined at config root level
+
+Within each level, macros are substituted in **reverse definition order** (LIFO):
+- The last macro defined is substituted first
+- This allows later macros to reference earlier ones
+- Single-pass substitution prevents circular dependencies
+
+### 4. Macro Reference Rules
+
+**Allowed:**
+- Macro can reference any macro defined **before** it (earlier in the file)
+- Model macros can reference global macros
+- Macros can reference reserved macros (`${PORT}`, `${MODEL_ID}`)
+
+**Prohibited:**
+- Macro cannot reference itself (e.g., `foo: "value ${foo}"`)
+- Macro cannot reference macros defined **after** it
+- No circular references (prevented by single-pass, ordered substitution)
+
+### 5. Validation Requirements
+
+Add validation to detect:
+- **Self-references:** Macro value contains reference to its own name
+- **Unknown macros:** After substitution, any remaining `${...}` references
+
+Error messages should be clear:
+```
+macro 'foo' contains self-reference
+unknown macro '${bar}' in model.cmd
+```
+
+### 6. Implementation Changes
+
+#### Files to Modify
+
+1. **[proxy/config/config.go](proxy/config/config.go)**
+   - Line 19: Change `MacroList` type definition
+   - Line 69: Update `Macros MacroList` field
+   - Line 153-157: Update macro validation loop to work with ordered structure
+   - Line 175-188: Update model-level macro validation
+   - Line 181-188: **NEW** Implement proper macro merging respecting order
+   - Line 193-202: **NEW** Implement ordered macro substitution in LIFO order
+   - Line 389-415: Update `validateMacro` to detect self-references
+   - Line 420-475: Update `substituteMetadataMacros` to accept ordered MacroList
+
+2. **[proxy/config/model_config.go](proxy/config/model_config.go)**
+   - Line 33: Update `Macros MacroList` field type
+
+3. **All test files**
+   - Update test fixtures to use ordered macro definitions
+   - Ensure tests specify macro order explicitly
+
+#### Core Algorithm
+
+Replace the macro substitution logic in [config.go:181-252](proxy/config/config.go#L181-L252) with:
+
+```go
+// Merge global config and model macros. Model macros take precedence
+mergedMacros := make(MacroList, 0, len(config.Macros)+len(modelConfig.Macros)+2)
+
+// Add global macros first
+for _, entry := range config.Macros {
+	mergedMacros = append(mergedMacros, entry)
+}
+
+// Add model macros (can override global)
+for _, entry := range modelConfig.Macros {
+	// Remove any existing global macro with same name
+	found := false
+	for i, existing := range mergedMacros {
+		if existing.Name == entry.Name {
+			mergedMacros[i] = entry // Override
+			found = true
+			break
+		}
+	}
+	if !found {
+		mergedMacros = append(mergedMacros, entry)
+	}
+}
+
+// Add reserved MODEL_ID macro at the end
+mergedMacros = append(mergedMacros, MacroEntry{Name: "MODEL_ID", Value: modelId})
+
+// Check if PORT macro is needed
+if strings.Contains(modelConfig.Cmd, "${PORT}") || strings.Contains(modelConfig.Proxy, "${PORT}") || strings.Contains(modelConfig.CmdStop, "${PORT}") {
+	// enforce ${PORT} used in both cmd and proxy
+	if !strings.Contains(modelConfig.Cmd, "${PORT}") && strings.Contains(modelConfig.Proxy, "${PORT}") {
+		return Config{}, fmt.Errorf("model %s: proxy uses ${PORT} but cmd does not - ${PORT} is only available when used in cmd", modelId)
+	}
+
+	// Add PORT macro to the end (highest priority)
+	mergedMacros = append(mergedMacros, MacroEntry{Name: "PORT", Value: nextPort})
+	nextPort++
+}
+
+// Single-pass substitution: Substitute all macros in LIFO order (last defined first)
+// This allows later macros to reference earlier ones
+for i := len(mergedMacros) - 1; i >= 0; i-- {
+	entry := mergedMacros[i]
+	macroSlug := fmt.Sprintf("${%s}", entry.Name)
+	macroStr := fmt.Sprintf("%v", entry.Value)
+
+	// Substitute in command fields
+	modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, macroSlug, macroStr)
+	modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, macroSlug, macroStr)
+	modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, macroSlug, macroStr)
+	modelConfig.CheckEndpoint = strings.ReplaceAll(modelConfig.CheckEndpoint, macroSlug, macroStr)
+	modelConfig.Filters.StripParams = strings.ReplaceAll(modelConfig.Filters.StripParams, macroSlug, macroStr)
+
+	// Substitute in metadata (recursive)
+	if len(modelConfig.Metadata) > 0 {
+		var err error
+		modelConfig.Metadata, err = substituteMacroInValue(modelConfig.Metadata, entry.Name, entry.Value)
+		if err != nil {
+			return Config{}, fmt.Errorf("model %s metadata: %s", modelId, err.Error())
+		}
+	}
+}
+```
+
+Add this new helper function to replace `substituteMetadataMacros`:
+
+```go
+// substituteMacroInValue recursively substitutes a single macro in a value structure
+// This is called once per macro, allowing LIFO substitution order
+func substituteMacroInValue(value any, macroName string, macroValue any) (any, error) {
+	macroSlug := fmt.Sprintf("${%s}", macroName)
+	macroStr := fmt.Sprintf("%v", macroValue)
+
+	switch v := value.(type) {
+	case string:
+		// Check if this is a direct macro substitution
+		if v == macroSlug {
+			return macroValue, nil
+		}
+		// Handle string interpolation
+		if strings.Contains(v, macroSlug) {
+			return strings.ReplaceAll(v, macroSlug, macroStr), nil
+		}
+		return v, nil
+
+	case map[string]any:
+		// Recursively process map values
+		newMap := make(map[string]any)
+		for key, val := range v {
+			newVal, err := substituteMacroInValue(val, macroName, macroValue)
+			if err != nil {
+				return nil, err
+			}
+			newMap[key] = newVal
+		}
+		return newMap, nil
+
+	case []any:
+		// Recursively process slice elements
+		newSlice := make([]any, len(v))
+		for i, val := range v {
+			newVal, err := substituteMacroInValue(val, macroName, macroValue)
+			if err != nil {
+				return nil, err
+			}
+			newSlice[i] = newVal
+		}
+		return newSlice, nil
+
+	default:
+		// Return scalar types as-is
+		return value, nil
+	}
+}
+```
+
+### 7. Self-Reference Detection
+
+Add to `validateMacro` function:
+
+```go
+func validateMacro(name string, value any) error {
+    // ... existing validation ...
+
+    // Check for self-reference
+    if str, ok := value.(string); ok {
+        macroSlug := fmt.Sprintf("${%s}", name)
+        if strings.Contains(str, macroSlug) {
+            return fmt.Errorf("macro '%s' contains self-reference", name)
+        }
+    }
+
+    return nil
+}
+```
+
+## Testing Plan
+
+### 1. Migration Tests
+- **Test:** All existing macro tests still pass after YAML library migration
+- **Files:** All `*_test.go` files with macro tests
+
+### 2. Macro Order Tests
+
+#### Test: Macro-in-macro substitution order
+```yaml
+macros:
+  "A": "value-A"
+  "B": "prefix-${A}-suffix"
+
+models:
+  test:
+    cmd: "echo ${B}"
+```
+**Expected:** `cmd` becomes `"echo prefix-value-A-suffix"`
+
+#### Test: LIFO substitution order
+```yaml
+macros:
+  "base": "/models"
+  "path": "${base}/llama"
+  "full": "${path}/model.gguf"
+
+models:
+  test:
+    cmd: "load ${full}"
+```
+**Expected:** `cmd` becomes `"load /models/llama/model.gguf"`
+
+#### Test: Model macro overrides global
+```yaml
+macros:
+  "tag": "global"
+  "msg": "value-${tag}"
+
+models:
+  test:
+    macros:
+      "tag": "model-level"
+    cmd: "echo ${msg}"
+```
+**Expected:** `cmd` becomes `"echo value-model-level"` (model macro overrides global)
+
+### 3. Reserved Macro Tests
+
+#### Test: MODEL_ID substituted in macro
+```yaml
+macros:
+  "podman-llama": "podman run --name ${MODEL_ID} ghcr.io/ggml-org/llama.cpp:server-cuda"
+
+models:
+  my-model:
+    cmd: "${podman-llama} -m model.gguf"
+```
+**Expected:** `cmd` becomes `"podman run --name my-model ghcr.io/ggml-org/llama.cpp:server-cuda -m model.gguf"`
+
+### 4. Error Detection Tests
+
+#### Test: Self-reference detection
+```yaml
+macros:
+  "recursive": "value-${recursive}"
+```
+**Expected:** Error: `macro 'recursive' contains self-reference`
+
+#### Test: Undefined macro reference
+```yaml
+macros:
+  "A": "value-${UNDEFINED}"
+```
+**Expected:** Error: `unknown macro '${UNDEFINED}' found in macros.A` (or similar)
+
+### 5. Regression Tests
+- Run all existing macro tests: `TestConfig_MacroReplacement`, `TestConfig_MacroReservedNames`, etc.
+- Ensure all pass without modification (except test fixtures if needed)
+
+## Checklist
+
+### Phase 1: Data Structure Changes
+- [ ] Implement custom `UnmarshalYAML` method for `MacroList` that uses `yaml.Node`
+- [ ] Define new ordered `MacroList` type as `[]MacroEntry`
+- [ ] Update `MacroList` type definition in [config.go](proxy/config/config.go#L19)
+- [ ] Update `Config.Macros` field type in [config.go](proxy/config/config.go#L69)
+- [ ] Update `ModelConfig.Macros` field type in [model_config.go](proxy/config/model_config.go#L33)
+- [ ] Implement helper functions:
+  - [ ] `func (ml MacroList) Get(name string) (any, bool)` - lookup by name
+  - [ ] `func (ml MacroList) Set(name string, value any) MacroList` - add/override entry
+  - [ ] `func (ml MacroList) ToMap() map[string]any` - convert to map if needed
+
+### Phase 2: Macro Validation Updates
+- [ ] Update macro validation loop at [config.go:153-157](proxy/config/config.go#L153-L157)
+- [ ] Update model macro validation at [config.go:175-179](proxy/config/config.go#L175-L179)
+- [ ] Add self-reference detection to `validateMacro` function [config.go:389](proxy/config/config.go#L389)
+- [ ] Test self-reference detection with new test case
+
+### Phase 3: Macro Substitution Algorithm
+- [ ] Implement ordered macro merging (global → model → reserved) at [config.go:181-188](proxy/config/config.go#L181-L188)
+- [ ] Implement single-pass LIFO substitution loop (reverse iteration) at [config.go:193-202](proxy/config/config.go#L193-L202)
+  - [ ] Substitute in all string fields (cmd, cmdStop, proxy, checkEndpoint, stripParams)
+  - [ ] Substitute in metadata within same loop
+- [ ] Ensure `MODEL_ID` is added to merged macros before substitution
+- [ ] Ensure `PORT` is added after port assignment (if needed)
+- [ ] Replace `substituteMetadataMacros` with new `substituteMacroInValue` function that processes one macro at a time [config.go:420](proxy/config/config.go#L420)
+- [ ] Remove old metadata substitution code that was separate from main loop [config.go:245-251](proxy/config/config.go#L245-L251)
+
+### Phase 4: Testing
+- [ ] Run `make test-dev` - fix any static checking errors
+- [ ] Add test: macro-in-macro basic substitution
+- [ ] Add test: LIFO substitution order with 3+ macro levels
+- [ ] Add test: MODEL_ID in global macro used by model
+- [ ] Add test: PORT in global macro used by model
+- [ ] Add test: model macro overrides global macro in substitution
+- [ ] Add test: self-reference detection error
+- [ ] Add test: undefined macro reference error
+- [ ] Verify all existing macro tests pass: `TestConfig_Macro*`
+- [ ] Run `make test-all` - ensure all tests including concurrency tests pass
+
+### Phase 5: Documentation
+- [ ] Update plan status in this file (mark completed)
+- [ ] Update CLAUDE.md if macro behavior needs documentation
+- [ ] Verify no new error messages need user documentation
+
+## Bug Example (Original Issue)
+
+```yaml
+macros:
+  "podman-llama": >
+    podman run --name ${MODEL_ID}
+    --init --rm -p ${PORT}:8080 -v /home/alex/ai/models:/models:z --gpus=all
+    ghcr.io/ggml-org/llama.cpp:server-cuda
+
+  "standard-options": >
+    --no-mmap --jinja
+
+  "kv8": >
+    -fa on -ctk q8_0 -ctv q8_0
+```
+
+**Current Bug:**
+- During macro substitution, if `${MODEL_ID}` is processed before `${podman-llama}`, the `${MODEL_ID}` reference inside `podman-llama` remains unsubstituted
+- Results in error: `unknown macro '${MODEL_ID}' found in model.cmd`
+
+**After Fix:**
+- Macros substituted in LIFO order: `kv8` → `standard-options` → `podman-llama`
+- `MODEL_ID` is a reserved macro, substituted last (after all user macros)
+- `${MODEL_ID}` inside `podman-llama` is correctly replaced with the model name
@@ -0,0 +1,306 @@
+package main
+
+import (
+	"context"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io"
+	"log"
+	"net/http"
+	"os"
+	"os/signal"
+	"strings"
+	"syscall"
+	"time"
+
+	"github.com/tidwall/gjson"
+)
+
+var loremWords = strings.Fields(
+	"Lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor " +
+		"incididunt ut labore et dolore magna aliqua Ut enim ad minim veniam quis nostrud " +
+		"exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat Duis aute " +
+		"irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla " +
+		"pariatur Excepteur sint occaecat cupidatat non proident sunt in culpa qui officia " +
+		"deserunt mollit anim id est laborum Sed ut perspiciatis unde omnis iste natus error " +
+		"sit voluptatem accusantium doloremque laudantium totam rem aperiam eaque ipsa quae " +
+		"ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo " +
+		"Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit",
+)
+
+var (
+	flagListen = flag.String("listen", "localhost:9898", "listen address")
+	flagTokens = flag.Int("tokens", 1000, "number of tokens to return")
+	flagTPS    = flag.Float64("tps", 75, "tokens per second")
+	flagLoad   = flag.String("load", "0s", "simulated load duration (e.g. 2s, 500ms)")
+)
+
+type chunkDelta struct {
+	Role    string `json:"role,omitempty"`
+	Content string `json:"content,omitempty"`
+}
+
+type chunkChoice struct {
+	Index        int        `json:"index"`
+	Delta        chunkDelta `json:"delta"`
+	FinishReason *string    `json:"finish_reason"`
+}
+
+type chatChunk struct {
+	ID      string        `json:"id"`
+	Object  string        `json:"object"`
+	Created int64         `json:"created"`
+	Model   string        `json:"model"`
+	Choices []chunkChoice `json:"choices"`
+}
+
+type completionMessage struct {
+	Role    string `json:"role"`
+	Content string `json:"content"`
+}
+
+type completionChoice struct {
+	Index        int               `json:"index"`
+	Message      completionMessage `json:"message"`
+	FinishReason string            `json:"finish_reason"`
+}
+
+type completionUsage struct {
+	PromptTokens     int `json:"prompt_tokens"`
+	CompletionTokens int `json:"completion_tokens"`
+	TotalTokens      int `json:"total_tokens"`
+}
+
+type chatCompletion struct {
+	ID      string             `json:"id"`
+	Object  string             `json:"object"`
+	Created int64              `json:"created"`
+	Model   string             `json:"model"`
+	Choices []completionChoice `json:"choices"`
+	Usage   completionUsage    `json:"usage"`
+}
+
+func loremText(n int) string {
+	words := make([]string, n)
+	for i := range words {
+		words[i] = loremWords[i%len(loremWords)]
+	}
+	return strings.Join(words, " ")
+}
+
+func sendChunk(w http.ResponseWriter, content string, finishReason *string) error {
+	chunk := chatChunk{
+		ID:      "chatcmpl-fake",
+		Object:  "chat.completion.chunk",
+		Created: time.Now().Unix(),
+		Model:   "fake-model",
+		Choices: []chunkChoice{
+			{
+				Index:        0,
+				Delta:        chunkDelta{Content: content},
+				FinishReason: finishReason,
+			},
+		},
+	}
+	data, err := json.Marshal(chunk)
+	if err != nil {
+		return err
+	}
+	_, err = fmt.Fprintf(w, "data: %s\n\n", data)
+	return err
+}
+
+// startLoading runs the countdown log and closes ready when loadDur elapses.
+// If loadDur is zero, ready is closed immediately.
+func startLoading(loadDur time.Duration) <-chan struct{} {
+	ready := make(chan struct{})
+	if loadDur == 0 {
+		close(ready)
+		return ready
+	}
+	go func() {
+		deadline := time.Now().Add(loadDur)
+		log.Printf("loading... %s remaining", loadDur.Round(time.Second))
+		ticker := time.NewTicker(time.Second)
+		defer ticker.Stop()
+		timer := time.NewTimer(loadDur)
+		for {
+			select {
+			case <-timer.C:
+				close(ready)
+				log.Printf("ready")
+				return
+			case <-ticker.C:
+				if rem := time.Until(deadline).Round(time.Second); rem > 0 {
+					log.Printf("loading... %s remaining", rem)
+				}
+			}
+		}
+	}()
+	return ready
+}
+
+func healthHandler(ready <-chan struct{}) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		select {
+		case <-ready:
+			w.WriteHeader(http.StatusOK)
+		default:
+			w.WriteHeader(http.StatusServiceUnavailable)
+		}
+	}
+}
+
+func chatHandler(ready <-chan struct{}) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		if r.Method != http.MethodPost {
+			http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
+			return
+		}
+
+		body, err := io.ReadAll(r.Body)
+		if err != nil {
+			http.Error(w, "failed to read body", http.StatusBadRequest)
+			return
+		}
+
+		streaming := gjson.GetBytes(body, "stream").Bool()
+		ctx := r.Context()
+
+		select {
+		case <-ready:
+		case <-ctx.Done():
+			return
+		}
+
+		tokens := *flagTokens
+		tps := *flagTPS
+		if tps <= 0 {
+			tps = 1
+		}
+
+		if !streaming {
+			delay := time.Duration(float64(tokens) / tps * float64(time.Second))
+			select {
+			case <-time.After(delay):
+			case <-ctx.Done():
+				return
+			}
+			text := loremText(tokens)
+			resp := chatCompletion{
+				ID:      "chatcmpl-fake",
+				Object:  "chat.completion",
+				Created: time.Now().Unix(),
+				Model:   "fake-model",
+				Choices: []completionChoice{
+					{
+						Index:        0,
+						Message:      completionMessage{Role: "assistant", Content: text},
+						FinishReason: "stop",
+					},
+				},
+				Usage: completionUsage{
+					PromptTokens:     0,
+					CompletionTokens: tokens,
+					TotalTokens:      tokens,
+				},
+			}
+			w.Header().Set("Content-Type", "application/json")
+			json.NewEncoder(w).Encode(resp)
+			return
+		}
+
+		w.Header().Set("Content-Type", "text/event-stream")
+		w.Header().Set("Cache-Control", "no-cache")
+		w.Header().Set("Connection", "keep-alive")
+
+		flusher, ok := w.(http.Flusher)
+		if !ok {
+			http.Error(w, "streaming not supported", http.StatusInternalServerError)
+			return
+		}
+
+		// Send role delta first
+		first := chatChunk{
+			ID:      "chatcmpl-fake",
+			Object:  "chat.completion.chunk",
+			Created: time.Now().Unix(),
+			Model:   "fake-model",
+			Choices: []chunkChoice{
+				{Index: 0, Delta: chunkDelta{Role: "assistant"}},
+			},
+		}
+		if data, err := json.Marshal(first); err == nil {
+			fmt.Fprintf(w, "data: %s\n\n", data)
+			flusher.Flush()
+		}
+
+		interval := time.Duration(float64(time.Second) / tps)
+		ticker := time.NewTicker(interval)
+		defer ticker.Stop()
+
+		stop := "stop"
+		for i := 0; i < tokens; i++ {
+			select {
+			case <-ctx.Done():
+				return
+			case <-ticker.C:
+			}
+
+			word := loremWords[i%len(loremWords)]
+			if i < tokens-1 {
+				if err := sendChunk(w, word+" ", nil); err != nil {
+					return
+				}
+			} else {
+				if err := sendChunk(w, word, &stop); err != nil {
+					return
+				}
+			}
+			flusher.Flush()
+		}
+
+		fmt.Fprintf(w, "data: [DONE]\n\n")
+		flusher.Flush()
+	}
+}
+
+func main() {
+	flag.Parse()
+
+	loadDur, err := time.ParseDuration(*flagLoad)
+	if err != nil {
+		log.Fatalf("invalid -load value %q: %v", *flagLoad, err)
+	}
+
+	ready := startLoading(loadDur)
+
+	mux := http.NewServeMux()
+	mux.HandleFunc("/health", healthHandler(ready))
+	mux.HandleFunc("/v1/chat/completions", chatHandler(ready))
+
+	srv := &http.Server{
+		Addr:    *flagListen,
+		Handler: mux,
+	}
+
+	go func() {
+		log.Printf("listening on %s (tokens=%d tps=%.1f load=%s)",
+			*flagListen, *flagTokens, *flagTPS, loadDur)
+		if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
+			log.Fatalf("server error: %v", err)
+		}
+	}()
+
+	quit := make(chan os.Signal, 1)
+	signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
+	<-quit
+
+	log.Println("shutting down...")
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+	if err := srv.Shutdown(ctx); err != nil {
+		log.Printf("shutdown error: %v", err)
+	}
+}
@@ -0,0 +1,159 @@
+package main
+
+// created for issue: #252 https://github.com/mostlygeek/llama-swap/issues/252
+// this simple benchmark tool sends a lot of small chat completion requests to llama-swap
+// to make sure all the requests are accounted for.
+//
+// requests can be sent in parallel, and the tool will report the results.
+// usage: go run main.go -baseurl http://localhost:8080/v1 -model llama3 -requests 1000 -par 5
+
+import (
+	"bytes"
+	"flag"
+	"fmt"
+	"io"
+	"log"
+	"net/http"
+	"os"
+	"sync"
+	"time"
+)
+
+func main() {
+	// ----- CLI arguments ----------------------------------------------------
+	var (
+		baseurl         string
+		modelName       string
+		totalRequests   int
+		parallelization int
+	)
+
+	flag.StringVar(&baseurl, "baseurl", "http://localhost:8080/v1", "Base URL of the API (e.g., https://api.example.com)")
+	flag.StringVar(&modelName, "model", "", "Model name to use")
+	flag.IntVar(&totalRequests, "requests", 1, "Total number of requests to send")
+	flag.IntVar(&parallelization, "par", 1, "Maximum number of concurrent requests")
+	flag.Parse()
+
+	if baseurl == "" || modelName == "" {
+		fmt.Println("Error: both -baseurl and -model are required.")
+		flag.Usage()
+		os.Exit(1)
+	}
+	if totalRequests <= 0 {
+		fmt.Println("Error: -requests must be greater than 0.")
+		os.Exit(1)
+	}
+	if parallelization <= 0 {
+		fmt.Println("Error: -parallelization must be greater than 0.")
+		os.Exit(1)
+	}
+
+	// ----- HTTP client -------------------------------------------------------
+	client := &http.Client{
+		Timeout: 30 * time.Second,
+	}
+
+	// ----- Tracking response codes -------------------------------------------
+	statusCounts := make(map[int]int) // map[statusCode]count
+	var mu sync.Mutex                 // protects statusCounts
+
+	// ----- Request queue (buffered channel) ----------------------------------
+	requests := make(chan int, 10) // Buffered channel with capacity 10
+
+	// Goroutine to fill the request queue
+	go func() {
+		for i := 0; i < totalRequests; i++ {
+			requests <- i + 1
+		}
+		close(requests)
+	}()
+
+	// ----- Worker pool -------------------------------------------------------
+	var wg sync.WaitGroup
+	for i := 0; i < parallelization; i++ {
+		wg.Add(1)
+		go func(workerID int) {
+			defer wg.Done()
+
+			for reqID := range requests {
+				// Build request payload as a single line JSON string
+				payload := `{"model":"` + modelName + `","max_tokens":100,"stream":false,"messages":[{"role":"user","content":"write a snake game in python"}]}`
+
+				// Send POST request
+				req, err := http.NewRequest(http.MethodPost,
+					fmt.Sprintf("%s/chat/completions", baseurl),
+					bytes.NewReader([]byte(payload)))
+				if err != nil {
+					log.Printf("[worker %d][req %d] request creation error: %v", workerID, reqID, err)
+					mu.Lock()
+					statusCounts[-1]++
+					mu.Unlock()
+					continue
+				}
+				req.Header.Set("Content-Type", "application/json")
+
+				resp, err := client.Do(req)
+				if err != nil {
+					log.Printf("[worker %d][req %d] HTTP request error: %v", workerID, reqID, err)
+					mu.Lock()
+					statusCounts[-1]++
+					mu.Unlock()
+					continue
+				}
+				io.Copy(io.Discard, resp.Body)
+				resp.Body.Close()
+
+				// Record status code
+				mu.Lock()
+				statusCounts[resp.StatusCode]++
+				mu.Unlock()
+			}
+		}(i + 1)
+	}
+
+	// ----- Status ticker (prints every second) -------------------------------
+	done := make(chan struct{})
+	tickerDone := make(chan struct{})
+	go func() {
+		ticker := time.NewTicker(1 * time.Second)
+		startTime := time.Now()
+		for {
+			select {
+			case <-ticker.C:
+				mu.Lock()
+				// Compute how many requests have completed so far
+				completed := 0
+				for _, cnt := range statusCounts {
+					completed += cnt
+				}
+				// Calculate duration and progress
+				duration := time.Since(startTime)
+				progress := completed * 100 / totalRequests
+				fmt.Printf("Duration: %v, Completed: %d%% requests\n", duration, progress)
+				mu.Unlock()
+			case <-done:
+				duration := time.Since(startTime)
+				fmt.Printf("Duration: %v, Completed: %d%% requests\n", duration, 100)
+				close(tickerDone)
+				return
+			}
+		}
+	}()
+
+	// Wait for all workers to finish
+	wg.Wait()
+	close(done)  // stops the status-update goroutine
+	<-tickerDone // give ticker time to finish / print
+
+	// ----- Summary ------------------------------------------------------------
+	fmt.Println("\n\n=== HTTP response code summary ===")
+	mu.Lock()
+	for code, cnt := range statusCounts {
+		if code == -1 {
+			fmt.Printf("Client-side errors (no HTTP response): %d\n", cnt)
+		} else {
+			fmt.Printf("%d : %d\n", code, cnt)
+		}
+	}
+	mu.Unlock()
+}
@@ -0,0 +1,91 @@
+package main
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"os"
+	"os/exec"
+	"os/signal"
+	"syscall"
+	"time"
+)
+
+/*
+**
+Test how exec.Cmd.CommandContext behaves under certain conditions:*
+
+  - process is killed externally, what happens with cmd.Wait() *
+    ✔︎ it returns. catches crashes.*
+
+  - process ignores SIGTERM*
+    ✔︎ `kill()` is called after cmd.WaitDelay*
+
+  - this process exits, what happens with children (kill -9 <this process' pid>)*
+    x they stick around. have to be manually killed.*
+
+  - .WithTimeout()'s cancel is called *
+    ✔︎ process is killed after it ignores sigterm, cmd.Wait() catches it.*
+
+  - parent receives SIGINT/SIGTERM, what happens
+    ✔︎ waits for child process to exit, then exits gracefully.
+*/
+func main() {
+
+	// swap between these to use kill -9 <pid> on the cli to sim external crash
+	ctx, cancel := context.WithCancel(context.Background())
+	//ctx, cancel := context.WithTimeout(context.Background(), 1000*time.Millisecond)
+	defer cancel()
+
+	//cmd := exec.CommandContext(ctx, "sleep", "1")
+	cmd := exec.CommandContext(ctx,
+		"../../build/simple-responder_darwin_arm64",
+		//"-ignore-sig-term", /* so it doesn't exit on receiving SIGTERM, test cmd.WaitTimeout */
+	)
+	cmd.Stdin = os.Stdin
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+
+	// set a wait delay before signing sig kill
+	cmd.WaitDelay = 500 * time.Millisecond
+	cmd.Cancel = func() error {
+		fmt.Println("✔︎ Cancel() called, sending SIGTERM")
+		cmd.Process.Signal(syscall.SIGTERM)
+
+		//return nil
+
+		// this error is returned by cmd.Wait(), and can be used to
+		// single an error when the process couldn't be normally terminated
+		// but since a SIGTERM is sent, it's probably ok to return a nil
+		// as WaitDelay timing out will override the any error set here.
+		//
+		// test by enabling/disabling -ignore-sig-term on the process
+		// with -ignore-sig-term enabled, cmd.Wait() will have "signal: killed"
+		// without it, it will show the "new error from cancel"
+		return errors.New("error from cmd.Cancel()") // sets error returned by cmd.Wait()
+	}
+
+	if err := cmd.Start(); err != nil {
+		fmt.Println("Error starting process:", err)
+		return
+	}
+
+	// catch signals. Calls cancel() which will cause cmd.Wait() to return and
+	// this program to eventually exit gracefully.
+	sigChan := make(chan os.Signal, 1)
+	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
+	go func() {
+		signal := <-sigChan
+		fmt.Printf("✔︎ Received signal: %d, Killing process... with cancel before exiting\n", signal)
+		cancel()
+	}()
+
+	fmt.Printf("✔︎ Parent Pid: %d, Process Pid: %d\n", os.Getpid(), cmd.Process.Pid)
+	fmt.Println("✔︎ Process started, cmd.Wait() ... ")
+	if err := cmd.Wait(); err != nil {
+		fmt.Println("✔︎ cmd.Wait returned, Error:", err)
+	} else {
+		fmt.Println("✔︎ cmd.Wait returned, Process exited on its own")
+	}
+	fmt.Println("✔︎ Child process exited, Done.")
+}
@@ -0,0 +1,4 @@
+The rerank-test.json data is from https://github.com/ggerganov/llama.cpp/pull/9510
+
+To run it:
+> curl http://127.0.0.1:8080/v1/rerank -H "Content-Type: application/json" -d @reranker-test.json  -v | jq .
@@ -0,0 +1,17 @@
+{
+  "model": "bge-reranker",
+  "query": "Organic skincare products for sensitive skin",
+  "top_n": 3,
+  "documents": [
+    "Organic skincare for sensitive skin with aloe vera and chamomile: Imagine the soothing embrace of nature with our organic skincare range, crafted specifically for sensitive skin. Infused with the calming properties of aloe vera and chamomile, each product provides gentle nourishment and protection. Say goodbye to irritation and hello to a glowing, healthy complexion.",
+    "New makeup trends focus on bold colors and innovative techniques: Step into the world of cutting-edge beauty with this seasons makeup trends. Bold, vibrant colors and groundbreaking techniques are redefining the art of makeup. From neon eyeliners to holographic highlighters, unleash your creativity and make a statement with every look.",
+    "Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille: Erleben Sie die wohltuende Wirkung unserer Bio-Hautpflege, speziell für empfindliche Haut entwickelt. Mit den beruhigenden Eigenschaften von Aloe Vera und Kamille pflegen und schützen unsere Produkte Ihre Haut auf natürliche Weise. Verabschieden Sie sich von Hautirritationen und genießen Sie einen strahlenden Teint.",
+    "Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken: Tauchen Sie ein in die Welt der modernen Schönheit mit den neuesten Make-up-Trends. Kräftige, lebendige Farben und innovative Techniken setzen neue Maßstäbe. Von auffälligen Eyelinern bis hin zu holografischen Highlightern – lassen Sie Ihrer Kreativität freien Lauf und setzen Sie jedes Mal ein Statement.",
+    "Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla: Descubre el poder de la naturaleza con nuestra línea de cuidado de la piel orgánico, diseñada especialmente para pieles sensibles. Enriquecidos con aloe vera y manzanilla, estos productos ofrecen una hidratación y protección suave. Despídete de las irritaciones y saluda a una piel radiante y saludable.",
+    "Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras: Entra en el fascinante mundo del maquillaje con las tendencias más actuales. Colores vivos y técnicas innovadoras están revolucionando el arte del maquillaje. Desde delineadores neón hasta iluminadores holográficos, desata tu creatividad y destaca en cada look.",
+    "针对敏感肌专门设计的天然有机护肤产品：体验由芦荟和洋甘菊提取物带来的自然呵护。我们的护肤产品特别为敏感肌设计，温和滋润，保护您的肌肤不受刺激。让您的肌肤告别不适，迎来健康光彩。",
+    "新的化妆趋势注重鲜艳的颜色和创新的技巧：进入化妆艺术的新纪元，本季的化妆趋势以大胆的颜色和创新的技巧为主。无论是霓虹眼线还是全息高光，每一款妆容都能让您脱颖而出，展现独特魅力。",
+    "敏感肌のために特別に設計された天然有機スキンケア製品: アロエベラとカモミールのやさしい力で、自然の抱擁を感じてください。敏感肌用に特別に設計された私たちのスキンケア製品は、肌に優しく栄養を与え、保護します。肌トラブルにさようなら、輝く健康な肌にこんにちは。",
+    "新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています: 今シーズンのメイクアップトレンドは、大胆な色彩と革新的な技術に注目しています。ネオンアイライナーからホログラフィックハイライターまで、クリエイティビティを解き放ち、毎回ユニークなルックを演出しましょう。"
+  ]
+}
@@ -0,0 +1,92 @@
+package main
+
+import (
+	"context"
+	"errors"
+	"flag"
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/internal/config"
+	"github.com/mostlygeek/llama-swap/internal/logmon"
+	"github.com/mostlygeek/llama-swap/internal/perf"
+)
+
+func printSysStat(s perf.SysStat) {
+	cores := make([]string, len(s.CpuUtilPerCore))
+	for i, v := range s.CpuUtilPerCore {
+		cores[i] = fmt.Sprintf("%.1f%%", v)
+	}
+	fmt.Printf("[SYS %s]\n", s.Timestamp.Format("15:04:05"))
+	fmt.Printf("  CPU:  %s\n", strings.Join(cores, "  "))
+	fmt.Printf("  Mem:  %d MB used / %d MB total (%d MB free)\n", s.MemUsedMB, s.MemTotalMB, s.MemFreeMB)
+	fmt.Printf("  Swap: %d MB used / %d MB total\n", s.SwapUsedMB, s.SwapTotalMB)
+	fmt.Printf("  Load: %.2f  %.2f  %.2f  (1m 5m 15m)\n", s.LoadAvg1, s.LoadAvg5, s.LoadAvg15)
+}
+
+func printGpuStats(gpus []perf.GpuStat) {
+	for _, g := range gpus {
+		fmt.Printf("[GPU %d %s]\n", g.ID, g.Name)
+		fmt.Printf("  Util:  GPU %.1f%%  Mem %.1f%%\n", g.GpuUtilPct, g.MemUtilPct)
+		fmt.Printf("  Mem:   %d MB used / %d MB total\n", g.MemUsedMB, g.MemTotalMB)
+		fmt.Printf("  Temp:  %d°C   Fan: %.1f%%   Power: %.1f W\n", g.TempC, g.FanSpeedPct, g.PowerDrawW)
+	}
+}
+
+func main() {
+	stream := flag.Bool("stream", false, "stream stats")
+	interval := flag.Duration("t", time.Second, "polling interval (clamped to 1s–1h)")
+	flag.Parse()
+
+	every := *interval
+	if every < time.Second {
+		every = time.Second
+	} else if every > time.Hour {
+		every = time.Hour
+	}
+
+	l := logmon.New()
+	l.SetLogLevel(logmon.LevelDebug)
+
+	s, err := perf.ReadSysStats()
+	if err != nil && err != perf.ErrNotImplemented {
+		fmt.Println("Sys Error:", err)
+		return
+	}
+	printSysStat(s)
+
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	gpuCh, err := perf.GetGpuStats(ctx, every, l)
+	if err != nil && !errors.Is(err, perf.ErrNotImplemented) && !errors.Is(err, perf.ErrNoGpuTool) {
+		fmt.Println("GPU Init Error:", err)
+		return
+	}
+
+	if gpuCh != nil {
+		select {
+		case g := <-gpuCh:
+			printGpuStats(g)
+		case <-ctx.Done():
+			fmt.Println("GPU: timed out waiting for stats")
+		}
+	}
+
+	if *stream {
+		m, _ := perf.New(config.PerformanceConfig{Every: every}, l)
+		m.Start()
+		defer m.Stop()
+		sysCh, gpuCh, unsub := m.Subscribe()
+		defer unsub()
+		for {
+			select {
+			case s := <-sysCh:
+				printSysStat(s)
+			case g := <-gpuCh:
+				printGpuStats(g)
+			}
+		}
+	}
+}
@@ -0,0 +1,374 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"io"
+	"log"
+	"net/http"
+	"os"
+	"os/signal"
+	"syscall"
+	"time"
+
+	"github.com/gin-gonic/gin"
+	"github.com/tidwall/gjson"
+)
+
+func main() {
+	gin.SetMode(gin.TestMode)
+	// Define a command-line flag for the port
+	port := flag.String("port", "8080", "port to listen on")
+	expectedModel := flag.String("model", "TheExpectedModel", "model name to expect")
+
+	// Define a command-line flag for the response message
+	responseMessage := flag.String("respond", "hi", "message to respond with")
+
+	silent := flag.Bool("silent", false, "disable all logging")
+
+	ignoreSigTerm := flag.Bool("ignore-sig-term", false, "ignore SIGTERM signal")
+
+	flag.Parse() // Parse the command-line flags
+
+	// Create a new Gin router
+	r := gin.New()
+
+	// Set up the handler function using the provided response message
+	r.POST("/v1/chat/completions", func(c *gin.Context) {
+		bodyBytes, _ := io.ReadAll(c.Request.Body)
+
+		// Check if streaming is requested
+		// Query is checked instead of JSON body since that event stream conflicts with other tests
+		isStreaming := c.Query("stream") == "true"
+
+		if isStreaming {
+			// Set headers for streaming
+			c.Header("Content-Type", "text/event-stream")
+			c.Header("Cache-Control", "no-cache")
+			c.Header("Connection", "keep-alive")
+			c.Header("Transfer-Encoding", "chunked")
+
+			// add a wait to simulate a slow query
+			if wait, err := time.ParseDuration(c.Query("wait")); err == nil {
+				time.Sleep(wait)
+			}
+
+			// Send 10 "asdf" tokens
+			for i := 0; i < 10; i++ {
+				data := gin.H{
+					"created": time.Now().Unix(),
+					"choices": []gin.H{
+						{
+							"index": 0,
+							"delta": gin.H{
+								"content": "asdf",
+							},
+							"finish_reason": nil,
+						},
+					},
+				}
+				c.SSEvent("message", data)
+				c.Writer.Flush()
+			}
+
+			// Send final data with usage info
+			finalData := gin.H{
+				"usage": gin.H{
+					"completion_tokens": 10,
+					"prompt_tokens":     25,
+					"total_tokens":      35,
+				},
+				// add timings to simulate llama.cpp
+				"timings": gin.H{
+					"prompt_n":             25,
+					"prompt_ms":            13,
+					"predicted_n":          10,
+					"predicted_ms":         17,
+					"predicted_per_second": 10,
+				},
+			}
+			c.SSEvent("message", finalData)
+			c.Writer.Flush()
+
+			// Send [DONE]
+			c.SSEvent("message", "[DONE]")
+			c.Writer.Flush()
+		} else {
+			c.Header("Content-Type", "application/json")
+
+			// add a wait to simulate a slow query
+			if wait, err := time.ParseDuration(c.Query("wait")); err == nil {
+				time.Sleep(wait)
+			}
+
+			c.JSON(http.StatusOK, gin.H{
+				"responseMessage":  *responseMessage,
+				"h_content_length": c.Request.Header.Get("Content-Length"),
+				"request_body":     string(bodyBytes),
+				"usage": gin.H{
+					"completion_tokens": 10,
+					"prompt_tokens":     25,
+					"total_tokens":      35,
+				},
+				"timings": gin.H{
+					"prompt_n":             25,
+					"prompt_ms":            13,
+					"predicted_n":          10,
+					"predicted_ms":         17,
+					"predicted_per_second": 10,
+				},
+			})
+		}
+	})
+
+	// for issue #62 to check model name strips profile slug
+	// has to be one of the openAI API endpoints that llama-swap proxies
+	// curl http://localhost:8080/v1/audio/speech -d '{"model":"profile:TheExpectedModel"}'
+	r.POST("/v1/audio/speech", func(c *gin.Context) {
+		body, err := io.ReadAll(c.Request.Body)
+		if err != nil {
+			c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to read request body"})
+			return
+		}
+		defer c.Request.Body.Close()
+		modelName := gjson.GetBytes(body, "model").String()
+		if modelName != *expectedModel {
+			c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("Invalid model: %s, expected: %s", modelName, *expectedModel)})
+			return
+		} else {
+			c.JSON(http.StatusOK, gin.H{"message": "ok"})
+		}
+	})
+
+	r.POST("/v1/completions", func(c *gin.Context) {
+		c.Header("Content-Type", "application/json")
+		c.JSON(http.StatusOK, gin.H{
+			"responseMessage": *responseMessage,
+			"usage": gin.H{
+				"completion_tokens": 10,
+				"prompt_tokens":     25,
+				"total_tokens":      35,
+			},
+		})
+
+	})
+
+	// llama-server compatibility: /completion
+	r.POST("/completion", func(c *gin.Context) {
+		c.Header("Content-Type", "application/json")
+		c.JSON(http.StatusOK, gin.H{
+			"responseMessage": *responseMessage,
+			"usage": gin.H{
+				"completion_tokens": 10,
+				"prompt_tokens":     25,
+				"total_tokens":      35,
+			},
+		})
+	})
+
+	// issue #41
+	r.POST("/v1/audio/transcriptions", func(c *gin.Context) {
+		// Parse the multipart form
+		if err := c.Request.ParseMultipartForm(10 << 20); err != nil { // 10 MB max memory
+			c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("Error parsing multipart form: %s", err)})
+			return
+		}
+
+		// Get the model from the form values
+		model := c.Request.FormValue("model")
+
+		if model == "" {
+			c.JSON(http.StatusBadRequest, gin.H{"error": "Missing model parameter"})
+			return
+		}
+
+		// Get the file from the form
+		file, _, err := c.Request.FormFile("file")
+		if err != nil {
+			c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("Error getting file: %s", err)})
+			return
+		}
+		defer file.Close()
+
+		// Read the file content to get its size
+		fileBytes, err := io.ReadAll(file)
+		if err != nil {
+			c.JSON(http.StatusInternalServerError, gin.H{"error": fmt.Sprintf("Error reading file: %s", err)})
+			return
+		}
+
+		fileSize := len(fileBytes)
+
+		// Return a JSON response with the model and transcription text including file size
+		c.JSON(http.StatusOK, gin.H{
+			"text":  fmt.Sprintf("The length of the file is %d bytes", fileSize),
+			"model": model,
+
+			// expose some header values for testing
+			"h_content_type":   c.GetHeader("Content-Type"),
+			"h_content_length": c.GetHeader("Content-Length"),
+		})
+	})
+
+	r.GET("/v1/audio/voices", func(c *gin.Context) {
+		model := c.Query("model")
+		c.JSON(http.StatusOK, gin.H{"voices": []string{"voice1"}, "model": model})
+	})
+
+	r.GET("/slow-respond", func(c *gin.Context) {
+		echo := c.Query("echo")
+		delay := c.Query("delay")
+
+		if echo == "" {
+			echo = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
+		}
+
+		// Parse the duration
+		if delay == "" {
+			delay = "100ms"
+		}
+
+		t, err := time.ParseDuration(delay)
+		if err != nil {
+			c.Header("Content-Type", "text/plain")
+			c.String(http.StatusBadRequest, fmt.Sprintf("Invalid duration: %s", err))
+			return
+		}
+
+		c.Header("Content-Type", "text/plain")
+		for _, char := range echo {
+			c.Writer.Write([]byte(string(char)))
+			c.Writer.Flush()
+
+			// wait
+			<-time.After(t)
+		}
+	})
+
+	r.GET("/test", func(c *gin.Context) {
+		c.Header("Content-Type", "text/plain")
+		c.String(200, *responseMessage)
+	})
+
+	r.GET("/env", func(c *gin.Context) {
+		c.Header("Content-Type", "text/plain")
+		c.String(200, *responseMessage)
+
+		// Get environment variables
+		envVars := os.Environ()
+
+		// Write each environment variable to the response
+		for _, envVar := range envVars {
+			c.String(200, envVar)
+		}
+	})
+
+	// Set up the /health endpoint handler function
+	r.GET("/health", func(c *gin.Context) {
+		c.Header("Content-Type", "application/json")
+		c.JSON(200, gin.H{"status": "ok"})
+	})
+
+	r.GET("/", func(c *gin.Context) {
+		c.Header("Content-Type", "text/plain")
+		c.String(200, fmt.Sprintf("%s %s", c.Request.Method, c.Request.URL.Path))
+	})
+
+	// SD API endpoints
+	r.POST("/sdapi/v1/txt2img", func(c *gin.Context) {
+		body, err := io.ReadAll(c.Request.Body)
+		if err != nil {
+			c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to read request body"})
+			return
+		}
+		defer c.Request.Body.Close()
+
+		modelName := gjson.GetBytes(body, "model").String()
+		c.JSON(http.StatusOK, gin.H{
+			"model":  modelName,
+			"images": []string{},
+		})
+	})
+
+	r.POST("/sdapi/v1/img2img", func(c *gin.Context) {
+		body, err := io.ReadAll(c.Request.Body)
+		if err != nil {
+			c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to read request body"})
+			return
+		}
+		defer c.Request.Body.Close()
+
+		modelName := gjson.GetBytes(body, "model").String()
+		c.JSON(http.StatusOK, gin.H{
+			"model":  modelName,
+			"images": []string{},
+		})
+	})
+
+	r.GET("/sdapi/v1/loras", func(c *gin.Context) {
+		c.JSON(http.StatusOK, gin.H{
+			"loras": []string{},
+		})
+	})
+
+	address := "127.0.0.1:" + *port // Address with the specified port
+
+	srv := &http.Server{
+		Addr:    address,
+		Handler: r.Handler(),
+	}
+
+	// Disable logging if the --silent flag is set
+	if *silent {
+		gin.SetMode(gin.ReleaseMode)
+		gin.DefaultWriter = io.Discard
+		log.SetOutput(io.Discard)
+	}
+
+	if !*silent {
+		fmt.Printf("My PID: %d\n", os.Getpid())
+	}
+
+	go func() {
+		log.Printf("simple-responder listening on %s\n", address)
+		// service connections
+		if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
+			log.Fatalf("simple-responder err: %s\n", err)
+		}
+	}()
+
+	// Wait for interrupt signal to gracefully shutdown the server with
+	// a timeout of 5 seconds.
+	sigChan := make(chan os.Signal, 1)
+	// kill (no param) default send syscall.SIGTERM
+	// kill -2 is syscall.SIGINT
+	// kill -9 is syscall.SIGKILL but can't be catch, so don't need add it
+	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
+
+	countSigInt := 0
+
+runloop:
+	for {
+		signal := <-sigChan
+		switch signal {
+		case syscall.SIGINT:
+			countSigInt++
+			if countSigInt > 1 {
+				break runloop
+			} else {
+				log.Println("Received SIGINT, send another SIGINT to shutdown")
+			}
+		case syscall.SIGTERM:
+			if *ignoreSigTerm {
+				log.Println("Ignoring SIGTERM")
+			} else {
+				log.Println("Received SIGTERM, shutting down")
+				break runloop
+			}
+		default:
+			break runloop
+		}
+	}
+
+	log.Println("simple-responder shutting down")
+}
@@ -0,0 +1,96 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"os"
+	"sync"
+	"time"
+
+	tea "github.com/charmbracelet/bubbletea"
+)
+
+func main() {
+	prompt := flag.String("prompt", "Write a few sentences about the history of computing.", "user message sent to each model")
+	maxTokens := flag.Int("max-tokens", 256, "max_tokens per request")
+	flag.Usage = func() {
+		fmt.Fprintf(os.Stderr, "Usage: %s [flags] <base-url> <model> [model...]\n", os.Args[0])
+		fmt.Fprintf(os.Stderr, "Example: %s -max-tokens 400 http://localhost:8080 A B C D\n\n", os.Args[0])
+		flag.PrintDefaults()
+	}
+	flag.Parse()
+
+	args := flag.Args()
+	if len(args) < 2 {
+		flag.Usage()
+		os.Exit(1)
+	}
+
+	baseURL := args[0]
+	models := args[1:]
+
+	m := newModel(models)
+	prog := tea.NewProgram(m, tea.WithAltScreen(), tea.WithMouseCellMotion())
+
+	// Chain of triggers ensures requests are sent in the order provided.
+	triggers := make([]chan struct{}, len(models))
+	for i := range triggers {
+		triggers[i] = make(chan struct{}, 1)
+	}
+	triggers[0] <- struct{}{}
+
+	var wg sync.WaitGroup
+	start := time.Now()
+
+	for i, name := range models {
+		wg.Add(1)
+		go func(idx int, mdl string) {
+			defer wg.Done()
+
+			<-triggers[idx]
+
+			reqStart := time.Now()
+			prog.Send(statusMsg{idx: idx, status: statusStreaming})
+
+			if idx+1 < len(triggers) {
+				triggers[idx+1] <- struct{}{}
+			}
+
+			err := sendRequest(baseURL, mdl, *prompt, *maxTokens, idx, func(i int, text string) {
+				prog.Send(deltaMsg{idx: i, text: text})
+			})
+
+			elapsed := time.Since(reqStart)
+			if err != nil {
+				prog.Send(statusMsg{idx: idx, status: statusError, elapsed: elapsed, err: err})
+			} else {
+				prog.Send(statusMsg{idx: idx, status: statusDone, elapsed: elapsed})
+			}
+		}(i, name)
+	}
+
+	if _, err := prog.Run(); err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(1)
+	}
+
+	wg.Wait()
+	printSummary(m, start)
+}
+
+func printSummary(m *model, start time.Time) {
+	fmt.Println("Summary:")
+	for _, p := range m.panels {
+		switch p.status {
+		case statusError:
+			fmt.Printf("  [%d] %-20s ERROR   elapsed=%s err=%v\n",
+				p.idx, p.model, p.elapsed.Round(time.Millisecond), p.err)
+		case statusDone:
+			fmt.Printf("  [%d] %-20s done    elapsed=%s\n",
+				p.idx, p.model, p.elapsed.Round(time.Millisecond))
+		default:
+			fmt.Printf("  [%d] %-20s %s\n", p.idx, p.model, p.status)
+		}
+	}
+	fmt.Printf("all done in %s\n", time.Since(start).Round(time.Millisecond))
+}
@@ -0,0 +1,88 @@
+package main
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+)
+
+// deltaSink receives streamed text fragments for a given model panel.
+type deltaSink func(idx int, text string)
+
+type streamDelta struct {
+	Content          string `json:"content"`
+	ReasoningContent string `json:"reasoning_content"`
+}
+
+type streamChoice struct {
+	Delta streamDelta `json:"delta"`
+}
+
+type streamChunk struct {
+	Choices []streamChoice `json:"choices"`
+}
+
+// sendRequest streams a chat completion and forwards each content/reasoning
+// delta to sink. Reasoning and assistant content are emitted into the same
+// stream so they render together.
+func sendRequest(baseURL, model, prompt string, maxTokens, idx int, sink deltaSink) error {
+	payload := map[string]any{
+		"model": model,
+		"messages": []map[string]string{
+			{"role": "user", "content": prompt},
+		},
+		"max_tokens": maxTokens,
+		"stream":     true,
+	}
+
+	body, err := json.Marshal(payload)
+	if err != nil {
+		return err
+	}
+
+	resp, err := http.Post(baseURL+"/v1/chat/completions", "application/json", bytes.NewReader(body))
+	if err != nil {
+		return err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		b, _ := io.ReadAll(resp.Body)
+		return fmt.Errorf("status %d: %s", resp.StatusCode, strings.TrimSpace(string(b)))
+	}
+
+	scanner := bufio.NewScanner(resp.Body)
+	scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
+	for scanner.Scan() {
+		line := scanner.Text()
+		if !strings.HasPrefix(line, "data:") {
+			continue
+		}
+		data := strings.TrimSpace(strings.TrimPrefix(line, "data:"))
+		if data == "" || data == "[DONE]" {
+			if data == "[DONE]" {
+				break
+			}
+			continue
+		}
+
+		var chunk streamChunk
+		if err := json.Unmarshal([]byte(data), &chunk); err != nil {
+			continue
+		}
+		for _, c := range chunk.Choices {
+			if c.Delta.ReasoningContent != "" {
+				sink(idx, c.Delta.ReasoningContent)
+			}
+			if c.Delta.Content != "" {
+				sink(idx, c.Delta.Content)
+			}
+		}
+	}
+
+	return scanner.Err()
+}
@@ -0,0 +1,343 @@
+package main
+
+import (
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/charmbracelet/bubbles/viewport"
+	tea "github.com/charmbracelet/bubbletea"
+	"github.com/charmbracelet/lipgloss"
+)
+
+type panelStatus int
+
+const (
+	statusWaiting panelStatus = iota
+	statusStreaming
+	statusDone
+	statusError
+)
+
+func (s panelStatus) String() string {
+	switch s {
+	case statusStreaming:
+		return "streaming"
+	case statusDone:
+		return "done"
+	case statusError:
+		return "error"
+	default:
+		return "waiting"
+	}
+}
+
+// deltaMsg appends streamed text to a panel.
+type deltaMsg struct {
+	idx  int
+	text string
+}
+
+// statusMsg updates a panel's lifecycle state.
+type statusMsg struct {
+	idx     int
+	status  panelStatus
+	elapsed time.Duration
+	err     error
+}
+
+type panel struct {
+	idx     int
+	model   string
+	color   lipgloss.Color
+	status  panelStatus
+	buf     strings.Builder
+	elapsed time.Duration
+	err     error
+}
+
+const (
+	minPanelWidth = 28
+	maxCols       = 3
+	panelHeight   = 9 // total box height including border + header
+)
+
+type model struct {
+	panels  []*panel
+	focused int
+	vp      viewport.Model
+	width   int
+	height  int
+	cols    int
+	pw      int // inner panel content width
+	ready   bool
+}
+
+func newModel(models []string) *model {
+	// Assign a stable color per unique model name (by first appearance).
+	colorOf := map[string]lipgloss.Color{}
+	panels := make([]*panel, len(models))
+	for i, m := range models {
+		c, ok := colorOf[m]
+		if !ok {
+			c = modelPalette[len(colorOf)%len(modelPalette)]
+			colorOf[m] = c
+		}
+		panels[i] = &panel{idx: i, model: m, color: c, status: statusWaiting}
+	}
+	return &model{panels: panels, focused: 0}
+}
+
+func (m *model) Init() tea.Cmd { return nil }
+
+func (m *model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
+	switch msg := msg.(type) {
+	case tea.WindowSizeMsg:
+		m.width = msg.Width
+		m.height = msg.Height
+		m.relayout()
+		m.refreshViewport(true)
+		return m, nil
+
+	case tea.KeyMsg:
+		switch msg.String() {
+		case "q", "ctrl+c", "esc":
+			return m, tea.Quit
+		case "tab", "right", "l":
+			m.setFocus(m.focused + 1)
+			return m, nil
+		case "shift+tab", "left", "h":
+			m.setFocus(m.focused - 1)
+			return m, nil
+		}
+		var cmd tea.Cmd
+		m.vp, cmd = m.vp.Update(msg)
+		return m, cmd
+
+	case tea.MouseMsg:
+		if msg.Action == tea.MouseActionPress && msg.Button == tea.MouseButtonLeft {
+			if idx, ok := m.panelAt(msg.X, msg.Y); ok {
+				m.setFocus(idx)
+			}
+			return m, nil
+		}
+		var cmd tea.Cmd
+		m.vp, cmd = m.vp.Update(msg)
+		return m, cmd
+
+	case deltaMsg:
+		p := m.panels[msg.idx]
+		p.buf.WriteString(msg.text)
+		if msg.idx == m.focused {
+			atBottom := m.vp.AtBottom()
+			m.refreshViewport(false)
+			if atBottom {
+				m.vp.GotoBottom()
+			}
+		}
+		return m, nil
+
+	case statusMsg:
+		p := m.panels[msg.idx]
+		p.status = msg.status
+		p.elapsed = msg.elapsed
+		p.err = msg.err
+		if msg.err != nil {
+			errTxt := lipgloss.NewStyle().Foreground(lipgloss.Color("196")).Render("\n" + msg.err.Error())
+			p.buf.WriteString(errTxt)
+			if msg.idx == m.focused {
+				m.refreshViewport(false)
+				m.vp.GotoBottom()
+			}
+		}
+		return m, nil
+	}
+
+	return m, nil
+}
+
+func (m *model) setFocus(idx int) {
+	if len(m.panels) == 0 {
+		return
+	}
+	if idx < 0 {
+		idx = len(m.panels) - 1
+	}
+	if idx >= len(m.panels) {
+		idx = 0
+	}
+	if idx == m.focused {
+		return
+	}
+	m.focused = idx
+	m.refreshViewport(true)
+}
+
+// relayout recomputes grid columns and panel/viewport dimensions.
+func (m *model) relayout() {
+	if m.width < minPanelWidth+4 {
+		m.cols = 1
+	} else {
+		m.cols = m.width / (minPanelWidth + 2)
+		if m.cols > maxCols {
+			m.cols = maxCols
+		}
+		if m.cols > len(m.panels) {
+			m.cols = len(m.panels)
+		}
+		if m.cols < 1 {
+			m.cols = 1
+		}
+	}
+
+	// inner content width: total width / cols, minus borders+padding (4) and gap.
+	boxOuter := m.width/m.cols - 1
+	m.pw = boxOuter - 4
+	if m.pw < 8 {
+		m.pw = 8
+	}
+
+	m.vp = viewport.New(m.pw, panelHeight-2)
+	m.ready = true
+}
+
+func (m *model) refreshViewport(reset bool) {
+	if !m.ready || len(m.panels) == 0 {
+		return
+	}
+	content := lipgloss.NewStyle().Width(m.pw).Render(m.panels[m.focused].buf.String())
+	m.vp.SetContent(content)
+	if reset {
+		m.vp.GotoBottom()
+	}
+}
+
+// panelAt maps screen coordinates to a panel index based on the grid layout.
+func (m *model) panelAt(x, y int) (int, bool) {
+	if m.cols == 0 {
+		return 0, false
+	}
+	boxOuterW := m.width/m.cols + 1
+	col := x / boxOuterW
+	row := y / panelHeight
+	idx := row*m.cols + col
+	if col < m.cols && idx >= 0 && idx < len(m.panels) {
+		return idx, true
+	}
+	return 0, false
+}
+
+func (m *model) View() string {
+	if !m.ready {
+		return "loading..."
+	}
+
+	rows := []string{}
+	var current []string
+	for i, p := range m.panels {
+		current = append(current, m.renderPanel(p, i == m.focused))
+		if len(current) == m.cols {
+			rows = append(rows, lipgloss.JoinHorizontal(lipgloss.Top, current...))
+			current = nil
+		}
+	}
+	if len(current) > 0 {
+		rows = append(rows, lipgloss.JoinHorizontal(lipgloss.Top, current...))
+	}
+
+	grid := lipgloss.JoinVertical(lipgloss.Left, rows...)
+	footer := lipgloss.NewStyle().Faint(true).Render(
+		"tab/click: focus panel  •  wheel/↑↓/pgup/pgdn: scroll focused  •  q: quit")
+	return grid + "\n" + footer
+}
+
+// modelPalette gives each panel a distinct, readable color for its name.
+var modelPalette = []lipgloss.Color{
+	"39",  // blue
+	"213", // magenta
+	"214", // orange
+	"45",  // cyan
+	"141", // purple
+	"203", // salmon
+	"82",  // lime
+	"227", // light yellow
+}
+
+func statusColor(s panelStatus) lipgloss.Color {
+	switch s {
+	case statusStreaming:
+		return lipgloss.Color("220") // yellow - active
+	case statusDone:
+		return lipgloss.Color("42") // green - success
+	case statusError:
+		return lipgloss.Color("196") // red - error
+	default:
+		return lipgloss.Color("244") // gray - waiting
+	}
+}
+
+func (m *model) renderPanel(p *panel, focused bool) string {
+	border := lipgloss.RoundedBorder()
+	if focused {
+		border = lipgloss.DoubleBorder()
+	}
+	style := lipgloss.NewStyle().
+		Border(border).
+		BorderForeground(lipgloss.Color("240"))
+
+	statusTxt := p.status.String()
+	if p.elapsed > 0 {
+		statusTxt += " " + p.elapsed.Round(time.Millisecond).String()
+	}
+
+	// Header: model name (left, model color) + status/timer (right, status color).
+	name := fmt.Sprintf("[%d] %s", p.idx, p.model)
+	gap := m.pw - lipgloss.Width(name) - lipgloss.Width(statusTxt)
+	if gap < 1 {
+		name = truncate(name, m.pw-lipgloss.Width(statusTxt)-1)
+		gap = m.pw - lipgloss.Width(name) - lipgloss.Width(statusTxt)
+	}
+	if gap < 1 {
+		gap = 1
+	}
+	header := lipgloss.NewStyle().Bold(true).Foreground(p.color).Render(name) +
+		strings.Repeat(" ", gap) +
+		lipgloss.NewStyle().Foreground(statusColor(p.status)).Render(statusTxt)
+
+	var bodyLines string
+	if focused {
+		bodyLines = m.vp.View()
+	} else {
+		bodyLines = tailLines(p.buf.String(), m.pw, panelHeight-2)
+	}
+
+	content := lipgloss.JoinVertical(lipgloss.Left, header, bodyLines)
+	return style.Width(m.pw).Height(panelHeight - 2).Render(content)
+}
+
+func truncate(s string, w int) string {
+	if w <= 0 {
+		return ""
+	}
+	if lipgloss.Width(s) <= w {
+		return s
+	}
+	r := []rune(s)
+	if len(r) > w {
+		r = r[:w]
+	}
+	return string(r)
+}
+
+// tailLines wraps text to width w and returns the last n lines.
+func tailLines(s string, w, n int) string {
+	wrapped := lipgloss.NewStyle().Width(w).Render(s)
+	lines := strings.Split(wrapped, "\n")
+	if len(lines) > n {
+		lines = lines[len(lines)-n:]
+	}
+	for len(lines) < n {
+		lines = append(lines, "")
+	}
+	return strings.Join(lines, "\n")
+}
@@ -0,0 +1,27 @@
+# wol-proxy
+
+wol-proxy automatically wakes up a suspended llama-swap server using Wake-on-LAN when requests are received.
+
+When a request arrives and llama-swap is unavailable, wol-proxy sends a WOL packet and holds the request until the server becomes available. If the server doesn't respond within the timeout period (default: 60 seconds), the request is dropped.
+
+This utility helps conserve energy by allowing GPU-heavy servers to remain suspended when idle, as they can consume hundreds of watts even when not actively processing requests.
+
+## Usage
+
+```shell
+# minimal
+$ ./wol-proxy -mac BA:DC:0F:FE:E0:00 -upstream http://192.168.1.13:8080
+
+# everything
+$ ./wol-proxy -mac BA:DC:0F:FE:E0:00 -upstream http://192.168.1.13:8080 \
+    # use debug log level
+    -log debug \
+    # altenerative listening port
+    -listen localhost:9999 \
+    # seconds to hold requests waiting for upstream to be ready
+    -timeout 30
+```
+
+## API
+
+`GET /status` - that's it. Everything else is proxied to the upstream server.
@@ -0,0 +1,64 @@
+<!DOCTYPE html>
+<html>
+<head>
+<meta charset="utf-8">
+<title>Loading...</title>
+<style>
+body {
+    font-family: sans-serif;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    height: 100vh;
+    margin: 0;
+    background: #f5f5f5;
+}
+.loader {
+    text-align: center;
+}
+.stats {
+    font-size: 18px;
+    color: #333;
+    margin: 20px 0;
+}
+.stats-label {
+    color: #666;
+    font-size: 14px;
+}
+</style>
+</head>
+<body>
+<div class="loader">
+    <p>Waking up upstream server...</p>
+    <div class="stats">
+        <div><span class="stats-label">Time elapsed:</span> <span id="elapsed">0s</span></div>
+        <div><span id="attempts">&nbsp;</span></div>
+    </div>
+</div>
+<script>
+var startTime = Date.now();
+var attempts = 0;
+
+setInterval(function() {
+    var elapsed = (Date.now() - startTime) / 1000;
+    document.getElementById('elapsed').textContent = elapsed.toFixed(1) + 's';
+}, 100);
+
+// Check status every second
+setInterval(function() {
+    attempts++;
+    var dots = '.'.repeat((attempts % 10) || 10);
+    document.getElementById('attempts').textContent = dots;
+
+    fetch('/status')
+        .then(function(r) { return r.text(); })
+        .then(function(t) {
+            if (t.indexOf('status: ready') !== -1) {
+                location.reload();
+            }
+        })
+        .catch(function() {});
+}, 1000);
+</script>
+</body>
+</html>
@@ -0,0 +1,333 @@
+package main
+
+import (
+	"bufio"
+	"context"
+	_ "embed"
+	"errors"
+	"flag"
+	"fmt"
+	"io"
+	"log/slog"
+	"net"
+	"net/http"
+	"net/http/httputil"
+	"net/url"
+	"os"
+	"os/signal"
+	"strings"
+	"sync"
+	"time"
+)
+
+//go:embed index.html
+var loadingPageHTML string
+
+var (
+	flagMac      = flag.String("mac", "", "mac address to send WoL packet to")
+	flagUpstream = flag.String("upstream", "", "upstream proxy address to send requests to")
+	flagListen   = flag.String("listen", ":8080", "listen address to listen on")
+	flagLog      = flag.String("log", "info", "log level (debug, info, warn, error)")
+	flagTimeout  = flag.Int("timeout", 60, "seconds requests wait for upstream response before failing")
+)
+
+func main() {
+	flag.Parse()
+
+	switch *flagLog {
+	case "debug":
+		slog.SetLogLoggerLevel(slog.LevelDebug)
+	case "info":
+		slog.SetLogLoggerLevel(slog.LevelInfo)
+	case "warn":
+		slog.SetLogLoggerLevel(slog.LevelWarn)
+	case "error":
+		slog.SetLogLoggerLevel(slog.LevelError)
+	default:
+		slog.Error("invalid log level", "logLevel", *flagLog)
+		return
+	}
+
+	// Validate flags
+	if *flagListen == "" {
+		slog.Error("listen address is required")
+		return
+	}
+
+	if *flagMac == "" {
+		slog.Error("mac address is required")
+		return
+	}
+
+	if *flagTimeout < 1 {
+		slog.Error("timeout must be greater than 0")
+		return
+	}
+
+	var upstreamURL *url.URL
+	var err error
+	// validate mac address
+	if _, err = net.ParseMAC(*flagMac); err != nil {
+		slog.Error("invalid mac address", "error", err)
+		return
+	}
+
+	if *flagUpstream == "" {
+		slog.Error("upstream proxy address is required")
+		return
+	} else {
+		upstreamURL, err = url.ParseRequestURI(*flagUpstream)
+		if err != nil {
+			slog.Error("error parsing upstream url", "error", err)
+			return
+		}
+	}
+
+	proxy := newProxy(upstreamURL)
+	server := &http.Server{
+		Addr:    *flagListen,
+		Handler: proxy,
+	}
+
+	// start the server
+	go func() {
+		slog.Info("server starting on", "address", *flagListen)
+		if err := server.ListenAndServe(); err != nil {
+			slog.Error("error starting server", "error", err)
+		}
+	}()
+
+	// graceful shutdown
+	ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
+	<-ctx.Done()
+	server.Close()
+}
+
+type upstreamStatus string
+
+const (
+	notready upstreamStatus = "not ready"
+	ready    upstreamStatus = "ready"
+)
+
+type proxyServer struct {
+	upstreamProxy *httputil.ReverseProxy
+	failCount     int
+	statusMutex   sync.RWMutex
+	status        upstreamStatus
+}
+
+func newProxy(url *url.URL) *proxyServer {
+	p := httputil.NewSingleHostReverseProxy(url)
+	proxy := &proxyServer{
+		upstreamProxy: p,
+		status:        notready,
+		failCount:     0,
+	}
+
+	// start a goroutine to monitor upstream status via SSE
+	go func() {
+		eventsUrl := url.Scheme + "://" + url.Host + "/api/events"
+		client := &http.Client{
+			Timeout: 0, // No timeout for SSE connection
+		}
+
+		waitDuration := 10 * time.Second
+
+		for {
+			slog.Debug("connecting to SSE endpoint", "url", eventsUrl)
+
+			req, err := http.NewRequest("GET", eventsUrl, nil)
+			if err != nil {
+				slog.Warn("failed to create SSE request", "error", err)
+				proxy.setStatus(notready)
+				proxy.incFail(1)
+				time.Sleep(waitDuration)
+				continue
+			}
+
+			req.Header.Set("Accept", "text/event-stream")
+			req.Header.Set("Cache-Control", "no-cache")
+			req.Header.Set("Connection", "keep-alive")
+
+			resp, err := client.Do(req)
+			if err != nil {
+				slog.Error("failed to connect to SSE endpoint", "error", err)
+				proxy.setStatus(notready)
+				proxy.incFail(1)
+				time.Sleep(10 * time.Second)
+				continue
+			}
+
+			if resp.StatusCode != http.StatusOK {
+				slog.Warn("SSE endpoint returned non-OK status", "status", resp.StatusCode)
+				_, _ = io.Copy(io.Discard, resp.Body)
+				_ = resp.Body.Close()
+				proxy.setStatus(notready)
+				proxy.incFail(1)
+				time.Sleep(10 * time.Second)
+				continue
+			}
+
+			// Successfully connected to SSE endpoint
+			slog.Info("connected to SSE endpoint, upstream ready")
+			proxy.setStatus(ready)
+			proxy.resetFailures()
+
+			// Read from the SSE stream to detect disconnection
+			scanner := bufio.NewScanner(resp.Body)
+
+			// use a fairly large buffer to avoid scanner errors when reading large SSE events
+			buf := make([]byte, 0, 1024*1024*2)
+			scanner.Buffer(buf, 1024*1024*2)
+			events := 0
+			if slog.Default().Enabled(context.Background(), slog.LevelDebug) {
+				fmt.Print("Events: ")
+			}
+			for scanner.Scan() {
+				if slog.Default().Enabled(context.Background(), slog.LevelDebug) {
+					// Just read the events to keep connection alive
+					// We don't need to process the event data
+					events++
+					fmt.Printf("%d, ", events)
+				}
+			}
+			fmt.Println()
+			if err := scanner.Err(); err != nil {
+				slog.Error("error reading from SSE stream", "error", err)
+			}
+
+			// Connection closed or error occurred
+			_ = resp.Body.Close()
+			slog.Info("SSE connection closed, upstream not ready")
+			proxy.setStatus(notready)
+			proxy.incFail(1)
+
+			// Wait before reconnecting
+			time.Sleep(waitDuration)
+		}
+	}()
+
+	return proxy
+}
+
+func (p *proxyServer) ServeHTTP(w http.ResponseWriter, r *http.Request) {
+	if r.Method == "GET" && r.URL.Path == "/status" {
+		status := string(p.getStatus())
+		failCount := p.getFailures()
+		w.Header().Set("Content-Type", "text/plain")
+		w.WriteHeader(200)
+		fmt.Fprintf(w, "status: %s\n", status)
+		fmt.Fprintf(w, "failures: %d\n", failCount)
+		return
+	}
+
+	if p.getStatus() == notready {
+		path := r.URL.Path
+		if strings.HasPrefix(path, "/api/events") {
+			slog.Debug("Skipping wake up", "req", path)
+			w.WriteHeader(http.StatusNoContent)
+			return
+		}
+
+		slog.Info("upstream not ready, sending magic packet", "req", path, "from", r.RemoteAddr)
+		if err := sendMagicPacket(*flagMac); err != nil {
+			slog.Warn("failed to send magic WoL packet", "error", err)
+		}
+
+		// For root or UI path requests, return loading page with status polling
+		// the web page will do the polling and redirect when ready
+		if path == "/" || strings.HasPrefix(path, "/ui/") {
+			w.Header().Set("Content-Type", "text/html; charset=utf-8")
+			w.WriteHeader(http.StatusOK)
+			fmt.Fprint(w, loadingPageHTML)
+			return
+		}
+
+		ticker := time.NewTicker(250 * time.Millisecond)
+		timeout, cancel := context.WithTimeout(context.Background(), time.Duration(*flagTimeout)*time.Second)
+		defer cancel()
+	loop:
+		for {
+			select {
+			case <-timeout.Done():
+				slog.Info("timeout waiting for upstream to be ready")
+				http.Error(w, "timeout", http.StatusRequestTimeout)
+				return
+			case <-ticker.C:
+				if p.getStatus() == ready {
+					ticker.Stop()
+					break loop
+				}
+			}
+		}
+	}
+
+	p.upstreamProxy.ServeHTTP(w, r)
+}
+
+func (p *proxyServer) getStatus() upstreamStatus {
+	p.statusMutex.RLock()
+	defer p.statusMutex.RUnlock()
+	return p.status
+}
+
+func (p *proxyServer) setStatus(status upstreamStatus) {
+	p.statusMutex.Lock()
+	defer p.statusMutex.Unlock()
+	p.status = status
+}
+
+func (p *proxyServer) incFail(num int) {
+	p.statusMutex.Lock()
+	defer p.statusMutex.Unlock()
+	p.failCount += num
+}
+
+func (p *proxyServer) getFailures() int {
+	p.statusMutex.RLock()
+	defer p.statusMutex.RUnlock()
+	return p.failCount
+}
+
+func (p *proxyServer) resetFailures() {
+	p.statusMutex.Lock()
+	defer p.statusMutex.Unlock()
+	p.failCount = 0
+}
+
+func sendMagicPacket(macAddr string) error {
+	hwAddr, err := net.ParseMAC(macAddr)
+	if err != nil {
+		return err
+	}
+
+	if len(hwAddr) != 6 {
+		return errors.New("invalid MAC address")
+	}
+
+	// Create the magic packet.
+	packet := make([]byte, 102)
+	// Add 6 bytes of 0xFF.
+	for i := 0; i < 6; i++ {
+		packet[i] = 0xFF
+	}
+	// Repeat the MAC address 16 times.
+	for i := 1; i <= 16; i++ {
+		copy(packet[i*6:], hwAddr)
+	}
+
+	// Send the packet using UDP.
+	addr := net.UDPAddr{
+		IP:   net.IPv4bcast,
+		Port: 9,
+	}
+	conn, err := net.DialUDP("udp", nil, &addr)
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	_, err = conn.Write(packet)
+	return err
+}
@@ -0,0 +1,693 @@
+{
+    "$schema": "https://json-schema.org/draft-07/schema#",
+    "$id": "llama-swap-config-schema.json",
+    "title": "llama-swap configuration",
+    "description": "Configuration file for llama-swap",
+    "type": "object",
+    "required": [
+        "models"
+    ],
+    "definitions": {
+        "macros": {
+            "type": "object",
+            "additionalProperties": {
+                "oneOf": [
+                    {
+                        "type": "string",
+                        "minLength": 0,
+                        "maxLength": 1024
+                    },
+                    {
+                        "type": "number"
+                    },
+                    {
+                        "type": "boolean"
+                    }
+                ]
+            },
+            "propertyNames": {
+                "type": "string",
+                "minLength": 1,
+                "maxLength": 64,
+                "pattern": "^[a-zA-Z0-9_-]+$",
+                "not": {
+                    "enum": [
+                        "PORT",
+                        "MODEL_ID"
+                    ]
+                }
+            },
+            "default": {},
+            "description": "A dictionary of string substitutions. Macros are reusable snippets used in model cmd, cmdStop, proxy, checkEndpoint, filters.stripParams. Macro names must be <64 chars, match ^[a-zA-Z0-9_-]+$, and not be PORT or MODEL_ID. Values can be string, number, or boolean. Macros can reference other macros defined before them."
+        },
+        "timeouts": {
+            "type": "object",
+            "properties": {
+                "connect": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "default": 30,
+                    "description": "TCP connection timeout in seconds. Set to 0 to disable."
+                },
+                "keepalive": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "default": 30,
+                    "description": "TCP keepalive timeout in seconds. Set to 0 to disable."
+                },
+                "responseHeader": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "default": 0,
+                    "description": "Time to wait for response headers in seconds. Set to 0 to disable."
+                },
+                "tlsHandshake": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "default": 10,
+                    "description": "TLS handshake timeout in seconds. Set to 0 to disable."
+                },
+                "expectContinue": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "default": 1,
+                    "description": "Expect-Continue timeout in seconds. Set to 0 to disable."
+                },
+                "idleConn": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "default": 90,
+                    "description": "Idle connection timeout in seconds. Set to 0 to disable."
+                }
+            },
+            "additionalProperties": false,
+            "description": "Timeout settings for proxy connections."
+        },
+        "groupsConfig": {
+            "type": "object",
+            "additionalProperties": {
+                "type": "object",
+                "required": [
+                    "members"
+                ],
+                "properties": {
+                    "swap": {
+                        "type": "boolean",
+                        "default": true,
+                        "description": "Controls model swapping behaviour within the group. True: only one model runs at a time. False: all models can run together."
+                    },
+                    "exclusive": {
+                        "type": "boolean",
+                        "default": true,
+                        "description": "Controls how the group affects other groups. True: causes all other groups to unload when this group runs a model. False: does not affect other groups."
+                    },
+                    "persistent": {
+                        "type": "boolean",
+                        "default": false,
+                        "description": "Prevents other groups from unloading the models in this group. Does not affect individual model behaviour."
+                    },
+                    "members": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        },
+                        "description": "Array of model IDs that are members of this group. Model IDs must be defined in models."
+                    }
+                }
+            },
+            "description": "A dictionary of group settings. Provides advanced controls over model swapping behaviour. Model IDs must be defined in models. A model can only be a member of one group. Behaviour controlled via swap, exclusive, persistent."
+        },
+        "matrixConfig": {
+            "type": "object",
+            "description": "Solver-based alternative to groups. Declares valid combinations of concurrent models. The solver minimizes eviction cost when swapping. A config must use either groups or matrix, not both.",
+            "required": [
+                "vars",
+                "sets"
+            ],
+            "properties": {
+                "vars": {
+                    "type": "object",
+                    "description": "Short names for models. Keys must be alphanumeric, 1-8 characters. All sets and evict_costs must use these IDs.",
+                    "minProperties": 1,
+                    "additionalProperties": {
+                        "type": "string"
+                    },
+                    "propertyNames": {
+                        "pattern": "^[a-zA-Z0-9]{1,8}$"
+                    }
+                },
+                "evict_costs": {
+                    "type": "object",
+                    "description": "Relative cost of evicting a running model. Models not listed default to 1. Values must be positive integers.",
+                    "additionalProperties": {
+                        "type": "integer",
+                        "minimum": 1
+                    }
+                },
+                "sets": {
+                    "type": "object",
+                    "description": "Named sets of concurrent model combinations. Values are DSL strings using & (AND), | (OR), () (grouping), and +ref (inline another set). Definition order is used for tie-breaking.",
+                    "minProperties": 1,
+                    "additionalProperties": {
+                        "type": "string"
+                    }
+                }
+            },
+            "additionalProperties": false
+        }
+    },
+    "properties": {
+        "healthCheckTimeout": {
+            "type": "integer",
+            "minimum": 15,
+            "default": 120,
+            "description": "Number of seconds to wait for a model to be ready to serve requests."
+        },
+        "globalTTL": {
+            "type": "integer",
+            "minimum": 0,
+            "default": 0,
+            "description": "Default TTL for all models in seconds, 0 means no TTL and models will never be automatically unloaded"
+        },
+        "logLevel": {
+            "type": "string",
+            "enum": [
+                "debug",
+                "info",
+                "warn",
+                "error"
+            ],
+            "default": "info",
+            "description": "Sets the logging value. Valid values: debug, info, warn, error."
+        },
+        "logTimeFormat": {
+            "type": "string",
+            "enum": [
+                "",
+                "ansic",
+                "unixdate",
+                "rubydate",
+                "rfc822",
+                "rfc822z",
+                "rfc850",
+                "rfc1123",
+                "rfc1123z",
+                "rfc3339",
+                "rfc3339nano",
+                "kitchen",
+                "stamp",
+                "stampmilli",
+                "stampmicro",
+                "stampnano"
+            ],
+            "default": "",
+            "description": "Enables and sets the logging timestamp format. Valid values: \"\", \"ansic\", \"unixdate\", \"rubydate\", \"rfc822\", \"rfc822z\", \"rfc850\", \"rfc1123\", \"rfc1123z\", \"rfc3339\", \"rfc3339nano\", \"kitchen\", \"stamp\", \"stampmilli\", \"stampmicro\", and \"stampnano\". For more info, read: https://pkg.go.dev/time#pkg-constants"
+        },
+        "metricsMaxInMemory": {
+            "type": "integer",
+            "default": 1000,
+            "description": "Maximum number of metrics to keep in memory. Controls how many metrics are stored before older ones are discarded."
+        },
+        "captureBuffer": {
+            "type": "integer",
+            "minimum": 0,
+            "default": 5,
+            "description": "Size in megabytes of the buffer for storing request/response captures. Set to 0 to disable captures."
+        },
+        "performance": {
+            "type": "object",
+            "properties": {
+                "disabled": {
+                    "type": "boolean",
+                    "default": false,
+                    "description": "Disable system performance monitoring."
+                },
+                "every": {
+                    "type": "string",
+                    "pattern": "^[-+]?(\\d+(\\.\\d+)?(ns|us|ms|s|m|h))+$",
+                    "default": "15s",
+                    "description": "Delay between polling for new performance statistics. Minimum duration is 1s. Lower values use more RAM as stats are kept in memory."
+                }
+            },
+            "additionalProperties": false,
+            "default": {},
+            "description": "Configuration for CPU, RAM and GPU monitoring statistics."
+        },
+        "startPort": {
+            "type": "integer",
+            "default": 5800,
+            "description": "Starting port number for the automatic ${PORT} macro. The ${PORT} macro is incremented for every model that uses it."
+        },
+        "sendLoadingState": {
+            "type": "boolean",
+            "default": false,
+            "description": "Inject loading status updates into the reasoning field. When true, a stream of loading messages will be sent to the client."
+        },
+        "includeAliasesInList": {
+            "type": "boolean",
+            "default": false,
+            "description": "Present aliases within the /v1/models OpenAI API listing. when true, model aliases will be output to the API model listing duplicating all fields except for Id so chat UIs can use the alias equivalent to the original."
+        },
+        "macros": {
+            "$ref": "#/definitions/macros"
+        },
+        "models": {
+            "type": "object",
+            "description": "A dictionary of model configurations. Each key is a model's ID. Model settings have defaults if not defined. The model's ID is available as ${MODEL_ID}.",
+            "additionalProperties": {
+                "type": "object",
+                "required": [
+                    "cmd"
+                ],
+                "properties": {
+                    "macros": {
+                        "$ref": "#/definitions/macros"
+                    },
+                    "cmd": {
+                        "type": "string",
+                        "minLength": 1,
+                        "description": "Command to run to start the inference server. Macros can be used. Comments allowed with |."
+                    },
+                    "cmdStop": {
+                        "type": "string",
+                        "default": "",
+                        "description": "Command to run to stop the model gracefully. Uses ${PID} macro for upstream process id. If empty, default shutdown behavior is used."
+                    },
+                    "name": {
+                        "type": "string",
+                        "default": "",
+                        "maxLength": 128,
+                        "description": "Display name for the model. Used in v1/models API response."
+                    },
+                    "description": {
+                        "type": "string",
+                        "default": "",
+                        "maxLength": 1024,
+                        "description": "Description for the model. Used in v1/models API response."
+                    },
+                    "env": {
+                        "type": "array",
+                        "items": {
+                            "type": "string",
+                            "pattern": "^[A-Z_][A-Z0-9_]*=.*$"
+                        },
+                        "default": [],
+                        "description": "Array of environment variables to inject into cmd's environment. Each value is a string in ENV_NAME=value format."
+                    },
+                    "proxy": {
+                        "type": "string",
+                        "default": "http://localhost:${PORT}",
+                        "format": "uri",
+                        "description": "URL where llama-swap routes API requests. If custom port is used in cmd, this must be set."
+                    },
+                    "aliases": {
+                        "type": "array",
+                        "items": {
+                            "type": "string",
+                            "minLength": 1
+                        },
+                        "default": [],
+                        "description": "Alternative model names for this configuration. Must be unique globally."
+                    },
+                    "checkEndpoint": {
+                        "type": "string",
+                        "default": "/health",
+                        "pattern": "^/.*$|^none$",
+                        "description": "URL path to check if the server is ready. Use 'none' to skip health checking."
+                    },
+                    "ttl": {
+                        "type": "integer",
+                        "minimum": -1,
+                        "default": -1,
+                        "description": "Automatically unload the model after ttl seconds. -1 uses the global TTL value, 0 disables unloading. Must be >0 to enable."
+                    },
+                    "useModelName": {
+                        "type": "string",
+                        "default": "",
+                        "description": "Override the model name sent to upstream server. Useful if upstream expects a different name."
+                    },
+                    "filters": {
+                        "type": "object",
+                        "properties": {
+                            "stripParams": {
+                                "type": "string",
+                                "default": "",
+                                "pattern": "^[a-zA-Z0-9_, ]*$",
+                                "description": "Comma separated list of parameters to remove from the request. Used for server-side enforcement of sampling parameters."
+                            },
+                            "setParams": {
+                                "type": "object",
+                                "additionalProperties": true,
+                                "default": {},
+                                "description": "Dictionary of parameters to set/override in requests. Useful for enforcing specific parameter values. Protected params like 'model' cannot be overridden. Values can be strings, numbers, booleans, arrays, or objects."
+                            },
+                            "setParamsByID": {
+                                "type": "object",
+                                "additionalProperties": {
+                                    "type": "object",
+                                    "additionalProperties": true
+                                },
+                                "default": {},
+                                "description": "Dictionary mapping requested model IDs (or aliases) to parameters to set/override in requests. Applied after setParams and can override those values. Useful with aliases to vary behaviour depending on which alias the client used (e.g. different reasoning_effort per alias). Keys support ${MODEL_ID} macro substitution. Protected params like 'model' cannot be overridden."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "default": {},
+                        "description": "Dictionary of filter settings. Supports stripParams, setParams, and setParamsByID."
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": true,
+                        "default": {},
+                        "description": "Dictionary of arbitrary values included in /v1/models. Can contain complex types. Only passed through in /v1/models responses."
+                    },
+                    "concurrencyLimit": {
+                        "type": "integer",
+                        "minimum": 0,
+                        "default": 0,
+                        "description": "Overrides allowed number of active parallel requests to a model. 0 uses internal default of 10. >0 overrides default. Requests exceeding limit get HTTP 429."
+                    },
+                    "sendLoadingState": {
+                        "type": "boolean",
+                        "description": "Overrides the global sendLoadingState for this model. Ommitting this property will use the global setting."
+                    },
+                    "unlisted": {
+                        "type": "boolean",
+                        "default": false,
+                        "description": "If true the model will not show up in /v1/models responses. It can still be used as normal in API requests."
+                    },
+                    "timeouts": {
+                        "$ref": "#/definitions/timeouts"
+                    },
+                    "capabilities": {
+                        "type": "object",
+                        "properties": {
+                            "in": {
+                                "type": "array",
+                                "minItems": 1,
+                                "uniqueItems": true,
+                                "default": [],
+                                "items": {
+                                    "type": "string",
+                                    "enum": [
+                                        "text",
+                                        "audio",
+                                        "image"
+                                    ]
+                                },
+                                "description": "List of input modalities understood by the model."
+                            },
+                            "out": {
+                                "type": "array",
+                                "minItems": 1,
+                                "uniqueItems": true,
+                                "default": [],
+                                "items": {
+                                    "type": "string",
+                                    "enum": [
+                                        "text",
+                                        "audio",
+                                        "image"
+                                    ]
+                                },
+                                "description": "List of output modalities generated by the model."
+                            },
+                            "tools": {
+                                "type": "boolean",
+                                "default": false,
+                                "description": "Whether the model supports function calling."
+                            },
+                            "reranker": {
+                                "type": "boolean",
+                                "default": false,
+                                "description": "Whether the model supports the /v1/rerank endpoint."
+                            },
+                            "context": {
+                                "type": "integer",
+                                "minimum": 0,
+                                "default": 0,
+                                "description": "Maximum token context length supported by the model."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "description": "Defines what the model accepts for input, output and other metadata. Used in v1/models to inform clients what the model can do. An empty capabilities block (all zero values) is treated as not configured."
+                    }
+                }
+            }
+        },
+        "groups": {
+            "$ref": "#/definitions/groupsConfig"
+        },
+        "matrix": {
+            "$ref": "#/definitions/matrixConfig"
+        },
+        "hooks": {
+            "type": "object",
+            "properties": {
+                "on_startup": {
+                    "type": "object",
+                    "properties": {
+                        "preload": {
+                            "type": "array",
+                            "items": {
+                                "type": "string"
+                            },
+                            "default": [],
+                            "description": "List of model IDs to load on startup. Model names must match keys in models. When preloading multiple models, define a group to prevent swapping."
+                        }
+                    },
+                    "additionalProperties": false,
+                    "description": "Actions to perform on startup. Only supported action is preload."
+                }
+            },
+            "additionalProperties": false,
+            "description": "A dictionary of event triggers and actions. Only supported hook is on_startup."
+        },
+        "logToStdout": {
+            "type": "string",
+            "enum": [
+                "proxy",
+                "upstream",
+                "both",
+                "none"
+            ],
+            "default": "proxy",
+            "description": "Controls what is logged to stdout. 'proxy': logs generated by llama-swap, 'upstream': copy of upstream process stdout logs, 'both': both interleaved together, 'none': no logs written to stdout."
+        },
+        "apiKeys": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "minLength": 1
+            },
+            "default": [],
+            "description": "Require an API key when making requests to inference endpoints. When empty, authorization will not be checked. Each key is a non-empty string."
+        },
+        "peers": {
+            "type": "object",
+            "additionalProperties": {
+                "type": "object",
+                "required": [
+                    "proxy",
+                    "models"
+                ],
+                "properties": {
+                    "proxy": {
+                        "type": "string",
+                        "format": "uri",
+                        "description": "A valid base URL to proxy requests to. Requested path to llama-swap will be appended to the end of the proxy value."
+                    },
+                    "apiKey": {
+                        "type": "string",
+                        "default": "",
+                        "description": "A string key to be injected into the request. If blank, no key will be added. Key will be injected into headers: Authorization: Bearer <key> and x-api-key: <key>."
+                    },
+                    "models": {
+                        "type": "array",
+                        "items": {
+                            "type": "string",
+                            "minLength": 1
+                        },
+                        "description": "A list of models served by the peer."
+                    },
+                    "filters": {
+                        "type": "object",
+                        "properties": {
+                            "stripParams": {
+                                "type": "string",
+                                "default": "",
+                                "pattern": "^[a-zA-Z0-9_, ]*$",
+                                "description": "Comma separated list of parameters to remove from the request. Useful for removing parameters that the peer doesn't support."
+                            },
+                            "setParams": {
+                                "type": "object",
+                                "additionalProperties": true,
+                                "default": {},
+                                "description": "Dictionary of parameters to set/override in requests to this peer. Useful for injecting provider-specific settings. Protected params like 'model' cannot be overridden. Values can be strings, numbers, booleans, arrays, or objects."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "default": {},
+                        "description": "Dictionary of filter settings for peer requests. Supports stripParams and setParams."
+                    },
+                    "timeouts": {
+                        "type": "object",
+                        "properties": {
+                            "connect": {
+                                "type": "integer",
+                                "minimum": 0,
+                                "default": 30,
+                                "description": "TCP connection timeout in seconds."
+                            },
+                            "keepalive": {
+                                "type": "integer",
+                                "minimum": 0,
+                                "default": 30,
+                                "description": "TCP keepalive connection timeout in seconds."
+                            },
+                            "responseHeader": {
+                                "type": "integer",
+                                "minimum": 0,
+                                "default": 0,
+                                "description": "Time to wait for response headers in seconds."
+                            },
+                            "tlsHandshake": {
+                                "type": "integer",
+                                "minimum": 0,
+                                "default": 10,
+                                "description": "TLS handshake timeout in seconds."
+                            },
+                            "idleConn": {
+                                "type": "integer",
+                                "minimum": 0,
+                                "default": 90,
+                                "description": "Idle connection timeout in seconds."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "description": "Timeout settings for proxy connections to this peer."
+                    }
+                }
+            },
+            "default": {},
+            "description": "A dictionary of remote peers and models they provide. Peers can be another llama-swap or any server that provides the /v1/ generative API endpoints supported by llama-swap."
+        },
+        "upstream": {
+            "type": "object",
+            "description": "Controls behaviour of the /upstream passthrough endpoint. Recommended to only use in special use cases; leaving it as the default will typically be the best experience.",
+            "properties": {
+                "ignorePaths": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    },
+                    "default": [
+                        ".*\\.(js|json|css|png|gif|jpg|jpeg|ico|txt)$"
+                    ],
+                    "description": "List of RE2 compatible regular expressions. Any request to a path matching any of the regular expressions will be ignored and not trigger a swap. When not specified, defaults to a pattern matching common static-asset suffixes (.js, .json, .css, .png, .gif, .jpg, .jpeg, .ico, .txt)."
+                }
+            },
+            "additionalProperties": false,
+            "default": {}
+        },
+        "routing": {
+            "type": "object",
+            "description": "Canonical routing/scheduling configuration. Alternative to the legacy top-level 'groups'/'matrix' keys; a config must not use both styles.",
+            "properties": {
+                "scheduler": {
+                    "type": "object",
+                    "description": "Scheduler configuration. Decides the order in which queued requests are serviced.",
+                    "properties": {
+                        "use": {
+                            "type": "string",
+                            "enum": [
+                                "fifo"
+                            ],
+                            "default": "fifo",
+                            "description": "Scheduler to use. Only 'fifo' is currently supported."
+                        },
+                        "settings": {
+                            "type": "object",
+                            "properties": {
+                                "fifo": {
+                                    "type": "object",
+                                    "properties": {
+                                        "priority": {
+                                            "type": "object",
+                                            "description": "Per-model priority. Keys are model IDs, values are integers (default 0). Higher values are serviced first.",
+                                            "additionalProperties": {
+                                                "type": "integer"
+                                            }
+                                        }
+                                    },
+                                    "additionalProperties": false
+                                }
+                            },
+                            "additionalProperties": false
+                        }
+                    },
+                    "additionalProperties": false
+                },
+                "router": {
+                    "type": "object",
+                    "description": "Router configuration. Selects between the group and matrix swapping strategies.",
+                    "properties": {
+                        "use": {
+                            "type": "string",
+                            "enum": [
+                                "group",
+                                "matrix"
+                            ],
+                            "default": "group",
+                            "description": "Router to use. 'group' uses static groups, 'matrix' uses the solver-based swap matrix."
+                        },
+                        "settings": {
+                            "type": "object",
+                            "properties": {
+                                "groups": {
+                                    "$ref": "#/definitions/groupsConfig"
+                                },
+                                "matrix": {
+                                    "$ref": "#/definitions/matrixConfig"
+                                }
+                            },
+                            "additionalProperties": false
+                        }
+                    },
+                    "additionalProperties": false
+                }
+            },
+            "additionalProperties": false
+        }
+    },
+    "allOf": [
+        {
+            "if": {
+                "required": [
+                    "groups"
+                ]
+            },
+            "then": {
+                "not": {
+                    "required": [
+                        "matrix"
+                    ]
+                }
+            }
+        },
+        {
+            "if": {
+                "required": [
+                    "matrix"
+                ]
+            },
+            "then": {
+                "not": {
+                    "required": [
+                        "groups"
+                    ]
+                }
+            }
+        }
+    ]
+}
@@ -1,53 +1,637 @@
-# Seconds to wait for llama.cpp to be available to serve requests
-# Default (and minimum): 15 seconds
-healthCheckTimeout: 15
+# add this modeline for validation in vscode
+# yaml-language-server: $schema=https://raw.githubusercontent.com/mostlygeek/llama-swap/refs/heads/main/config-schema.json
+#
+# llama-swap YAML configuration example
+# -------------------------------------
+#
+# 💡 Tip - Use an LLM with this file!
+# ====================================
+#  This example configuration is written to be LLM friendly. Try
+#  copying this file into an LLM and asking it to explain or generate
+#  sections for you.
+# ====================================

+# Usage notes:
+# - Below are all the available configuration options for llama-swap.
+# - Settings noted as "required" must be in your configuration file
+# - Settings noted as "optional" can be omitted
+
+# healthCheckTimeout: number of seconds to wait for a model to be ready to serve requests
+# - optional, default: 120
+# - minimum value is 15 seconds, anything less will be set to this value
+healthCheckTimeout: 500
+
+# logLevel: sets the logging value
+# - optional, default: info
+# - Valid log levels: debug, info, warn, error
+logLevel: info
+
+# logTimeFormat: enables and sets the logging timestamp format
+# - optional, default (disabled): ""
+# - Valid values: "", "ansic", "unixdate", "rubydate", "rfc822", "rfc822z",
+#   "rfc850", "rfc1123", "rfc1123z", "rfc3339", "rfc3339nano", "kitchen",
+#   "stamp", "stampmilli", "stampmicro", and "stampnano".
+# - For more info, read: https://pkg.go.dev/time#pkg-constants
+logTimeFormat: ""
+
+# logToStdout: controls what is logged to stdout
+# - optional, default: "proxy"
+# - valid values:
+#   - "proxy": logs generated by llama-swap when swapping models,
+#      handling requests, etc.
+#   - "upstream": a copy of an upstream processes stdout logs
+#   - "both": both the proxy and upstream logs interleaved together
+#   - "none": no logs are ever written to stdout
+logToStdout: "proxy"
+
+# metricsMaxInMemory: maximum number of metrics to keep in memory
+# - optional, default: 1000
+# - controls how many metrics are stored in memory before older ones are discarded
+# - useful for limiting memory usage when processing large volumes of metrics
+metricsMaxInMemory: 1000
+
+# captureBuffer: how many MBs to allocate for storing request/response captures
+# - optional, default: 10
+# - set to 0 to disable
+captureBuffer: 15
+
+# performance: configuration for system monitoring statistics
+# - timing values are duration strings like 1s, 1h30m, 90m, 2h10s, etc.
+performance:
+  # disabled: boolean
+  # - default: false
+  disabled: false
+
+  # every: delay between polling for new performance statistics
+  # - default: 5s
+  # - minimum duration 5s
+  every: 15s
+
+# startPort: sets the starting port number for the automatic ${PORT} macro.
+# - optional, default: 5800
+# - the ${PORT} macro can be used in model.cmd and model.proxy settings
+# - it is automatically incremented for every model that uses it
+startPort: 10001
+
+# sendLoadingState: inject loading status updates into the reasoning (thinking)
+# field
+# - optional, default: false
+# - when true, a stream of loading messages will be sent to the client in the
+#   reasoning field so chat UIs can show that loading is in progress.
+# - see #366 for more details
+sendLoadingState: true
+
+# includeAliasesInList: present aliases within the /v1/models OpenAI API listing
+# - optional, default: false
+# - when true, model aliases will be output to the API model listing duplicating
+#   all fields except for Id so chat UIs can use the alias equivalent to the original.
+includeAliasesInList: false
+
+# globalTTL: the default TTL in seconds before unloading a model
+# - optional, default: 0 (never automatically unload)
+# - must be >= 0
+globalTTL: 0
+
+# macros: a dictionary of string substitutions
+# - optional, default: empty dictionary
+# - macros are reusable snippets
+# - used in a model's cmd, cmdStop, proxy, checkEndpoint, filters.stripParams
+# - useful for reducing common configuration settings
+# - macro names are strings and must be less than 64 characters
+# - macro names must match the regex ^[a-zA-Z0-9_-]+$
+# - macro names must not be a reserved name: PORT or MODEL_ID
+# - macro values can be numbers, bools, or strings
+# - macros can contain other macros, but they must be defined before they are used
+# - environment variables can be referenced with ${env.VAR_NAME} syntax
+#   - env macros are substituted first, before regular macros
+#   - if the env var is not set, config loading will fail with an error
+macros:
+  # Example of a multi-line macro
+  "latest-llama": >
+    /path/to/llama-server/llama-server-ec9e0301 --port ${PORT}
+
+  "default_ctx": 4096
+
+  # Example of macro-in-macro usage. macros can contain other macros
+  # but they must be previously declared.
+  "default_args": "--ctx-size ${default_ctx}"
+
+  # Example of environment variable macros
+  # - ${env.VAR_NAME} pulls the value from the system environment
+  # - useful for paths, secrets, or machine-specific configuration
+  "models_dir": "${env.HOME}/models"
+
+# apiKeys: require an API key when making requests to inference endpoints
+# - optional, default: []
+# - when empty (the default) authorization will not be checked as llama-swap is default-allow
+# - each key is a non-empty string
+apiKeys:
+  - "sk-hunter2"
+  # tip, one liner: printf "sk-%s\n" "$(head -c 48 /dev/urandom | base64 )"
+  - "sk-gyCPiKUcIfPlaM4OSMZekkprgijPx6+OsmQs8Rsg0xZ9qpy6gKWsIKqHOk+cgXVx"
+
+  # use environment variable macros to keep secrets out of the config
+  - "${env.API_KEY_1}"
+  - "${env.API_KEY_2}"
+
+# upstream: controls behaviour of the /upstream passthrough endpoint
+# - optional, default: empty dictionary
+# - recommended to only use in special use cases. Leaving it as the
+#   default will typically be the best experience
+upstream:
+  # ignorePaths: list of RE2 compatible regular expressions
+  # - default: (see below)
+  # - any request to a path matching any of the regular expressions
+  #   will be ignored and not trigger a swap
+  ignorePaths:
+    - '.*\.(js|json|css|png|gif|jpg|jpeg|ico|txt)$'
+
+# models: a dictionary of model configurations
+# - required
+# - each key is the model's ID, used in API requests
+# - model settings have default values that are used if they are not defined here
+# - the model's ID is available in the ${MODEL_ID} macro, also available in macros defined above
+# - below are examples of the all the settings a model can have
 models:
-  "llama":
-    cmd: >
-      models/llama-server-osx
-      --port 9001
-      -m models/Llama-3.2-1B-Instruct-Q4_0.gguf
-    proxy: http://127.0.0.1:9001
+  # keys are the model names used in API requests
+  "gpt-oss-120b":
+    # macros: a dictionary of string substitutions specific to this model
+    # - optional, default: empty dictionary
+    # - macros defined here override macros defined in the global macros section
+    # - model level macros follow the same rules as global macros
+    macros:
+      "default_ctx": 16384
+      "temp": 0.7

-    # list of model name aliases this llama.cpp instance can serve
-    aliases:
-    - gpt-4o-mini
+    # cmd: the command to run to start the inference server.
+    # - required
+    # - it is just a string, similar to what you would run on the CLI
+    # - using `|` allows for comments in the command, these will be parsed out
+    # - macros can be used within cmd
+    cmd: |
+      # ${latest-llama} is a macro that is defined above
+      ${latest-llama}
+      --model path/to/gpt-oss-120B.gguf
+      --ctx-size ${default_ctx}
+      --temperature ${temp}

-    # check this path for a HTTP 200 response for the server to be ready
-    checkEndpoint: /health
+    # name: a display name for the model
+    # - optional, default: empty string
+    # - if set, it will be used in the v1/models API response
+    # - if not set, it will be omitted in the JSON model record
+    name: "gpt-oss 120B"

-    # unload model after 5 seconds
-    ttl: 5
+    # description: a description for the model
+    # - optional, default: empty string
+    # - if set, it will be used in the v1/models API response
+    # - if not set, it will be omitted in the JSON model record
+    description: "A thinking model from OpenAI"

-  "qwen":
-    cmd: models/llama-server-osx --port 9002 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
-    proxy: http://127.0.0.1:9002
-    aliases:
-    - gpt-3.5-turbo
-
-  "simple":
-    # example of setting environment variables
+    # env: define an array of environment variables to inject into cmd's environment
+    # - optional, default: empty array
+    # - each value is a single string
+    # - in the format: ENV_NAME=value
    env:
-      - CUDA_VISIBLE_DEVICES=0,1
-      - env1=hello
-    cmd: build/simple-responder --port 8999
+      - "CUDA_VISIBLE_DEVICES=0,1,2"
+
+    # proxy: the URL where llama-swap routes API requests
+    # - optional, default: http://localhost:${PORT}
+    # - if you used ${PORT} in cmd this can be omitted
+    # - if you use a custom port in cmd this *must* be set
    proxy: http://127.0.0.1:8999

-    # use "none" to skip check. Caution this may cause some requests to fail
-    # until the upstream server is ready for traffic
-    checkEndpoint: none
+    # checkEndpoint: URL path to check if the server is ready
+    # - optional, default: /health
+    # - endpoint is expected to return an HTTP 200 response
+    # - all requests wait until the endpoint is ready or fails
+    # - use "none" to skip endpoint health checking
+    checkEndpoint: /custom-endpoint

-  # don't use these, just for testing if things are broken
-  "broken":
-    cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
-    proxy: http://127.0.0.1:8999
-  "broken_timeout":
-    cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
-    proxy: http://127.0.0.1:9000
+    # ttl: automatically unload the model after ttl seconds
+    # - optional, default: -1 (use global default)
+    # - ttl values must be a value greater than or equal to 0
+    # - a ttl of -1 will use the global TTL value as the default
+    # - a ttl of 0 will mean never unload
+    # - a value of 0 disables automatic unloading of the model
+    ttl: 60

-# creating a coding profile with models for code generation and general questions
-profiles:
-  coding:
-    - "qwen"
-    - "llama"
+    # useModelName: override the model name that is sent to upstream server
+    # - optional, default: ""
+    # - useful for when the upstream server expects a specific model name that
+    #   is different from the model's ID
+    useModelName: "openai/gpt-oss-120B"
+
+    # filters: a dictionary of filter settings
+    # - optional, default: empty dictionary
+    # - same capabilities as peer filters (stripParams, setParams)
+    filters:
+      # stripParams: a comma separated list of parameters to remove from the request
+      # - optional, default: ""
+      # - useful for server side enforcement of sampling parameters
+      # - the `model` parameter can never be removed
+      # - can be any JSON key in the request body
+      # - recommended to stick to sampling parameters
+      stripParams: "temperature, top_p, top_k"
+
+      # setParams: a dictionary of parameters to set/override in requests
+      # - optional, default: empty dictionary
+      # - useful for enforcing specific parameter values
+      # - protected params like "model" cannot be overridden
+      # - values can be strings, numbers, booleans, arrays, or objects
+      # - always runs for the model
+      setParams:
+        # Example: enforce specific sampling parameters
+        temperature: 0.7
+        top_p: 0.9
+
+      # setParamsByID: a dictionary of parameters to set based the model ID
+      # - optional, default: empty dictionary
+      # - combine with aliases to create variant behaviour without reloading the model
+      # - parameters are set in the request body JSON
+      # - run after setParams so it will override any settings
+      # - protected params like "model" cannot be overridden
+      # - values can be strings, numbers, booleans, arrays, or objects
+      # - model aliases will be automatically created for each key
+      setParamsByID:
+        "${MODEL_ID}":
+          chat_template_kwargs:
+            reasoning_effort: medium
+        "${MODEL_ID}:high":
+          chat_template_kwargs:
+            reasoning_effort: high
+        "${MODEL_ID}:low":
+          chat_template_kwargs:
+            reasoning_effort: low
+
+    # aliases: alternative model names that this model configuration is used for
+    # - optional, default: empty array
+    # - aliases must be unique globally
+    # - useful for impersonating a specific model
+    aliases:
+      - "gpt-4o-mini"
+
+    # metadata: a dictionary of arbitrary values that are included in /v1/models
+    # - optional, default: empty dictionary
+    # - while metadata can contains complex types it is recommended to keep it simple
+    # - metadata is only passed through in /v1/models responses
+    metadata:
+      # port will remain an integer
+      port: ${PORT}
+
+      # the ${temp} macro will remain a float
+      temperature: ${temp}
+      note: "The ${MODEL_ID} is running on port ${PORT} temp=${temp},
+        context=${default_ctx}"
+
+      a_list:
+        - 1
+        - 1.23
+        - "macros are OK in list and dictionary types: ${MODEL_ID}"
+
+      an_obj:
+        a: "1"
+        b: 2
+        # objects can contain complex types with macro substitution
+        # becomes: c: [0.7, false, "model: llama"]
+        c: ["${temp}", false, "model: ${MODEL_ID}"]
+
+    # concurrencyLimit: overrides the allowed number of active parallel requests to a model
+    # - optional, default: 0
+    # - useful for limiting the number of active parallel requests a model can process
+    # - must be set per model
+    # - any number greater than 0 will override the internal default value of 10
+    # - any requests that exceeds the limit will receive an HTTP 429 Too Many Requests response
+    # - recommended to be omitted and the default used
+    concurrencyLimit: 0
+
+    # sendLoadingState: overrides the global sendLoadingState setting for this model
+    # - optional, default: undefined (use global setting)
+    sendLoadingState: false
+
+    # timeouts: configure proxy connection timeouts for this model
+    # - optional, defaults shown below
+    # - useful for models running on slower hardware that need longer timeouts
+    # - connect: TCP dial connection timeout in seconds, default: 30 seconds
+    # - keepalive: TCP connection keepalive timeout, default: 30 seconds
+    # - responseHeader: time to wait for response headers in seconds, default: 0 (no timeout)
+    # - tlsHandshake: TLS handshake timeout in seconds, default: 10 seconds
+    # - idleConn: idle connection timeout in seconds, default: 90 seconds
+    # - set any value to 0 to disable that timeout (not recommended)
+    timeouts:
+      connect: 30
+      keepalive: 0
+      responseHeader: 60
+      tlsHandshake: 10
+      idleConn: 90
+
+    # capabilities: defines what the model accepts for input, output and other metadata
+    # - optional; omitted or all-zero means no capabilities
+    # - used in v1/models to inform clients what the model can do
+    capabilities:
+      # in: list of modalities understood by the model
+      # - default: []
+      # - valid: text, audio, image
+      in:
+        - text
+        - audio
+        - image
+      # out: list of modalities generated by the model
+      # - default: []
+      # - valid: text, audio, image
+      out:
+        - text
+        - audio
+        - image
+      # tools: the model supports function calling
+      # - default: false
+      tools: true
+
+      # reranker: the model supports the /v1/rerank endpoint
+      # - default: false
+      reranker: false
+
+      # context: the maximum token context length supported
+      # - default: 0
+      # - must be an integer > 0
+      context: 32000
+
+  # Unlisted model example:
+  "qwen-unlisted":
+    # unlisted: boolean, true or false
+    # - optional, default: false
+    # - unlisted models do not show up in /v1/models api requests
+    # - can be requested as normal through all apis
+    unlisted: true
+    cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
+
+  # Docker example:
+  # container runtimes like Docker and Podman can be used reliably with
+  # a combination of cmd, cmdStop, and ${MODEL_ID}
+  "docker-llama":
+    proxy: "http://127.0.0.1:${PORT}"
+    cmd: |
+      docker run --name ${MODEL_ID}
+      --init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
+      ghcr.io/ggml-org/llama.cpp:server
+      --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
+
+    # cmdStop: command to run to stop the model gracefully
+    # - optional, default: ""
+    # - useful for stopping commands managed by another system
+    # - the upstream's process id is available in the ${PID} macro
+    #
+    # When empty, llama-swap has this default behaviour:
+    # - on POSIX systems: a SIGTERM signal is sent
+    # - on Windows, calls taskkill to stop the process
+    # - processes have 5 seconds to shutdown until forceful termination is attempted
+    cmdStop: docker stop ${MODEL_ID}
+
+# hooks: a dictionary of event triggers and actions
+# - optional, default: empty dictionary
+# - the only supported hook is on_startup
+hooks:
+  # on_startup: a dictionary of actions to perform on startup
+  # - optional, default: empty dictionary
+  # - the only supported action is preload
+  on_startup:
+    # preload: a list of model ids to load on startup
+    # - optional, default: empty list
+    # - model names must match keys in the models sections
+    # - when preloading multiple models at once, define a group
+    #   otherwise models will be loaded and swapped out
+    preload:
+      - "llama"
+
+# routing:
+# Controls how llama-swap decides which models can run at the same time and
+# which get swapped out. Choose one of two swap engines:
+#
+# - group:  the default engine. Simpler to configure. You define groups of
+#           models that run together, and loading one group typically unloads
+#           the others.
+#
+# - matrix: the newer engine. More involved to configure, but far more
+#           flexible. It uses a small expression language to describe which
+#           model combinations are allowed to run concurrently, enabling
+#           setups that groups cannot express.
+#
+# The routing section is optional.
+routing:
+  router:
+    # use: a string defining which engine to use
+    # - optional, default: "group"
+    # - valid values: group, matrix
+    use: group
+
+    # settings: a dictionary of settings for the specific engines
+    settings:
+      # groups: a dictionary of named groups
+      # - optional, default: empty dictionary
+      # - lets you keep some models loaded while others swap out
+      # - every member must be a model ID defined in the models section
+      # - a model can belong to only one group
+      # - behaviour is set per group with the `swap`, `exclusive` and
+      #   `persistent` fields
+      # - see issue #109 for details
+      #
+      # NOTE: the model names below are illustrative and are not defined above.
+      groups:
+        # group1 reproduces llama-swap's default behaviour: only one model
+        # runs at a time across the entire instance.
+        "group1":
+          # swap: how members of this group swap among themselves
+          # - optional, default: true
+          # - true:  only one member runs at a time
+          # - false: all members can run together, no swapping
+          swap: true
+
+          # exclusive: how this group affects other groups
+          # - optional, default: true
+          # - true:  running a member unloads every other group
+          # - false: running a member leaves other groups untouched
+          exclusive: true
+
+          # members: the model IDs in this group
+          # required
+          members:
+            - "llama"
+            - "qwen-unlisted"
+
+        # group2: members all run together, but loading any other group
+        # unloads them.
+        "group2":
+          # swap: false lets all members stay loaded at once
+          swap: false
+
+          # exclusive: false means requesting a member loads it without
+          # unloading any other group
+          exclusive: false
+          members:
+            - "docker-llama"
+            - "modelA"
+            - "modelB"
+
+        # forever: a persistent group that other groups can never unload.
+        "forever":
+          # persistent: other groups cannot unload this group's members
+          # - optional, default: false
+          # - has no effect on swapping within the group
+          persistent: true
+
+          # swap/exclusive: false keeps all members loaded and avoids
+          # unloading other groups
+          swap: false
+          exclusive: false
+          members:
+            - "forever-modelA"
+            - "forever-modelB"
+            - "forever-modelc"
+
+      # The matrix lists the model combinations that are allowed to run
+      # concurrently. When a model is requested, the solver makes room for it
+      # by evicting as few running models as possible, preferring to keep the
+      # costliest ones loaded.
+      #
+      # Solver behaviour:
+      #   1. A request arrives for model X.
+      #   2. If X is already running, forward the request. Done.
+      #   3. Collect every set that contains X.
+      #   4. For each set, add up the evict_costs of the running models that
+      #      are NOT in that set — that is the set's cost.
+      #   5. Choose the lowest-cost set. Break ties by definition order.
+      #   6. Evict the models outside that set, start X, forward the request.
+      #
+      # Subset semantics: a set [a, b, c] also permits any subset of itself.
+      # Only the requested model is started; the others are not preloaded.
+      #
+      # A model that appears in no set can only run on its own.
+      #
+      matrix:
+        # vars: short aliases for model IDs (alphanumeric, 1-8 chars)
+        # - required: sets and evict_costs reference these names, not model IDs
+        # - map each short name to a real model ID (not a model alias)
+        # - keeps the set expressions short and readable
+        vars:
+          g: gemma-model
+          q: qwen-model
+          m: mistral-model
+          v: voxtral-model
+          e: reranker-model
+          L: llama-70B
+          sd: stable-diffusion
+
+        # evict_costs: relative cost of losing a running model (default: 1)
+        evict_costs:
+          v: 50 # vllm backend, slow cold start
+          L: 30 # 70B weights, slow to load
+
+        # sets: named combinations of models that may run together.
+        # Each value is an expression built from these operators:
+        #   &     AND   (models run together)
+        #   |     OR    (alternatives)
+        #   ()    grouping
+        #   +ref  inline the expression of another set
+        #
+        # Each expression expands into one or more concrete sets:
+        #   "L"                  → [L]
+        #   "a & b"              → [a, b]
+        #   "a | b"              → [a], [b]
+        #   "(a | b) & c"        → [a, c], [b, c]
+        #   "(a | b) & (c | d)"  → [a,c], [a,d], [b,c], [b,d]
+        #   "+llms & v"          → inline the llms set, then AND with v
+        sets:
+          # An LLM plus TTS. Switching between g/q/m keeps v loaded.
+          # expands to: [g,v], [q,v], [m,v]
+          standard: "(g | q | m) & v"
+
+          # An LLM plus TTS plus reranker.
+          # expands to: [g,v,e], [q,v,e]
+          with_rerank: "(g | q) & v & e"
+
+          # An LLM plus image generation, no TTS.
+          # expands to: [g,sd], [q,sd]
+          creative: "(g | q) & sd"
+
+          # The 70B model uses every GPU, so it can only run alone.
+          # expands to: [L]
+          full: "L"
+
+  # scheduler: how queued requests are ordered.
+  # The default and only valid scheduler is "fifo"
+  scheduler:
+    use: fifo
+    settings:
+      fifo:
+        # priority: a dictionary of model ID -> priority
+        # - optional, default: empty dictionary
+        # - models default to priority 0
+        # - higher priority requests are serviced first in the queue
+        priority:
+          A: 10
+          B: 5
+          C: 5
+          D: 1
+
+# peers: a dictionary of remote peers and models they provide
+# - optional, default empty dictionary
+# - peers can be another llama-swap
+# - peers can be any server that provides the /v1/ generative api endpoints supported by llama-swap
+peers:
+  # keys is the peer'd ID
+  llama-swap-peer:
+    # proxy: a valid base URL to proxy requests to
+    # - required
+    # - requested path to llama-swap will be appended to the end of the proxy value
+    proxy: http://192.168.1.23
+    # models: a list of models served by the peer
+    # - required
+    models:
+      - model_a
+      - model_b
+      - embeddings/model_c
+  openrouter:
+    proxy: https://openrouter.ai/api
+    # apiKey: a string key to be injected into the request
+    # - optional, default: ""
+    # - if blank, no key will be added to the request
+    # - key will be injected into headers: Authorization: Bearer <key> and x-api-key: <key>
+    # - can be a string or a macro
+    apiKey: ${env.OPENROUTER_API_KEY}
+    models:
+      - meta-llama/llama-3.1-8b-instruct
+      - qwen/qwen3-235b-a22b-2507
+      - deepseek/deepseek-v3.2
+      - z-ai/glm-4.7
+      - moonshotai/kimi-k2-0905
+      - minimax/minimax-m2.1
+    # timeouts: configure proxy connection timeouts for this peer
+    # - optional, defaults shown below
+    # - useful when the peer runs on slower hardware
+    # - set any value to 0 to disable that timeout (not recommended)
+    timeouts:
+      connect: 30
+      keepalive: 30
+      responseHeader: 60
+      tlsHandshake: 10
+      idleConn: 90
+
+    # filters: a dictionary of filter settings for peer requests
+    # - optional, default: empty dictionary
+    # - same capabilities as model filters (stripParams, setParams)
+    filters:
+      # stripParams: a comma separated list of parameters to remove from the request
+      # - optional, default: ""
+      # - useful for removing parameters that the peer doesn't support
+      # - the `model` parameter can never be removed
+      stripParams: "temperature, top_p"
+
+      # setParams: a dictionary of parameters to set/override in requests to this peer
+      # - optional, default: empty dictionary
+      # - useful for injecting provider-specific settings like data retention policies
+      # - protected params like "model" cannot be overridden
+      # - values can be strings, numbers, booleans, arrays, or objects
+      setParams:
+        # Example: enforce zero-data-retention for OpenRouter
+        provider:
+          data_collection: "deny"
+          zdr: true
@@ -0,0 +1,214 @@
+#!/bin/bash
+
+set -euo pipefail
+
+cd $(dirname "$0")
+
+# use this to test locally, example:
+# GITHUB_TOKEN=$(gh auth token) LOG_DEBUG=1 DEBUG_ABORT_BUILD=1 ./docker/build-container.sh rocm
+# you need read:package scope on the token. Generate a personal access token with
+# the scopes: gist, read:org, repo, write:packages
+# then: gh auth login (and copy/paste the new token)
+
+LOG_DEBUG=${LOG_DEBUG:-0}
+DEBUG_ABORT_BUILD=${DEBUG_ABORT_BUILD:-}
+
+log_debug() {
+    if [ "$LOG_DEBUG" = "1" ]; then
+        echo "[DEBUG] $*"
+    fi
+}
+
+log_info() {
+    echo "[INFO] $*"
+}
+
+ARCH=$1
+PUSH_IMAGES=${2:-false}
+
+# List of allowed architectures
+ALLOWED_ARCHS=("intel" "vulkan" "musa" "cuda" "cuda13" "cpu" "rocm")
+
+# Check if ARCH is in the allowed list
+if [[ ! " ${ALLOWED_ARCHS[@]} " =~ " ${ARCH} " ]]; then
+  log_info "Error: ARCH must be one of the following: ${ALLOWED_ARCHS[@]}"
+  exit 1
+fi
+
+# Check if GITHUB_TOKEN is set and not empty
+if [[ -z "${GITHUB_TOKEN:-}" ]]; then
+  log_info "Error: GITHUB_TOKEN is not set or is empty."
+  exit 1
+fi
+
+# Set llama.cpp base image, customizable using the BASE_LLAMACPP_IMAGE environment
+# variable, this permits testing with forked llama.cpp repositories
+BASE_IMAGE=${BASE_LLAMACPP_IMAGE:-ghcr.io/ggml-org/llama.cpp}
+SD_IMAGE=${BASE_SDCPP_IMAGE:-ghcr.io/leejet/stable-diffusion.cpp}
+
+# LS_REPO is the destination of the built container image — defaults to the
+# current GitHub repository so forked CI builds publish to the fork's own
+# ghcr.io namespace without code changes.
+LS_REPO=${GITHUB_REPOSITORY:-mostlygeek/llama-swap}
+
+# LS_BINARY_REPO is where the llama-swap release tarball is downloaded
+# from. Decoupled from LS_REPO so forks (which usually have no releases of
+# their own) can still build a container by pulling the canonical binary
+# from upstream. Override via the LS_BINARY_REPO env var when you maintain
+# fork-side releases.
+LS_BINARY_REPO=${LS_BINARY_REPO:-mostlygeek/llama-swap}
+
+# the most recent llama-swap tag
+# have to strip out the 'v' due to .tar.gz file naming.
+# Authenticated request — unauth'd github.com API is 60/hr per IP and GHA
+# runners share IPs, so the call regularly returns rate-limit JSON and
+# `.tag_name` then resolves to "null", producing a bogus `vnull` URL below.
+LS_VER=$(curl -s -H "Authorization: Bearer $GITHUB_TOKEN" \
+    "https://api.github.com/repos/${LS_BINARY_REPO}/releases/latest" \
+    | jq -r .tag_name | sed 's/v//')
+
+if [[ -z "$LS_VER" || "$LS_VER" == "null" ]]; then
+    log_info "Error: could not resolve latest llama-swap release tag from ${LS_BINARY_REPO}"
+    exit 1
+fi
+
+# Fetches the most recent llama.cpp tag matching the given prefix
+# Handles pagination to search beyond the first 100 results
+# $1 - tag_prefix (e.g., "server" or "server-vulkan")
+# Returns: the version number extracted from the tag
+fetch_llama_tag() {
+    local tag_prefix=$1
+    local page=1
+    local per_page=100
+
+    while true; do
+        log_debug "Fetching page $page for tag prefix: $tag_prefix"
+
+        local response=$(curl -s -H "Authorization: Bearer $GITHUB_TOKEN" \
+            "https://api.github.com/users/ggml-org/packages/container/llama.cpp/versions?per_page=${per_page}&page=${page}")
+
+        # Check for API errors
+        if echo "$response" | jq -e '.message' > /dev/null 2>&1; then
+            local error_msg=$(echo "$response" | jq -r '.message')
+            log_info "GitHub API error: $error_msg"
+            return 1
+        fi
+
+        # Check if response is empty array (no more pages)
+        if [ "$(echo "$response" | jq 'length')" -eq 0 ]; then
+            log_debug "No more pages (empty response)"
+            return 1
+        fi
+
+        # Extract matching tag from this page
+        local found_tag=$(echo "$response" | jq -r \
+            ".[] | select(.metadata.container.tags[]? | startswith(\"$tag_prefix\")) | .metadata.container.tags[] | select(startswith(\"$tag_prefix\"))" \
+            | sort -r | head -n1)
+
+        if [ -n "$found_tag" ]; then
+            log_debug "Found tag: $found_tag on page $page"
+            echo "$found_tag" | awk -F '-' '{print $NF}'
+            return 0
+        fi
+
+        page=$((page + 1))
+
+        # Safety limit to prevent infinite loops
+        if [ $page -gt 50 ]; then
+            log_info "Reached pagination safety limit (50 pages)"
+            return 1
+        fi
+    done
+}
+
+if [ "$ARCH" == "cpu" ]; then
+    LCPP_TAG=$(fetch_llama_tag "server")
+    BASE_TAG=server-${LCPP_TAG}
+else
+    LCPP_TAG=$(fetch_llama_tag "server-${ARCH}")
+    BASE_TAG=server-${ARCH}-${LCPP_TAG}
+fi
+
+SD_TAG=master-${ARCH}
+
+# Abort if LCPP_TAG is empty.
+if [[ -z "$LCPP_TAG" ]]; then
+    log_info "Abort: Could not find llama-server container for arch: $ARCH"
+    exit 1
+else
+    log_info "LCPP_TAG: $LCPP_TAG"
+fi
+
+if [[ ! -z "$DEBUG_ABORT_BUILD" ]]; then
+    log_info "Abort: DEBUG_ABORT_BUILD set"
+    exit 0
+fi
+
+# cpu is the only backend with a multi-arch upstream base
+# (ghcr.io/ggml-org/llama.cpp:server-bXXXX ships amd64+arm64); GPU backends
+# are amd64-only and stay on the original `docker build` path so the
+# sd-server layer can still FROM the just-built image via the local
+# dockerd image store (buildx's container driver has a separate store
+# that doesn't share with dockerd, which breaks the sd build).
+if [ "$ARCH" == "cpu" ]; then
+    if [ "$PUSH_IMAGES" == "true" ]; then
+        BUILDX_FLAGS="--push --platform linux/amd64,linux/arm64"
+    else
+        # Smoke build: validate both platforms but emit no output. buildx
+        # on the docker-container driver defaults to cacheonly when
+        # neither --push nor --load is given, so each arch fully builds
+        # and a regression in either fails CI — without materializing the
+        # image or needing to --load (which is multi-arch-incompatible).
+        BUILDX_FLAGS="--platform linux/amd64,linux/arm64"
+    fi
+fi
+
+for CONTAINER_TYPE in non-root root; do
+  CONTAINER_TAG="ghcr.io/${LS_REPO}:v${LS_VER}-${ARCH}-${LCPP_TAG}"
+  CONTAINER_LATEST="ghcr.io/${LS_REPO}:${ARCH}"
+  USER_UID=0
+  USER_GID=0
+  USER_HOME=/root
+
+  if [ "$CONTAINER_TYPE" == "non-root" ]; then
+    CONTAINER_TAG="${CONTAINER_TAG}-non-root"
+    CONTAINER_LATEST="${CONTAINER_LATEST}-non-root"
+    USER_UID=10001
+    USER_GID=10001
+    USER_HOME=/app
+  fi
+
+  log_info "Building $CONTAINER_TYPE $CONTAINER_TAG $LS_VER"
+  if [ "$ARCH" == "cpu" ]; then
+    docker buildx build $BUILDX_FLAGS --provenance=false \
+      -f llama-swap.Containerfile \
+      --build-arg BASE_TAG=${BASE_TAG} --build-arg LS_VER=${LS_VER} --build-arg UID=${USER_UID} \
+      --build-arg LS_REPO=${LS_BINARY_REPO} --build-arg GID=${USER_GID} --build-arg USER_HOME=${USER_HOME} \
+      --build-arg BASE_IMAGE=${BASE_IMAGE} \
+      -t ${CONTAINER_TAG} -t ${CONTAINER_LATEST} .
+  else
+    docker build --provenance=false -f llama-swap.Containerfile \
+      --build-arg BASE_TAG=${BASE_TAG} --build-arg LS_VER=${LS_VER} --build-arg UID=${USER_UID} \
+      --build-arg LS_REPO=${LS_BINARY_REPO} --build-arg GID=${USER_GID} --build-arg USER_HOME=${USER_HOME} \
+      -t ${CONTAINER_TAG} -t ${CONTAINER_LATEST} \
+      --build-arg BASE_IMAGE=${BASE_IMAGE} .
+  fi
+
+  # For architectures with stable-diffusion.cpp support, layer sd-server on top.
+  # Stays on `docker build` so the base resolves from local dockerd.
+  case "$ARCH" in
+    "musa" | "vulkan")
+      log_info "Adding sd-server to $CONTAINER_TAG"
+      docker build --provenance=false -f llama-swap-sd.Containerfile \
+        --build-arg BASE=${CONTAINER_TAG} \
+        --build-arg SD_IMAGE=${SD_IMAGE} --build-arg SD_TAG=${SD_TAG} \
+        --build-arg UID=${USER_UID} --build-arg GID=${USER_GID} \
+        -t ${CONTAINER_TAG} -t ${CONTAINER_LATEST} . ;;
+  esac
+
+  # cpu builds push inline via buildx --push; all other archs push here.
+  if [ "$ARCH" != "cpu" ] && [ "$PUSH_IMAGES" == "true" ]; then
+    docker push ${CONTAINER_TAG}
+    docker push ${CONTAINER_LATEST}
+  fi
+done
@@ -0,0 +1,305 @@
+#!/bin/bash
+#
+# Build script for llama-swap-docker with commit hash pinning
+#
+# Usage:
+#   ./build-image.sh --cuda                    # Build CUDA image
+#   ./build-image.sh --vulkan                  # Build Vulkan image
+#   ./build-image.sh --cuda --no-cache         # Build CUDA image without cache
+#   LLAMA_COMMIT_HASH=abc123 ./build-image.sh --cuda      # Override llama.cpp commit
+#   LLAMA_COMMIT_HASH=b8429 ./build-image.sh --vulkan    # Override llama.cpp release tag (vulkan uses prebuilt binaries)
+#   WHISPER_COMMIT_HASH=def456 ./build-image.sh --vulkan  # Override whisper.cpp commit
+#   SD_COMMIT_HASH=ghi789 ./build-image.sh --cuda        # Override stable-diffusion.cpp commit
+#
+# Features:
+#   - Auto-detects latest commit hashes from git repos
+#   - Builds llama-swap from local source code
+#   - Allows environment variable overrides for reproducible builds
+#   - Cache-friendly: changing commit hash busts cache appropriately
+#   - Supports both CUDA and Vulkan backends (requires explicit flag)
+#
+
+set -euo pipefail
+
+# Parse command line arguments
+BACKEND=""
+NO_CACHE=false
+
+if [[ $# -eq 0 ]]; then
+    echo "Error: No backend specified. Please use --cuda or --vulkan."
+    echo ""
+    echo "Usage: ./build-image.sh --cuda|--vulkan [--no-cache]"
+    echo ""
+    echo "Options:"
+    echo "  --cuda      Build CUDA image (NVIDIA GPUs)"
+    echo "  --vulkan    Build Vulkan image (AMD GPUs and compatible hardware)"
+    echo "  --no-cache  Force rebuild without using Docker cache"
+    echo "  --help, -h  Show this help message"
+    echo ""
+    echo "Environment variables:"
+    echo "  DOCKER_IMAGE_TAG     Set custom image tag (default: llama-swap:cuda or llama-swap:vulkan)"
+    echo "  LLAMA_COMMIT_HASH    Override llama.cpp commit hash"
+    echo "  WHISPER_COMMIT_HASH  Override whisper.cpp commit hash"
+    echo "  SD_COMMIT_HASH       Override stable-diffusion.cpp commit hash"
+    exit 1
+fi
+
+for arg in "$@"; do
+    case $arg in
+        --cuda)
+            BACKEND="cuda"
+            ;;
+        --vulkan)
+            BACKEND="vulkan"
+            ;;
+        --no-cache)
+            NO_CACHE=true
+            ;;
+        --help|-h)
+            echo "Usage: ./build-image.sh --cuda|--vulkan [--no-cache]"
+            echo ""
+            echo "Options:"
+            echo "  --cuda      Build CUDA image (NVIDIA GPUs)"
+            echo "  --vulkan    Build Vulkan image (AMD GPUs and compatible hardware)"
+            echo "  --no-cache  Force rebuild without using Docker cache"
+            echo "  --help, -h  Show this help message"
+            echo ""
+            echo "Environment variables:"
+            echo "  DOCKER_IMAGE_TAG     Set custom image tag (default: llama-swap:cuda or llama-swap:vulkan)"
+            echo "  LLAMA_COMMIT_HASH    Override llama.cpp commit hash"
+            echo "  WHISPER_COMMIT_HASH  Override whisper.cpp commit hash"
+            echo "  SD_COMMIT_HASH       Override stable-diffusion.cpp commit hash"
+            exit 0
+            ;;
+    esac
+done
+
+# Validate backend selection
+if [[ -z "$BACKEND" ]]; then
+    echo "Error: No backend specified. Please use --cuda or --vulkan."
+    exit 1
+fi
+
+# Configuration
+if [[ -n "${DOCKER_IMAGE_TAG:-}" ]]; then
+    # User provided a custom tag, use it as-is
+    :
+elif [[ "$BACKEND" == "vulkan" ]]; then
+    DOCKER_IMAGE_TAG="llama-swap:vulkan"
+else
+    DOCKER_IMAGE_TAG="llama-swap:cuda"
+fi
+DOCKER_BUILDKIT="${DOCKER_BUILDKIT:-1}"
+
+# Single unified Dockerfile, backend selected via build arg
+DOCKERFILE="Dockerfile"
+if [[ "$BACKEND" == "vulkan" ]]; then
+    echo "Building for: Vulkan (AMD GPUs and compatible hardware)"
+else
+    echo "Building for: CUDA (NVIDIA GPUs)"
+fi
+
+# Git repository URLs
+LLAMA_REPO="https://github.com/ggml-org/llama.cpp.git"
+WHISPER_REPO="https://github.com/ggml-org/whisper.cpp.git"
+SD_REPO="https://github.com/leejet/stable-diffusion.cpp.git"
+
+# Function to get the latest commit hash from a git repo's default branch
+get_latest_commit() {
+    local repo_url="$1"
+    local branch="${2:-master}"
+
+    # Try to get the latest commit hash for the specified branch
+    git ls-remote --heads "${repo_url}" "${branch}" 2>/dev/null | head -1 | cut -f1
+}
+
+# Function to get the default branch name (master or main)
+get_default_branch() {
+    local repo_url="$1"
+
+    # Check for master first
+    if git ls-remote --heads "${repo_url}" master &>/dev/null; then
+        echo "master"
+    elif git ls-remote --heads "${repo_url}" main &>/dev/null; then
+        echo "main"
+    else
+        echo "master"  # fallback
+    fi
+}
+
+# Function to get the latest release tag from a GitHub repo
+get_latest_release_tag() {
+    local owner_repo="$1"
+    curl -fsSL "https://api.github.com/repos/${owner_repo}/releases/latest" \
+        | grep '"tag_name"' | head -1 | cut -d'"' -f4
+}
+
+echo "=========================================="
+echo "llama-swap-docker Build Script"
+echo "=========================================="
+echo ""
+
+# Determine commit hashes / release tags - use env vars or auto-detect
+# For vulkan builds, llama and sd use GitHub release tags (prebuilt binaries).
+# For cuda builds (or whisper on any backend), use git commit hashes.
+if [[ -n "${LLAMA_COMMIT_HASH:-}" ]]; then
+    LLAMA_HASH="${LLAMA_COMMIT_HASH}"
+    echo "llama.cpp: Using provided version: ${LLAMA_HASH}"
+elif [[ "$BACKEND" == "vulkan" ]]; then
+    LLAMA_HASH=$(get_latest_release_tag "ggml-org/llama.cpp")
+    if [[ -z "${LLAMA_HASH}" ]]; then
+        echo "ERROR: Could not determine latest release tag for llama.cpp" >&2
+        exit 1
+    fi
+    echo "llama.cpp: Auto-detected latest release tag: ${LLAMA_HASH}"
+else
+    LLAMA_BRANCH=$(get_default_branch "${LLAMA_REPO}")
+    LLAMA_HASH=$(get_latest_commit "${LLAMA_REPO}" "${LLAMA_BRANCH}")
+    if [[ -z "${LLAMA_HASH}" ]]; then
+        echo "ERROR: Could not determine latest commit for llama.cpp" >&2
+        exit 1
+    fi
+    echo "llama.cpp: Auto-detected latest commit (${LLAMA_BRANCH}): ${LLAMA_HASH}"
+fi
+
+if [[ -n "${WHISPER_COMMIT_HASH:-}" ]]; then
+    WHISPER_HASH="${WHISPER_COMMIT_HASH}"
+    echo "whisper.cpp: Using provided commit hash: ${WHISPER_HASH}"
+else
+    WHISPER_BRANCH=$(get_default_branch "${WHISPER_REPO}")
+    WHISPER_HASH=$(get_latest_commit "${WHISPER_REPO}" "${WHISPER_BRANCH}")
+    if [[ -z "${WHISPER_HASH}" ]]; then
+        echo "ERROR: Could not determine latest commit for whisper.cpp" >&2
+        exit 1
+    fi
+    echo "whisper.cpp: Auto-detected latest commit (${WHISPER_BRANCH}): ${WHISPER_HASH}"
+fi
+
+if [[ -n "${SD_COMMIT_HASH:-}" ]]; then
+    SD_HASH="${SD_COMMIT_HASH}"
+    echo "stable-diffusion.cpp: Using provided version: ${SD_HASH}"
+elif [[ "$BACKEND" == "vulkan" ]]; then
+    SD_HASH=$(get_latest_release_tag "leejet/stable-diffusion.cpp")
+    if [[ -z "${SD_HASH}" ]]; then
+        echo "ERROR: Could not determine latest release tag for stable-diffusion.cpp" >&2
+        exit 1
+    fi
+    echo "stable-diffusion.cpp: Auto-detected latest release tag: ${SD_HASH}"
+else
+    SD_BRANCH=$(get_default_branch "${SD_REPO}")
+    SD_HASH=$(get_latest_commit "${SD_REPO}" "${SD_BRANCH}")
+    if [[ -z "${SD_HASH}" ]]; then
+        echo "ERROR: Could not determine latest commit for stable-diffusion.cpp" >&2
+        exit 1
+    fi
+    echo "stable-diffusion.cpp: Auto-detected latest commit (${SD_BRANCH}): ${SD_HASH}"
+fi
+
+echo ""
+echo "=========================================="
+echo "Starting Docker build..."
+echo "=========================================="
+echo ""
+
+# Build the Docker image with commit hashes as build args
+# Build context is the repository root (..) so the Dockerfile can access Go source
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+BUILD_ARGS=(
+    --build-arg "BACKEND=${BACKEND}"
+    --build-arg "LLAMA_COMMIT_HASH=${LLAMA_HASH}"
+    --build-arg "WHISPER_COMMIT_HASH=${WHISPER_HASH}"
+    --build-arg "SD_COMMIT_HASH=${SD_HASH}"
+    -t "${DOCKER_IMAGE_TAG}"
+    -f "${SCRIPT_DIR}/${DOCKERFILE}"
+)
+
+if [[ "$NO_CACHE" == true ]]; then
+    BUILD_ARGS+=(--no-cache)
+    echo "Note: Building without cache"
+fi
+
+# Use docker buildx with a custom builder for parallelism control
+# The legacy DOCKER_BUILDKIT=1 docker build doesn't respect BUILDKIT_MAX_PARALLELISM env var
+# We need to use a custom builder with a buildkitd.toml config file
+BUILDER_NAME="llama-swap-builder"
+
+# Check if our custom builder exists with the right config, create/update if needed
+if ! docker buildx inspect "$BUILDER_NAME" >/dev/null 2>&1; then
+    echo "Creating custom buildx builder with max-parallelism=1..."
+    
+    # Create buildkitd.toml config file
+    cat > buildkitd.toml << 'BUILDKIT_EOF'
+[worker.oci]
+  max-parallelism = 1
+BUILDKIT_EOF
+    
+    # Create the builder with the config
+    docker buildx create --name "$BUILDER_NAME" \
+        --driver docker-container \
+        --buildkitd-config buildkitd.toml \
+        --use
+else
+    # Switch to our builder
+    docker buildx use "$BUILDER_NAME"
+fi
+
+echo "Building with sequential stages (one at a time), each using all CPU cores..."
+echo "Using builder: $BUILDER_NAME"
+
+# Use docker buildx build with --load to load the image into Docker
+# The --builder flag ensures we use our custom builder with max-parallelism=1
+# Build context is the repository root so we can access Go source files
+docker buildx build --builder "$BUILDER_NAME" --load "${BUILD_ARGS[@]}" "${REPO_ROOT}"
+
+echo ""
+echo "=========================================="
+echo "Verifying build artifacts..."
+echo "=========================================="
+echo ""
+
+# Verify all expected binaries exist in the image
+MISSING_BINARIES=()
+
+for binary in llama-server llama-cli whisper-server whisper-cli sd-server sd-cli llama-swap; do
+    if ! docker run --rm "${DOCKER_IMAGE_TAG}" which "${binary}" >/dev/null 2>&1; then
+        MISSING_BINARIES+=("${binary}")
+    fi
+done
+
+if [[ ${#MISSING_BINARIES[@]} -gt 0 ]]; then
+    echo "ERROR: Build succeeded but the following binaries are missing from the image:"
+    for binary in "${MISSING_BINARIES[@]}"; do
+        echo "  - ${binary}"
+    done
+    echo ""
+    echo "This usually indicates a build stage failure. Try running with --no-cache flag:"
+    echo "  ./build-image.sh --vulkan --no-cache"
+    exit 1
+fi
+
+echo "All expected binaries verified: llama-server, llama-cli, whisper-server, whisper-cli, sd-server, sd-cli, llama-swap"
+
+echo ""
+echo "=========================================="
+echo "Build complete!"
+echo "=========================================="
+echo ""
+echo "Image tag: ${DOCKER_IMAGE_TAG}"
+echo ""
+echo "Built with:"
+echo "  llama.cpp:           ${LLAMA_HASH}"
+echo "  whisper.cpp:         ${WHISPER_HASH}"
+echo "  stable-diffusion.cpp: ${SD_HASH}"
+echo "  llama-swap:          $(docker run --rm "${DOCKER_IMAGE_TAG}" cat /versions.txt | grep llama-swap | cut -d' ' -f2-)"
+echo ""
+if [[ "$BACKEND" == "vulkan" ]]; then
+    echo "Run with:"
+    echo "  docker run -it --rm --device /dev/dri:/dev/dri ${DOCKER_IMAGE_TAG}"
+    echo ""
+    echo "Note: For AMD GPUs, you may also need to mount render devices:"
+    echo "  docker run -it --rm --device /dev/dri:/dev/dri --group-add video ${DOCKER_IMAGE_TAG}"
+else
+    echo "Run with:"
+    echo "  docker run -it --rm --gpus all ${DOCKER_IMAGE_TAG}"
+fi
@@ -0,0 +1,33 @@
+healthCheckTimeout: 300
+logRequests: true
+metricsMaxInMemory: 1000
+
+models:
+  "qwen2.5":
+    proxy: "http://127.0.0.1:9999"
+    cmd: >
+      /app/llama-server
+      -hf bartowski/Qwen2.5-0.5B-Instruct-GGUF:Q4_K_M
+      --port 9999
+
+  "smollm2":
+    proxy: "http://127.0.0.1:9999"
+    cmd: >
+      /app/llama-server
+      -hf bartowski/SmolLM2-135M-Instruct-GGUF:Q4_K_M
+      --port 9999
+
+  z-image:
+    checkEndpoint: /
+    cmd: |
+      /app/sd-server
+      --listen-port 9999
+      --diffusion-fa
+      --diffusion-model /models/z_image_turbo-Q8_0.gguf
+      --vae /models/ae.safetensors
+      --llm /models/qwen3-4b-instruct-2507-q8_0.gguf
+      --offload-to-cpu
+      --cfg-scale 1.0
+      --height 512 --width 512
+      --steps 8
+    aliases: [gpt-image-1,dall-e-2,dall-e-3,gpt-image-1-mini,gpt-image-1.5]
@@ -0,0 +1,11 @@
+ARG SD_IMAGE=ghcr.io/leejet/stable-diffusion.cpp
+ARG SD_TAG=master-vulkan
+ARG BASE=llama-swap:latest
+
+FROM ${SD_IMAGE}:${SD_TAG} AS sd-source
+FROM ${BASE}
+
+ARG UID=10001
+ARG GID=10001
+
+COPY --from=sd-source --chown=${UID}:${GID} /sd-server /app/sd-server
@@ -0,0 +1,49 @@
+ARG BASE_IMAGE=ghcr.io/ggml-org/llama.cpp
+ARG BASE_TAG=server-cuda
+FROM ${BASE_IMAGE}:${BASE_TAG}
+
+ARG LS_VER=170
+ARG LS_REPO=mostlygeek/llama-swap
+
+# Set default UID/GID arguments
+ARG UID=10001
+ARG GID=10001
+ARG USER_HOME=/app
+
+# Add user/group
+ENV HOME=$USER_HOME
+RUN if [ $UID -ne 0 ]; then \
+      if [ $GID -ne 0 ]; then \
+        groupadd --system --gid $GID app; \
+      fi; \
+      useradd --system --uid $UID --gid $GID \
+      --home $USER_HOME app; \
+    fi
+
+# Handle paths
+RUN mkdir --parents $HOME /app
+RUN chown --recursive $UID:$GID $HOME /app
+
+# Switch user
+USER $UID:$GID
+
+WORKDIR /app
+
+# Add /app to PATH
+ENV PATH="/app:${PATH}"
+
+RUN \
+    set -eux; \
+    case "$(uname -m)" in \
+        x86_64)  ARCH=amd64 ;; \
+        aarch64) ARCH=arm64 ;; \
+        *) echo "unsupported arch: $(uname -m)" >&2; exit 1 ;; \
+    esac; \
+    curl --fail -LO "https://github.com/${LS_REPO}/releases/download/v${LS_VER}/llama-swap_${LS_VER}_linux_${ARCH}.tar.gz" && \
+    tar -zxf "llama-swap_${LS_VER}_linux_${ARCH}.tar.gz" && \
+    rm "llama-swap_${LS_VER}_linux_${ARCH}.tar.gz"
+
+COPY --chown=$UID:$GID config.example.yaml /app/config.yaml
+
+HEALTHCHECK CMD curl -f http://localhost:8080/ || exit 1
+ENTRYPOINT [ "/app/llama-swap", "-config", "/app/config.yaml" ]
@@ -0,0 +1,207 @@
+# Unified multi-stage Dockerfile for AI inference tools
+# Supports CUDA and Vulkan backends via BACKEND build arg
+#
+# Usage:
+#   docker buildx build --build-arg BACKEND=cuda -t llama-swap:unified-cuda .
+#   docker buildx build --build-arg BACKEND=vulkan -t llama-swap:unified-vulkan .
+#   docker buildx build --build-arg BACKEND=cuda --build-arg CMAKE_CUDA_ARCHITECTURES="86;89" -t llama-swap:unified-cuda .
+#
+# Each project has its own install script that handles cloning, building,
+# and installing binaries. Build stages are independent for cache efficiency.
+
+ARG BACKEND=cuda
+
+# ── Builder bases ──────────────────────────────────────────────────────
+
+FROM nvidia/cuda:12.9.1-devel-ubuntu24.04 AS builder-base-cuda
+
+ARG CMAKE_CUDA_ARCHITECTURES="60;61;75;86;89"
+ENV DEBIAN_FRONTEND=noninteractive
+ENV CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}
+ENV CCACHE_DIR=/ccache
+ENV CCACHE_MAXSIZE=2G
+ENV PATH="/usr/lib/ccache:${PATH}"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential cmake git python3 python3-pip libssl-dev \
+    curl ca-certificates ccache make wget \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /build
+
+# ──
+
+FROM ubuntu:24.04 AS builder-base-vulkan
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV CCACHE_DIR=/ccache
+ENV CCACHE_MAXSIZE=2G
+ENV PATH="/usr/lib/ccache:${PATH}"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential cmake git python3 python3-pip libssl-dev \
+    curl ca-certificates ccache make wget software-properties-common \
+    libvulkan-dev glslang-tools spirv-tools vulkan-validationlayers glslc \
+    spirv-headers \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /build
+
+# ── Select builder base by BACKEND ────────────────────────────────────
+
+FROM builder-base-${BACKEND} AS builder-base
+
+# ── Build whisper.cpp (fastest build, run first) ──────────────────────
+
+FROM builder-base AS whisper-build
+ARG BACKEND=cuda
+ARG WHISPER_COMMIT_HASH=master
+COPY install-whisper.sh /build/
+RUN --mount=type=cache,id=ccache-${BACKEND},target=/ccache \
+    --mount=type=cache,id=whisper-${BACKEND},target=/src/whisper.cpp/build \
+    BACKEND=${BACKEND} bash /build/install-whisper.sh "${WHISPER_COMMIT_HASH}"
+
+# ── Build stable-diffusion.cpp ────────────────────────────────────────
+
+FROM builder-base AS sd-build
+ARG BACKEND=cuda
+ARG SD_COMMIT_HASH=master
+COPY install-sd.sh /build/
+RUN --mount=type=cache,id=ccache-${BACKEND},target=/ccache \
+    --mount=type=cache,id=sd-${BACKEND},target=/src/stable-diffusion.cpp/build \
+    BACKEND=${BACKEND} bash /build/install-sd.sh "${SD_COMMIT_HASH}"
+
+# ── Build llama.cpp (slowest build, run last) ─────────────────────────
+
+FROM builder-base AS llama-build
+ARG BACKEND=cuda
+ARG LLAMA_COMMIT_HASH=master
+COPY install-llama.sh /build/
+RUN --mount=type=cache,id=ccache-${BACKEND},target=/ccache \
+    --mount=type=cache,id=llama-${BACKEND},target=/src/llama.cpp/build \
+    BACKEND=${BACKEND} bash /build/install-llama.sh "${LLAMA_COMMIT_HASH}"
+
+# ── Build ik_llama.cpp (CUDA only) ────────────────────────────────────
+#
+# Two named stages allow ARG BACKEND to select at build time:
+#   - ik-llama-cuda  : real build (from builder-base-cuda)
+#   - ik-llama-vulkan: no-op (empty /install/bin, skips CUDA pull entirely)
+# BuildKit only evaluates the selected branch, so vulkan builds never
+# pull nvidia/cuda:*-devel or compile ik_llama.cpp.
+
+FROM builder-base-vulkan AS ik-llama-vulkan
+RUN mkdir -p /install/bin
+
+FROM builder-base-cuda AS ik-llama-cuda
+ARG IK_LLAMA_COMMIT_HASH=main
+COPY install-ik-llama.sh /build/
+RUN --mount=type=cache,id=ccache-cuda,target=/ccache \
+    --mount=type=cache,id=ik-llama-cuda,target=/src/ik_llama.cpp/build \
+    bash /build/install-ik-llama.sh "${IK_LLAMA_COMMIT_HASH}"
+
+ARG BACKEND=cuda
+FROM ik-llama-${BACKEND} AS ik-llama-build
+
+# ── Download llama-swap release binary ────────────────────────────────
+
+FROM builder-base AS llama-swap-download
+ARG LS_VERSION=latest
+COPY install-llama-swap.sh /build/
+RUN bash /build/install-llama-swap.sh "${LS_VERSION}"
+
+# ── Runtime bases ─────────────────────────────────────────────────────
+
+FROM nvidia/cuda:12.9.1-runtime-ubuntu24.04 AS runtime-cuda
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
+ENV PATH="/usr/local/bin:${PATH}"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgomp1 python3 curl ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# CUDA stub drivers for container compatibility
+COPY --from=builder-base-cuda /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so
+COPY --from=builder-base-cuda /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
+
+# ──
+
+FROM ubuntu:24.04 AS runtime-vulkan
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PATH="/usr/local/bin:${PATH}"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgomp1 libvulkan1 mesa-vulkan-drivers \
+    python3 curl ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# ── Select runtime base by BACKEND ────────────────────────────────────
+
+FROM runtime-${BACKEND} AS runtime
+
+ARG BACKEND=cuda
+ARG LLAMA_COMMIT_HASH=unknown
+ARG WHISPER_COMMIT_HASH=unknown
+ARG SD_COMMIT_HASH=unknown
+ARG IK_LLAMA_COMMIT_HASH=unknown
+ARG RUN_UID=0
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3-numpy python3-sentencepiece python3-pip \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create non-root user when RUN_UID != 0
+RUN if [ "$RUN_UID" != "0" ]; then \
+      groupadd --system --gid $RUN_UID llama-swap && \
+      useradd --system --uid $RUN_UID --gid $RUN_UID \
+        --home /app --shell /sbin/nologin llama-swap; \
+    fi && \
+    mkdir -p /etc/llama-swap/config && \
+    chown -R ${RUN_UID}:${RUN_UID} /etc/llama-swap
+
+WORKDIR /app
+
+# Copy whisper.cpp binaries and libraries
+COPY --from=whisper-build /install/bin/whisper-server /usr/local/bin/
+COPY --from=whisper-build /install/bin/whisper-cli /usr/local/bin/
+COPY --from=whisper-build /install/lib/ /usr/local/lib/
+
+# Copy stable-diffusion.cpp binaries and libraries
+COPY --from=sd-build /install/bin/sd-server /usr/local/bin/
+COPY --from=sd-build /install/bin/sd-cli /usr/local/bin/
+COPY --from=sd-build /install/lib/ /usr/local/lib/
+
+# Copy llama.cpp binaries (statically linked)
+COPY --from=llama-build /install/bin/llama-server /usr/local/bin/
+COPY --from=llama-build /install/bin/llama-cli /usr/local/bin/
+
+# Copy ik-llama-server (CUDA only; empty copy for vulkan)
+COPY --from=ik-llama-build /install/bin/ /usr/local/bin/
+
+# Install uv
+RUN pip install uv --break-system-packages
+
+# Copy llama-swap binary
+COPY --from=llama-swap-download /install/bin/llama-swap /usr/local/bin/
+COPY --from=llama-swap-download /install/llama-swap-version /tmp/
+
+RUN ldconfig
+
+COPY config.example.yaml /etc/llama-swap/config/config.yaml
+
+# Version tracking
+RUN echo "llama.cpp: ${LLAMA_COMMIT_HASH}" > /versions.txt && \
+    echo "whisper.cpp: ${WHISPER_COMMIT_HASH}" >> /versions.txt && \
+    echo "stable-diffusion.cpp: ${SD_COMMIT_HASH}" >> /versions.txt && \
+    echo "ik_llama.cpp: ${IK_LLAMA_COMMIT_HASH}" >> /versions.txt && \
+    echo "llama-swap: $(cat /tmp/llama-swap-version)" >> /versions.txt && \
+    echo "backend: ${BACKEND}" >> /versions.txt && \
+    echo "build_timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> /versions.txt
+
+RUN mkdir -p /models && chown ${RUN_UID}:${RUN_UID} /models
+WORKDIR /models
+USER ${RUN_UID}
+ENTRYPOINT ["llama-swap"]
+CMD ["-config", "/etc/llama-swap/config/config.yaml", "-listen", "0.0.0.0:8080"]
@@ -0,0 +1,8 @@
+# Unified Docker Container
+
+These scripts create a custom llama-swap container that contains:
+
+- llama-server for LLMs, rerank and embedding model support
+- sd-server (stable-diffusion.cpp) for image generation
+- whisper.cpp for ASR
+
@@ -0,0 +1,303 @@
+#!/bin/bash
+#
+# Build script for unified container with version pinning
+#
+# Usage:
+#   ./build-image.sh --cuda                              # Build CUDA image
+#   ./build-image.sh --vulkan                            # Build Vulkan image
+#   ./build-image.sh --cuda --no-cache                   # Build without cache
+#   LLAMA_REF=b1234 ./build-image.sh --vulkan            # Pin llama.cpp to a commit hash
+#   LLAMA_REF=v1.2.3 ./build-image.sh --cuda             # Pin llama.cpp to a tag
+#   WHISPER_REF=v1.0.0 ./build-image.sh --vulkan         # Pin whisper.cpp to a tag
+#   SD_REF=master ./build-image.sh --cuda                # Pin stable-diffusion.cpp to a branch
+#   LS_VERSION=170 ./build-image.sh --cuda               # Override llama-swap version
+#   IK_LLAMA_REF=main ./build-image.sh --cuda            # Pin ik_llama.cpp to main branch (CUDA only)
+#
+
+set -euo pipefail
+
+BACKEND=""
+NO_CACHE=false
+
+for arg in "$@"; do
+    case $arg in
+        --cuda)
+            BACKEND="cuda"
+            ;;
+        --vulkan)
+            BACKEND="vulkan"
+            ;;
+        --no-cache)
+            NO_CACHE=true
+            ;;
+        --help|-h)
+            echo "Usage: ./build-image.sh --cuda|--vulkan [--no-cache]"
+            echo ""
+            echo "Options:"
+            echo "  --cuda      Build CUDA image (NVIDIA GPUs)"
+            echo "  --vulkan    Build Vulkan image (AMD GPUs and compatible hardware)"
+            echo "  --no-cache  Force rebuild without using Docker cache"
+            echo "  --help, -h  Show this help message"
+            echo ""
+            echo "Environment variables:"
+            echo "  DOCKER_IMAGE_TAG     Set custom image tag (default: llama-swap:unified-cuda or llama-swap:unified-vulkan)"
+            echo "  LLAMA_REF            Pin llama.cpp to a commit, tag, or branch"
+            echo "  WHISPER_REF          Pin whisper.cpp to a commit, tag, or branch"
+            echo "  SD_REF               Pin stable-diffusion.cpp to a commit, tag, or branch"
+            echo "  IK_LLAMA_REF         Pin ik_llama.cpp to a commit, tag, or branch (CUDA only)"
+            echo "  LS_VERSION           Override llama-swap version (e.g., '170' or 'latest')"
+            exit 0
+            ;;
+    esac
+done
+
+if [[ -z "$BACKEND" ]]; then
+    echo "Error: No backend specified. Please use --cuda or --vulkan."
+    echo ""
+    echo "Usage: ./build-image.sh --cuda|--vulkan [--no-cache]"
+    exit 1
+fi
+
+DOCKER_IMAGE_TAG="${DOCKER_IMAGE_TAG:-llama-swap:unified-${BACKEND}}"
+
+# Git repository URLs
+LLAMA_REPO="https://github.com/ggml-org/llama.cpp.git"
+WHISPER_REPO="https://github.com/ggml-org/whisper.cpp.git"
+SD_REPO="https://github.com/leejet/stable-diffusion.cpp.git"
+LLAMA_SWAP_REPO="https://github.com/mostlygeek/llama-swap.git"
+IK_LLAMA_REPO="https://github.com/ikawrakow/ik_llama.cpp.git"
+
+# Resolve a git ref (commit hash, tag, or branch) to a full commit hash.
+# Requires only: git, network access to the remote.
+resolve_ref() {
+    local repo_url="$1"
+    local ref="$2"
+
+    # Full 40-char SHA — use as-is
+    if [[ "${ref}" =~ ^[0-9a-f]{40}$ ]]; then
+        echo "${ref}"
+        return
+    fi
+
+    # Try tag then branch (exact match)
+    local hash
+    hash=$(git ls-remote "${repo_url}" "refs/tags/${ref}" "refs/heads/${ref}" 2>/dev/null | head -1 | cut -f1)
+    if [[ -n "${hash}" ]]; then
+        echo "${hash}"
+        return
+    fi
+
+    # Short hash (7+ chars): scan all refs for a SHA with this prefix
+    if [[ "${ref}" =~ ^[0-9a-f]{7,}$ ]]; then
+        hash=$(git ls-remote "${repo_url}" 2>/dev/null | grep "^${ref}" | head -1 | cut -f1)
+        if [[ -n "${hash}" ]]; then
+            echo "${hash}"
+            return
+        fi
+    fi
+
+    echo "ERROR: Could not resolve ref '${ref}' for ${repo_url}" >&2
+    if [[ "${ref}" =~ ^[0-9a-f]+$ && ${#ref} -lt 7 ]]; then
+        echo "  Short hashes must be at least 7 characters (got ${#ref})." >&2
+    else
+        echo "  Tried: tag, branch, git ls-remote prefix match" >&2
+    fi
+    echo "  Use a full 40-char SHA, a tag name, a branch name, or a 7+ char short hash." >&2
+    return 1
+}
+
+# Resolve HEAD of a repo without needing to know the default branch name.
+get_latest_hash() {
+    git ls-remote "${1}" HEAD 2>/dev/null | head -1 | cut -f1
+}
+
+echo "=========================================="
+echo "llama-swap Unified Build (${BACKEND})"
+echo "=========================================="
+echo ""
+
+# Resolve llama.cpp ref
+if [[ -n "${LLAMA_REF:-}" ]]; then
+    LLAMA_HASH=$(resolve_ref "${LLAMA_REPO}" "${LLAMA_REF}") || exit 1
+    echo "llama.cpp: ${LLAMA_REF} -> ${LLAMA_HASH}"
+else
+    LLAMA_HASH=$(get_latest_hash "${LLAMA_REPO}")
+    if [[ -z "${LLAMA_HASH}" ]]; then
+        echo "ERROR: Could not determine latest commit for llama.cpp" >&2
+        exit 1
+    fi
+    echo "llama.cpp: latest HEAD: ${LLAMA_HASH}"
+fi
+
+# Resolve whisper.cpp ref
+if [[ -n "${WHISPER_REF:-}" ]]; then
+    WHISPER_HASH=$(resolve_ref "${WHISPER_REPO}" "${WHISPER_REF}") || exit 1
+    echo "whisper.cpp: ${WHISPER_REF} -> ${WHISPER_HASH}"
+else
+    WHISPER_HASH=$(get_latest_hash "${WHISPER_REPO}")
+    if [[ -z "${WHISPER_HASH}" ]]; then
+        echo "ERROR: Could not determine latest commit for whisper.cpp" >&2
+        exit 1
+    fi
+    echo "whisper.cpp: latest HEAD: ${WHISPER_HASH}"
+fi
+
+# Resolve stable-diffusion.cpp ref
+if [[ -n "${SD_REF:-}" ]]; then
+    SD_HASH=$(resolve_ref "${SD_REPO}" "${SD_REF}") || exit 1
+    echo "stable-diffusion.cpp: ${SD_REF} -> ${SD_HASH}"
+else
+    SD_HASH=$(get_latest_hash "${SD_REPO}")
+    if [[ -z "${SD_HASH}" ]]; then
+        echo "ERROR: Could not determine latest commit for stable-diffusion.cpp" >&2
+        exit 1
+    fi
+    echo "stable-diffusion.cpp: latest HEAD: ${SD_HASH}"
+fi
+
+# Resolve ik_llama.cpp ref (CUDA only)
+if [[ "$BACKEND" == "cuda" ]]; then
+    if [[ -n "${IK_LLAMA_REF:-}" ]]; then
+        IK_LLAMA_HASH=$(resolve_ref "${IK_LLAMA_REPO}" "${IK_LLAMA_REF}") || exit 1
+        echo "ik_llama.cpp: ${IK_LLAMA_REF} -> ${IK_LLAMA_HASH}"
+    else
+        IK_LLAMA_HASH=$(get_latest_hash "${IK_LLAMA_REPO}")
+        if [[ -z "${IK_LLAMA_HASH}" ]]; then
+            echo "ERROR: Could not determine latest commit for ik_llama.cpp" >&2
+            exit 1
+        fi
+        echo "ik_llama.cpp: latest HEAD: ${IK_LLAMA_HASH}"
+    fi
+else
+    IK_LLAMA_HASH="n/a"
+    echo "ik_llama.cpp: skipped (vulkan build)"
+fi
+
+# Resolve llama-swap ref
+if [[ -n "${LS_VERSION:-}" ]]; then
+    LS_HASH=$(resolve_ref "${LLAMA_SWAP_REPO}" "${LS_VERSION}") || exit 1
+    echo "llama-swap: ${LS_VERSION} -> ${LS_HASH}"
+else
+    LS_HASH=$(get_latest_hash "${LLAMA_SWAP_REPO}")
+    if [[ -z "${LS_HASH}" ]]; then
+        echo "ERROR: Could not determine latest commit for llama-swap" >&2
+        exit 1
+    fi
+    echo "llama-swap: latest HEAD: ${LS_HASH}"
+fi
+
+echo ""
+echo "=========================================="
+echo "Starting Docker build..."
+echo "=========================================="
+echo ""
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+BUILD_ARGS=(
+    --build-arg "BACKEND=${BACKEND}"
+    --build-arg "LLAMA_COMMIT_HASH=${LLAMA_HASH}"
+    --build-arg "WHISPER_COMMIT_HASH=${WHISPER_HASH}"
+    --build-arg "SD_COMMIT_HASH=${SD_HASH}"
+    --build-arg "IK_LLAMA_COMMIT_HASH=${IK_LLAMA_HASH}"
+    --build-arg "LS_VERSION=${LS_HASH}"
+    -t "${DOCKER_IMAGE_TAG}"
+    -f "${SCRIPT_DIR}/Dockerfile"
+)
+
+if [[ "$NO_CACHE" == true ]]; then
+    BUILD_ARGS+=(--no-cache)
+    echo "Note: Building without cache"
+elif [[ "${GITHUB_ACTIONS:-}" == "true" && "${ACT:-}" != "true" ]]; then
+    CACHE_REF="ghcr.io/mostlygeek/llama-swap:unified-${BACKEND}-cache"
+    BUILD_ARGS+=(
+        --cache-from "type=registry,ref=${CACHE_REF}"
+        --cache-to "type=registry,ref=${CACHE_REF},mode=max"
+    )
+    echo "Note: Using registry cache (${CACHE_REF})"
+fi
+
+DOCKER_BUILDKIT=1 docker buildx build --load "${BUILD_ARGS[@]}" "${SCRIPT_DIR}"
+
+echo ""
+echo "=========================================="
+echo "Verifying build artifacts..."
+echo "=========================================="
+echo ""
+
+EXPECTED_BINARIES=(llama-server llama-cli whisper-server whisper-cli sd-server sd-cli llama-swap)
+if [[ "$BACKEND" == "cuda" ]]; then
+    EXPECTED_BINARIES+=(ik-llama-server)
+fi
+
+MISSING_BINARIES=()
+for binary in "${EXPECTED_BINARIES[@]}"; do
+    if ! docker run --rm --entrypoint which "${DOCKER_IMAGE_TAG}" "${binary}" >/dev/null 2>&1; then
+        MISSING_BINARIES+=("${binary}")
+    fi
+done
+
+if [[ ${#MISSING_BINARIES[@]} -gt 0 ]]; then
+    echo "ERROR: Build succeeded but the following binaries are missing:"
+    for binary in "${MISSING_BINARIES[@]}"; do
+        echo "  - ${binary}"
+    done
+    echo ""
+    echo "Try running with --no-cache flag:"
+    echo "  ./build-image.sh --${BACKEND} --no-cache"
+    exit 1
+fi
+
+VERIFIED_LIST="llama-server, llama-cli, whisper-server, whisper-cli, sd-server, sd-cli, llama-swap"
+if [[ "$BACKEND" == "cuda" ]]; then
+    VERIFIED_LIST="${VERIFIED_LIST}, ik-llama-server"
+fi
+echo "All expected binaries verified: ${VERIFIED_LIST}"
+
+echo ""
+echo "=========================================="
+echo "Building rootless image..."
+echo "=========================================="
+echo ""
+
+ROOTLESS_TAG="${DOCKER_IMAGE_TAG}-rootless"
+docker buildx build --load -t "${ROOTLESS_TAG}" - <<EOF
+FROM ${DOCKER_IMAGE_TAG}
+USER root
+RUN groupadd --system --gid 10001 llama-swap && \\
+    useradd --system --uid 10001 --gid 10001 \\
+      --home /app --shell /sbin/nologin llama-swap && \\
+    chown -R 10001:10001 /etc/llama-swap /models
+USER 10001
+EOF
+
+echo "Rootless image built: ${ROOTLESS_TAG}"
+
+echo ""
+echo "=========================================="
+echo "Build complete!"
+echo "=========================================="
+echo ""
+echo "Image tags:"
+echo "  ${DOCKER_IMAGE_TAG}"
+echo "  ${ROOTLESS_TAG}"
+echo ""
+echo "Built with:"
+echo "  llama.cpp:            ${LLAMA_HASH}"
+echo "  whisper.cpp:          ${WHISPER_HASH}"
+echo "  stable-diffusion.cpp: ${SD_HASH}"
+if [[ "$BACKEND" == "cuda" ]]; then
+    echo "  ik_llama.cpp:         ${IK_LLAMA_HASH}"
+fi
+echo "  llama-swap:           $(docker run --rm --entrypoint cat "${DOCKER_IMAGE_TAG}" /versions.txt | grep llama-swap | cut -d' ' -f2-)"
+echo ""
+if [[ "$BACKEND" == "vulkan" ]]; then
+    echo "Run with:"
+    echo "  docker run -it --rm --device /dev/dri:/dev/dri ${DOCKER_IMAGE_TAG}"
+    echo ""
+    echo "Note: For AMD GPUs, you may also need:"
+    echo "  docker run -it --rm --device /dev/dri:/dev/dri --group-add video ${DOCKER_IMAGE_TAG}"
+else
+    echo "Run with:"
+    echo "  docker run -it --rm --gpus all ${DOCKER_IMAGE_TAG}"
+fi
@@ -0,0 +1,33 @@
+# placeholder example configuration
+healthCheckTimeout: 300
+logRequests: true
+
+models:
+  "llama":
+    cmd: >
+      llama-server
+      -hf bartowski/Qwen2.5-0.5B-Instruct-GGUF:Q4_K_M
+      --port ${PORT}
+
+  "whisper":
+    checkEndpoint: /v1/audio/transcriptions/
+    cmd: >
+      whisper-server
+      --port ${PORT}
+      --m /models/whisper.bin
+      --flash-attn
+      --request-path /v1/audio/transcriptions --inference-path ""
+
+  "image":
+    checkEndpoint: /
+    cmd: |
+      /app/sd-server
+      --listen-port 9999
+      --diffusion-fa
+      --diffusion-model /models/z_image_turbo-Q8_0.gguf
+      --vae /models/ae.safetensors
+      --llm /models/qwen3-4b-instruct-2507-q8_0.gguf
+      --offload-to-cpu
+      --cfg-scale 1.0
+      --height 512 --width 512
+      --steps 8
@@ -0,0 +1,48 @@
+#!/bin/bash
+# Install ik_llama.cpp - clone, build, and install binaries
+# Usage: ./install-ik-llama.sh <commit_hash>
+# Note: CUDA only; always built against builder-base-cuda
+set -e
+
+COMMIT_HASH="${1:-main}"
+
+mkdir -p /install/bin
+
+# Clone and checkout (init-based so cache-mounted build dir doesn't break clone)
+echo "=== Cloning ik_llama.cpp at ${COMMIT_HASH} ==="
+mkdir -p /src/ik_llama.cpp
+cd /src/ik_llama.cpp
+if [ ! -d .git ]; then
+    git init
+    git remote add origin https://github.com/ikawrakow/ik_llama.cpp.git
+fi
+git fetch --depth=1 origin "${COMMIT_HASH}"
+git checkout FETCH_HEAD
+
+CMAKE_FLAGS=(
+    -DGGML_NATIVE=OFF
+    -DBUILD_SHARED_LIBS=OFF
+    -DCMAKE_BUILD_TYPE=Release
+    -DCMAKE_C_COMPILER_LAUNCHER=ccache
+    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+    -DGGML_CUDA=ON
+    "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:?CMAKE_CUDA_ARCHITECTURES must be set}"
+    "-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
+    "-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda -Wl,--allow-shlib-undefined"
+)
+
+rm -rf build/CMakeCache.txt build/CMakeFiles 2>/dev/null || true
+
+echo "=== Building ik_llama.cpp ==="
+cmake -B build "${CMAKE_FLAGS[@]}"
+cmake --build build --config Release -j"$(nproc)" --target llama-server
+
+if [ ! -f "build/bin/llama-server" ]; then
+    echo "FATAL: llama-server not found in build/bin/" >&2
+    exit 1
+fi
+
+# Install as ik-llama-server to avoid collision with llama.cpp's llama-server
+cp "build/bin/llama-server" "/install/bin/ik-llama-server"
+echo "=== ik_llama.cpp build complete ==="
+ls -la /install/bin/
@@ -0,0 +1,67 @@
+#!/bin/bash
+# Install llama-swap - download latest release binary from GitHub
+# Usage: ./install-llama-swap.sh [version]
+#   version: release version number (e.g., "170") or "latest" (default)
+set -e
+
+VERSION="${1:-latest}"
+REPO="mostlygeek/llama-swap"
+
+mkdir -p /install/bin
+
+# If a full commit hash is given, find the release tag that points to it
+if echo "${VERSION}" | grep -qE '^[0-9a-f]{40}$'; then
+    echo "=== Resolving commit ${VERSION:0:7} to release tag ==="
+    TAG=$(git ls-remote --tags "https://github.com/${REPO}.git" 2>/dev/null \
+        | grep "^${VERSION}" | sed 's|.*refs/tags/||' | grep -v '\^{}' | head -1)
+    if [ -n "${TAG}" ]; then
+        echo "Resolved to tag: ${TAG}"
+        VERSION="${TAG#v}"
+    else
+        echo "No release tag found for commit ${VERSION:0:7}, using latest"
+        VERSION="latest"
+    fi
+fi
+
+# Strip leading 'v' prefix so both "198" and "v198" work
+VERSION="${VERSION#v}"
+
+# Resolve "latest" to actual version number
+if [ "$VERSION" = "latest" ]; then
+    echo "=== Resolving latest llama-swap release ==="
+    VERSION=$(curl -fsSL "https://api.github.com/repos/${REPO}/releases/latest" \
+        | grep '"tag_name"' | head -1 | cut -d'"' -f4 | sed 's/^v//')
+    if [ -z "$VERSION" ]; then
+        echo "FATAL: Could not determine latest release version" >&2
+        exit 1
+    fi
+    echo "Latest version: ${VERSION}"
+fi
+
+
+ARCH=$(uname -m)
+case "$ARCH" in
+    x86_64) ARCH="amd64" ;;
+    aarch64|arm64) ARCH="arm64" ;;
+    *) echo "FATAL: Unsupported architecture: $ARCH" >&2; exit 1 ;;
+esac
+
+# Download and extract
+URL="https://github.com/${REPO}/releases/download/v${VERSION}/llama-swap_${VERSION}_linux_${ARCH}.tar.gz"
+echo "=== Downloading llama-swap v${VERSION} ==="
+echo "URL: $URL"
+curl -fSL -o /tmp/llama-swap.tar.gz "$URL"
+tar -xzf /tmp/llama-swap.tar.gz -C /install/bin/
+rm /tmp/llama-swap.tar.gz
+
+# Validate
+if [ ! -x "/install/bin/llama-swap" ]; then
+    echo "FATAL: llama-swap binary not found or not executable" >&2
+    ls -la /install/bin/ >&2
+    exit 1
+fi
+
+echo "$VERSION" > /install/llama-swap-version
+
+echo "=== llama-swap v${VERSION} installed ==="
+ls -la /install/bin/llama-swap
@@ -0,0 +1,63 @@
+#!/bin/bash
+# Install llama.cpp - clone, build, and install binaries
+# Usage: BACKEND=cuda|vulkan ./install-llama.sh <commit_hash>
+set -e
+
+COMMIT_HASH="${1:-master}"
+BACKEND="${BACKEND:-cuda}"
+
+mkdir -p /install/bin
+
+# Clone and checkout (init-based so cache-mounted /src/llama.cpp/build dir doesn't break clone)
+echo "=== Cloning llama.cpp at ${COMMIT_HASH} ==="
+mkdir -p /src/llama.cpp
+cd /src/llama.cpp
+if [ ! -d .git ]; then
+    git init
+    git remote add origin https://github.com/ggml-org/llama.cpp.git
+fi
+git fetch --depth=1 origin "${COMMIT_HASH}"
+git checkout FETCH_HEAD
+
+# Common cmake flags
+CMAKE_FLAGS=(
+    -DGGML_NATIVE=OFF
+    -DBUILD_SHARED_LIBS=OFF
+    -DCMAKE_BUILD_TYPE=Release
+    -DCMAKE_C_COMPILER_LAUNCHER=ccache
+    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+    -DLLAMA_BUILD_TESTS=OFF
+)
+
+if [ "$BACKEND" = "cuda" ]; then
+    CMAKE_FLAGS+=(
+        -DGGML_CUDA=ON
+        -DGGML_VULKAN=OFF
+        "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:?CMAKE_CUDA_ARCHITECTURES must be set}"
+        "-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
+        "-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
+    )
+elif [ "$BACKEND" = "vulkan" ]; then
+    CMAKE_FLAGS+=(
+        -DGGML_CUDA=OFF
+        -DGGML_VULKAN=ON
+    )
+fi
+
+TARGETS=(llama-cli llama-server)
+
+rm -rf build/CMakeCache.txt build/CMakeFiles 2>/dev/null || true
+
+echo "=== Building llama.cpp for ${BACKEND} ==="
+cmake -B build "${CMAKE_FLAGS[@]}"
+cmake --build build --config Release -j"$(nproc)" --target "${TARGETS[@]}"
+
+for bin in "${TARGETS[@]}"; do
+    if [ ! -f "build/bin/$bin" ]; then
+        echo "FATAL: $bin not found in build/bin/" >&2
+        exit 1
+    fi
+    cp "build/bin/$bin" "/install/bin/"
+done
+echo "=== llama.cpp build complete ==="
+ls -la /install/bin/
@@ -0,0 +1,68 @@
+#!/bin/bash
+# Install stable-diffusion.cpp - clone, build, and install binaries and library
+# Usage: BACKEND=cuda|vulkan ./install-sd.sh <commit_hash>
+set -e
+
+COMMIT_HASH="${1:-master}"
+BACKEND="${BACKEND:-cuda}"
+
+mkdir -p /install/bin /install/lib
+
+# Clone and checkout (init-based so cache-mounted /src/stable-diffusion.cpp/build dir doesn't break clone)
+echo "=== Cloning stable-diffusion.cpp at ${COMMIT_HASH} ==="
+mkdir -p /src/stable-diffusion.cpp
+cd /src/stable-diffusion.cpp
+if [ ! -d .git ]; then
+    git init
+    git remote add origin https://github.com/leejet/stable-diffusion.cpp.git
+fi
+git fetch --depth=1 origin "${COMMIT_HASH}"
+git checkout FETCH_HEAD
+git submodule update --init --recursive --depth=1
+
+# Common cmake flags
+CMAKE_FLAGS=(
+    -DGGML_NATIVE=OFF
+    -DCMAKE_BUILD_TYPE=Release
+    -DCMAKE_C_COMPILER_LAUNCHER=ccache
+    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+    -DSD_BUILD_EXAMPLES=ON
+)
+
+if [ "$BACKEND" = "cuda" ]; then
+    CMAKE_FLAGS+=(
+        -DGGML_CUDA=ON
+        -DGGML_VULKAN=OFF
+        "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:?CMAKE_CUDA_ARCHITECTURES must be set}"
+        "-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
+        "-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
+        "-DCMAKE_SHARED_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
+        -DSD_CUDA=ON
+    )
+elif [ "$BACKEND" = "vulkan" ]; then
+    CMAKE_FLAGS+=(
+        -DGGML_CUDA=OFF
+        -DGGML_VULKAN=ON
+        -DSD_VULKAN=ON
+    )
+fi
+
+TARGETS=(stable-diffusion sd-cli sd-server)
+
+rm -rf build/CMakeCache.txt build/CMakeFiles 2>/dev/null || true
+
+echo "=== Building stable-diffusion.cpp for ${BACKEND} ==="
+cmake -B build "${CMAKE_FLAGS[@]}"
+cmake --build build --config Release -j"$(nproc)" --target "${TARGETS[@]}"
+
+for bin in sd-cli sd-server; do
+    if [ ! -f "build/bin/$bin" ]; then
+        echo "FATAL: $bin not found in build/bin/" >&2
+        exit 1
+    fi
+    cp "build/bin/$bin" "/install/bin/"
+done
+find build -name "*.so*" -type f -exec cp {} /install/lib/ \;
+
+echo "=== stable-diffusion.cpp build complete ==="
+ls -la /install/bin/ /install/lib/
@@ -0,0 +1,64 @@
+#!/bin/bash
+# Install whisper.cpp - clone, build, and install binaries
+# Usage: BACKEND=cuda|vulkan ./install-whisper.sh <commit_hash>
+set -e
+
+COMMIT_HASH="${1:-master}"
+BACKEND="${BACKEND:-cuda}"
+
+mkdir -p /install/bin /install/lib
+
+# Clone and checkout (init-based so cache-mounted /src/whisper.cpp/build dir doesn't break clone)
+echo "=== Cloning whisper.cpp at ${COMMIT_HASH} ==="
+mkdir -p /src/whisper.cpp
+cd /src/whisper.cpp
+if [ ! -d .git ]; then
+    git init
+    git remote add origin https://github.com/ggml-org/whisper.cpp.git
+fi
+git fetch --depth=1 origin "${COMMIT_HASH}"
+git checkout FETCH_HEAD
+
+# Common cmake flags
+CMAKE_FLAGS=(
+    -DGGML_NATIVE=OFF
+    -DCMAKE_BUILD_TYPE=Release
+    -DCMAKE_C_COMPILER_LAUNCHER=ccache
+    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+)
+
+if [ "$BACKEND" = "cuda" ]; then
+    CMAKE_FLAGS+=(
+        -DGGML_CUDA=ON
+        -DGGML_VULKAN=OFF
+        "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:?CMAKE_CUDA_ARCHITECTURES must be set}"
+        "-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
+        "-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
+        "-DCMAKE_SHARED_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
+    )
+elif [ "$BACKEND" = "vulkan" ]; then
+    CMAKE_FLAGS+=(
+        -DGGML_CUDA=OFF
+        -DGGML_VULKAN=ON
+    )
+fi
+
+TARGETS=(whisper-cli whisper-server)
+
+rm -rf build/CMakeCache.txt build/CMakeFiles 2>/dev/null || true
+
+echo "=== Building whisper.cpp for ${BACKEND} ==="
+cmake -B build "${CMAKE_FLAGS[@]}"
+cmake --build build --config Release -j"$(nproc)" --target "${TARGETS[@]}"
+
+for bin in "${TARGETS[@]}"; do
+    if [ ! -f "build/bin/$bin" ]; then
+        echo "FATAL: $bin not found in build/bin/" >&2
+        exit 1
+    fi
+    cp "build/bin/$bin" "/install/bin/"
+done
+find build -name "*.so*" -type f -exec cp {} /install/lib/ \;
+
+echo "=== whisper.cpp build complete ==="
+ls -la /install/bin/
@@ -0,0 +1,594 @@
+# config.yaml
+
+llama-swap is designed to be very simple: one binary, one configuration file.
+
+## minimal viable config
+
+```yaml
+models:
+  model1:
+    cmd: llama-server --port ${PORT} --model /path/to/model.gguf
+```
+
+This is enough to launch `llama-server` to serve `model1`. Of course, llama-swap is about making it possible to serve many models:
+
+```yaml
+models:
+  model1:
+    cmd: llama-server --port ${PORT} -m /path/to/model.gguf
+  model2:
+    cmd: llama-server --port ${PORT} -m /path/to/another_model.gguf
+  model3:
+    cmd: llama-server --port ${PORT} -m /path/to/third_model.gguf
+```
+
+With this configuration models will be hot swapped and loaded on demand. The special `${PORT}` macro provides a unique port per model which is useful if you want to run multiple models at the same time with the `matrix` feature.
+
+## Advanced control with `cmd`
+
+llama-swap is also about customizability. You can use any CLI flag available:
+
+```yaml
+models:
+  model1:
+    cmd: | # support for multi-line
+      llama-server --PORT ${PORT} -m /path/to/model.gguf
+      --ctx-size 8192
+      --jinja
+      --cache-type-k q8_0
+      --cache-type-v q8_0
+```
+
+## Support for any OpenAI API compatible server
+
+llama-swap supports any OpenAI API compatible server. If you can run it on the CLI llama-swap will be able to manage it. Even if it's run in Docker or Podman containers.
+
+```yaml
+models:
+  "Q3-30B-CODER-VLLM":
+    name: "Qwen3 30B Coder vllm AWQ (Q3-30B-CODER-VLLM)"
+    # cmdStop provides a reliable way to stop containers
+    cmdStop: docker stop vllm-coder
+    cmd: |
+      docker run --init --rm --name vllm-coder
+        --runtime=nvidia --gpus '"device=2,3"'
+        --shm-size=16g
+        -v /mnt/nvme/vllm-cache:/root/.cache
+        -v /mnt/ssd-extra/models:/models -p ${PORT}:8000
+        vllm/vllm-openai:v0.10.0
+        --model "/models/cpatonn/Qwen3-Coder-30B-A3B-Instruct-AWQ"
+        --served-model-name "Q3-30B-CODER-VLLM"
+        --enable-expert-parallel
+        --swap-space 16
+        --max-num-seqs 512
+        --max-model-len 65536
+        --max-seq-len-to-capture 65536
+        --gpu-memory-utilization 0.9
+        --tensor-parallel-size 2
+        --trust-remote-code
+```
+
+## Many more features..
+
+llama-swap supports many more features to customize how you want to manage your environment.
+
+| Feature   | Description                                    |
+| --------- | ---------------------------------------------- |
+| `ttl`     | automatic unloading of models after a timeout  |
+| `macros`  | reusable snippets to use in configurations     |
+| `matrix`  | run multiple models at a time                  |
+| `hooks`   | event driven functionality                     |
+| `env`     | define environment variables per model         |
+| `aliases` | serve a model with different names             |
+| `filters` | modify requests before sending to the upstream |
+| `...`     | And many more tweaks                           |
+
+## Full Configuration Example
+
+> [!NOTE]
+> Always check [config.example.yaml](https://github.com/mostlygeek/llama-swap/blob/main/config.example.yaml) for the most up to date reference for all example configurations.
+
+```yaml
+# add this modeline for validation in vscode
+# yaml-language-server: $schema=https://raw.githubusercontent.com/mostlygeek/llama-swap/refs/heads/main/config-schema.json
+#
+# llama-swap YAML configuration example
+# -------------------------------------
+#
+# 💡 Tip - Use an LLM with this file!
+# ====================================
+#  This example configuration is written to be LLM friendly. Try
+#  copying this file into an LLM and asking it to explain or generate
+#  sections for you.
+# ====================================
+
+# Usage notes:
+# - Below are all the available configuration options for llama-swap.
+# - Settings noted as "required" must be in your configuration file
+# - Settings noted as "optional" can be omitted
+
+# healthCheckTimeout: number of seconds to wait for a model to be ready to serve requests
+# - optional, default: 120
+# - minimum value is 15 seconds, anything less will be set to this value
+healthCheckTimeout: 500
+
+# logLevel: sets the logging value
+# - optional, default: info
+# - Valid log levels: debug, info, warn, error
+logLevel: info
+
+# logTimeFormat: enables and sets the logging timestamp format
+# - optional, default (disabled): ""
+# - Valid values: "", "ansic", "unixdate", "rubydate", "rfc822", "rfc822z",
+#   "rfc850", "rfc1123", "rfc1123z", "rfc3339", "rfc3339nano", "kitchen",
+#   "stamp", "stampmilli", "stampmicro", and "stampnano".
+# - For more info, read: https://pkg.go.dev/time#pkg-constants
+logTimeFormat: ""
+
+# logToStdout: controls what is logged to stdout
+# - optional, default: "proxy"
+# - valid values:
+#   - "proxy": logs generated by llama-swap when swapping models,
+#      handling requests, etc.
+#   - "upstream": a copy of an upstream processes stdout logs
+#   - "both": both the proxy and upstream logs interleaved together
+#   - "none": no logs are ever written to stdout
+logToStdout: "proxy"
+
+# metricsMaxInMemory: maximum number of metrics to keep in memory
+# - optional, default: 1000
+# - controls how many metrics are stored in memory before older ones are discarded
+# - useful for limiting memory usage when processing large volumes of metrics
+metricsMaxInMemory: 1000
+
+# captureBuffer: how many MBs to allocate for storing request/response captures
+# - optional, default: 10
+# - set to 0 to disable
+captureBuffer: 15
+
+# performance: configuration for system monitoring statistics
+# - timing values are duration strings like 1s, 1h30m, 90m, 2h10s, etc.
+performance:
+  # disabled: boolean
+  # - default: false
+  enable: true
+
+  # every: delay between polling for new performance statistics
+  # - default: 5s
+  # - minimum duration 5s
+  every: 5s
+
+# startPort: sets the starting port number for the automatic ${PORT} macro.
+# - optional, default: 5800
+# - the ${PORT} macro can be used in model.cmd and model.proxy settings
+# - it is automatically incremented for every model that uses it
+startPort: 10001
+
+# sendLoadingState: inject loading status updates into the reasoning (thinking)
+# field
+# - optional, default: false
+# - when true, a stream of loading messages will be sent to the client in the
+#   reasoning field so chat UIs can show that loading is in progress.
+# - see #366 for more details
+sendLoadingState: true
+
+# includeAliasesInList: present aliases within the /v1/models OpenAI API listing
+# - optional, default: false
+# - when true, model aliases will be output to the API model listing duplicating
+#   all fields except for Id so chat UIs can use the alias equivalent to the original.
+includeAliasesInList: false
+
+# globalTTL: the default TTL in seconds before unloading a model
+# - optional, default: 0 (never automatically unload)
+# - must be >= 0
+globalTTL: 0
+
+# macros: a dictionary of string substitutions
+# - optional, default: empty dictionary
+# - macros are reusable snippets
+# - used in a model's cmd, cmdStop, proxy, checkEndpoint, filters.stripParams
+# - useful for reducing common configuration settings
+# - macro names are strings and must be less than 64 characters
+# - macro names must match the regex ^[a-zA-Z0-9_-]+$
+# - macro names must not be a reserved name: PORT or MODEL_ID
+# - macro values can be numbers, bools, or strings
+# - macros can contain other macros, but they must be defined before they are used
+# - environment variables can be referenced with ${env.VAR_NAME} syntax
+#   - env macros are substituted first, before regular macros
+#   - if the env var is not set, config loading will fail with an error
+macros:
+  # Example of a multi-line macro
+  "latest-llama": >
+    /path/to/llama-server/llama-server-ec9e0301 --port ${PORT}
+
+  "default_ctx": 4096
+
+  # Example of macro-in-macro usage. macros can contain other macros
+  # but they must be previously declared.
+  "default_args": "--ctx-size ${default_ctx}"
+
+  # Example of environment variable macros
+  # - ${env.VAR_NAME} pulls the value from the system environment
+  # - useful for paths, secrets, or machine-specific configuration
+  "models_dir": "${env.HOME}/models"
+
+# apiKeys: require an API key when making requests to inference endpoints
+# - optional, default: []
+# - when empty (the default) authorization will not be checked as llama-swap is default-allow
+# - each key is a non-empty string
+apiKeys:
+  - "sk-hunter2"
+  # tip, one liner: printf "sk-%s\n" "$(head -c 48 /dev/urandom | base64 )"
+  - "sk-gyCPiKUcIfPlaM4OSMZekkprgijPx6+OsmQs8Rsg0xZ9qpy6gKWsIKqHOk+cgXVx"
+
+  # use environment variable macros to keep secrets out of the config
+  - "${env.API_KEY_1}"
+  - "${env.API_KEY_2}"
+
+# models: a dictionary of model configurations
+# - required
+# - each key is the model's ID, used in API requests
+# - model settings have default values that are used if they are not defined here
+# - the model's ID is available in the ${MODEL_ID} macro, also available in macros defined above
+# - below are examples of the all the settings a model can have
+models:
+  # keys are the model names used in API requests
+  "gpt-oss-120b":
+    # macros: a dictionary of string substitutions specific to this model
+    # - optional, default: empty dictionary
+    # - macros defined here override macros defined in the global macros section
+    # - model level macros follow the same rules as global macros
+    macros:
+      "default_ctx": 16384
+      "temp": 0.7
+
+    # cmd: the command to run to start the inference server.
+    # - required
+    # - it is just a string, similar to what you would run on the CLI
+    # - using `|` allows for comments in the command, these will be parsed out
+    # - macros can be used within cmd
+    cmd: |
+      # ${latest-llama} is a macro that is defined above
+      ${latest-llama}
+      --model path/to/gpt-oss-120B.gguf
+      --ctx-size ${default_ctx}
+      --temperature ${temp}
+
+    # name: a display name for the model
+    # - optional, default: empty string
+    # - if set, it will be used in the v1/models API response
+    # - if not set, it will be omitted in the JSON model record
+    name: "gpt-oss 120B"
+
+    # description: a description for the model
+    # - optional, default: empty string
+    # - if set, it will be used in the v1/models API response
+    # - if not set, it will be omitted in the JSON model record
+    description: "A thinking model from OpenAI"
+
+    # env: define an array of environment variables to inject into cmd's environment
+    # - optional, default: empty array
+    # - each value is a single string
+    # - in the format: ENV_NAME=value
+    env:
+      - "CUDA_VISIBLE_DEVICES=0,1,2"
+
+    # proxy: the URL where llama-swap routes API requests
+    # - optional, default: http://localhost:${PORT}
+    # - if you used ${PORT} in cmd this can be omitted
+    # - if you use a custom port in cmd this *must* be set
+    proxy: http://127.0.0.1:8999
+
+    # checkEndpoint: URL path to check if the server is ready
+    # - optional, default: /health
+    # - endpoint is expected to return an HTTP 200 response
+    # - all requests wait until the endpoint is ready or fails
+    # - use "none" to skip endpoint health checking
+    checkEndpoint: /custom-endpoint
+
+    # ttl: automatically unload the model after ttl seconds
+    # - optional, default: -1 (use global default)
+    # - ttl values must be a value greater than or equal to 0
+    # - a ttl of -1 will use the global TTL value as the default
+    # - a ttl of 0 will mean never unload
+    # - a value of 0 disables automatic unloading of the model
+    ttl: 60
+
+    # useModelName: override the model name that is sent to upstream server
+    # - optional, default: ""
+    # - useful for when the upstream server expects a specific model name that
+    #   is different from the model's ID
+    useModelName: "openai/gpt-oss-120B"
+
+    # filters: a dictionary of filter settings
+    # - optional, default: empty dictionary
+    # - same capabilities as peer filters (stripParams, setParams)
+    filters:
+      # stripParams: a comma separated list of parameters to remove from the request
+      # - optional, default: ""
+      # - useful for server side enforcement of sampling parameters
+      # - the `model` parameter can never be removed
+      # - can be any JSON key in the request body
+      # - recommended to stick to sampling parameters
+      stripParams: "temperature, top_p, top_k"
+
+      # setParams: a dictionary of parameters to set/override in requests
+      # - optional, default: empty dictionary
+      # - useful for enforcing specific parameter values
+      # - protected params like "model" cannot be overridden
+      # - values can be strings, numbers, booleans, arrays, or objects
+      # - always runs for the model
+      setParams:
+        # Example: enforce specific sampling parameters
+        temperature: 0.7
+        top_p: 0.9
+
+      # setParamsByID: a dictionary of parameters to set based the model ID
+      # - optional, default: empty dictionary
+      # - combine with aliases to create variant behaviour without reloading the model
+      # - parameters are set in the request body JSON
+      # - run after setParams so it will override any settings
+      # - protected params like "model" cannot be overridden
+      # - values can be strings, numbers, booleans, arrays, or objects
+      # - model aliases will be automatically created for each key
+      setParamsByID:
+        "${MODEL_ID}":
+          chat_template_kwargs:
+            reasoning_effort: medium
+        "${MODEL_ID}:high":
+          chat_template_kwargs:
+            reasoning_effort: high
+        "${MODEL_ID}:low":
+          chat_template_kwargs:
+            reasoning_effort: low
+
+    # aliases: alternative model names that this model configuration is used for
+    # - optional, default: empty array
+    # - aliases must be unique globally
+    # - useful for impersonating a specific model
+    aliases:
+      - "gpt-4o-mini"
+
+    # metadata: a dictionary of arbitrary values that are included in /v1/models
+    # - optional, default: empty dictionary
+    # - while metadata can contains complex types it is recommended to keep it simple
+    # - metadata is only passed through in /v1/models responses
+    metadata:
+      # port will remain an integer
+      port: ${PORT}
+
+      # the ${temp} macro will remain a float
+      temperature: ${temp}
+      note: "The ${MODEL_ID} is running on port ${PORT} temp=${temp},
+        context=${default_ctx}"
+
+      a_list:
+        - 1
+        - 1.23
+        - "macros are OK in list and dictionary types: ${MODEL_ID}"
+
+      an_obj:
+        a: "1"
+        b: 2
+        # objects can contain complex types with macro substitution
+        # becomes: c: [0.7, false, "model: llama"]
+        c: ["${temp}", false, "model: ${MODEL_ID}"]
+
+    # concurrencyLimit: overrides the allowed number of active parallel requests to a model
+    # - optional, default: 0
+    # - useful for limiting the number of active parallel requests a model can process
+    # - must be set per model
+    # - any number greater than 0 will override the internal default value of 10
+    # - any requests that exceeds the limit will receive an HTTP 429 Too Many Requests response
+    # - recommended to be omitted and the default used
+    concurrencyLimit: 0
+
+    # sendLoadingState: overrides the global sendLoadingState setting for this model
+    # - optional, default: undefined (use global setting)
+    sendLoadingState: false
+
+    # timeouts: configure proxy connection timeouts for this model
+    # - optional, defaults shown below
+    # - useful for models running on slower hardware that need longer timeouts
+    # - connect: TCP dial connection timeout in seconds, default: 30 seconds
+    # - keepalive: TCP connection keepalive timeout, default: 30 seconds
+    # - responseHeader: time to wait for response headers in seconds, default: 0 (no timeout)
+    # - tlsHandshake: TLS handshake timeout in seconds, default: 10 seconds
+    # - idleConn: idle connection timeout in seconds, default: 90 seconds
+    # - set any value to 0 to disable that timeout (not recommended)
+    timeouts:
+      connect: 30
+      keepalive: 0
+      responseHeader: 60
+      tlsHandshake: 10
+      idleConn: 90
+
+  # Unlisted model example:
+  "qwen-unlisted":
+    # unlisted: boolean, true or false
+    # - optional, default: false
+    # - unlisted models do not show up in /v1/models api requests
+    # - can be requested as normal through all apis
+    unlisted: true
+    cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
+
+  # Docker example:
+  # container runtimes like Docker and Podman can be used reliably with
+  # a combination of cmd, cmdStop, and ${MODEL_ID}
+  "docker-llama":
+    proxy: "http://127.0.0.1:${PORT}"
+    cmd: |
+      docker run --name ${MODEL_ID}
+      --init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
+      ghcr.io/ggml-org/llama.cpp:server
+      --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
+
+    # cmdStop: command to run to stop the model gracefully
+    # - optional, default: ""
+    # - useful for stopping commands managed by another system
+    # - the upstream's process id is available in the ${PID} macro
+    #
+    # When empty, llama-swap has this default behaviour:
+    # - on POSIX systems: a SIGTERM signal is sent
+    # - on Windows, calls taskkill to stop the process
+    # - processes have 5 seconds to shutdown until forceful termination is attempted
+    cmdStop: docker stop ${MODEL_ID}
+
+# =============================================================================
+# matrix: run concurrent models with a solver-based swap DSL
+# =============================================================================
+#
+# Note:
+# A config must use either a matrix or legacy groups, not both. A configuration error
+# will occur if both are defined. Configuration examples for legacy Groups can be found:
+# https://github.com/mostlygeek/llama-swap/blob/40e39f7/config.example.yaml#L334-L396
+#
+# The matrix declares valid combinations of models that can run concurrently.
+# When a model is requested, the solver finds the cheapest way to make it
+# available by evicting as few (and least costly) running models as possible.
+#
+# Solver behavior:
+#   1. Request arrives for model X
+#   2. If X is already running, forward immediately. Done.
+#   3. Find all sets containing X
+#   4. For each candidate set, compute cost: sum of evict_costs for
+#      every running model NOT in that set
+#   5. Pick lowest cost candidate. Ties broken by definition order.
+#   6. Evict what needs to stop. Start X. Forward request.
+#
+# Subset semantics: a set [a, b, c] means any subset is valid.
+# Only the requested model is started — others are not preloaded.
+#
+# A model not appearing in any set can only run alone.
+#
+matrix:
+  # vars: short names for models (alphanumeric, 1-8 chars)
+  # - required for sets and evict_costs settings
+  # - each entry is a short name to a real model ID. Do not use an alias
+  # - used to keep set DSL logic short and easier to read
+  # - sets and evict_costs only use identifiers defined in vars
+  vars:
+    g: gemma-model
+    q: qwen-model
+    m: mistral-model
+    v: voxtral-model
+    e: reranker-model
+    L: llama-70B
+    sd: stable-diffusion
+
+  # evict_costs: relative cost of losing a running model (default: 1)
+  evict_costs:
+    v: 50 # vllm backend, slow cold start
+    L: 30 # 70B weights, slow to load
+
+  # sets: named sets of concurrent model combinations
+  # Values are DSL strings with operators:
+  #   &     AND (models run together)
+  #   |     OR  (alternatives)
+  #   ()    grouping
+  #   +ref  inline another set's expression
+  #
+  # Expansion examples:
+  #   "L"                  → [L]
+  #   "a & b"              → [a, b]
+  #   "a | b"              → [a], [b]
+  #   "(a | b) & c"        → [a, c], [b, c]
+  #   "(a | b) & (c | d)"  → [a,c], [a,d], [b,c], [b,d]
+  #   "+llms & v"          → expands llms inline, then applies & v
+  sets:
+    # LLM + TTS: switching between g/q/m won't evict v
+    # expands to: [g,v], [q,v], [m,v]
+    standard: "(g | q | m) & v"
+
+    # LLM + TTS + reranker
+    # expands to: [g,v,e], [q,v,e]
+    with_rerank: "(g | q) & v & e"
+
+    # LLM + image generation, no TTS
+    # expands to: [g,sd], [q,sd]
+    creative: "(g | q) & sd"
+
+    # 70B model uses all GPUs, can only run alone
+    # expands to: [L]
+    full: "L"
+
+# hooks: a dictionary of event triggers and actions
+# - optional, default: empty dictionary
+# - the only supported hook is on_startup
+hooks:
+  # on_startup: a dictionary of actions to perform on startup
+  # - optional, default: empty dictionary
+  # - the only supported action is preload
+  on_startup:
+    # preload: a list of model ids to load on startup
+    # - optional, default: empty list
+    # - model names must match keys in the models sections
+    # - when preloading multiple models at once, define a group
+    #   otherwise models will be loaded and swapped out
+    preload:
+      - "llama"
+
+# peers: a dictionary of remote peers and models they provide
+# - optional, default empty dictionary
+# - peers can be another llama-swap
+# - peers can be any server that provides the /v1/ generative api endpoints supported by llama-swap
+peers:
+  # keys is the peer'd ID
+  llama-swap-peer:
+    # proxy: a valid base URL to proxy requests to
+    # - required
+    # - requested path to llama-swap will be appended to the end of the proxy value
+    proxy: http://192.168.1.23
+    # models: a list of models served by the peer
+    # - required
+    models:
+      - model_a
+      - model_b
+      - embeddings/model_c
+  openrouter:
+    proxy: https://openrouter.ai/api
+    # apiKey: a string key to be injected into the request
+    # - optional, default: ""
+    # - if blank, no key will be added to the request
+    # - key will be injected into headers: Authorization: Bearer <key> and x-api-key: <key>
+    # - can be a string or a macro
+    apiKey: ${env.OPENROUTER_API_KEY}
+    models:
+      - meta-llama/llama-3.1-8b-instruct
+      - qwen/qwen3-235b-a22b-2507
+      - deepseek/deepseek-v3.2
+      - z-ai/glm-4.7
+      - moonshotai/kimi-k2-0905
+      - minimax/minimax-m2.1
+    # timeouts: configure proxy connection timeouts for this peer
+    # - optional, defaults shown below
+    # - useful when the peer runs on slower hardware
+    # - set any value to 0 to disable that timeout (not recommended)
+    timeouts:
+      connect: 30
+      keepalive: 30
+      responseHeader: 60
+      tlsHandshake: 10
+      idleConn: 90
+
+    # filters: a dictionary of filter settings for peer requests
+    # - optional, default: empty dictionary
+    # - same capabilities as model filters (stripParams, setParams)
+    filters:
+      # stripParams: a comma separated list of parameters to remove from the request
+      # - optional, default: ""
+      # - useful for removing parameters that the peer doesn't support
+      # - the `model` parameter can never be removed
+      stripParams: "temperature, top_p"
+
+      # setParams: a dictionary of parameters to set/override in requests to this peer
+      # - optional, default: empty dictionary
+      # - useful for injecting provider-specific settings like data retention policies
+      # - protected params like "model" cannot be overridden
+      # - values can be strings, numbers, booleans, arrays, or objects
+      setParams:
+        # Example: enforce zero-data-retention for OpenRouter
+        provider:
+          data_collection: "deny"
+          zdr: true
+```
@@ -0,0 +1,9 @@
+## Container Security
+
+For convenience, the default container images use the **root** user within the container. This permits simplified access to host resources including volume mounts and hardware devices under `/dev/dri` (_for Vulkan support_). But this can widen the attack surface to privilege escalation exploits.
+
+Alternative images, tagged as `non-root`, are also available. For example, `llama-swap:cpu-non-root` uses the unprivileged **app** user by default. Depending on deployment requirements, additional configuration may be necessary to ensure that the container retains access to required hosts resources. This might entail customizing host filesystem permissions/ownership appropriately or injecting host group membership into the container.
+
+Docker offers a [system-wide option enabling user namespace remapping](https://docs.docker.com/engine/security/userns-remap/) to accommodate situations were a **root** container user is required but also mentions that _"The best way to prevent privilege-escalation attacks from within a container is to configure your container's applications to run as unprivileged users."_ Podman offers similar capability, per-container, to [set UID/GID mapping in a new user namespace](https://docs.podman.io/en/latest/markdown/podman-run.1.html#set-uid-gid-mapping-in-a-new-user-namespace).
+
+The Large Language Model (_LLM/AI_) ecosystem is rapidly evolving and [serious security vulnerabilities have surfaced in the past](https://huggingface.co/docs/hub/security-pickle). These alternative _non-root_ images could reduce the impact of future unknown problems. However, proper planning and configuration is recommended to utilize them.
@@ -0,0 +1,153 @@
+# aider, QwQ, Qwen-Coder 2.5 and llama-swap
+
+This guide show how to use aider and llama-swap to get a 100% local coding co-pilot setup. The focus is on the trickest part which is configuring aider, llama-swap and llama-server to work together.
+
+## Here's what you you need:
+
+- aider - [installation docs](https://aider.chat/docs/install.html)
+- llama-server - [download latest release](https://github.com/ggml-org/llama.cpp/releases)
+- llama-swap - [download latest release](https://github.com/mostlygeek/llama-swap/releases)
+- [QwQ 32B](https://huggingface.co/bartowski/Qwen_QwQ-32B-GGUF) and [Qwen Coder 2.5 32B](https://huggingface.co/bartowski/Qwen2.5-Coder-32B-Instruct-GGUF) models
+- 24GB VRAM video card
+
+## Running aider
+
+The goal is getting this command line to work:
+
+```sh
+aider --architect \
+    --no-show-model-warnings \
+    --model openai/QwQ \
+    --editor-model openai/qwen-coder-32B \
+    --model-settings-file aider.model.settings.yml \
+    --openai-api-key "sk-na" \
+    --openai-api-base "http://10.0.1.24:8080/v1" \
+```
+
+Set `--openai-api-base` to the IP and port where your llama-swap is running.
+
+## Create an aider model settings file
+
+```yaml
+# aider.model.settings.yml
+
+#
+# !!! important: model names must match llama-swap configuration names !!!
+#
+
+- name: "openai/QwQ"
+  edit_format: diff
+  extra_params:
+    max_tokens: 16384
+    top_p: 0.95
+    top_k: 40
+    presence_penalty: 0.1
+    repetition_penalty: 1
+    num_ctx: 16384
+  use_temperature: 0.6
+  reasoning_tag: think
+  weak_model_name: "openai/qwen-coder-32B"
+  editor_model_name: "openai/qwen-coder-32B"
+
+- name: "openai/qwen-coder-32B"
+  edit_format: diff
+  extra_params:
+    max_tokens: 16384
+    top_p: 0.8
+    top_k: 20
+    repetition_penalty: 1.05
+  use_temperature: 0.6
+  reasoning_tag: think
+  editor_edit_format: editor-diff
+  editor_model_name: "openai/qwen-coder-32B"
+```
+
+## llama-swap configuration
+
+```yaml
+# config.yaml
+
+# The parameters are tweaked to fit model+context into 24GB VRAM GPUs
+models:
+  "qwen-coder-32B":
+    proxy: "http://127.0.0.1:8999"
+    cmd: >
+      /path/to/llama-server
+      --host 127.0.0.1 --port 8999 --flash-attn --slots
+      --ctx-size 16000
+      --cache-type-k q8_0 --cache-type-v q8_0
+       -ngl 99
+      --model /path/to/Qwen2.5-Coder-32B-Instruct-Q4_K_M.gguf
+
+  "QwQ":
+    proxy: "http://127.0.0.1:9503"
+    cmd: >
+      /path/to/llama-server
+      --host 127.0.0.1 --port 9503 --flash-attn --metrics--slots
+      --cache-type-k q8_0 --cache-type-v q8_0
+      --ctx-size 32000
+      --samplers "top_k;top_p;min_p;temperature;dry;typ_p;xtc"
+      --temp 0.6 --repeat-penalty 1.1 --dry-multiplier 0.5
+      --min-p 0.01 --top-k 40 --top-p 0.95
+      -ngl 99
+      --model /mnt/nvme/models/bartowski/Qwen_QwQ-32B-Q4_K_M.gguf
+```
+
+## Advanced, Dual GPU Configuration
+
+If you have _dual 24GB GPUs_ you can use llama-swap profiles to avoid swapping between QwQ and Qwen Coder.
+
+In llama-swap's configuration file:
+
+1. add a `profiles` section with `aider` as the profile name
+2. using the `env` field to specify the GPU IDs for each model
+
+```yaml
+# config.yaml
+
+# Add a profile for aider
+profiles:
+  aider:
+    - qwen-coder-32B
+    - QwQ
+
+models:
+  "qwen-coder-32B":
+    # manually set the GPU to run on
+    env:
+      - "CUDA_VISIBLE_DEVICES=0"
+    proxy: "http://127.0.0.1:8999"
+    cmd: /path/to/llama-server ...
+
+  "QwQ":
+    # manually set the GPU to run on
+    env:
+      - "CUDA_VISIBLE_DEVICES=1"
+    proxy: "http://127.0.0.1:9503"
+    cmd: /path/to/llama-server ...
+```
+
+Append the profile tag, `aider:`, to the model names in the model settings file
+
+```yaml
+# aider.model.settings.yml
+- name: "openai/aider:QwQ"
+  weak_model_name: "openai/aider:qwen-coder-32B-aider"
+  editor_model_name: "openai/aider:qwen-coder-32B-aider"
+
+- name: "openai/aider:qwen-coder-32B"
+  editor_model_name: "openai/aider:qwen-coder-32B-aider"
+```
+
+Run aider with:
+
+```sh
+$ aider --architect \
+    --no-show-model-warnings \
+    --model openai/aider:QwQ \
+    --editor-model openai/aider:qwen-coder-32B \
+    --config aider.conf.yml \
+    --model-settings-file aider.model.settings.yml
+    --openai-api-key "sk-na" \
+    --openai-api-base "http://10.0.1.24:8080/v1"
+```
@@ -0,0 +1,28 @@
+# this makes use of llama-swap's profile feature to
+# keep the architect and editor models in VRAM on different GPUs
+
+- name: "openai/aider:QwQ"
+  edit_format: diff
+  extra_params:
+    max_tokens: 16384
+    top_p: 0.95
+    top_k: 40
+    presence_penalty: 0.1
+    repetition_penalty: 1
+    num_ctx: 16384
+  use_temperature: 0.6
+  reasoning_tag: think
+  weak_model_name: "openai/aider:qwen-coder-32B"
+  editor_model_name: "openai/aider:qwen-coder-32B"
+
+- name: "openai/aider:qwen-coder-32B"
+  edit_format: diff
+  extra_params:
+    max_tokens: 16384
+    top_p: 0.8
+    top_k: 20
+    repetition_penalty: 1.05
+  use_temperature: 0.6
+  reasoning_tag: think
+  editor_edit_format: editor-diff
+  editor_model_name: "openai/aider:qwen-coder-32B"
@@ -0,0 +1,26 @@
+- name: "openai/QwQ"
+  edit_format: diff
+  extra_params:
+    max_tokens: 16384
+    top_p: 0.95
+    top_k: 40
+    presence_penalty: 0.1
+    repetition_penalty: 1
+    num_ctx: 16384
+  use_temperature: 0.6
+  reasoning_tag: think
+  weak_model_name: "openai/qwen-coder-32B"
+  editor_model_name: "openai/qwen-coder-32B"
+
+- name: "openai/qwen-coder-32B"
+  edit_format: diff
+  extra_params:
+    max_tokens: 16384
+    top_p: 0.8
+    top_k: 20
+    repetition_penalty: 1.05
+  use_temperature: 0.6
+  reasoning_tag: think
+  editor_edit_format: editor-diff
+  editor_model_name: "openai/qwen-coder-32B"
+
@@ -0,0 +1,49 @@
+healthCheckTimeout: 300
+logLevel: debug
+
+profiles:
+    aider:
+      - qwen-coder-32B
+      - QwQ
+
+models:
+  "qwen-coder-32B":
+    env:
+      - "CUDA_VISIBLE_DEVICES=0"
+    aliases:
+      - coder
+    proxy: "http://127.0.0.1:8999"
+
+    # set appropriate paths for your environment
+    cmd: >
+      /path/to/llama-server
+      --host 127.0.0.1 --port 8999 --flash-attn --slots
+      --ctx-size 16000
+      --ctx-size-draft 16000
+      --model /path/to/Qwen2.5-Coder-32B-Instruct-Q4_K_M.gguf
+      --model-draft /path/to/Qwen2.5-Coder-1.5B-Instruct-Q8_0.gguf
+      -ngl 99 -ngld 99
+      --draft-max 16 --draft-min 4 --draft-p-min 0.4
+      --cache-type-k q8_0 --cache-type-v q8_0
+  "QwQ":
+    env:
+      - "CUDA_VISIBLE_DEVICES=1"
+    proxy: "http://127.0.0.1:9503"
+
+    # set appropriate paths for your environment
+    cmd: >
+      /path/to/llama-server
+      --host 127.0.0.1 --port 9503
+      --flash-attn --metrics
+      --slots
+      --model /path/to/Qwen_QwQ-32B-Q4_K_M.gguf
+      --cache-type-k q8_0 --cache-type-v q8_0
+      --ctx-size 32000
+      --samplers "top_k;top_p;min_p;temperature;dry;typ_p;xtc"
+      --temp 0.6
+      --repeat-penalty 1.1
+      --dry-multiplier 0.5
+      --min-p 0.01
+      --top-k 40
+      --top-p 0.95
+      -ngl 99 -ngld 99
@@ -0,0 +1,51 @@
+# Restart llama-swap on config change
+
+Sometimes editing the configuration file can take a bit of trail and error to get a model configuration tuned just right. The `watch-and-restart.sh` script can be used to watch `config.yaml` for changes and restart `llama-swap` when it detects a change.
+
+```bash
+#!/bin/bash
+#
+# A simple watch and restart llama-swap when its configuration
+# file changes. Useful for trying out configuration changes
+# without manually restarting the server each time.
+if [ -z "$1" ]; then
+    echo "Usage: $0 <path to config.yaml>"
+    exit 1
+fi
+
+while true; do
+    # Start the process again
+    ./llama-swap-linux-amd64 -config $1 -listen :1867 &
+    PID=$!
+    echo "Started llama-swap with PID $PID"
+
+    # Wait for modifications in the specified directory or file
+    inotifywait -e modify "$1"
+
+    # Check if process exists before sending signal
+    if kill -0 $PID 2>/dev/null; then
+        echo "Sending SIGTERM to $PID"
+        kill -SIGTERM $PID
+        wait $PID
+    else
+        echo "Process $PID no longer exists"
+    fi
+    sleep 1
+done
+```
+
+## Usage and output example
+
+```bash
+$ ./watch-and-restart.sh config.yaml
+Started llama-swap with PID 495455
+Setting up watches.
+Watches established.
+llama-swap listening on :1867
+Sending SIGTERM to 495455
+Shutting down llama-swap
+Started llama-swap with PID 495486
+Setting up watches.
+Watches established.
+llama-swap listening on :1867
+```
@@ -0,0 +1,264 @@
+# New Router Migration TODO
+
+This document tracks the work needed for [cmd/newrouter/main.go](../cmd/newrouter/main.go) and [internal/router/](../internal/router/) to reach feature parity with the legacy entrypoint at [llama-swap.go](../llama-swap.go) plus [proxy/proxymanager.go](../proxy/proxymanager.go).
+
+The work is split into phases so each can land and be tested independently. Earlier phases unblock later ones.
+
+## Current state (newrouter)
+
+`cmd/newrouter` already supports:
+
+- Loading config via `-config`
+- Selecting Matrix vs Group router based on config
+- Peer routing fallback
+- Plain HTTP listen (`-listen`)
+- Graceful shutdown on `SIGINT` / `SIGTERM`
+- Model extraction from JSON body, query string, and form bodies (see [router.go:88](../internal/router/router.go#L88))
+- `Server.ServeHTTP` dispatches a single request to peer or local router based on the requested model
+
+Everything below is missing or only partially implemented.
+
+---
+
+## Phase 1 — Package relocation -- Completed.
+
+Goal: move shared infrastructure packages out from under `proxy/` so the new router does not depend on the legacy proxy tree. This is a prerequisite for retiring `proxy/` in Phase 8.
+
+---
+
+## Phase 2 — Server lifecycle parity -- Completed.
+
+Goal: make `cmd/newrouter` a drop-in replacement for the legacy binary's process model, _without_ yet adding any extra HTTP endpoints.
+
+---
+
+## Phase 3 — `internal/chain` package -- Completed.
+
+API: `chain.New(mws...).Then(final)` for ServeMux registration; `Append` returns an extended Chain without mutating the receiver, so a base stack (auth/CORS) can be reused across many routes with per-route additions.
+
+---
+
+## Phase 4 — `internal/server` package scaffolding (ProxyManager replacement) -- Completed.
+
+Goal: build the [internal/server](../internal/server/) package so it can stand in for [proxy.ProxyManager](../proxy/proxymanager.go#L67) — the mux, lifecycle, model dispatch, custom endpoints, request filters, auth/CORS, and upstream passthrough. After this phase, `cmd/newrouter/main.go` constructs a `server.Server` instead of a bare `router.Server`.
+
+The legacy `ProxyManager` collapses three concerns into one struct: the HTTP mux, the model→process router, and the cross-cutting services (loggers, metrics, perf, inflight counter, version). The new layout keeps the `router.Router` implementations focused on model dispatch and lets `internal/server.Server` own the mux and all cross-cutting middleware. `server.Server` builds the `local` and `peer` routers directly and dispatches between them itself, so it fully **supersedes `internal/router.Server`** — see the cleanup item below.
+
+The phase is split into sub-phases that can land and be tested independently:
+
+| Sub-phase | Scope                                                                      |
+| --------- | -------------------------------------------------------------------------- |
+| 4a        | package scaffolding — struct, `New`, `ServeHTTP`, `Shutdown`, model routes |
+| 4b        | custom (non-model-dispatched) HTTP endpoints                               |
+| 4c        | request-body filter middleware                                             |
+| 4d        | auth & CORS middleware                                                     |
+| 4e        | upstream passthrough                                                       |
+
+The package is split by concern across stub files already in place:
+
+| File         | Responsibility                                  | Filled in by           |
+| ------------ | ----------------------------------------------- | ---------------------- |
+| `server.go`  | `Server` struct, `New`, `ServeHTTP`, `Shutdown` | 4a                     |
+| `log.go`     | `muxlog` combined logger; `/logs` handlers      | 4a                     |
+| `auth.go`    | `CreateAuthMiddleware`                          | 4d                     |
+| `filters.go` | request-body filter middleware                  | 4c                     |
+| `api.go`     | llama-swap-specific API handlers                | 4b / Phase 5 / Phase 6 |
+| `ui.go`      | embedded UI serving                             | Phase 7                |
+
+### Phase 4a — package scaffolding -- Completed.
+
+`server.Server` owns the mux, the `local`/`peer` routers, `muxlog`, and a
+shutdown context. `New` builds the routers, registers all model-dispatched
+routes on a stdlib `http.ServeMux`, and wraps the mux with the global CORS
+middleware. `localPeerHandler` resolves the model once via `router.FetchModel`
+and dispatches to `local` or `peer`. `Shutdown` stops both routers in parallel
+and is idempotent. `cmd/newrouter/main.go` now constructs `server.New(...)`;
+`internal/router/server.go` and `server_test.go` were removed as dead code.
+
+### Phase 4b — Custom HTTP endpoints -- Completed.
+
+`GET /v1/models` (local + peer models, aliases, metadata), `GET /health`,
+`GET /wol-health`, and `GET /` → `/ui` are registered. `GET /favicon.ico` is
+deferred to Phase 7 since it requires the embedded UI filesystem.
+
+### Phase 4c — Request-body filters -- Completed.
+
+`CreateFilterMiddleware` (in `filters.go`) applies `UseModelName`,
+`StripParams`, `SetParams`, and `SetParamsByID` to JSON requests, then
+re-attaches the body with `Content-Length` / `Transfer-Encoding` cleanup.
+
+### Phase 4d — Auth & CORS -- Completed.
+
+`CreateAuthMiddleware` validates API keys (Bearer / Basic / `x-api-key`) and
+strips the headers before upstream. `CreateCORSMiddleware` answers OPTIONS
+preflight; `/v1/models` echoes the `Origin`.
+
+### Phase 4e — Upstream passthrough -- Completed.
+
+`GET /upstream` → `/ui/models`, and `/upstream/<model>/<path>` proxies to the
+resolved model with multi-segment name resolution, canonical-form redirect
+(301/308), and prefix stripping.
+
+---
+
+## Phase 5 — Operations endpoints -- Completed.
+
+A new `router.LocalRouter` interface embeds `Router` and adds `RunningModels()`
+and `Unload(timeout, models...)`, both implemented once on `baseRouter` so
+`Group` and `Matrix` share them — the legacy matrix/group divergence at
+[proxymanager.go:1167](../proxy/proxymanager.go#L1167) collapses since
+`baseRouter` already unifies process storage. `Peer` does not implement it;
+`Server.local` is typed `LocalRouter`, `Server.peer` stays `Router`.
+
+`GET /unload` stops every local process; `GET /running` lists non-stopped
+processes joined against config for `cmd`/`proxy`/`ttl`/`name`/`description`.
+`startPreload` fires a background `GET /` at each `Hooks.OnStartup.Preload`
+model and emits `shared.ModelPreloadedEvent`.
+
+---
+
+## Phase 6 — Metrics, perf, and SSE -- Completed.
+
+`perf.Monitor` is created and started in `cmd/newrouter/main.go` (it outlives
+config reloads via `UpdateConfig`) and passed into `server.New`. `GET /metrics`
+serves `perf.Monitor.MetricsHandler()` output, 503 when disabled.
+
+`internal/process` emits `shared.ProcessStateChangeEvent` from `setState`.
+`server.inflightCounter` (atomic) + `CreateInflightMiddleware` track
+model-dispatched requests and emit `InFlightRequestsEvent`. `metricsMonitor`
+(in `metrics.go`) parses token usage from upstream responses via
+`CreateMetricsMiddleware`.
+
+The `/api` group (API-key protected) is registered: `POST /api/models/unload`,
+`POST /api/models/unload/{model...}`, `GET /api/events` (SSE: `modelStatus` /
+`logData` / `metrics` / `inflight`), `GET /api/metrics`, `GET /api/performance`
+(`?after=` RFC3339 filter), `GET /api/version`. `GET /api/captures/{id}`
+returns 501 until 6f.
+
+### Phase 6f — Request/response captures -- Completed.
+
+`proxy/cache` moved to `internal/cache`. `metricsMonitor` stores zstd+CBOR
+`ReqRespCapture` records in a sized `cache.Cache` (`captureBuffer` MB, 0
+disables). `CreateMetricsMiddleware` buffers request body/headers before
+dispatch; `record` builds the capture per a `captureFieldsByPath` table
+(`captures.go`) that trims large audio/image payloads, defaulting JSON routes
+to `captureAll`. `GET /api/captures/{id}` decompresses and returns the capture;
+`getMetrics` resolves `HasCapture` against the cache.
+
+---
+
+## Phase 7 — UI serving -- Completed.
+
+`internal/server/ui.go` embeds `ui_dist` and serves it. `GET /ui/` is
+brotli/gzip-aware via `serveCompressedFile`; unknown paths without a file
+extension fall back to `index.html` for SPA routing. `GET /favicon.ico` serves
+from the same embedded FS. The Makefile `ui` target copies the vite build into
+`internal/server/ui_dist`; a committed `placeholder.txt` keeps the embed valid
+before a build runs.
+
+---
+
+## Phase 8a - Review Part I
+
+- [x] All functionality from the proxy package has been migrated in the above phases — with the remaining gaps listed in Phase 8b
+- [x] Test coverage at or exceeds the level from the proxy package — `internal/server` now at 76.6% vs 73.9% (`proxy`)
+
+### Findings
+
+**Gap 1 — Request logging middleware missing -- Resolved.**
+
+`CreateRequestLogMiddleware` ([log.go](../internal/server/log.go)) records one
+access-log line per request to `s.proxylog` in the legacy format
+`clientIP "METHOD PATH PROTO" status bodySize "UA" duration`, skipping
+`/wol-health`, `/api/performance`, and `/metrics`. A `statusRecorder` captures
+the status/body size (forwarding `Flush` for SSE) and `clientIP` honours
+`X-Forwarded-For` / `X-Real-IP`. It is wired as the outermost middleware in
+`routes()`, wrapping the CORS layer.
+
+**Gap 2 — Per-model log streaming not supported -- Resolved **
+
+`Server.getLogger` ([log.go:50](../internal/server/log.go#L50)) only handles `""`, `"proxy"`, and `"upstream"`. The legacy `ProxyManager.getLogger` ([proxymanager_loghandlers.go:92](../proxy/proxymanager_loghandlers.go#L92)) additionally resolves a model ID against the active process groups / matrix and returns that process's logger. Callers of `GET /logs/stream/<modelID>` will get a 400 instead of the model's live log stream.
+
+**Gap 3 — `UseModelName` not applied to multipart form endpoints -- Resolved.**
+
+`CreateFormFilterMiddleware` ([filters.go](../internal/server/filters.go)) parses
+`multipart/form-data` requests, rewrites the `model` field with `UseModelName`,
+reconstructs the body via `rewriteMultipartModel`, and re-attaches it with
+`Content-Type` / `Content-Length` cleanup. It runs in `modelChain` after the
+JSON `filterMW`; each is a no-op for the other's Content-Type. Audio
+transcription (`/v1/audio/transcriptions`) and image edit (`/v1/images/edits`)
+now honour `use_model_name`.
+
+**Coverage gaps (0 % functions) -- Resolved.**
+
+The functions previously at 0 % (`handleListModels`, `handleMetrics`,
+`handleRootRedirect`, `handleUpstreamRedirect`, `handleUpstream`,
+`findModelInPath`, `handleAPICapture`, `handleAPIUnloadAll`,
+`handleAPIUnloadModel`, `CreateAuthMiddleware`, `extractAPIKey`,
+`handleLogStream`, `applyFilters`, `decompressBody`, `filterAcceptEncoding`,
+`handleUI`, `handleFavicon`) now have tests across `auth_test.go`, `api_test.go`,
+`filters_test.go`, `log_test.go`, and `extras_test.go`.
+
+---
+
+### Phase 8b - Fill gaps discovered in Phase 8a
+
+- [x] **Add request-log middleware** — `CreateRequestLogMiddleware` ([log.go](../internal/server/log.go)) records `clientIP "METHOD PATH PROTO" status bodySize "UA" duration` to `s.proxylog`, skips `/wol-health` / `/api/performance` / `/metrics`, and is wired as the outermost middleware in `routes()`.
+- [x] **Extend `getLogger` with model-ID resolution** — add a `default:` branch to `Server.getLogger` ([log.go:50](../internal/server/log.go#L50)) that resolves the ID via `s.local` (using a new `LocalRouter.GetProcess(name)` method or equivalent) and returns that process's `Logger()`. Match the fallback behaviour: return a 400 with `"invalid logger. Use 'proxy', 'upstream' or a model's ID"` when not found.
+- [x] **`UseModelName` rewrite for multipart endpoints** — `CreateFormFilterMiddleware` parses `multipart/form-data`, rewrites the `model` field according to `UseModelName`, reconstructs the body, and updates `Content-Type` / `Content-Length`. It is wired into `modelChain` after the JSON filter.
+- [x] **Raise test coverage to ≥ 74 %** — `internal/server` now at 76.1%; tests added for every 0 % function across `auth_test.go`, `api_test.go`, `filters_test.go`, `log_test.go`, and `extras_test.go`.
+
+---
+
+## Phase 8c - Review Part II (entrypoint comparison)
+
+A second pass comparing [cmd/newrouter/main.go](../cmd/newrouter/main.go) against
+the legacy [llama-swap.go](../llama-swap.go) + [proxy.New](../proxy/proxymanager.go#L104)
+surfaced four more gaps, all in logger setup.
+
+**Gap 4 — `LogToStdout` config ignored -- Resolved.**
+
+`cmd/newrouter/main.go` previously hardcoded `proxyLog` / `upstreamLog` to
+`os.Stdout`, and the old `muxlog()` helper built a Monitor that nothing wrote
+into — so `logToStdout` had no effect and `/logs` (combined history) was always
+empty. `server.NewLoggers` ([log.go](../internal/server/log.go)) now replicates
+the legacy switch: `proxy` / `upstream` monitors feed `muxLog` (or `io.Discard`)
+per `none` / `both` / `upstream` / `proxy`, so `muxLog` accumulates the combined
+history. `server.New` takes `muxlog` as a parameter. The loggers outlive config
+reloads, so a `LogToStdout` change requires a restart to take effect.
+
+**Gap 5 — `LogTimeFormat` config ignored -- Resolved.**
+
+`cmd/newrouter/main.go` now maps `cfg.LogTimeFormat` to a Go time layout via the
+`logTimeFormats` table and applies it (alongside log level) to the proxy and
+upstream monitors in `applyLogSettings`, re-applied on config reload.
+
+**Gap 6 — `LogRequests` deprecation warning missing.**
+
+The legacy [proxymanager.go:127](../proxy/proxymanager.go#L127) warns when the
+deprecated `logRequests` config key is set. `cmd/newrouter` does not. Low
+priority — left open.
+
+**Gap 7 — PID debug log missing -- Resolved.**
+
+`cmd/newrouter/main.go` now logs `PID: %d` at debug level after `applyLogSettings`,
+matching [llama-swap.go:71](../llama-swap.go#L71).
+
+---
+
+## Phase X (tbd) — Cutover
+
+- [ ] Swap `llama-swap.go` to delegate to `cmd/newrouter` (or rename newrouter to be the primary entrypoint)
+- [ ] Update `Makefile` build targets
+- [ ] Update docs / README references to the legacy binary
+- [ ] Remove `proxy/proxymanager*.go` and `gin-gonic` dependency once nothing imports them
+- [ ] Run `make test-all` and confirm concurrency suite still passes against the new entrypoint
+
+---
+
+## Cross-cutting concerns to keep in mind
+
+- **Single body read**: legacy and newrouter both buffer the request body once. When adding filters (Phase 4c), make sure the buffered bytes flow through `Content-Length` / `transfer-encoding` cleanup as in [proxymanager.go:872](../proxy/proxymanager.go#L872).
+- **Streaming flag in context**: legacy stashes `streaming` and `model` under `proxyCtxKey`. The new router uses `ModelKey` / `ModelIDKey` — pick one set of keys and use them consistently for metrics + log handlers.
+- **Matrix vs Group divergence**: any handler that calls `swapProcessGroup` or `findGroupByModelName` in the legacy needs a matrix branch too. The new router's `Router` interface already abstracts this — preserve that abstraction rather than reintroducing the branch in every handler.
+- **Shutdown ordering**: `httpServer.Shutdown` must drain inflight requests _before_ `Server.Shutdown` tears down processes, otherwise inflight requests 502. Current newrouter ordering at [main.go:87](../cmd/newrouter/main.go#L87) is correct — keep it.
@@ -1,40 +1,77 @@
 module github.com/mostlygeek/llama-swap

-go 1.23.0
+go 1.26.1

 require (
-	github.com/stretchr/testify v1.9.0
+	github.com/billziss-gh/golib v0.2.0
+	github.com/charmbracelet/bubbles v1.0.0
+	github.com/charmbracelet/bubbletea v1.3.10
+	github.com/charmbracelet/lipgloss v1.1.0
+	github.com/fxamacker/cbor/v2 v2.9.1
+	github.com/gin-gonic/gin v1.10.0
+	github.com/google/jsonschema-go v0.4.3
+	github.com/klauspost/compress v1.18.5
+	github.com/shirou/gopsutil/v4 v4.26.4
+	github.com/stretchr/testify v1.11.1
+	github.com/tidwall/gjson v1.18.0
+	github.com/tidwall/sjson v1.2.5
+	golang.org/x/sync v0.20.0
+	golang.org/x/sys v0.41.0
 	gopkg.in/yaml.v3 v3.0.1
 )

 require (
+	github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
 	github.com/bytedance/sonic v1.11.6 // indirect
 	github.com/bytedance/sonic/loader v0.1.1 // indirect
+	github.com/charmbracelet/colorprofile v0.4.1 // indirect
+	github.com/charmbracelet/x/ansi v0.11.6 // indirect
+	github.com/charmbracelet/x/cellbuf v0.0.15 // indirect
+	github.com/charmbracelet/x/term v0.2.2 // indirect
+	github.com/clipperhouse/displaywidth v0.9.0 // indirect
+	github.com/clipperhouse/stringish v0.1.1 // indirect
+	github.com/clipperhouse/uax29/v2 v2.5.0 // indirect
 	github.com/cloudwego/base64x v0.1.4 // indirect
 	github.com/cloudwego/iasm v0.2.0 // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/ebitengine/purego v0.10.0 // indirect
+	github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect
 	github.com/gabriel-vasile/mimetype v1.4.3 // indirect
 	github.com/gin-contrib/sse v0.1.0 // indirect
-	github.com/gin-gonic/gin v1.10.0 // indirect
+	github.com/go-ole/go-ole v1.2.6 // indirect
 	github.com/go-playground/locales v0.14.1 // indirect
 	github.com/go-playground/universal-translator v0.18.1 // indirect
 	github.com/go-playground/validator/v10 v10.20.0 // indirect
 	github.com/goccy/go-json v0.10.2 // indirect
-	github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
 	github.com/klauspost/cpuid/v2 v2.2.7 // indirect
 	github.com/leodido/go-urn v1.4.0 // indirect
+	github.com/lucasb-eyer/go-colorful v1.3.0 // indirect
+	github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
 	github.com/mattn/go-isatty v0.0.20 // indirect
+	github.com/mattn/go-localereader v0.0.1 // indirect
+	github.com/mattn/go-runewidth v0.0.19 // indirect
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.2 // indirect
+	github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect
+	github.com/muesli/cancelreader v0.2.2 // indirect
+	github.com/muesli/termenv v0.16.0 // indirect
 	github.com/pelletier/go-toml/v2 v2.2.2 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
+	github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect
+	github.com/rivo/uniseg v0.4.7 // indirect
+	github.com/tidwall/match v1.1.1 // indirect
+	github.com/tidwall/pretty v1.2.1 // indirect
+	github.com/tklauser/go-sysconf v0.3.16 // indirect
+	github.com/tklauser/numcpus v0.11.0 // indirect
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
 	github.com/ugorji/go/codec v1.2.12 // indirect
+	github.com/x448/float16 v0.8.4 // indirect
+	github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
+	github.com/yusufpapurcu/wmi v1.2.4 // indirect
 	golang.org/x/arch v0.8.0 // indirect
-	golang.org/x/crypto v0.23.0 // indirect
-	golang.org/x/net v0.25.0 // indirect
-	golang.org/x/sys v0.20.0 // indirect
-	golang.org/x/text v0.15.0 // indirect
+	golang.org/x/crypto v0.45.0 // indirect
+	golang.org/x/net v0.47.0 // indirect
+	golang.org/x/text v0.31.0 // indirect
 	google.golang.org/protobuf v1.34.1 // indirect
 )
@@ -1,7 +1,31 @@
+github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
+github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
+github.com/billziss-gh/golib v0.2.0 h1:NyvcAQdfvM8xokKkKotiligKjKXzuQD4PPykg1nKc/8=
+github.com/billziss-gh/golib v0.2.0/go.mod h1:mZpUYANXZkDKSnyYbX9gfnyxwe0ddRhUtfXcsD5r8dw=
 github.com/bytedance/sonic v1.11.6 h1:oUp34TzMlL+OY1OUWxHqsdkgC/Zfc85zGqw9siXjrc0=
 github.com/bytedance/sonic v1.11.6/go.mod h1:LysEHSvpvDySVdC2f87zGWf6CIKJcAvqab1ZaiQtds4=
 github.com/bytedance/sonic/loader v0.1.1 h1:c+e5Pt1k/cy5wMveRDyk2X4B9hF4g7an8N3zCYjJFNM=
 github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU=
+github.com/charmbracelet/bubbles v1.0.0 h1:12J8/ak/uCZEMQ6KU7pcfwceyjLlWsDLAxB5fXonfvc=
+github.com/charmbracelet/bubbles v1.0.0/go.mod h1:9d/Zd5GdnauMI5ivUIVisuEm3ave1XwXtD1ckyV6r3E=
+github.com/charmbracelet/bubbletea v1.3.10 h1:otUDHWMMzQSB0Pkc87rm691KZ3SWa4KUlvF9nRvCICw=
+github.com/charmbracelet/bubbletea v1.3.10/go.mod h1:ORQfo0fk8U+po9VaNvnV95UPWA1BitP1E0N6xJPlHr4=
+github.com/charmbracelet/colorprofile v0.4.1 h1:a1lO03qTrSIRaK8c3JRxJDZOvhvIeSco3ej+ngLk1kk=
+github.com/charmbracelet/colorprofile v0.4.1/go.mod h1:U1d9Dljmdf9DLegaJ0nGZNJvoXAhayhmidOdcBwAvKk=
+github.com/charmbracelet/lipgloss v1.1.0 h1:vYXsiLHVkK7fp74RkV7b2kq9+zDLoEU4MZoFqR/noCY=
+github.com/charmbracelet/lipgloss v1.1.0/go.mod h1:/6Q8FR2o+kj8rz4Dq0zQc3vYf7X+B0binUUBwA0aL30=
+github.com/charmbracelet/x/ansi v0.11.6 h1:GhV21SiDz/45W9AnV2R61xZMRri5NlLnl6CVF7ihZW8=
+github.com/charmbracelet/x/ansi v0.11.6/go.mod h1:2JNYLgQUsyqaiLovhU2Rv/pb8r6ydXKS3NIttu3VGZQ=
+github.com/charmbracelet/x/cellbuf v0.0.15 h1:ur3pZy0o6z/R7EylET877CBxaiE1Sp1GMxoFPAIztPI=
+github.com/charmbracelet/x/cellbuf v0.0.15/go.mod h1:J1YVbR7MUuEGIFPCaaZ96KDl5NoS0DAWkskup+mOY+Q=
+github.com/charmbracelet/x/term v0.2.2 h1:xVRT/S2ZcKdhhOuSP4t5cLi5o+JxklsoEObBSgfgZRk=
+github.com/charmbracelet/x/term v0.2.2/go.mod h1:kF8CY5RddLWrsgVwpw4kAa6TESp6EB5y3uxGLeCqzAI=
+github.com/clipperhouse/displaywidth v0.9.0 h1:Qb4KOhYwRiN3viMv1v/3cTBlz3AcAZX3+y9OLhMtAtA=
+github.com/clipperhouse/displaywidth v0.9.0/go.mod h1:aCAAqTlh4GIVkhQnJpbL0T/WfcrJXHcj8C0yjYcjOZA=
+github.com/clipperhouse/stringish v0.1.1 h1:+NSqMOr3GR6k1FdRhhnXrLfztGzuG+VuFDfatpWHKCs=
+github.com/clipperhouse/stringish v0.1.1/go.mod h1:v/WhFtE1q0ovMta2+m+UbpZ+2/HEXNWYXQgCt4hdOzA=
+github.com/clipperhouse/uax29/v2 v2.5.0 h1:x7T0T4eTHDONxFJsL94uKNKPHrclyFI0lm7+w94cO8U=
+github.com/clipperhouse/uax29/v2 v2.5.0/go.mod h1:Wn1g7MK6OoeDT0vL+Q0SQLDz/KpfsVRgg6W7ihQeh4g=
 github.com/cloudwego/base64x v0.1.4 h1:jwCgWpFanWmN8xoIUHa2rtzmkd5J2plF/dnLS6Xd/0Y=
 github.com/cloudwego/base64x v0.1.4/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w=
 github.com/cloudwego/iasm v0.2.0 h1:1KNIy1I1H9hNNFEEH3DVnI4UujN+1zjpuk6gwHLTssg=
@@ -9,12 +33,22 @@ github.com/cloudwego/iasm v0.2.0/go.mod h1:8rXZaNYT2n95jn+zTI1sDr+IgcD2GVs0nlbbQ
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/ebitengine/purego v0.10.0 h1:QIw4xfpWT6GWTzaW5XEKy3HXoqrJGx1ijYHzTF0/ISU=
+github.com/ebitengine/purego v0.10.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ=
+github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4=
+github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM=
+github.com/fxamacker/cbor/v2 v2.9.1 h1:2rWm8B193Ll4VdjsJY28jxs70IdDsHRWgQYAI80+rMQ=
+github.com/fxamacker/cbor/v2 v2.9.1/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ=
 github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uqxgUFjbI0=
 github.com/gabriel-vasile/mimetype v1.4.3/go.mod h1:d8uq/6HKRL6CGdk+aubisF/M5GcPfT7nKyLpA0lbSSk=
 github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE=
 github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI=
 github.com/gin-gonic/gin v1.10.0 h1:nTuyha1TYqgedzytsKYqna+DfLos46nTv2ygFy86HFU=
 github.com/gin-gonic/gin v1.10.0/go.mod h1:4PMNQiOhvDRa013RKVbsiNwoyezlm2rm0uX/T7kzp5Y=
+github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY=
+github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
+github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
+github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
 github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA=
 github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY=
 github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY=
@@ -23,28 +57,53 @@ github.com/go-playground/validator/v10 v10.20.0 h1:K9ISHbSaI0lyB2eWMPJo+kOS/FBEx
 github.com/go-playground/validator/v10 v10.20.0/go.mod h1:dbuPbCMFw/DrkbEynArYaCwl3amGuJotoKCe95atGMM=
 github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
 github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
+github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
+github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
-github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 h1:El6M4kTTCOh6aBiKaUGG7oYTSPP8MxqL4YI3kZKwcP4=
-github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510/go.mod h1:pupxD2MaaD3pAXIBCelhxNneeOaAeabZDe5s4K6zSpQ=
+github.com/google/jsonschema-go v0.4.3 h1:/DBOLZTfDow7pe2GmaJNhltueGTtDKICi8V8p+DQPd0=
+github.com/google/jsonschema-go v0.4.3/go.mod h1:r5quNTdLOYEz95Ru18zA0ydNbBuYoo9tgaYcxEYhJVE=
 github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
 github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
+github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBFW2IE=
+github.com/klauspost/compress v1.18.5/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ=
 github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
 github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM=
 github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
 github.com/knz/go-libedit v1.10.1/go.mod h1:MZTVkCWyz0oBc7JOWP3wNAzd002ZbM/5hgShxwh4x8M=
 github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ=
 github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI=
+github.com/lucasb-eyer/go-colorful v1.3.0 h1:2/yBRLdWBZKrf7gB40FoiKfAWYQ0lqNcbuQwVHXptag=
+github.com/lucasb-eyer/go-colorful v1.3.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
+github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4=
+github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I=
 github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
 github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
+github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4=
+github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88=
+github.com/mattn/go-runewidth v0.0.19 h1:v++JhqYnZuu5jSKrk9RbgF5v4CGUjqRfBm05byFGLdw=
+github.com/mattn/go-runewidth v0.0.19/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs=
 github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
 github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
+github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI=
+github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo=
+github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA=
+github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
+github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc=
+github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk=
 github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6Wq+LM=
 github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU=
+github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
+github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
+github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
+github.com/shirou/gopsutil/v4 v4.26.4 h1:B4SXVbcwTyrocPHEmWBC4uCYr4Xcu3MK1TXqbprAOWY=
+github.com/shirou/gopsutil/v4 v4.26.4/go.mod h1:LZ6ewCSkBqUpvSOf+LsTGnRinC6iaNUNMGBtDkJBaLQ=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
 github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
@@ -55,25 +114,54 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
-github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
 github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
+github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
+github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
+github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
+github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY=
+github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
+github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
+github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
+github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
+github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4=
+github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
+github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY=
+github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28=
+github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA=
+github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI=
+github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw=
+github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ=
 github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
 github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
 github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE=
 github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
+github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
+github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
+github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no=
+github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM=
+github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0=
+github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
 golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
 golang.org/x/arch v0.8.0 h1:3wRIsP3pM4yUptoR96otTUOXI367OS0+c9eeRi9doIc=
 golang.org/x/arch v0.8.0/go.mod h1:FEVrYAQjsQXMVJ1nsMoVVXPZg6p2JE2mx8psSWTDQys=
-golang.org/x/crypto v0.23.0 h1:dIJU/v2J8Mdglj/8rJ6UUOM3Zc9zLZxVZwwxMooUSAI=
-golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
-golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac=
-golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
+golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q=
+golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4=
+golang.org/x/exp v0.0.0-20231006140011-7918f672742d h1:jtJma62tbqLibJ5sFQz8bKtEM8rJBtfilJ2qTU199MI=
+golang.org/x/exp v0.0.0-20231006140011-7918f672742d/go.mod h1:ldy0pHrwJyGW56pPQzzkH36rKxoZW1tw7ZJpeKx+hdo=
+golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY=
+golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU=
+golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
+golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
+golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y=
-golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
-golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk=
-golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k=
+golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
+golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM=
+golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 google.golang.org/protobuf v1.34.1 h1:9ddQBjfCyZPOHPUiPxpYESBLc+T8P3E+Vo4IbKZgFWg=
 google.golang.org/protobuf v1.34.1/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
@@ -0,0 +1,102 @@
+package cache
+
+import (
+	"errors"
+	"sync"
+)
+
+var (
+	ErrExceedsMaxSize = errors.New("item exceeds maximum cache size")
+	ErrNotFound       = errors.New("item not found")
+)
+
+type Cache struct {
+	mu      sync.Mutex
+	items   map[int][]byte
+	order   []int
+	size    int
+	maxSize int
+}
+
+func New(maxBytes int) *Cache {
+	return &Cache{
+		items:   make(map[int][]byte),
+		order:   make([]int, 0),
+		maxSize: maxBytes,
+	}
+}
+
+func (c *Cache) Add(id int, data []byte) error {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	dataSize := len(data)
+	if dataSize > c.maxSize {
+		return ErrExceedsMaxSize
+	}
+
+	// If key already exists, remove old entry from size and order
+	if old, exists := c.items[id]; exists {
+		c.size -= len(old)
+		c.removeOrder(id)
+	}
+
+	// Evict oldest (FIFO) until room available
+	for c.size+dataSize > c.maxSize && len(c.order) > 0 {
+		oldestID := c.order[0]
+		c.order = c.order[1:]
+		if evicted, exists := c.items[oldestID]; exists {
+			c.size -= len(evicted)
+			delete(c.items, oldestID)
+		}
+	}
+
+	c.items[id] = data
+	c.order = append(c.order, id)
+	c.size += dataSize
+	return nil
+}
+
+func (c *Cache) removeOrder(id int) {
+	for i, v := range c.order {
+		if v == id {
+			c.order = append(c.order[:i], c.order[i+1:]...)
+			return
+		}
+	}
+}
+
+func (c *Cache) Get(id int) ([]byte, error) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	data, exists := c.items[id]
+	if !exists {
+		return nil, ErrNotFound
+	}
+	return data, nil
+}
+
+func (c *Cache) Has(id int) bool {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	_, exists := c.items[id]
+	return exists
+}
+
+func (c *Cache) Size() int {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	return c.size
+}
+
+func (c *Cache) Clear() {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	c.items = make(map[int][]byte)
+	c.order = c.order[:0]
+	c.size = 0
+}
@@ -0,0 +1,130 @@
+package cache
+
+import (
+	"sync"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestCache_Add(t *testing.T) {
+	t.Run("adds and retrieves item", func(t *testing.T) {
+		c := New(1024)
+		data := []byte("hello")
+		require.NoError(t, c.Add(1, data))
+
+		got, err := c.Get(1)
+		require.NoError(t, err)
+		assert.Equal(t, data, got)
+	})
+
+	t.Run("returns error for oversized item", func(t *testing.T) {
+		c := New(10)
+		err := c.Add(1, make([]byte, 20))
+		assert.ErrorIs(t, err, ErrExceedsMaxSize)
+	})
+
+	t.Run("evicts oldest items to make room", func(t *testing.T) {
+		c := New(100)
+
+		require.NoError(t, c.Add(1, make([]byte, 40)))
+		require.NoError(t, c.Add(2, make([]byte, 40)))
+		// Adding item 3 should evict item 1
+		require.NoError(t, c.Add(3, make([]byte, 40)))
+
+		assert.False(t, c.Has(1))
+		assert.True(t, c.Has(2))
+		assert.True(t, c.Has(3))
+	})
+
+	t.Run("overwrites existing key", func(t *testing.T) {
+		c := New(100)
+		require.NoError(t, c.Add(1, []byte("old")))
+		require.NoError(t, c.Add(1, []byte("new")))
+
+		got, err := c.Get(1)
+		require.NoError(t, err)
+		assert.Equal(t, []byte("new"), got)
+		assert.Equal(t, 3, c.Size())
+	})
+}
+
+func TestCache_Get(t *testing.T) {
+	t.Run("returns ErrNotFound for missing key", func(t *testing.T) {
+		c := New(100)
+		_, err := c.Get(99)
+		assert.ErrorIs(t, err, ErrNotFound)
+	})
+}
+
+func TestCache_Has(t *testing.T) {
+	t.Run("returns true for existing key", func(t *testing.T) {
+		c := New(100)
+		require.NoError(t, c.Add(1, []byte("data")))
+		assert.True(t, c.Has(1))
+	})
+
+	t.Run("returns false for missing key", func(t *testing.T) {
+		c := New(100)
+		assert.False(t, c.Has(1))
+	})
+}
+
+func TestCache_Size(t *testing.T) {
+	t.Run("tracks byte usage", func(t *testing.T) {
+		c := New(1000)
+		assert.Equal(t, 0, c.Size())
+
+		require.NoError(t, c.Add(1, make([]byte, 100)))
+		assert.Equal(t, 100, c.Size())
+
+		require.NoError(t, c.Add(2, make([]byte, 200)))
+		assert.Equal(t, 300, c.Size())
+	})
+
+	t.Run("updates on eviction", func(t *testing.T) {
+		c := New(150)
+		require.NoError(t, c.Add(1, make([]byte, 100)))
+		require.NoError(t, c.Add(2, make([]byte, 100)))
+
+		// Item 1 should be evicted, size = 100
+		assert.Equal(t, 100, c.Size())
+	})
+}
+
+func TestCache_Clear(t *testing.T) {
+	t.Run("removes all items and resets size", func(t *testing.T) {
+		c := New(1000)
+		require.NoError(t, c.Add(1, []byte("a")))
+		require.NoError(t, c.Add(2, []byte("b")))
+
+		c.Clear()
+
+		assert.Equal(t, 0, c.Size())
+		assert.False(t, c.Has(1))
+		assert.False(t, c.Has(2))
+	})
+}
+
+func TestCache_Concurrent(t *testing.T) {
+	t.Run("concurrent operations are safe", func(t *testing.T) {
+		c := New(10000)
+
+		var wg sync.WaitGroup
+		for i := 0; i < 10; i++ {
+			wg.Add(1)
+			go func(id int) {
+				defer wg.Done()
+				for j := 0; j < 100; j++ {
+					key := id*100 + j
+					_ = c.Add(key, []byte("data"))
+					_, _ = c.Get(key)
+					_ = c.Has(key)
+					_ = c.Size()
+				}
+			}(i)
+		}
+		wg.Wait()
+	})
+}
@@ -0,0 +1,63 @@
+// Package chain composes http.Handler middleware into a single handler.
+//
+// A Middleware wraps a downstream http.Handler and may run logic before or
+// after delegating to it, or short-circuit by not calling next at all
+// (e.g. auth failure, CORS preflight).
+package chain
+
+import "net/http"
+
+// Middleware wraps an http.Handler with cross-cutting behavior. It receives
+// the next handler in the chain and returns a handler that may call next,
+// modify the request/response around it, or short-circuit.
+type Middleware func(next http.Handler) http.Handler
+
+// Chain is a reusable middleware stack. Build it once with New (and optionally
+// extend per-route with Append), then call Then to wrap each terminal handler
+// when registering routes against an http.ServeMux:
+//
+//	api := chain.New(authMW, corsMW)
+//	mux.Handle("/v1/chat/completions", api.Then(dispatch))
+//	mux.Handle("/v1/embeddings",       api.Append(filters).Then(dispatch))
+//
+// Middlewares execute left-to-right: mws[0] runs first and may call into
+// mws[1], and so on, with the terminal handler invoked last. A middleware
+// that does not call next short-circuits the remainder of the chain.
+// A zero Chain is valid and applies no middleware.
+type Chain struct {
+	mws []Middleware
+}
+
+// New returns a Chain that applies mws left-to-right around any terminal
+// handler passed to Then.
+func New(mws ...Middleware) Chain {
+	cp := make([]Middleware, len(mws))
+	copy(cp, mws)
+	return Chain{mws: cp}
+}
+
+// Append returns a new Chain with mws added after the existing middleware.
+// The receiver is not modified, so a base Chain can be safely reused across
+// multiple routes that each need different per-route additions.
+func (c Chain) Append(mws ...Middleware) Chain {
+	out := make([]Middleware, 0, len(c.mws)+len(mws))
+	out = append(out, c.mws...)
+	out = append(out, mws...)
+	return Chain{mws: out}
+}
+
+// Then wraps final with the chain's middleware and returns the resulting
+// handler, suitable for passing to http.ServeMux.Handle. With an empty chain,
+// Then returns final unchanged.
+func (c Chain) Then(final http.Handler) http.Handler {
+	h := final
+	for i := len(c.mws) - 1; i >= 0; i-- {
+		h = c.mws[i](h)
+	}
+	return h
+}
+
+// ThenFunc is shorthand for Then(http.HandlerFunc(f)).
+func (c Chain) ThenFunc(f http.HandlerFunc) http.Handler {
+	return c.Then(f)
+}
@@ -0,0 +1,205 @@
+package chain
+
+import (
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+)
+
+// recordingMiddleware appends tag before calling next and "-after-"+tag after.
+func recordingMiddleware(tag string, log *[]string) Middleware {
+	return func(next http.Handler) http.Handler {
+		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			*log = append(*log, tag)
+			next.ServeHTTP(w, r)
+			*log = append(*log, "after-"+tag)
+		})
+	}
+}
+
+func TestChain_HandlersExecuteInDeclaredOrder(t *testing.T) {
+	var log []string
+	final := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		log = append(log, "final")
+	})
+
+	h := New(
+		recordingMiddleware("a", &log),
+		recordingMiddleware("b", &log),
+		recordingMiddleware("c", &log),
+	).Then(final)
+
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodGet, "/", nil)
+	h.ServeHTTP(rec, req)
+
+	want := []string{"a", "b", "c", "final", "after-c", "after-b", "after-a"}
+	if !equal(log, want) {
+		t.Fatalf("execution order mismatch:\n got: %v\nwant: %v", log, want)
+	}
+}
+
+func TestChain_ShortCircuitsWhenMiddlewareDoesNotCallNext(t *testing.T) {
+	var log []string
+	final := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		log = append(log, "final")
+	})
+
+	gate := func(next http.Handler) http.Handler {
+		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			log = append(log, "gate")
+			w.WriteHeader(http.StatusUnauthorized)
+		})
+	}
+
+	h := New(
+		recordingMiddleware("outer", &log),
+		gate,
+		recordingMiddleware("inner", &log),
+	).Then(final)
+
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodGet, "/", nil)
+	h.ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusUnauthorized {
+		t.Fatalf("status: got %d, want %d", rec.Code, http.StatusUnauthorized)
+	}
+	want := []string{"outer", "gate", "after-outer"}
+	if !equal(log, want) {
+		t.Fatalf("short-circuit order mismatch:\n got: %v\nwant: %v", log, want)
+	}
+}
+
+func TestChain_EarlyWritesAreVisibleToLaterMiddleware(t *testing.T) {
+	header := func(next http.Handler) http.Handler {
+		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			w.Header().Set("X-Set-By", "outer")
+			_, _ = io.WriteString(w, "outer:")
+			next.ServeHTTP(w, r)
+		})
+	}
+	inner := func(next http.Handler) http.Handler {
+		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			// The outer middleware already set the header; we should see it.
+			if got := w.Header().Get("X-Set-By"); got != "outer" {
+				_, _ = io.WriteString(w, "missing-header;")
+			}
+			_, _ = io.WriteString(w, "inner:")
+			next.ServeHTTP(w, r)
+		})
+	}
+	final := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		_, _ = io.WriteString(w, "final")
+	})
+
+	h := New(header, inner).Then(final)
+
+	rec := httptest.NewRecorder()
+	h.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
+
+	body, _ := io.ReadAll(rec.Body)
+	if got := string(body); !strings.Contains(got, "outer:inner:final") {
+		t.Fatalf("body: got %q, want it to contain %q", got, "outer:inner:final")
+	}
+	if got := rec.Header().Get("X-Set-By"); got != "outer" {
+		t.Fatalf("header X-Set-By: got %q, want %q", got, "outer")
+	}
+}
+
+func TestChain_ReusableAcrossRoutesViaThen(t *testing.T) {
+	var log []string
+	base := New(
+		recordingMiddleware("auth", &log),
+		recordingMiddleware("cors", &log),
+	)
+
+	mux := http.NewServeMux()
+	mux.Handle("/a", base.ThenFunc(func(w http.ResponseWriter, r *http.Request) {
+		log = append(log, "handler-a")
+	}))
+	mux.Handle("/b", base.ThenFunc(func(w http.ResponseWriter, r *http.Request) {
+		log = append(log, "handler-b")
+	}))
+
+	srv := httptest.NewServer(mux)
+	defer srv.Close()
+
+	for _, path := range []string{"/a", "/b"} {
+		resp, err := http.Get(srv.URL + path)
+		if err != nil {
+			t.Fatalf("GET %s: %v", path, err)
+		}
+		resp.Body.Close()
+	}
+
+	want := []string{
+		"auth", "cors", "handler-a", "after-cors", "after-auth",
+		"auth", "cors", "handler-b", "after-cors", "after-auth",
+	}
+	if !equal(log, want) {
+		t.Fatalf("reusable chain order mismatch:\n got: %v\nwant: %v", log, want)
+	}
+}
+
+func TestChain_AppendDoesNotMutateReceiver(t *testing.T) {
+	var log []string
+	base := New(recordingMiddleware("base", &log))
+	extended := base.Append(recordingMiddleware("extra", &log))
+
+	final := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		log = append(log, "final")
+	})
+
+	// Run extended first to surface any aliasing of the underlying slice.
+	rec := httptest.NewRecorder()
+	extended.Then(final).ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
+
+	rec = httptest.NewRecorder()
+	base.Then(final).ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
+
+	want := []string{
+		"base", "extra", "final", "after-extra", "after-base",
+		"base", "final", "after-base",
+	}
+	if !equal(log, want) {
+		t.Fatalf("Append must not mutate the receiver:\n got: %v\nwant: %v", log, want)
+	}
+}
+
+func TestChain_ZeroValueAndEmptyThenAreIdentity(t *testing.T) {
+	final := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusTeapot)
+	})
+
+	for name, c := range map[string]Chain{
+		"zero":  {},
+		"empty": New(),
+	} {
+		t.Run(name, func(t *testing.T) {
+			h := c.Then(final)
+			if _, ok := h.(http.HandlerFunc); !ok {
+				t.Fatalf("expected http.HandlerFunc identity, got %T", h)
+			}
+			rec := httptest.NewRecorder()
+			h.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
+			if rec.Code != http.StatusTeapot {
+				t.Fatalf("status: got %d, want %d", rec.Code, http.StatusTeapot)
+			}
+		})
+	}
+}
+
+func equal(a, b []string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i := range a {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
+}
@@ -0,0 +1,924 @@
+package config
+
+import (
+	"fmt"
+	"io"
+	"net/url"
+	"os"
+	"regexp"
+	"runtime"
+	"sort"
+	"strings"
+	"time"
+
+	"github.com/billziss-gh/golib/shlex"
+	"gopkg.in/yaml.v3"
+)
+
+const DEFAULT_GROUP_ID = "(default)"
+const (
+	LogToStdoutProxy    = "proxy"
+	LogToStdoutUpstream = "upstream"
+	LogToStdoutBoth     = "both"
+	LogToStdoutNone     = "none"
+)
+
+type MacroEntry struct {
+	Name  string
+	Value any
+}
+
+type MacroList []MacroEntry
+
+// UnmarshalYAML implements custom YAML unmarshaling that preserves macro definition order
+func (ml *MacroList) UnmarshalYAML(value *yaml.Node) error {
+	if value.Kind != yaml.MappingNode {
+		return fmt.Errorf("macros must be a mapping")
+	}
+
+	// yaml.Node.Content for a mapping contains alternating key/value nodes
+	entries := make([]MacroEntry, 0, len(value.Content)/2)
+	for i := 0; i < len(value.Content); i += 2 {
+		keyNode := value.Content[i]
+		valueNode := value.Content[i+1]
+
+		var name string
+		if err := keyNode.Decode(&name); err != nil {
+			return fmt.Errorf("failed to decode macro name: %w", err)
+		}
+
+		var val any
+		if err := valueNode.Decode(&val); err != nil {
+			return fmt.Errorf("failed to decode macro value for '%s': %w", name, err)
+		}
+
+		entries = append(entries, MacroEntry{Name: name, Value: val})
+	}
+
+	*ml = entries
+	return nil
+}
+
+// Get retrieves a macro value by name
+func (ml MacroList) Get(name string) (any, bool) {
+	for _, entry := range ml {
+		if entry.Name == name {
+			return entry.Value, true
+		}
+	}
+	return nil, false
+}
+
+// ToMap converts MacroList to a map (for backward compatibility if needed)
+func (ml MacroList) ToMap() map[string]any {
+	result := make(map[string]any, len(ml))
+	for _, entry := range ml {
+		result[entry.Name] = entry.Value
+	}
+	return result
+}
+
+type GroupConfig struct {
+	Swap       bool     `yaml:"swap"`
+	Exclusive  bool     `yaml:"exclusive"`
+	Persistent bool     `yaml:"persistent"`
+	Members    []string `yaml:"members"`
+}
+
+var (
+	macroNameRegex    = regexp.MustCompile(`^[a-zA-Z0-9_-]+$`)
+	macroPatternRegex = regexp.MustCompile(`\$\{([a-zA-Z0-9_-]+)\}`)
+	envMacroRegex     = regexp.MustCompile(`\$\{env\.([a-zA-Z_][a-zA-Z0-9_]*)\}`)
+)
+
+// set default values for GroupConfig
+func (c *GroupConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
+	type rawGroupConfig GroupConfig
+	defaults := rawGroupConfig{
+		Swap:       true,
+		Exclusive:  true,
+		Persistent: false,
+		Members:    []string{},
+	}
+
+	if err := unmarshal(&defaults); err != nil {
+		return err
+	}
+
+	*c = GroupConfig(defaults)
+	return nil
+}
+
+type HooksConfig struct {
+	OnStartup HookOnStartup `yaml:"on_startup"`
+}
+
+type HookOnStartup struct {
+	Preload []string `yaml:"preload"`
+}
+
+type Config struct {
+	HealthCheckTimeout int                    `yaml:"healthCheckTimeout"`
+	LogRequests        bool                   `yaml:"logRequests"`
+	LogLevel           string                 `yaml:"logLevel"`
+	LogTimeFormat      string                 `yaml:"logTimeFormat"`
+	LogToStdout        string                 `yaml:"logToStdout"`
+	MetricsMaxInMemory int                    `yaml:"metricsMaxInMemory"`
+	CaptureBuffer      int                    `yaml:"captureBuffer"`
+	Performance        PerformanceConfig      `yaml:"performance"`
+	GlobalTTL          int                    `yaml:"globalTTL"`
+	Models             map[string]ModelConfig `yaml:"models"` /* key is model ID */
+	Profiles           map[string][]string    `yaml:"profiles"`
+
+	// routing is the canonical source for swap/scheduling configuration.
+	// New code must read Routing, never the backwards-compat fields below.
+	Routing RoutingConfig `yaml:"routing"`
+
+	// Groups and Matrix are permanent backwards-compat input fields for the
+	// legacy top-level `groups:`/`matrix:` keys. They are normalized into
+	// Routing by LoadConfigFromReader. New code must not read them directly.
+	Groups map[string]GroupConfig `yaml:"groups"` /* key is group ID */
+	Matrix *MatrixConfig          `yaml:"matrix"`
+
+	// for key/value replacements in model's cmd, cmdStop, proxy, checkEndPoint
+	Macros MacroList `yaml:"macros"`
+
+	// map aliases to actual model IDs
+	aliases map[string]string
+
+	// automatic port assignments
+	StartPort int `yaml:"startPort"`
+
+	// hooks, see: #209
+	Hooks HooksConfig `yaml:"hooks"`
+
+	// send loading state in reasoning
+	SendLoadingState bool `yaml:"sendLoadingState"`
+
+	// present aliases to /v1/models OpenAI API listing
+	IncludeAliasesInList bool `yaml:"includeAliasesInList"`
+
+	// support API keys, see issue #433, #50, #251
+	RequiredAPIKeys []string `yaml:"apiKeys"`
+
+	// support remote peers, see issue #433, #296
+	Peers PeerDictionaryConfig `yaml:"peers"`
+
+	// upstream controls behaviour of the /upstream passthrough endpoint
+	Upstream UpstreamConfig `yaml:"upstream"`
+}
+
+// RoutingConfig is the canonical, normalized routing/scheduling configuration.
+type RoutingConfig struct {
+	Scheduler SchedulerConfig `yaml:"scheduler"`
+	Router    RouterConfig    `yaml:"router"`
+}
+
+type SchedulerConfig struct {
+	Use      string            `yaml:"use"` // default "fifo"
+	Settings SchedulerSettings `yaml:"settings"`
+}
+
+type SchedulerSettings struct {
+	Fifo FifoConfig `yaml:"fifo"`
+}
+
+type FifoConfig struct {
+	Priority map[string]int `yaml:"priority"` // model ID -> priority, default 0
+}
+
+type RouterConfig struct {
+	Use      string         `yaml:"use"` // "group" (default) | "matrix"
+	Settings RouterSettings `yaml:"settings"`
+}
+
+type RouterSettings struct {
+	Groups map[string]GroupConfig `yaml:"groups"`
+	Matrix *MatrixConfig          `yaml:"matrix"`
+}
+
+func (c *Config) RealModelName(search string) (string, bool) {
+	if _, found := c.Models[search]; found {
+		return search, true
+	} else if name, found := c.aliases[search]; found {
+		return name, found
+	} else {
+		return "", false
+	}
+}
+
+func (c *Config) FindConfig(modelName string) (ModelConfig, string, bool) {
+	if realName, found := c.RealModelName(modelName); !found {
+		return ModelConfig{}, "", false
+	} else {
+		return c.Models[realName], realName, true
+	}
+}
+
+func LoadConfig(path string) (Config, error) {
+	file, err := os.Open(path)
+	if err != nil {
+		return Config{}, err
+	}
+	defer file.Close()
+	return LoadConfigFromReader(file)
+}
+
+func LoadConfigFromReader(r io.Reader) (Config, error) {
+	data, err := io.ReadAll(r)
+	if err != nil {
+		return Config{}, err
+	}
+	yamlStr := string(data)
+
+	// Phase 1: Substitute all ${env.VAR} macros at string level
+	// This is safe because env values are simple strings without YAML formatting
+	yamlStr, err = substituteEnvMacros(yamlStr)
+	if err != nil {
+		return Config{}, err
+	}
+
+	// Unmarshal into full Config with defaults
+	config := Config{
+		HealthCheckTimeout: 120,
+		StartPort:          5800,
+		LogLevel:           "info",
+		LogTimeFormat:      "",
+		LogToStdout:        LogToStdoutProxy,
+		MetricsMaxInMemory: 1000,
+		CaptureBuffer:      5,
+		GlobalTTL:          0,
+	}
+	if err = yaml.Unmarshal([]byte(yamlStr), &config); err != nil {
+		return Config{}, err
+	}
+
+	if config.HealthCheckTimeout < 15 {
+		config.HealthCheckTimeout = 15
+	}
+
+	// Apply defaults for performance config when section is missing
+	if config.Performance.Every == 0 {
+		config.Performance.Every = 5 * time.Second
+	}
+	if err = config.Performance.Validate(); err != nil {
+		return Config{}, fmt.Errorf("performance: %w", err)
+	}
+
+	if config.StartPort < 1 {
+		return Config{}, fmt.Errorf("startPort must be greater than 1")
+	}
+
+	if config.GlobalTTL < 0 {
+		return Config{}, fmt.Errorf("globalTTL must be >= 0")
+	}
+
+	// Apply default for upstream.ignorePaths when not specified. The default
+	// matches common static-asset suffixes so they do not trigger a swap.
+	if len(config.Upstream.IgnorePaths) == 0 {
+		config.Upstream.IgnorePaths = DefaultUpstreamIgnorePaths()
+	}
+
+	switch config.LogToStdout {
+	case LogToStdoutProxy, LogToStdoutUpstream, LogToStdoutBoth, LogToStdoutNone:
+	default:
+		return Config{}, fmt.Errorf("logToStdout must be one of: proxy, upstream, both, none")
+	}
+
+	// Populate the aliases map
+	config.aliases = make(map[string]string)
+	for modelName, modelConfig := range config.Models {
+		for _, alias := range modelConfig.Aliases {
+			if _, found := config.aliases[alias]; found {
+				return Config{}, fmt.Errorf("duplicate alias %s found in model: %s", alias, modelName)
+			}
+			config.aliases[alias] = modelName
+		}
+	}
+
+	// Validate global macros
+	for _, macro := range config.Macros {
+		if err = validateMacro(macro.Name, macro.Value); err != nil {
+			return Config{}, err
+		}
+	}
+
+	// Get and sort all model IDs for consistent port assignment
+	modelIds := make([]string, 0, len(config.Models))
+	for modelId := range config.Models {
+		modelIds = append(modelIds, modelId)
+	}
+	sort.Strings(modelIds)
+
+	nextPort := config.StartPort
+	for _, modelId := range modelIds {
+		modelConfig := config.Models[modelId]
+		modelConfig.HealthCheckTimeout = config.HealthCheckTimeout
+
+		// Strip comments from command fields
+		modelConfig.Cmd = StripComments(modelConfig.Cmd)
+		modelConfig.CmdStop = StripComments(modelConfig.CmdStop)
+
+		// set model TTL to globalTTL it is the default value
+		if modelConfig.UnloadAfter == MODEL_CONFIG_DEFAULT_TTL {
+			modelConfig.UnloadAfter = config.GlobalTTL
+		}
+
+		if modelConfig.UnloadAfter < 0 {
+			return Config{}, fmt.Errorf("model %s: invalid TTL value %d", modelId, modelConfig.UnloadAfter)
+		}
+
+		// Validate model macros
+		for _, macro := range modelConfig.Macros {
+			if err = validateMacro(macro.Name, macro.Value); err != nil {
+				return Config{}, fmt.Errorf("model %s: %s", modelId, err.Error())
+			}
+		}
+
+		// Build merged macro list: MODEL_ID + global macros + model macros (model overrides global)
+		mergedMacros := make(MacroList, 0, len(config.Macros)+len(modelConfig.Macros)+1)
+		mergedMacros = append(mergedMacros, MacroEntry{Name: "MODEL_ID", Value: modelId})
+		mergedMacros = append(mergedMacros, config.Macros...)
+
+		// Add model macros (override globals with same name)
+		for _, entry := range modelConfig.Macros {
+			found := false
+			for i, existing := range mergedMacros {
+				if existing.Name == entry.Name {
+					mergedMacros[i] = entry
+					found = true
+					break
+				}
+			}
+			if !found {
+				mergedMacros = append(mergedMacros, entry)
+			}
+		}
+
+		// Substitute remaining macros in model fields (LIFO order)
+		for i := len(mergedMacros) - 1; i >= 0; i-- {
+			entry := mergedMacros[i]
+			macroSlug := fmt.Sprintf("${%s}", entry.Name)
+			macroStr := fmt.Sprintf("%v", entry.Value)
+
+			modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, macroSlug, macroStr)
+			modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, macroSlug, macroStr)
+			modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, macroSlug, macroStr)
+			modelConfig.CheckEndpoint = strings.ReplaceAll(modelConfig.CheckEndpoint, macroSlug, macroStr)
+			modelConfig.Filters.StripParams = strings.ReplaceAll(modelConfig.Filters.StripParams, macroSlug, macroStr)
+			modelConfig.Name = strings.ReplaceAll(modelConfig.Name, macroSlug, macroStr)
+			modelConfig.Description = strings.ReplaceAll(modelConfig.Description, macroSlug, macroStr)
+
+			// Substitute macros in SetParamsByID keys and values
+			if len(modelConfig.Filters.SetParamsByID) > 0 {
+				newSetParamsByID := make(map[string]map[string]any, len(modelConfig.Filters.SetParamsByID))
+				for key, paramMap := range modelConfig.Filters.SetParamsByID {
+					newKey := strings.ReplaceAll(key, macroSlug, macroStr)
+					newValAny, err := substituteMacroInValue(any(paramMap), entry.Name, entry.Value)
+					if err != nil {
+						return Config{}, fmt.Errorf("model %s filters.setParamsByID: %s", modelId, err.Error())
+					}
+					newParamMap, ok := newValAny.(map[string]any)
+					if !ok {
+						return Config{}, fmt.Errorf("model %s filters.setParamsByID: unexpected type after macro substitution", modelId)
+					}
+					newSetParamsByID[newKey] = newParamMap
+				}
+				modelConfig.Filters.SetParamsByID = newSetParamsByID
+			}
+
+			// Substitute in metadata (type-preserving)
+			if len(modelConfig.Metadata) > 0 {
+				result, err := substituteMacroInValue(modelConfig.Metadata, entry.Name, entry.Value)
+				if err != nil {
+					return Config{}, fmt.Errorf("model %s metadata: %s", modelId, err.Error())
+				}
+				modelConfig.Metadata = result.(map[string]any)
+			}
+		}
+
+		// Handle PORT macro - only allocate if cmd uses it
+		cmdHasPort := strings.Contains(modelConfig.Cmd, "${PORT}")
+		proxyHasPort := strings.Contains(modelConfig.Proxy, "${PORT}")
+		if cmdHasPort || proxyHasPort {
+			if !cmdHasPort && proxyHasPort {
+				return Config{}, fmt.Errorf("model %s: proxy uses ${PORT} but cmd does not - ${PORT} is only available when used in cmd", modelId)
+			}
+
+			macroSlug := "${PORT}"
+			macroStr := fmt.Sprintf("%v", nextPort)
+
+			modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, macroSlug, macroStr)
+			modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, macroSlug, macroStr)
+			modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, macroSlug, macroStr)
+			modelConfig.Name = strings.ReplaceAll(modelConfig.Name, macroSlug, macroStr)
+			modelConfig.Description = strings.ReplaceAll(modelConfig.Description, macroSlug, macroStr)
+
+			if len(modelConfig.Metadata) > 0 {
+				result, err := substituteMacroInValue(modelConfig.Metadata, "PORT", nextPort)
+				if err != nil {
+					return Config{}, fmt.Errorf("model %s metadata: %s", modelId, err.Error())
+				}
+				modelConfig.Metadata = result.(map[string]any)
+			}
+
+			nextPort++
+		}
+
+		// Validate no unknown macros remain
+		fieldMap := map[string]string{
+			"cmd":                 modelConfig.Cmd,
+			"cmdStop":             modelConfig.CmdStop,
+			"proxy":               modelConfig.Proxy,
+			"checkEndpoint":       modelConfig.CheckEndpoint,
+			"filters.stripParams": modelConfig.Filters.StripParams,
+			"name":                modelConfig.Name,
+			"description":         modelConfig.Description,
+		}
+
+		for fieldName, fieldValue := range fieldMap {
+			matches := macroPatternRegex.FindAllStringSubmatch(fieldValue, -1)
+			for _, match := range matches {
+				macroName := match[1]
+				if macroName == "PID" && fieldName == "cmdStop" {
+					continue // replaced at runtime
+				}
+				if macroName == "PORT" || macroName == "MODEL_ID" {
+					return Config{}, fmt.Errorf("macro '${%s}' should have been substituted in %s.%s", macroName, modelId, fieldName)
+				}
+				return Config{}, fmt.Errorf("unknown macro '${%s}' found in %s.%s", macroName, modelId, fieldName)
+			}
+		}
+
+		if len(modelConfig.Metadata) > 0 {
+			if err := validateNestedForUnknownMacros(modelConfig.Metadata, fmt.Sprintf("model %s metadata", modelId)); err != nil {
+				return Config{}, err
+			}
+		}
+
+		if err = modelConfig.Capabilities.Validate(); err != nil {
+			return Config{}, fmt.Errorf("model %s: %w", modelId, err)
+		}
+
+		// Validate SetParamsByID keys and values
+		for key, paramMap := range modelConfig.Filters.SetParamsByID {
+			if matches := macroPatternRegex.FindAllStringSubmatch(key, -1); len(matches) > 0 {
+				return Config{}, fmt.Errorf("unknown macro '${%s}' found in model %s filters.setParamsByID key", matches[0][1], modelId)
+			}
+			if err := validateNestedForUnknownMacros(any(paramMap), fmt.Sprintf("model %s filters.setParamsByID[%s]", modelId, key)); err != nil {
+				return Config{}, err
+			}
+		}
+
+		// Auto-register setParamsByID keys as aliases (skip the model's own ID)
+		for key := range modelConfig.Filters.SetParamsByID {
+			if key == modelId {
+				continue
+			}
+			if _, exists := config.Models[key]; exists {
+				return Config{}, fmt.Errorf("model %s filters.setParamsByID: key '%s' conflicts with an existing model ID", modelId, key)
+			}
+			if existingModel, exists := config.aliases[key]; exists {
+				if existingModel != modelId {
+					return Config{}, fmt.Errorf("duplicate alias '%s' in model %s filters.setParamsByID, already used by model %s", key, modelId, existingModel)
+				}
+				continue // already registered as explicit alias for this model
+			}
+			config.aliases[key] = modelId
+			modelConfig.Aliases = append(modelConfig.Aliases, key)
+		}
+
+		if _, err := url.Parse(modelConfig.Proxy); err != nil {
+			return Config{}, fmt.Errorf("model %s: invalid proxy URL: %w", modelId, err)
+		}
+
+		if modelConfig.SendLoadingState == nil {
+			v := config.SendLoadingState
+			modelConfig.SendLoadingState = &v
+		}
+
+		config.Models[modelId] = modelConfig
+	}
+
+	// Normalize routing config. The legacy top-level `matrix`/`groups` keys and
+	// the new `routing.router` block are mutually exclusive: a config may use
+	// either style, never both.
+	hasTopLevel := config.Matrix != nil || len(config.Groups) > 0
+	rtr := config.Routing.Router
+	hasRouting := rtr.Use != "" || rtr.Settings.Matrix != nil || len(rtr.Settings.Groups) > 0
+
+	if hasTopLevel && hasRouting {
+		return Config{}, fmt.Errorf("config uses both the legacy top-level 'matrix'/'groups' keys and the new 'routing.router' block; please migrate the top-level keys into 'routing.router' and remove them")
+	}
+
+	if !hasTopLevel {
+		// Both groups and matrix may be defined under routing.router.settings;
+		// routing.router.use selects which one is active, so there is no conflict.
+		rs := config.Routing.Router.Settings
+		switch config.Routing.Router.Use {
+		case "matrix":
+			if rs.Matrix == nil {
+				return Config{}, fmt.Errorf("routing.router.use is 'matrix' but routing.router.settings.matrix is not set")
+			}
+			config.Matrix = rs.Matrix
+		case "group", "":
+			config.Groups = rs.Groups
+		default:
+			return Config{}, fmt.Errorf("routing.router.use: unknown router %q (valid: group, matrix)", config.Routing.Router.Use)
+		}
+	}
+
+	// groups XOR matrix
+	if config.Matrix != nil && len(config.Groups) > 0 {
+		return Config{}, fmt.Errorf("config cannot use both 'groups' and 'matrix'")
+	}
+
+	if config.Matrix != nil {
+		expandedSets, err := ValidateMatrix(*config.Matrix, config.Models)
+		if err != nil {
+			return Config{}, fmt.Errorf("matrix: %w", err)
+		}
+		config.Matrix.ExpandedSets = expandedSets
+	} else {
+		config = AddDefaultGroupToConfig(config)
+
+		// Validate group members
+		memberUsage := make(map[string]string)
+		for groupID, groupConfig := range config.Groups {
+			prevSet := make(map[string]bool)
+			for _, member := range groupConfig.Members {
+				if _, found := prevSet[member]; found {
+					return Config{}, fmt.Errorf("duplicate model member %s found in group: %s", member, groupID)
+				}
+				prevSet[member] = true
+
+				if existingGroup, exists := memberUsage[member]; exists {
+					return Config{}, fmt.Errorf("model member %s is used in multiple groups: %s and %s", member, existingGroup, groupID)
+				}
+				memberUsage[member] = groupID
+			}
+		}
+	}
+
+	// Build the canonical Config.Routing from the effective result. Both legacy
+	// and new-style configs converge here. The Matrix pointer is shared so
+	// ExpandedSets stays in one place.
+	if config.Matrix != nil {
+		config.Routing.Router.Use = "matrix"
+	} else {
+		config.Routing.Router.Use = "group"
+	}
+	config.Routing.Router.Settings.Matrix = config.Matrix
+	config.Routing.Router.Settings.Groups = config.Groups
+
+	if config.Routing.Scheduler.Use == "" {
+		config.Routing.Scheduler.Use = "fifo"
+	}
+	if config.Routing.Scheduler.Use != "fifo" {
+		return Config{}, fmt.Errorf("routing.scheduler.use: unknown scheduler %q (valid: fifo)", config.Routing.Scheduler.Use)
+	}
+	for modelID := range config.Routing.Scheduler.Settings.Fifo.Priority {
+		if _, found := config.RealModelName(modelID); !found {
+			return Config{}, fmt.Errorf("routing.scheduler.settings.fifo.priority references unknown model %q", modelID)
+		}
+	}
+
+	// Clean up hooks preload
+	if len(config.Hooks.OnStartup.Preload) > 0 {
+		var toPreload []string
+		for _, modelID := range config.Hooks.OnStartup.Preload {
+			modelID = strings.TrimSpace(modelID)
+			if modelID == "" {
+				continue
+			}
+			if real, found := config.RealModelName(modelID); found {
+				toPreload = append(toPreload, real)
+			}
+		}
+		config.Hooks.OnStartup.Preload = toPreload
+	}
+
+	// Validate API keys (env macros already substituted at string level)
+	for i, apikey := range config.RequiredAPIKeys {
+		if apikey == "" {
+			return Config{}, fmt.Errorf("empty api key found in apiKeys")
+		}
+		if strings.Contains(apikey, " ") {
+			return Config{}, fmt.Errorf("api key cannot contain spaces: `%s`", apikey)
+		}
+		config.RequiredAPIKeys[i] = apikey
+	}
+
+	// Process peers with global macro substitution
+	for peerName, peerConfig := range config.Peers {
+		// Substitute global macros (LIFO order)
+		for i := len(config.Macros) - 1; i >= 0; i-- {
+			entry := config.Macros[i]
+			macroSlug := fmt.Sprintf("${%s}", entry.Name)
+			macroStr := fmt.Sprintf("%v", entry.Value)
+
+			peerConfig.ApiKey = strings.ReplaceAll(peerConfig.ApiKey, macroSlug, macroStr)
+			peerConfig.Filters.StripParams = strings.ReplaceAll(peerConfig.Filters.StripParams, macroSlug, macroStr)
+
+			// Substitute in setParams (type-preserving)
+			if len(peerConfig.Filters.SetParams) > 0 {
+				result, err := substituteMacroInValue(peerConfig.Filters.SetParams, entry.Name, entry.Value)
+				if err != nil {
+					return Config{}, fmt.Errorf("peers.%s.filters.setParams: %w", peerName, err)
+				}
+				peerConfig.Filters.SetParams = result.(map[string]any)
+			}
+		}
+
+		// Validate no unknown macros remain
+		if matches := macroPatternRegex.FindAllStringSubmatch(peerConfig.ApiKey, -1); len(matches) > 0 {
+			return Config{}, fmt.Errorf("peers.%s.apiKey: unknown macro '${%s}'", peerName, matches[0][1])
+		}
+		if matches := macroPatternRegex.FindAllStringSubmatch(peerConfig.Filters.StripParams, -1); len(matches) > 0 {
+			return Config{}, fmt.Errorf("peers.%s.filters.stripParams: unknown macro '${%s}'", peerName, matches[0][1])
+		}
+		if len(peerConfig.Filters.SetParams) > 0 {
+			if err := validateNestedForUnknownMacros(peerConfig.Filters.SetParams, fmt.Sprintf("peers.%s.filters.setParams", peerName)); err != nil {
+				return Config{}, err
+			}
+		}
+		config.Peers[peerName] = peerConfig
+	}
+
+	return config, nil
+}
+
+// rewrites the yaml to include a default group with any orphaned models
+func AddDefaultGroupToConfig(config Config) Config {
+
+	if config.Groups == nil {
+		config.Groups = make(map[string]GroupConfig)
+	}
+
+	defaultGroup := GroupConfig{
+		Swap:      true,
+		Exclusive: true,
+		Members:   []string{},
+	}
+	// if groups is empty, create a default group and put
+	// all models into it
+	if len(config.Groups) == 0 {
+		for modelName := range config.Models {
+			defaultGroup.Members = append(defaultGroup.Members, modelName)
+		}
+	} else {
+		// iterate over existing group members and add non-grouped models into the default group
+		for modelName := range config.Models {
+			foundModel := false
+		found:
+			// search for the model in existing groups
+			for _, groupConfig := range config.Groups {
+				for _, member := range groupConfig.Members {
+					if member == modelName {
+						foundModel = true
+						break found
+					}
+				}
+			}
+
+			if !foundModel {
+				defaultGroup.Members = append(defaultGroup.Members, modelName)
+			}
+		}
+	}
+
+	sort.Strings(defaultGroup.Members) // make consistent ordering for testing
+	config.Groups[DEFAULT_GROUP_ID] = defaultGroup
+
+	return config
+}
+
+func SanitizeCommand(cmdStr string) ([]string, error) {
+	var cleanedLines []string
+	for _, line := range strings.Split(cmdStr, "\n") {
+		trimmed := strings.TrimSpace(line)
+		// Skip comment lines
+		if strings.HasPrefix(trimmed, "#") {
+			continue
+		}
+		// Handle trailing backslashes by replacing with space
+		if strings.HasSuffix(trimmed, "\\") {
+			cleanedLines = append(cleanedLines, strings.TrimSuffix(trimmed, "\\")+" ")
+		} else {
+			cleanedLines = append(cleanedLines, line)
+		}
+	}
+
+	// put it back together
+	cmdStr = strings.Join(cleanedLines, "\n")
+
+	// Split the command into arguments
+	var args []string
+	if runtime.GOOS == "windows" {
+		args = shlex.Windows.Split(cmdStr)
+	} else {
+		args = shlex.Posix.Split(cmdStr)
+	}
+
+	// Ensure the command is not empty
+	if len(args) == 0 {
+		return nil, fmt.Errorf("empty command")
+	}
+
+	return args, nil
+}
+
+func StripComments(cmdStr string) string {
+	var cleanedLines []string
+	for _, line := range strings.Split(cmdStr, "\n") {
+		trimmed := strings.TrimSpace(line)
+		// Skip comment lines
+		if strings.HasPrefix(trimmed, "#") {
+			continue
+		}
+		cleanedLines = append(cleanedLines, line)
+	}
+	return strings.Join(cleanedLines, "\n")
+}
+
+// validateMacro validates macro name and value constraints
+func validateMacro(name string, value any) error {
+	if len(name) >= 64 {
+		return fmt.Errorf("macro name '%s' exceeds maximum length of 63 characters", name)
+	}
+	if !macroNameRegex.MatchString(name) {
+		return fmt.Errorf("macro name '%s' contains invalid characters, must match pattern ^[a-zA-Z0-9_-]+$", name)
+	}
+
+	// Validate that value is a scalar type
+	switch v := value.(type) {
+	case string:
+		// Check for self-reference
+		macroSlug := fmt.Sprintf("${%s}", name)
+		if strings.Contains(v, macroSlug) {
+			return fmt.Errorf("macro '%s' contains self-reference", name)
+		}
+	case int, int8, int16, int32, int64, uint, uint8, uint16, uint32, uint64, float32, float64, bool:
+		// These types are allowed
+	default:
+		return fmt.Errorf("macro '%s' has invalid type %T, must be a scalar type (string, int, float, or bool)", name, value)
+	}
+
+	switch name {
+	case "PORT", "MODEL_ID":
+		return fmt.Errorf("macro name '%s' is reserved", name)
+	}
+
+	return nil
+}
+
+// validateNestedForUnknownMacros recursively checks for any remaining macro references in nested structures
+func validateNestedForUnknownMacros(value any, context string) error {
+	switch v := value.(type) {
+	case string:
+		matches := macroPatternRegex.FindAllStringSubmatch(v, -1)
+		for _, match := range matches {
+			macroName := match[1]
+			return fmt.Errorf("%s: unknown macro '${%s}'", context, macroName)
+		}
+		// Check for unsubstituted env macros
+		envMatches := envMacroRegex.FindAllStringSubmatch(v, -1)
+		for _, match := range envMatches {
+			varName := match[1]
+			return fmt.Errorf("%s: environment variable '%s' not set", context, varName)
+		}
+		return nil
+
+	case map[string]any:
+		for _, val := range v {
+			if err := validateNestedForUnknownMacros(val, context); err != nil {
+				return err
+			}
+		}
+		return nil
+
+	case []any:
+		for _, val := range v {
+			if err := validateNestedForUnknownMacros(val, context); err != nil {
+				return err
+			}
+		}
+		return nil
+
+	default:
+		// Scalar types don't contain macros
+		return nil
+	}
+}
+
+// substituteMacroInValue recursively substitutes a single macro in a value structure
+// This is called once per macro, allowing LIFO substitution order
+func substituteMacroInValue(value any, macroName string, macroValue any) (any, error) {
+	macroSlug := fmt.Sprintf("${%s}", macroName)
+	macroStr := fmt.Sprintf("%v", macroValue)
+
+	switch v := value.(type) {
+	case string:
+		// Check if this is a direct macro substitution
+		if v == macroSlug {
+			return macroValue, nil
+		}
+		// Handle string interpolation
+		if strings.Contains(v, macroSlug) {
+			return strings.ReplaceAll(v, macroSlug, macroStr), nil
+		}
+		return v, nil
+
+	case map[string]any:
+		// Recursively process map values
+		newMap := make(map[string]any)
+		for key, val := range v {
+			newVal, err := substituteMacroInValue(val, macroName, macroValue)
+			if err != nil {
+				return nil, err
+			}
+			newMap[key] = newVal
+		}
+		return newMap, nil
+
+	case []any:
+		// Recursively process slice elements
+		newSlice := make([]any, len(v))
+		for i, val := range v {
+			newVal, err := substituteMacroInValue(val, macroName, macroValue)
+			if err != nil {
+				return nil, err
+			}
+			newSlice[i] = newVal
+		}
+		return newSlice, nil
+
+	default:
+		// Return scalar types as-is
+		return value, nil
+	}
+}
+
+// substituteEnvMacros replaces ${env.VAR_NAME} with environment variable values.
+// Returns error if any referenced env var is not set or contains invalid characters.
+// Env macros inside YAML comments are ignored by unmarshalling the YAML first
+// (which strips comments) and only checking the comment-free version for macros.
+func substituteEnvMacros(s string) (string, error) {
+	// Unmarshal and remarshal to strip YAML comments
+	var raw any
+	if err := yaml.Unmarshal([]byte(s), &raw); err != nil {
+		// If YAML is invalid, fall back to scanning the original string
+		// so the user gets the env var error rather than a confusing YAML parse error
+		return substituteEnvMacrosInString(s, s)
+	}
+	clean, err := yaml.Marshal(raw)
+	if err != nil {
+		return substituteEnvMacrosInString(s, s)
+	}
+
+	return substituteEnvMacrosInString(s, string(clean))
+}
+
+// substituteEnvMacrosInString finds ${env.VAR} macros in scanStr and substitutes
+// them in target. This separation allows scanning comment-free YAML while
+// substituting in the original string.
+func substituteEnvMacrosInString(target, scanStr string) (string, error) {
+	result := target
+	matches := envMacroRegex.FindAllStringSubmatch(scanStr, -1)
+	for _, match := range matches {
+		fullMatch := match[0] // ${env.VAR_NAME}
+		varName := match[1]   // VAR_NAME
+
+		value, exists := os.LookupEnv(varName)
+		if !exists {
+			return "", fmt.Errorf("environment variable '%s' is not set", varName)
+		}
+
+		// Sanitize the value for safe YAML substitution
+		value, err := sanitizeEnvValueForYAML(value, varName)
+		if err != nil {
+			return "", err
+		}
+
+		result = strings.ReplaceAll(result, fullMatch, value)
+	}
+	return result, nil
+}
+
+// sanitizeEnvValueForYAML ensures an environment variable value is safe for YAML substitution.
+// It rejects values with characters that break YAML structure and escapes quotes/backslashes
+// for compatibility with double-quoted YAML strings.
+func sanitizeEnvValueForYAML(value, varName string) (string, error) {
+	// Reject values that would break YAML structure regardless of quoting context
+	if strings.ContainsAny(value, "\n\r\x00") {
+		return "", fmt.Errorf("environment variable '%s' contains newlines or null bytes which are not allowed in YAML substitution", varName)
+	}
+
+	// Escape backslashes and double quotes for safe use in double-quoted YAML strings.
+	// In unquoted contexts, these escapes appear literally (harmless for most use cases).
+	// In double-quoted contexts, they are interpreted correctly.
+	value = strings.ReplaceAll(value, `\`, `\\`)
+	value = strings.ReplaceAll(value, `"`, `\"`)
+
+	return value, nil
+}
@@ -0,0 +1,290 @@
+//go:build !windows
+
+package config
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestConfig_SanitizeCommand(t *testing.T) {
+	// Test a command with spaces and newlines
+	args, err := SanitizeCommand(`python model1.py \
+		-a "double quotes" \
+		--arg2 'single quotes'
+		-s
+		# comment 1
+		--arg3 123 \
+
+		  # comment 2
+		--arg4 '"string in string"'
+
+
+		# this will get stripped out as well as the white space above
+		-c "'single quoted'"
+		`)
+	assert.NoError(t, err)
+	assert.Equal(t, []string{
+		"python", "model1.py",
+		"-a", "double quotes",
+		"--arg2", "single quotes",
+		"-s",
+		"--arg3", "123",
+		"--arg4", `"string in string"`,
+		"-c", `'single quoted'`,
+	}, args)
+
+	// Test an empty command
+	args, err = SanitizeCommand("")
+	assert.Error(t, err)
+	assert.Nil(t, args)
+}
+
+// Test the default values are automatically set for global, model and group configurations
+// after loading the configuration
+func TestConfig_DefaultValuesPosix(t *testing.T) {
+	content := `
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+`
+
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+	assert.Equal(t, 120, config.HealthCheckTimeout)
+	assert.Equal(t, 5800, config.StartPort)
+	assert.Equal(t, "info", config.LogLevel)
+	assert.Equal(t, "", config.LogTimeFormat)
+
+	// Test default group exists
+	defaultGroup, exists := config.Groups["(default)"]
+	assert.True(t, exists, "default group should exist")
+	if assert.NotNil(t, defaultGroup, "default group should not be nil") {
+		assert.Equal(t, true, defaultGroup.Swap)
+		assert.Equal(t, true, defaultGroup.Exclusive)
+		assert.Equal(t, false, defaultGroup.Persistent)
+		assert.Equal(t, []string{"model1"}, defaultGroup.Members)
+	}
+
+	model1, exists := config.Models["model1"]
+	assert.True(t, exists, "model1 should exist")
+	if assert.NotNil(t, model1, "model1 should not be nil") {
+		assert.Equal(t, "path/to/cmd --port 5800", model1.Cmd) // has the port replaced
+		assert.Equal(t, "", model1.CmdStop)
+		assert.Equal(t, "http://localhost:5800", model1.Proxy)
+		assert.Equal(t, "/health", model1.CheckEndpoint)
+		assert.Equal(t, []string{}, model1.Aliases)
+		assert.Equal(t, []string{}, model1.Env)
+		assert.Equal(t, 0, model1.UnloadAfter)
+		assert.Equal(t, false, model1.Unlisted)
+		assert.Equal(t, "", model1.UseModelName)
+		assert.Equal(t, 0, model1.ConcurrencyLimit)
+	}
+
+	// default empty filter exists
+	assert.Equal(t, "", model1.Filters.StripParams)
+}
+
+func TestConfig_LoadPosix(t *testing.T) {
+	// Create a temporary YAML file for testing
+	tempDir, err := os.MkdirTemp("", "test-config")
+	if err != nil {
+		t.Fatalf("Failed to create temporary directory: %v", err)
+	}
+	defer os.RemoveAll(tempDir)
+
+	tempFile := filepath.Join(tempDir, "config.yaml")
+	content := `
+macros:
+  svr-path: "path/to/server"
+hooks:
+  on_startup:
+    preload: ["model1", "model2"]
+models:
+  model1:
+    cmd: path/to/cmd --arg1 one
+    proxy: "http://localhost:8080"
+    name: "Model 1"
+    description: "This is model 1"
+    aliases:
+      - "m1"
+      - "model-one"
+    env:
+      - "VAR1=value1"
+      - "VAR2=value2"
+    checkEndpoint: "/health"
+  model2:
+    cmd: ${svr-path} --arg1 one
+    proxy: "http://localhost:8081"
+    aliases:
+      - "m2"
+    checkEndpoint: "/"
+  model3:
+    cmd: path/to/cmd --arg1 one
+    proxy: "http://localhost:8081"
+    aliases:
+      - "mthree"
+    checkEndpoint: "/"
+  model4:
+    cmd: path/to/cmd --arg1 one
+    proxy: "http://localhost:8082"
+    checkEndpoint: "/"
+
+healthCheckTimeout: 15
+profiles:
+  test:
+    - model1
+    - model2
+groups:
+  group1:
+    swap: true
+    exclusive: false
+    members: ["model2"]
+  forever:
+    exclusive: false
+    persistent: true
+    members:
+      - "model4"
+`
+
+	if err := os.WriteFile(tempFile, []byte(content), 0644); err != nil {
+		t.Fatalf("Failed to write temporary file: %v", err)
+	}
+
+	// Load the config and verify
+	config, err := LoadConfig(tempFile)
+	if err != nil {
+		t.Fatalf("Failed to load config: %v", err)
+	}
+
+	modelLoadingState := false
+
+	defaultTimeout := TimeoutsConfig{
+		Connect:        30,
+		KeepAlive:      30,
+		ResponseHeader: 0,
+		TLSHandshake:   10,
+		ExpectContinue: 1,
+		IdleConn:       90,
+	}
+
+	expectedGroups := map[string]GroupConfig{
+		DEFAULT_GROUP_ID: {
+			Swap:      true,
+			Exclusive: true,
+			Members:   []string{"model1", "model3"},
+		},
+		"group1": {
+			Swap:      true,
+			Exclusive: false,
+			Members:   []string{"model2"},
+		},
+		"forever": {
+			Swap:       true,
+			Exclusive:  false,
+			Persistent: true,
+			Members:    []string{"model4"},
+		},
+	}
+
+	expected := Config{
+		LogLevel:      "info",
+		LogTimeFormat: "",
+		LogToStdout:   LogToStdoutProxy,
+		StartPort:     5800,
+		Macros: MacroList{
+			{"svr-path", "path/to/server"},
+		},
+		Hooks: HooksConfig{
+			OnStartup: HookOnStartup{
+				Preload: []string{"model1", "model2"},
+			},
+		},
+		SendLoadingState: false,
+		Models: map[string]ModelConfig{
+			"model1": {
+				Cmd:                "path/to/cmd --arg1 one",
+				Proxy:              "http://localhost:8080",
+				Aliases:            []string{"m1", "model-one"},
+				Env:                []string{"VAR1=value1", "VAR2=value2"},
+				CheckEndpoint:      "/health",
+				Name:               "Model 1",
+				Description:        "This is model 1",
+				SendLoadingState:   &modelLoadingState,
+				Timeouts:           defaultTimeout,
+				HealthCheckTimeout: 15,
+			},
+			"model2": {
+				Cmd:                "path/to/server --arg1 one",
+				Proxy:              "http://localhost:8081",
+				Aliases:            []string{"m2"},
+				Env:                []string{},
+				CheckEndpoint:      "/",
+				SendLoadingState:   &modelLoadingState,
+				Timeouts:           defaultTimeout,
+				HealthCheckTimeout: 15,
+			},
+			"model3": {
+				Cmd:                "path/to/cmd --arg1 one",
+				Proxy:              "http://localhost:8081",
+				Aliases:            []string{"mthree"},
+				Env:                []string{},
+				CheckEndpoint:      "/",
+				SendLoadingState:   &modelLoadingState,
+				Timeouts:           defaultTimeout,
+				HealthCheckTimeout: 15,
+			},
+			"model4": {
+				Cmd:                "path/to/cmd --arg1 one",
+				Proxy:              "http://localhost:8082",
+				CheckEndpoint:      "/",
+				Aliases:            []string{},
+				Env:                []string{},
+				SendLoadingState:   &modelLoadingState,
+				Timeouts:           defaultTimeout,
+				HealthCheckTimeout: 15,
+			},
+		},
+		HealthCheckTimeout: 15,
+		MetricsMaxInMemory: 1000,
+		CaptureBuffer:      5,
+		Performance: PerformanceConfig{
+			Every: 5 * time.Second,
+		},
+		Profiles: map[string][]string{
+			"test": {"model1", "model2"},
+		},
+		aliases: map[string]string{
+			"m1":        "model1",
+			"model-one": "model1",
+			"m2":        "model2",
+			"mthree":    "model3",
+		},
+		Groups: expectedGroups,
+		Upstream: UpstreamConfig{
+			IgnorePaths: DefaultUpstreamIgnorePaths(),
+		},
+		Routing: RoutingConfig{
+			Router: RouterConfig{
+				Use: "group",
+				Settings: RouterSettings{
+					Groups: expectedGroups,
+				},
+			},
+			Scheduler: SchedulerConfig{
+				Use: "fifo",
+			},
+		},
+	}
+
+	assert.Equal(t, expected, config)
+
+	realname, found := config.RealModelName("m1")
+	assert.True(t, found)
+	assert.Equal(t, "model1", realname)
+}
@@ -0,0 +1,60 @@
+package config
+
+import (
+	"encoding/json"
+	"os"
+	"testing"
+
+	"github.com/google/jsonschema-go/jsonschema"
+	"gopkg.in/yaml.v3"
+)
+
+// TestConfig_ExampleMatchesSchema validates that config.example.yaml conforms to
+// config-schema.json. Both files live at the repository root.
+func TestConfig_ExampleMatchesSchema(t *testing.T) {
+	const (
+		schemaPath  = "../../config-schema.json"
+		examplePath = "../../config.example.yaml"
+	)
+
+	schemaBytes, err := os.ReadFile(schemaPath)
+	if err != nil {
+		t.Fatalf("reading %s: %v", schemaPath, err)
+	}
+
+	var schema jsonschema.Schema
+	if err := json.Unmarshal(schemaBytes, &schema); err != nil {
+		t.Fatalf("unmarshalling schema: %v", err)
+	}
+
+	resolved, err := schema.Resolve(&jsonschema.ResolveOptions{
+		BaseURI: "https://github.com/mostlygeek/llama-swap/",
+	})
+	if err != nil {
+		t.Fatalf("resolving schema: %v", err)
+	}
+
+	exampleBytes, err := os.ReadFile(examplePath)
+	if err != nil {
+		t.Fatalf("reading %s: %v", examplePath, err)
+	}
+
+	// Convert YAML to a JSON-like value so numbers and keys match what the
+	// validator expects.
+	var yamlValue any
+	if err := yaml.Unmarshal(exampleBytes, &yamlValue); err != nil {
+		t.Fatalf("unmarshalling example yaml: %v", err)
+	}
+	jsonBytes, err := json.Marshal(yamlValue)
+	if err != nil {
+		t.Fatalf("converting example to json: %v", err)
+	}
+	var instance any
+	if err := json.Unmarshal(jsonBytes, &instance); err != nil {
+		t.Fatalf("unmarshalling example json: %v", err)
+	}
+
+	if err := resolved.Validate(instance); err != nil {
+		t.Fatalf("config.example.yaml does not match config-schema.json:\n%v", err)
+	}
+}
@@ -0,0 +1,279 @@
+//go:build windows
+
+package config
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestConfig_SanitizeCommand(t *testing.T) {
+	// does not support single quoted strings like in config_posix_test.go
+	args, err := SanitizeCommand(`python model1.py \
+
+	-a "double quotes" \
+	-s
+	--arg3 123 \
+
+	   # comment 2
+	--arg4 '"string in string"'
+
+
+
+	# this will get stripped out as well as the white space above
+	-c "'single quoted'"
+	`)
+	assert.NoError(t, err)
+	assert.Equal(t, []string{
+		"python", "model1.py",
+		"-a", "double quotes",
+		"-s",
+		"--arg3", "123",
+		"--arg4", "'string in string'", // this is a little weird but the lexer says so...?
+		"-c", `'single quoted'`,
+	}, args)
+
+	// Test an empty command
+	args, err = SanitizeCommand("")
+	assert.Error(t, err)
+	assert.Nil(t, args)
+}
+
+func TestConfig_DefaultValuesWindows(t *testing.T) {
+	content := `
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+`
+
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+	assert.Equal(t, 120, config.HealthCheckTimeout)
+	assert.Equal(t, 5800, config.StartPort)
+	assert.Equal(t, "info", config.LogLevel)
+	assert.Equal(t, "", config.LogTimeFormat)
+
+	// Test default group exists
+	defaultGroup, exists := config.Groups["(default)"]
+	assert.True(t, exists, "default group should exist")
+	if assert.NotNil(t, defaultGroup, "default group should not be nil") {
+		assert.Equal(t, true, defaultGroup.Swap)
+		assert.Equal(t, true, defaultGroup.Exclusive)
+		assert.Equal(t, false, defaultGroup.Persistent)
+		assert.Equal(t, []string{"model1"}, defaultGroup.Members)
+	}
+
+	model1, exists := config.Models["model1"]
+	assert.True(t, exists, "model1 should exist")
+	if assert.NotNil(t, model1, "model1 should not be nil") {
+		assert.Equal(t, "path/to/cmd --port 5800", model1.Cmd) // has the port replaced
+		assert.Equal(t, "taskkill /f /t /pid ${PID}", model1.CmdStop)
+		assert.Equal(t, "http://localhost:5800", model1.Proxy)
+		assert.Equal(t, "/health", model1.CheckEndpoint)
+		assert.Equal(t, []string{}, model1.Aliases)
+		assert.Equal(t, []string{}, model1.Env)
+		assert.Equal(t, 0, model1.UnloadAfter)
+		assert.Equal(t, false, model1.Unlisted)
+		assert.Equal(t, "", model1.UseModelName)
+		assert.Equal(t, 0, model1.ConcurrencyLimit)
+	}
+
+	// default empty filter exists
+	assert.Equal(t, "", model1.Filters.StripParams)
+}
+
+func TestConfig_LoadWindows(t *testing.T) {
+	// Create a temporary YAML file for testing
+	tempDir, err := os.MkdirTemp("", "test-config")
+	if err != nil {
+		t.Fatalf("Failed to create temporary directory: %v", err)
+	}
+	defer os.RemoveAll(tempDir)
+
+	tempFile := filepath.Join(tempDir, "config.yaml")
+	content := `
+macros:
+  svr-path: "path/to/server"
+models:
+  model1:
+    cmd: path/to/cmd --arg1 one
+    proxy: "http://localhost:8080"
+    aliases:
+      - "m1"
+      - "model-one"
+    env:
+      - "VAR1=value1"
+      - "VAR2=value2"
+    checkEndpoint: "/health"
+  model2:
+    cmd: ${svr-path} --arg1 one
+    proxy: "http://localhost:8081"
+    aliases:
+      - "m2"
+    checkEndpoint: "/"
+  model3:
+    cmd: path/to/cmd --arg1 one
+    proxy: "http://localhost:8081"
+    aliases:
+      - "mthree"
+    checkEndpoint: "/"
+  model4:
+    cmd: path/to/cmd --arg1 one
+    proxy: "http://localhost:8082"
+    checkEndpoint: "/"
+
+healthCheckTimeout: 15
+profiles:
+  test:
+    - model1
+    - model2
+groups:
+  group1:
+    swap: true
+    exclusive: false
+    members: ["model2"]
+  forever:
+    exclusive: false
+    persistent: true
+    members:
+      - "model4"
+`
+
+	if err := os.WriteFile(tempFile, []byte(content), 0644); err != nil {
+		t.Fatalf("Failed to write temporary file: %v", err)
+	}
+
+	// Load the config and verify
+	config, err := LoadConfig(tempFile)
+	if err != nil {
+		t.Fatalf("Failed to load config: %v", err)
+	}
+
+	modelLoadingState := false
+
+	defaultTimeout := TimeoutsConfig{
+		Connect:        30,
+		KeepAlive:      30,
+		ResponseHeader: 0,
+		TLSHandshake:   10,
+		ExpectContinue: 1,
+		IdleConn:       90,
+	}
+
+	expectedGroups := map[string]GroupConfig{
+		DEFAULT_GROUP_ID: {
+			Swap:      true,
+			Exclusive: true,
+			Members:   []string{"model1", "model3"},
+		},
+		"group1": {
+			Swap:      true,
+			Exclusive: false,
+			Members:   []string{"model2"},
+		},
+		"forever": {
+			Swap:       true,
+			Exclusive:  false,
+			Persistent: true,
+			Members:    []string{"model4"},
+		},
+	}
+
+	expected := Config{
+		LogLevel:      "info",
+		LogTimeFormat: "",
+		LogToStdout:   LogToStdoutProxy,
+		StartPort:     5800,
+		Macros: MacroList{
+			{"svr-path", "path/to/server"},
+		},
+		SendLoadingState: false,
+		Models: map[string]ModelConfig{
+			"model1": {
+				Cmd:                "path/to/cmd --arg1 one",
+				CmdStop:            "taskkill /f /t /pid ${PID}",
+				Proxy:              "http://localhost:8080",
+				Aliases:            []string{"m1", "model-one"},
+				Env:                []string{"VAR1=value1", "VAR2=value2"},
+				CheckEndpoint:      "/health",
+				SendLoadingState:   &modelLoadingState,
+				Timeouts:           defaultTimeout,
+				HealthCheckTimeout: 15,
+			},
+			"model2": {
+				Cmd:                "path/to/server --arg1 one",
+				CmdStop:            "taskkill /f /t /pid ${PID}",
+				Proxy:              "http://localhost:8081",
+				Aliases:            []string{"m2"},
+				Env:                []string{},
+				CheckEndpoint:      "/",
+				SendLoadingState:   &modelLoadingState,
+				Timeouts:           defaultTimeout,
+				HealthCheckTimeout: 15,
+			},
+			"model3": {
+				Cmd:                "path/to/cmd --arg1 one",
+				CmdStop:            "taskkill /f /t /pid ${PID}",
+				Proxy:              "http://localhost:8081",
+				Aliases:            []string{"mthree"},
+				Env:                []string{},
+				CheckEndpoint:      "/",
+				SendLoadingState:   &modelLoadingState,
+				Timeouts:           defaultTimeout,
+				HealthCheckTimeout: 15,
+			},
+			"model4": {
+				Cmd:                "path/to/cmd --arg1 one",
+				CmdStop:            "taskkill /f /t /pid ${PID}",
+				Proxy:              "http://localhost:8082",
+				CheckEndpoint:      "/",
+				Aliases:            []string{},
+				Env:                []string{},
+				SendLoadingState:   &modelLoadingState,
+				Timeouts:           defaultTimeout,
+				HealthCheckTimeout: 15,
+			},
+		},
+		HealthCheckTimeout: 15,
+		MetricsMaxInMemory: 1000,
+		CaptureBuffer:      5,
+		Performance: PerformanceConfig{
+			Every: 5 * time.Second,
+		},
+		Profiles: map[string][]string{
+			"test": {"model1", "model2"},
+		},
+		aliases: map[string]string{
+			"m1":        "model1",
+			"model-one": "model1",
+			"m2":        "model2",
+			"mthree":    "model3",
+		},
+		Groups: expectedGroups,
+		Upstream: UpstreamConfig{
+			IgnorePaths: DefaultUpstreamIgnorePaths(),
+		},
+		Routing: RoutingConfig{
+			Router: RouterConfig{
+				Use: "group",
+				Settings: RouterSettings{
+					Groups: expectedGroups,
+				},
+			},
+			Scheduler: SchedulerConfig{
+				Use: "fifo",
+			},
+		},
+	}
+
+	assert.Equal(t, expected, config)
+
+	realname, found := config.RealModelName("m1")
+	assert.True(t, found)
+	assert.Equal(t, "model1", realname)
+}
@@ -0,0 +1,114 @@
+package config
+
+import (
+	"slices"
+	"sort"
+	"strings"
+)
+
+// ProtectedParams is a list of parameters that cannot be set or stripped via filters
+// These are protected to prevent breaking the proxy's ability to route requests correctly
+var ProtectedParams = []string{"model"}
+
+// Filters contains filter settings for modifying request parameters
+// Used by both models and peers
+type Filters struct {
+	// StripParams is a comma-separated list of parameters to remove from requests
+	// The "model" parameter can never be removed
+	StripParams string `yaml:"stripParams"`
+
+	// SetParams is a dictionary of parameters to set/override in requests
+	// Protected params (like "model") cannot be set
+	SetParams map[string]any `yaml:"setParams"`
+
+	// SetParamsByID maps requested model IDs to parameters to set/override in requests.
+	// Useful with aliases: a single loaded model can behave differently depending on
+	// which alias the client used. Applied after SetParams, so it can override those values.
+	// Protected params (like "model") cannot be set.
+	SetParamsByID map[string]map[string]any `yaml:"setParamsByID"`
+}
+
+// SanitizedStripParams returns a sorted list of parameters to strip,
+// with duplicates, empty strings, and protected params removed
+func (f Filters) SanitizedStripParams() []string {
+	if f.StripParams == "" {
+		return nil
+	}
+
+	params := strings.Split(f.StripParams, ",")
+	cleaned := make([]string, 0, len(params))
+	seen := make(map[string]bool)
+
+	for _, param := range params {
+		trimmed := strings.TrimSpace(param)
+		// Skip protected params, empty strings, and duplicates
+		if slices.Contains(ProtectedParams, trimmed) || trimmed == "" || seen[trimmed] {
+			continue
+		}
+		seen[trimmed] = true
+		cleaned = append(cleaned, trimmed)
+	}
+
+	if len(cleaned) == 0 {
+		return nil
+	}
+
+	slices.Sort(cleaned)
+	return cleaned
+}
+
+// SanitizedSetParamsByID returns the params to set for the given requestedModelID,
+// with protected params removed and keys sorted for consistent iteration order.
+// Returns nil if the ID has no entry or all its params are protected.
+func (f Filters) SanitizedSetParamsByID(requestedModelID string) (map[string]any, []string) {
+	if len(f.SetParamsByID) == 0 {
+		return nil, nil
+	}
+	params, found := f.SetParamsByID[requestedModelID]
+	if !found || len(params) == 0 {
+		return nil, nil
+	}
+	result := make(map[string]any, len(params))
+	keys := make([]string, 0, len(params))
+	for key, value := range params {
+		if slices.Contains(ProtectedParams, key) {
+			continue
+		}
+		result[key] = value
+		keys = append(keys, key)
+	}
+	sort.Strings(keys)
+	if len(result) == 0 {
+		return nil, nil
+	}
+	return result, keys
+}
+
+// SanitizedSetParams returns a copy of SetParams with protected params removed
+// and keys sorted for consistent iteration order
+func (f Filters) SanitizedSetParams() (map[string]any, []string) {
+	if len(f.SetParams) == 0 {
+		return nil, nil
+	}
+
+	result := make(map[string]any, len(f.SetParams))
+	keys := make([]string, 0, len(f.SetParams))
+
+	for key, value := range f.SetParams {
+		// Skip protected params
+		if slices.Contains(ProtectedParams, key) {
+			continue
+		}
+		result[key] = value
+		keys = append(keys, key)
+	}
+
+	// Sort keys for consistent ordering
+	sort.Strings(keys)
+
+	if len(result) == 0 {
+		return nil, nil
+	}
+
+	return result, keys
+}
@@ -0,0 +1,285 @@
+package config
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestFilters_SanitizedStripParams(t *testing.T) {
+	tests := []struct {
+		name        string
+		stripParams string
+		want        []string
+	}{
+		{
+			name:        "empty string",
+			stripParams: "",
+			want:        nil,
+		},
+		{
+			name:        "single param",
+			stripParams: "temperature",
+			want:        []string{"temperature"},
+		},
+		{
+			name:        "multiple params",
+			stripParams: "temperature, top_p, top_k",
+			want:        []string{"temperature", "top_k", "top_p"}, // sorted
+		},
+		{
+			name:        "model param filtered",
+			stripParams: "model, temperature, top_p",
+			want:        []string{"temperature", "top_p"},
+		},
+		{
+			name:        "only model param",
+			stripParams: "model",
+			want:        nil,
+		},
+		{
+			name:        "duplicates removed",
+			stripParams: "temperature, top_p, temperature",
+			want:        []string{"temperature", "top_p"},
+		},
+		{
+			name:        "extra whitespace",
+			stripParams: "  temperature  ,  top_p  ",
+			want:        []string{"temperature", "top_p"},
+		},
+		{
+			name:        "empty values filtered",
+			stripParams: "temperature,,top_p,",
+			want:        []string{"temperature", "top_p"},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			f := Filters{StripParams: tt.stripParams}
+			got := f.SanitizedStripParams()
+			assert.Equal(t, tt.want, got)
+		})
+	}
+}
+
+func TestFilters_SanitizedSetParams(t *testing.T) {
+	tests := []struct {
+		name       string
+		setParams  map[string]any
+		wantParams map[string]any
+		wantKeys   []string
+	}{
+		{
+			name:       "empty setParams",
+			setParams:  nil,
+			wantParams: nil,
+			wantKeys:   nil,
+		},
+		{
+			name:       "empty map",
+			setParams:  map[string]any{},
+			wantParams: nil,
+			wantKeys:   nil,
+		},
+		{
+			name: "normal params",
+			setParams: map[string]any{
+				"temperature": 0.7,
+				"top_p":       0.9,
+			},
+			wantParams: map[string]any{
+				"temperature": 0.7,
+				"top_p":       0.9,
+			},
+			wantKeys: []string{"temperature", "top_p"},
+		},
+		{
+			name: "protected model param filtered",
+			setParams: map[string]any{
+				"model":       "should-be-filtered",
+				"temperature": 0.7,
+			},
+			wantParams: map[string]any{
+				"temperature": 0.7,
+			},
+			wantKeys: []string{"temperature"},
+		},
+		{
+			name: "only protected param",
+			setParams: map[string]any{
+				"model": "should-be-filtered",
+			},
+			wantParams: nil,
+			wantKeys:   nil,
+		},
+		{
+			name: "complex nested values",
+			setParams: map[string]any{
+				"provider": map[string]any{
+					"data_collection": "deny",
+					"allow_fallbacks": false,
+				},
+				"transforms": []string{"middle-out"},
+			},
+			wantParams: map[string]any{
+				"provider": map[string]any{
+					"data_collection": "deny",
+					"allow_fallbacks": false,
+				},
+				"transforms": []string{"middle-out"},
+			},
+			wantKeys: []string{"provider", "transforms"},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			f := Filters{SetParams: tt.setParams}
+			gotParams, gotKeys := f.SanitizedSetParams()
+
+			assert.Equal(t, len(tt.wantKeys), len(gotKeys), "keys length mismatch")
+			for i, key := range gotKeys {
+				assert.Equal(t, tt.wantKeys[i], key, "key mismatch at %d", i)
+			}
+
+			if tt.wantParams == nil {
+				assert.Nil(t, gotParams, "expected nil params")
+				return
+			}
+
+			assert.Equal(t, len(tt.wantParams), len(gotParams), "params length mismatch")
+			for key, wantValue := range tt.wantParams {
+				gotValue, exists := gotParams[key]
+				assert.True(t, exists, "missing key: %s", key)
+				// Simple comparison for basic types
+				switch v := wantValue.(type) {
+				case string, int, float64, bool:
+					assert.Equal(t, v, gotValue, "value mismatch for key %s", key)
+				}
+			}
+		})
+	}
+}
+
+func TestFilters_SanitizedSetParamsByID(t *testing.T) {
+	tests := []struct {
+		name             string
+		setParamsByID    map[string]map[string]any
+		requestedModelID string
+		wantParams       map[string]any
+		wantKeys         []string
+	}{
+		{
+			name:             "empty SetParamsByID returns nil",
+			setParamsByID:    nil,
+			requestedModelID: "model1",
+			wantParams:       nil,
+			wantKeys:         nil,
+		},
+		{
+			name:             "empty map returns nil",
+			setParamsByID:    map[string]map[string]any{},
+			requestedModelID: "model1",
+			wantParams:       nil,
+			wantKeys:         nil,
+		},
+		{
+			name: "non-matching model ID returns nil",
+			setParamsByID: map[string]map[string]any{
+				"model2": {"temperature": 0.9},
+			},
+			requestedModelID: "model1",
+			wantParams:       nil,
+			wantKeys:         nil,
+		},
+		{
+			name: "matching model ID returns correct params",
+			setParamsByID: map[string]map[string]any{
+				"model1": {"temperature": 0.7, "top_p": 0.9},
+				"model2": {"temperature": 0.5},
+			},
+			requestedModelID: "model1",
+			wantParams: map[string]any{
+				"temperature": 0.7,
+				"top_p":       0.9,
+			},
+			wantKeys: []string{"temperature", "top_p"},
+		},
+		{
+			name: "protected param model is filtered out",
+			setParamsByID: map[string]map[string]any{
+				"model1": {
+					"model":       "should-be-filtered",
+					"temperature": 0.7,
+				},
+			},
+			requestedModelID: "model1",
+			wantParams: map[string]any{
+				"temperature": 0.7,
+			},
+			wantKeys: []string{"temperature"},
+		},
+		{
+			name: "only protected param returns nil",
+			setParamsByID: map[string]map[string]any{
+				"model1": {
+					"model": "should-be-filtered",
+				},
+			},
+			requestedModelID: "model1",
+			wantParams:       nil,
+			wantKeys:         nil,
+		},
+		{
+			name: "keys are sorted",
+			setParamsByID: map[string]map[string]any{
+				"model1": {
+					"z_param": "z",
+					"a_param": "a",
+					"m_param": "m",
+				},
+			},
+			requestedModelID: "model1",
+			wantParams: map[string]any{
+				"z_param": "z",
+				"a_param": "a",
+				"m_param": "m",
+			},
+			wantKeys: []string{"a_param", "m_param", "z_param"},
+		},
+		{
+			name: "alias style key lookup",
+			setParamsByID: map[string]map[string]any{
+				"model1:high": {"reasoning_effort": "high"},
+				"model1:low":  {"reasoning_effort": "low"},
+			},
+			requestedModelID: "model1:high",
+			wantParams: map[string]any{
+				"reasoning_effort": "high",
+			},
+			wantKeys: []string{"reasoning_effort"},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			f := Filters{SetParamsByID: tt.setParamsByID}
+			gotParams, gotKeys := f.SanitizedSetParamsByID(tt.requestedModelID)
+
+			if tt.wantParams == nil {
+				assert.Nil(t, gotParams)
+				assert.Nil(t, gotKeys)
+				return
+			}
+
+			assert.Equal(t, tt.wantKeys, gotKeys)
+			assert.Equal(t, tt.wantParams, gotParams)
+		})
+	}
+}
+
+func TestProtectedParams(t *testing.T) {
+	// Verify that "model" is protected
+	assert.Contains(t, ProtectedParams, "model")
+}
@@ -0,0 +1,179 @@
+package config
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+// Test macro-in-macro basic substitution
+func TestConfig_MacroInMacroBasic(t *testing.T) {
+	content := `
+startPort: 10000
+macros:
+  "A": "value-A"
+  "B": "prefix-${A}-suffix"
+
+models:
+  test:
+    cmd: echo ${B}
+    proxy: http://localhost:8080
+`
+
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+	assert.Equal(t, "echo prefix-value-A-suffix", config.Models["test"].Cmd)
+}
+
+// Test LIFO substitution order with 3+ macro levels
+func TestConfig_MacroInMacroLIFOOrder(t *testing.T) {
+	content := `
+startPort: 10000
+macros:
+  "base": "/models"
+  "path": "${base}/llama"
+  "full": "${path}/model.gguf"
+
+models:
+  test:
+    cmd: load ${full}
+    proxy: http://localhost:8080
+`
+
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+	assert.Equal(t, "load /models/llama/model.gguf", config.Models["test"].Cmd)
+}
+
+// Test MODEL_ID in global macro used by model
+func TestConfig_ModelIdInGlobalMacro(t *testing.T) {
+	content := `
+startPort: 10000
+macros:
+  "podman-llama": "podman run --name ${MODEL_ID} ghcr.io/ggml-org/llama.cpp:server-cuda"
+
+models:
+  my-model:
+    cmd: ${podman-llama} -m model.gguf
+    proxy: http://localhost:8080
+`
+
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+	assert.Equal(t, "podman run --name my-model ghcr.io/ggml-org/llama.cpp:server-cuda -m model.gguf", config.Models["my-model"].Cmd)
+}
+
+// Test model macro overrides global macro in substitution
+func TestConfig_ModelMacroOverridesGlobal(t *testing.T) {
+	content := `
+startPort: 10000
+macros:
+  "tag": "global"
+  "msg": "value-${tag}"
+
+models:
+  test:
+    macros:
+      "tag": "model-level"
+    cmd: echo ${msg}
+    proxy: http://localhost:8080
+`
+
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+	assert.Equal(t, "echo value-model-level", config.Models["test"].Cmd)
+}
+
+// Test self-reference detection error
+func TestConfig_SelfReferenceDetection(t *testing.T) {
+	content := `
+startPort: 10000
+macros:
+  "recursive": "value-${recursive}"
+
+models:
+  test:
+    cmd: echo ${recursive}
+    proxy: http://localhost:8080
+`
+
+	_, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "recursive")
+	assert.Contains(t, err.Error(), "self-reference")
+}
+
+// Test macro substitution in name and description fields
+func TestConfig_MacroInNameAndDescription(t *testing.T) {
+	content := `
+startPort: 10000
+macros:
+  "VARIANT": "Q4_K_M"
+  "FAMILY": "llama"
+
+models:
+  my-model:
+    cmd: echo ok
+    proxy: http://localhost:8080
+    name: "${FAMILY} ${VARIANT}"
+    description: "A ${FAMILY} model in ${VARIANT} format"
+`
+
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+	assert.Equal(t, "llama Q4_K_M", config.Models["my-model"].Name)
+	assert.Equal(t, "A llama model in Q4_K_M format", config.Models["my-model"].Description)
+}
+
+// Test MODEL_ID macro in name and description fields
+func TestConfig_ModelIDInNameAndDescription(t *testing.T) {
+	content := `
+startPort: 10000
+models:
+  llama-3b:
+    cmd: echo ok
+    proxy: http://localhost:8080
+    name: "Model: ${MODEL_ID}"
+    description: "Running ${MODEL_ID}"
+`
+
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+	assert.Equal(t, "Model: llama-3b", config.Models["llama-3b"].Name)
+	assert.Equal(t, "Running llama-3b", config.Models["llama-3b"].Description)
+}
+
+// Test unknown macro in name or description returns an error
+func TestConfig_UnknownMacroInNameDescription(t *testing.T) {
+	content := `
+startPort: 10000
+models:
+  test:
+    cmd: echo ok
+    proxy: http://localhost:8080
+    name: "Model ${UNDEFINED}"
+`
+
+	_, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "UNDEFINED")
+}
+
+// Test undefined macro reference error
+func TestConfig_UndefinedMacroReference(t *testing.T) {
+	content := `
+startPort: 10000
+macros:
+  "A": "value-${UNDEFINED}"
+
+models:
+  test:
+    cmd: echo ${A}
+    proxy: http://localhost:8080
+`
+
+	_, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "UNDEFINED")
+}
@@ -0,0 +1,229 @@
+package config
+
+import (
+	"fmt"
+	"regexp"
+	"sort"
+
+	"gopkg.in/yaml.v3"
+)
+
+var varKeyPattern = regexp.MustCompile(`^[a-zA-Z0-9]{1,8}$`)
+
+// MatrixConfig represents the swap matrix configuration block.
+type MatrixConfig struct {
+	Var        map[string]string `yaml:"vars"`
+	EvictCosts map[string]int    `yaml:"evict_costs"`
+	Sets       OrderedSets       `yaml:"sets"`
+
+	// populated by ValidateMatrix; not settable from yaml
+	ExpandedSets []ExpandedSet `yaml:"-"`
+}
+
+// SetEntry is a single named set with its DSL expression.
+type SetEntry struct {
+	Name string
+	DSL  string
+}
+
+// OrderedSets preserves YAML definition order of sets (used for tie-breaking).
+type OrderedSets []SetEntry
+
+func (os *OrderedSets) UnmarshalYAML(value *yaml.Node) error {
+	if value.Kind != yaml.MappingNode {
+		return fmt.Errorf("sets must be a mapping")
+	}
+
+	entries := make([]SetEntry, 0, len(value.Content)/2)
+	for i := 0; i < len(value.Content); i += 2 {
+		keyNode := value.Content[i]
+		valueNode := value.Content[i+1]
+
+		var name string
+		if err := keyNode.Decode(&name); err != nil {
+			return fmt.Errorf("failed to decode set name: %w", err)
+		}
+
+		var dsl string
+		if err := valueNode.Decode(&dsl); err != nil {
+			return fmt.Errorf("failed to decode DSL for set %q: %w", name, err)
+		}
+
+		entries = append(entries, SetEntry{Name: name, DSL: dsl})
+	}
+
+	*os = entries
+	return nil
+}
+
+// ExpandedSet is one valid combination of concurrent models (real model names).
+type ExpandedSet struct {
+	SetName string
+	DSL     string
+	Models  []string // real model names, sorted
+}
+
+// ValidateMatrix validates the matrix config and returns all expanded sets.
+func ValidateMatrix(matrix MatrixConfig, models map[string]ModelConfig) ([]ExpandedSet, error) {
+	if len(matrix.Sets) == 0 {
+		return nil, fmt.Errorf("matrix must define at least one set")
+	}
+
+	if len(matrix.Var) == 0 {
+		return nil, fmt.Errorf("matrix must define at least one var")
+	}
+
+	// Validate var entries
+	if matrix.Var != nil {
+		for id, modelName := range matrix.Var {
+			if !varKeyPattern.MatchString(id) {
+				return nil, fmt.Errorf("var key %q must be alphanumeric and 1-8 characters", id)
+			}
+			if _, exists := models[modelName]; !exists {
+				return nil, fmt.Errorf("var key %q references unknown model %q", id, modelName)
+			}
+		}
+	}
+
+	// Validate evict_costs
+	if matrix.EvictCosts != nil {
+		for key, cost := range matrix.EvictCosts {
+			if cost <= 0 {
+				return nil, fmt.Errorf("evict_cost for %q must be a positive integer, got %d", key, cost)
+			}
+			if _, ok := matrix.Var[key]; !ok {
+				return nil, fmt.Errorf("evict_costs: unknown var ID %q", key)
+			}
+		}
+	}
+
+	// Build dependency graph for +ref topological sort
+	setNames := make(map[string]bool)
+	for _, entry := range matrix.Sets {
+		setNames[entry.Name] = true
+	}
+
+	deps := make(map[string][]string) // setName -> set names it depends on
+	for _, entry := range matrix.Sets {
+		refs, err := extractRefs(entry.DSL)
+		if err != nil {
+			return nil, fmt.Errorf("set %q: %w", entry.Name, err)
+		}
+		for _, ref := range refs {
+			if !setNames[ref] {
+				return nil, fmt.Errorf("set %q references undefined set %q", entry.Name, ref)
+			}
+		}
+		deps[entry.Name] = refs
+	}
+
+	// Topological sort with cycle detection
+	order, err := topologicalSort(matrix.Sets, deps)
+	if err != nil {
+		return nil, err
+	}
+
+	// Expand sets in topological order
+	resolvedRefs := make(map[string][][]string) // set name -> expanded alias-level combos
+	var allExpanded []ExpandedSet
+	totalCombinations := 0
+
+	// Build ordered map for efficient lookup
+	setDSL := make(map[string]string)
+	for _, entry := range matrix.Sets {
+		setDSL[entry.Name] = entry.DSL
+	}
+
+	for _, name := range order {
+		dsl := setDSL[name]
+		combos, err := ParseAndExpandDSL(dsl, resolvedRefs)
+		if err != nil {
+			return nil, fmt.Errorf("set %q: %w", name, err)
+		}
+
+		resolvedRefs[name] = combos
+
+		// Resolve var IDs to real model names
+		for _, combo := range combos {
+			resolved := make([]string, len(combo))
+			for i, ident := range combo {
+				realName, ok := matrix.Var[ident]
+				if !ok {
+					return nil, fmt.Errorf("set %q: unknown var ID %q", name, ident)
+				}
+				resolved[i] = realName
+			}
+			sort.Strings(resolved)
+			allExpanded = append(allExpanded, ExpandedSet{
+				SetName: name,
+				DSL:     dsl,
+				Models:  resolved,
+			})
+		}
+
+		totalCombinations += len(combos)
+		if totalCombinations > maxDSLExpansions {
+			return nil, fmt.Errorf("total expanded combinations (%d) exceed limit of %d", totalCombinations, maxDSLExpansions)
+		}
+	}
+
+	return allExpanded, nil
+}
+
+// topologicalSort returns set names in dependency order.
+// Returns an error if a cycle is detected.
+func topologicalSort(sets OrderedSets, deps map[string][]string) ([]string, error) {
+	// States: 0 = unvisited, 1 = visiting, 2 = visited
+	state := make(map[string]int)
+	var order []string
+
+	var visit func(name string) error
+	visit = func(name string) error {
+		switch state[name] {
+		case 1:
+			return fmt.Errorf("circular reference detected involving set %q", name)
+		case 2:
+			return nil
+		}
+		state[name] = 1
+
+		for _, dep := range deps[name] {
+			if err := visit(dep); err != nil {
+				return err
+			}
+		}
+
+		state[name] = 2
+		order = append(order, name)
+		return nil
+	}
+
+	// Visit in definition order for deterministic output
+	for _, entry := range sets {
+		if state[entry.Name] == 0 {
+			if err := visit(entry.Name); err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	return order, nil
+}
+
+// ResolvedEvictCosts returns a map of real model name -> evict cost,
+// resolving var IDs. Models not listed default to 1.
+func (m *MatrixConfig) ResolvedEvictCosts() map[string]int {
+	costs := make(map[string]int)
+	if m.EvictCosts == nil {
+		return costs
+	}
+	for key, cost := range m.EvictCosts {
+		// Resolve var ID if present
+		if realName, ok := m.Var[key]; ok {
+			costs[realName] = cost
+		} else {
+			costs[key] = cost
+		}
+	}
+	return costs
+}
@@ -0,0 +1,376 @@
+package config
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+	"unicode"
+)
+
+const maxDSLExpansions = 1000
+
+// Token types for the DSL lexer
+type tokenType int
+
+const (
+	tokIdent  tokenType = iota // model alias or name
+	tokAnd                     // &
+	tokOr                      // |
+	tokLParen                  // (
+	tokRParen                  // )
+	tokRef                     // +setName
+	tokEOF
+)
+
+type token struct {
+	typ tokenType
+	val string
+}
+
+// tokenize splits a DSL string into tokens.
+func tokenize(input string) ([]token, error) {
+	var tokens []token
+	i := 0
+	runes := []rune(input)
+
+	for i < len(runes) {
+		ch := runes[i]
+
+		// skip whitespace
+		if unicode.IsSpace(ch) {
+			i++
+			continue
+		}
+
+		switch ch {
+		case '&':
+			tokens = append(tokens, token{tokAnd, "&"})
+			i++
+		case '|':
+			tokens = append(tokens, token{tokOr, "|"})
+			i++
+		case '(':
+			tokens = append(tokens, token{tokLParen, "("})
+			i++
+		case ')':
+			tokens = append(tokens, token{tokRParen, ")"})
+			i++
+		case '+':
+			// +ref: read the identifier that follows
+			i++
+			start := i
+			for i < len(runes) && isIdentChar(runes[i]) {
+				i++
+			}
+			if i == start {
+				return nil, fmt.Errorf("expected set name after '+' at position %d", start)
+			}
+			tokens = append(tokens, token{tokRef, string(runes[start:i])})
+		default:
+			if isIdentChar(ch) {
+				start := i
+				for i < len(runes) && isIdentChar(runes[i]) {
+					i++
+				}
+				tokens = append(tokens, token{tokIdent, string(runes[start:i])})
+			} else {
+				return nil, fmt.Errorf("unexpected character %q at position %d", ch, i)
+			}
+		}
+	}
+
+	tokens = append(tokens, token{tokEOF, ""})
+	return tokens, nil
+}
+
+func isIdentChar(ch rune) bool {
+	return unicode.IsLetter(ch) || unicode.IsDigit(ch) || ch == '_' || ch == '-' || ch == '.'
+}
+
+// AST node types
+type dslNode interface {
+	dslNode()
+}
+
+type andNode struct {
+	children []dslNode
+}
+
+type orNode struct {
+	children []dslNode
+}
+
+type leafNode struct {
+	name string
+}
+
+type refNode struct {
+	setName string
+}
+
+func (andNode) dslNode()  {}
+func (orNode) dslNode()   {}
+func (leafNode) dslNode() {}
+func (refNode) dslNode()  {}
+
+// parser holds state for recursive-descent parsing.
+type parser struct {
+	tokens []token
+	pos    int
+}
+
+func (p *parser) peek() token {
+	if p.pos < len(p.tokens) {
+		return p.tokens[p.pos]
+	}
+	return token{tokEOF, ""}
+}
+
+func (p *parser) next() token {
+	t := p.peek()
+	if t.typ != tokEOF {
+		p.pos++
+	}
+	return t
+}
+
+func (p *parser) expect(typ tokenType) (token, error) {
+	t := p.next()
+	if t.typ != typ {
+		return t, fmt.Errorf("expected token type %d, got %q", typ, t.val)
+	}
+	return t, nil
+}
+
+// Grammar:
+//
+//	expr    = andExpr
+//	andExpr = orExpr ('&' orExpr)*
+//	orExpr  = atom ('|' atom)*
+//	atom    = ident | '+' ident | '(' expr ')'
+//
+// & binds tighter than |, so "a | b & c" means "a | (b & c)"
+func parse(tokens []token) (dslNode, error) {
+	p := &parser{tokens: tokens}
+	node, err := p.parseExpr()
+	if err != nil {
+		return nil, err
+	}
+	if p.peek().typ != tokEOF {
+		return nil, fmt.Errorf("unexpected token %q after expression", p.peek().val)
+	}
+	return node, nil
+}
+
+func (p *parser) parseExpr() (dslNode, error) {
+	return p.parseOrExpr()
+}
+
+func (p *parser) parseOrExpr() (dslNode, error) {
+	left, err := p.parseAndExpr()
+	if err != nil {
+		return nil, err
+	}
+
+	if p.peek().typ == tokOr {
+		children := []dslNode{left}
+		for p.peek().typ == tokOr {
+			p.next() // consume |
+			right, err := p.parseAndExpr()
+			if err != nil {
+				return nil, err
+			}
+			children = append(children, right)
+		}
+		return orNode{children: children}, nil
+	}
+
+	return left, nil
+}
+
+func (p *parser) parseAndExpr() (dslNode, error) {
+	left, err := p.parseAtom()
+	if err != nil {
+		return nil, err
+	}
+
+	if p.peek().typ == tokAnd {
+		children := []dslNode{left}
+		for p.peek().typ == tokAnd {
+			p.next() // consume &
+			right, err := p.parseAtom()
+			if err != nil {
+				return nil, err
+			}
+			children = append(children, right)
+		}
+		return andNode{children: children}, nil
+	}
+
+	return left, nil
+}
+
+func (p *parser) parseAtom() (dslNode, error) {
+	t := p.peek()
+
+	switch t.typ {
+	case tokIdent:
+		p.next()
+		return leafNode{name: t.val}, nil
+
+	case tokRef:
+		p.next()
+		return refNode{setName: t.val}, nil
+
+	case tokLParen:
+		p.next() // consume (
+		node, err := p.parseExpr()
+		if err != nil {
+			return nil, err
+		}
+		if _, err := p.expect(tokRParen); err != nil {
+			return nil, fmt.Errorf("missing closing parenthesis")
+		}
+		return node, nil
+
+	default:
+		return nil, fmt.Errorf("unexpected token %q", t.val)
+	}
+}
+
+// expand walks the AST and produces all combinations.
+// resolvedRefs contains previously expanded sets for +ref resolution.
+func expand(node dslNode, resolvedRefs map[string][][]string) ([][]string, error) {
+	switch n := node.(type) {
+	case leafNode:
+		return [][]string{{n.name}}, nil
+
+	case refNode:
+		expanded, ok := resolvedRefs[n.setName]
+		if !ok {
+			return nil, fmt.Errorf("unknown set reference +%s", n.setName)
+		}
+		// Return a copy
+		result := make([][]string, len(expanded))
+		for i, combo := range expanded {
+			result[i] = make([]string, len(combo))
+			copy(result[i], combo)
+		}
+		return result, nil
+
+	case orNode:
+		// Union of all children's expansions
+		var result [][]string
+		for _, child := range n.children {
+			childResult, err := expand(child, resolvedRefs)
+			if err != nil {
+				return nil, err
+			}
+			result = append(result, childResult...)
+			if len(result) > maxDSLExpansions {
+				return nil, fmt.Errorf("DSL expansion exceeded %d combinations", maxDSLExpansions)
+			}
+		}
+		return result, nil
+
+	case andNode:
+		// Cartesian product across children
+		result := [][]string{{}} // start with one empty combo
+		for _, child := range n.children {
+			childResult, err := expand(child, resolvedRefs)
+			if err != nil {
+				return nil, err
+			}
+			result, err = cartesianProduct(result, childResult, maxDSLExpansions)
+			if err != nil {
+				return nil, err
+			}
+		}
+		return result, nil
+
+	default:
+		return nil, fmt.Errorf("unknown node type %T", node)
+	}
+}
+
+// cartesianProduct computes the cartesian product of two sets of combinations.
+// It returns an error if the product would exceed cap.
+func cartesianProduct(left, right [][]string, cap int) ([][]string, error) {
+	if int64(len(left))*int64(len(right)) > int64(cap) {
+		return nil, fmt.Errorf("DSL expansion exceeded %d combinations", cap)
+	}
+	result := make([][]string, 0, len(left)*len(right))
+	for _, l := range left {
+		for _, r := range right {
+			combo := make([]string, 0, len(l)+len(r))
+			combo = append(combo, l...)
+			combo = append(combo, r...)
+			result = append(result, combo)
+		}
+	}
+	return result, nil
+}
+
+// ParseAndExpandDSL tokenizes, parses, and expands a DSL string.
+// resolvedRefs contains previously expanded sets for +ref inlining.
+func ParseAndExpandDSL(dsl string, resolvedRefs map[string][][]string) ([][]string, error) {
+	dsl = strings.TrimSpace(dsl)
+	if dsl == "" {
+		return nil, fmt.Errorf("empty DSL expression")
+	}
+
+	tokens, err := tokenize(dsl)
+	if err != nil {
+		return nil, fmt.Errorf("tokenize: %w", err)
+	}
+
+	tree, err := parse(tokens)
+	if err != nil {
+		return nil, fmt.Errorf("parse: %w", err)
+	}
+
+	result, err := expand(tree, resolvedRefs)
+	if err != nil {
+		return nil, err
+	}
+
+	// Deduplicate models within each combination and sort for consistency
+	for i, combo := range result {
+		result[i] = dedupAndSort(combo)
+	}
+
+	return result, nil
+}
+
+// dedupAndSort removes duplicate entries and sorts alphabetically.
+func dedupAndSort(items []string) []string {
+	seen := make(map[string]bool, len(items))
+	var unique []string
+	for _, item := range items {
+		if !seen[item] {
+			seen[item] = true
+			unique = append(unique, item)
+		}
+	}
+	sort.Strings(unique)
+	return unique
+}
+
+// extractRefs scans a DSL string for +ref tokens without full parsing.
+// Used for building the dependency graph for topological sorting.
+func extractRefs(dsl string) ([]string, error) {
+	tokens, err := tokenize(dsl)
+	if err != nil {
+		return nil, err
+	}
+
+	var refs []string
+	seen := make(map[string]bool)
+	for _, t := range tokens {
+		if t.typ == tokRef && !seen[t.val] {
+			seen[t.val] = true
+			refs = append(refs, t.val)
+		}
+	}
+	return refs, nil
+}
@@ -0,0 +1,300 @@
+package config
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestDSL_Tokenize(t *testing.T) {
+	tests := []struct {
+		name   string
+		input  string
+		expect []token
+		errMsg string
+	}{
+		{
+			name:  "single identifier",
+			input: "abc",
+			expect: []token{
+				{tokIdent, "abc"},
+				{tokEOF, ""},
+			},
+		},
+		{
+			name:  "identifier with hyphens and dots",
+			input: "model-name.v2",
+			expect: []token{
+				{tokIdent, "model-name.v2"},
+				{tokEOF, ""},
+			},
+		},
+		{
+			name:  "and expression",
+			input: "a & b",
+			expect: []token{
+				{tokIdent, "a"},
+				{tokAnd, "&"},
+				{tokIdent, "b"},
+				{tokEOF, ""},
+			},
+		},
+		{
+			name:  "or expression",
+			input: "a | b",
+			expect: []token{
+				{tokIdent, "a"},
+				{tokOr, "|"},
+				{tokIdent, "b"},
+				{tokEOF, ""},
+			},
+		},
+		{
+			name:  "parentheses",
+			input: "(a | b) & c",
+			expect: []token{
+				{tokLParen, "("},
+				{tokIdent, "a"},
+				{tokOr, "|"},
+				{tokIdent, "b"},
+				{tokRParen, ")"},
+				{tokAnd, "&"},
+				{tokIdent, "c"},
+				{tokEOF, ""},
+			},
+		},
+		{
+			name:  "ref token",
+			input: "+llms & v",
+			expect: []token{
+				{tokRef, "llms"},
+				{tokAnd, "&"},
+				{tokIdent, "v"},
+				{tokEOF, ""},
+			},
+		},
+		{
+			name:  "no whitespace",
+			input: "(a|b)&c",
+			expect: []token{
+				{tokLParen, "("},
+				{tokIdent, "a"},
+				{tokOr, "|"},
+				{tokIdent, "b"},
+				{tokRParen, ")"},
+				{tokAnd, "&"},
+				{tokIdent, "c"},
+				{tokEOF, ""},
+			},
+		},
+		{
+			name:   "empty ref",
+			input:  "+",
+			errMsg: "expected set name after '+'",
+		},
+		{
+			name:   "invalid character",
+			input:  "a @ b",
+			errMsg: "unexpected character",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			tokens, err := tokenize(tt.input)
+			if tt.errMsg != "" {
+				require.Error(t, err)
+				assert.Contains(t, err.Error(), tt.errMsg)
+			} else {
+				require.NoError(t, err)
+				assert.Equal(t, tt.expect, tokens)
+			}
+		})
+	}
+}
+
+func TestDSL_ParseAndExpand(t *testing.T) {
+	tests := []struct {
+		name   string
+		dsl    string
+		refs   map[string][][]string
+		expect [][]string
+		errMsg string
+	}{
+		{
+			name:   "single model",
+			dsl:    "L",
+			expect: [][]string{{"L"}},
+		},
+		{
+			name:   "two models with AND",
+			dsl:    "a & b",
+			expect: [][]string{{"a", "b"}},
+		},
+		{
+			name:   "two models with OR",
+			dsl:    "a | b",
+			expect: [][]string{{"a"}, {"b"}},
+		},
+		{
+			name:   "three models with OR",
+			dsl:    "a | b | c",
+			expect: [][]string{{"a"}, {"b"}, {"c"}},
+		},
+		{
+			name: "cartesian product (a|b) & (c|d)",
+			dsl:  "(a | b) & (c | d)",
+			expect: [][]string{
+				{"a", "c"},
+				{"a", "d"},
+				{"b", "c"},
+				{"b", "d"},
+			},
+		},
+		{
+			name: "three-way AND",
+			dsl:  "a & b & c",
+			expect: [][]string{
+				{"a", "b", "c"},
+			},
+		},
+		{
+			name: "(g | q | m) & v",
+			dsl:  "(g | q | m) & v",
+			expect: [][]string{
+				{"g", "v"},
+				{"q", "v"},
+				{"m", "v"},
+			},
+		},
+		{
+			name: "(g | q) & v & e",
+			dsl:  "(g | q) & v & e",
+			expect: [][]string{
+				{"e", "g", "v"},
+				{"e", "q", "v"},
+			},
+		},
+		{
+			name: "precedence: a | b & c means a | (b & c)",
+			dsl:  "a | b & c",
+			expect: [][]string{
+				{"a"},
+				{"b", "c"},
+			},
+		},
+		{
+			name: "+ref inlining",
+			dsl:  "+llms & v",
+			refs: map[string][][]string{
+				"llms": {{"g"}, {"q"}, {"m"}},
+			},
+			expect: [][]string{
+				{"g", "v"},
+				{"q", "v"},
+				{"m", "v"},
+			},
+		},
+		{
+			name: "+ref chained",
+			dsl:  "+with_tts & e",
+			refs: map[string][][]string{
+				"with_tts": {{"g", "v"}, {"q", "v"}, {"m", "v"}},
+			},
+			expect: [][]string{
+				{"e", "g", "v"},
+				{"e", "q", "v"},
+				{"e", "m", "v"},
+			},
+		},
+		{
+			name: "dedup within combination",
+			dsl:  "a & a",
+			expect: [][]string{
+				{"a"},
+			},
+		},
+		{
+			name:   "empty expression",
+			dsl:    "",
+			errMsg: "empty DSL expression",
+		},
+		{
+			name:   "unmatched open paren",
+			dsl:    "(a | b",
+			errMsg: "missing closing parenthesis",
+		},
+		{
+			name:   "unmatched close paren",
+			dsl:    "a | b)",
+			errMsg: "unexpected token",
+		},
+		{
+			name:   "unknown ref",
+			dsl:    "+unknown",
+			errMsg: "unknown set reference +unknown",
+		},
+		{
+			name:   "empty parens",
+			dsl:    "()",
+			errMsg: "unexpected token",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			refs := tt.refs
+			if refs == nil {
+				refs = map[string][][]string{}
+			}
+			result, err := ParseAndExpandDSL(tt.dsl, refs)
+			if tt.errMsg != "" {
+				require.Error(t, err)
+				assert.Contains(t, err.Error(), tt.errMsg)
+			} else {
+				require.NoError(t, err)
+				assert.Equal(t, tt.expect, result)
+			}
+		})
+	}
+}
+
+func TestDSL_ExpansionCap(t *testing.T) {
+	// Build an expression that would exceed 1000 combinations:
+	// (a1|a2|...|a32) & (b1|b2|...|b32) = 1024 combos
+	var aItems, bItems []string
+	for i := 0; i < 32; i++ {
+		aItems = append(aItems, fmt.Sprintf("a%d", i))
+		bItems = append(bItems, fmt.Sprintf("b%d", i))
+	}
+	dsl := fmt.Sprintf("(%s) & (%s)",
+		join(aItems, " | "),
+		join(bItems, " | "),
+	)
+	_, err := ParseAndExpandDSL(dsl, map[string][][]string{})
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "exceeded")
+}
+
+func TestDSL_ExtractRefs(t *testing.T) {
+	refs, err := extractRefs("+llms & v & +other")
+	require.NoError(t, err)
+	assert.Equal(t, []string{"llms", "other"}, refs)
+
+	refs, err = extractRefs("a & b")
+	require.NoError(t, err)
+	assert.Empty(t, refs)
+}
+
+func join(items []string, sep string) string {
+	result := ""
+	for i, item := range items {
+		if i > 0 {
+			result += sep
+		}
+		result += item
+	}
+	return result
+}
@@ -0,0 +1,307 @@
+package config
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func makeModels(names ...string) map[string]ModelConfig {
+	m := make(map[string]ModelConfig)
+	for _, name := range names {
+		m[name] = ModelConfig{Cmd: "echo " + name}
+	}
+	return m
+}
+
+func TestValidateMatrix_Basic(t *testing.T) {
+	models := makeModels("gemma", "qwen", "mistral", "voxtral", "llama70B")
+
+	matrix := MatrixConfig{
+		Var: map[string]string{
+			"g": "gemma",
+			"q": "qwen",
+			"m": "mistral",
+			"v": "voxtral",
+			"L": "llama70B",
+		},
+		EvictCosts: map[string]int{
+			"L": 30,
+			"v": 50,
+		},
+		Sets: OrderedSets{
+			{Name: "standard", DSL: "(g | q | m) & v"},
+			{Name: "full", DSL: "L"},
+		},
+	}
+
+	expanded, err := ValidateMatrix(matrix, models)
+	require.NoError(t, err)
+
+	// standard expands to [gemma,voxtral], [qwen,voxtral], [mistral,voxtral]
+	// full expands to [llama70B]
+	assert.Len(t, expanded, 4)
+
+	assert.Equal(t, "standard", expanded[0].SetName)
+	assert.Equal(t, []string{"gemma", "voxtral"}, expanded[0].Models)
+
+	assert.Equal(t, "standard", expanded[1].SetName)
+	assert.Equal(t, []string{"qwen", "voxtral"}, expanded[1].Models)
+
+	assert.Equal(t, "standard", expanded[2].SetName)
+	assert.Equal(t, []string{"mistral", "voxtral"}, expanded[2].Models)
+
+	assert.Equal(t, "full", expanded[3].SetName)
+	assert.Equal(t, []string{"llama70B"}, expanded[3].Models)
+}
+
+func TestValidateMatrix_WithRef(t *testing.T) {
+	models := makeModels("gemma", "qwen", "mistral", "voxtral", "reranker")
+
+	matrix := MatrixConfig{
+		Var: map[string]string{
+			"g": "gemma",
+			"q": "qwen",
+			"m": "mistral",
+			"v": "voxtral",
+			"e": "reranker",
+		},
+		Sets: OrderedSets{
+			{Name: "llms", DSL: "g | q | m"},
+			{Name: "with_tts", DSL: "+llms & v"},
+			{Name: "mega", DSL: "+with_tts & e"},
+		},
+	}
+
+	expanded, err := ValidateMatrix(matrix, models)
+	require.NoError(t, err)
+
+	// llms: [gemma], [qwen], [mistral]
+	// with_tts: [gemma,voxtral], [qwen,voxtral], [mistral,voxtral]
+	// mega: [gemma,reranker,voxtral], [qwen,reranker,voxtral], [mistral,reranker,voxtral]
+	assert.Len(t, expanded, 9)
+
+	// Check mega entries
+	megaEntries := filterBySetName(expanded, "mega")
+	assert.Len(t, megaEntries, 3)
+	assert.Equal(t, []string{"gemma", "reranker", "voxtral"}, megaEntries[0].Models)
+}
+
+func TestValidateMatrix_MapIDRequired(t *testing.T) {
+	// DSL cannot use real model names directly — must use var IDs
+	models := makeModels("gemma", "voxtral")
+
+	matrix := MatrixConfig{
+		Var: map[string]string{"g": "gemma"},
+		Sets: OrderedSets{
+			{Name: "combo", DSL: "g & voxtral"},
+		},
+	}
+
+	_, err := ValidateMatrix(matrix, models)
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "unknown var ID")
+}
+
+func TestValidateMatrix_InvalidAliasKey(t *testing.T) {
+	models := makeModels("gemma")
+
+	tests := []struct {
+		name   string
+		alias  string
+		errMsg string
+	}{
+		{"too long", "abcdefghi", "alphanumeric and 1-8 characters"},
+		{"has underscore", "a_b", "alphanumeric and 1-8 characters"},
+		{"has hyphen", "a-b", "alphanumeric and 1-8 characters"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			matrix := MatrixConfig{
+				Var:  map[string]string{tt.alias: "gemma"},
+				Sets: OrderedSets{{Name: "s", DSL: tt.alias}},
+			}
+			_, err := ValidateMatrix(matrix, models)
+			require.Error(t, err)
+			assert.Contains(t, err.Error(), tt.errMsg)
+		})
+	}
+}
+
+func TestValidateMatrix_AliasReferencesUnknownModel(t *testing.T) {
+	models := makeModels("gemma")
+
+	matrix := MatrixConfig{
+		Var:  map[string]string{"x": "nonexistent"},
+		Sets: OrderedSets{{Name: "s", DSL: "x"}},
+	}
+
+	_, err := ValidateMatrix(matrix, models)
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "unknown model")
+}
+
+func TestValidateMatrix_EvictCostInvalid(t *testing.T) {
+	models := makeModels("gemma")
+
+	t.Run("zero cost", func(t *testing.T) {
+		matrix := MatrixConfig{
+			Var:        map[string]string{"g": "gemma"},
+			EvictCosts: map[string]int{"g": 0},
+			Sets:       OrderedSets{{Name: "s", DSL: "g"}},
+		}
+		_, err := ValidateMatrix(matrix, models)
+		require.Error(t, err)
+		assert.Contains(t, err.Error(), "positive integer")
+	})
+
+	t.Run("negative cost", func(t *testing.T) {
+		matrix := MatrixConfig{
+			Var:        map[string]string{"g": "gemma"},
+			EvictCosts: map[string]int{"g": -1},
+			Sets:       OrderedSets{{Name: "s", DSL: "g"}},
+		}
+		_, err := ValidateMatrix(matrix, models)
+		require.Error(t, err)
+		assert.Contains(t, err.Error(), "positive integer")
+	})
+
+	t.Run("unknown var ID in evict_costs", func(t *testing.T) {
+		matrix := MatrixConfig{
+			Var:        map[string]string{"g": "gemma"},
+			EvictCosts: map[string]int{"unknown": 5},
+			Sets:       OrderedSets{{Name: "s", DSL: "g"}},
+		}
+		_, err := ValidateMatrix(matrix, models)
+		require.Error(t, err)
+		assert.Contains(t, err.Error(), "unknown var ID")
+	})
+}
+
+func TestValidateMatrix_CycleDetection(t *testing.T) {
+	models := makeModels("gemma")
+
+	matrix := MatrixConfig{
+		Var: map[string]string{"g": "gemma"},
+		Sets: OrderedSets{
+			{Name: "a", DSL: "+b"},
+			{Name: "b", DSL: "+a"},
+		},
+	}
+
+	_, err := ValidateMatrix(matrix, models)
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "circular reference")
+}
+
+func TestValidateMatrix_UndefinedRefTarget(t *testing.T) {
+	models := makeModels("gemma")
+
+	matrix := MatrixConfig{
+		Var: map[string]string{"g": "gemma"},
+		Sets: OrderedSets{
+			{Name: "a", DSL: "+nonexistent"},
+		},
+	}
+
+	_, err := ValidateMatrix(matrix, models)
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "references undefined set")
+}
+
+func TestValidateMatrix_NoSets(t *testing.T) {
+	_, err := ValidateMatrix(MatrixConfig{}, makeModels("gemma"))
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "at least one set")
+}
+
+func TestValidateMatrix_UnknownMapIDInDSL(t *testing.T) {
+	models := makeModels("gemma")
+
+	matrix := MatrixConfig{
+		Var: map[string]string{"g": "gemma"},
+		Sets: OrderedSets{
+			{Name: "s", DSL: "g & nonexistent"},
+		},
+	}
+
+	_, err := ValidateMatrix(matrix, models)
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "unknown var ID")
+}
+
+func TestValidateMatrix_ResolvedEvictCosts(t *testing.T) {
+	mc := &MatrixConfig{
+		Var: map[string]string{
+			"g": "gemma",
+			"L": "llama70B",
+		},
+		EvictCosts: map[string]int{
+			"L": 30,
+			"g": 5,
+		},
+	}
+
+	costs := mc.ResolvedEvictCosts()
+	assert.Equal(t, 30, costs["llama70B"])
+	assert.Equal(t, 5, costs["gemma"])
+}
+
+func TestValidateMatrix_ConfigXOR(t *testing.T) {
+	// groups and matrix both defined
+	yaml := `
+models:
+  model1:
+    cmd: echo model1
+    proxy: http://localhost:8080
+groups:
+  group1:
+    members:
+      - model1
+matrix:
+  sets:
+    s: "model1"
+`
+	_, err := LoadConfigFromReader(strings.NewReader(yaml))
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "cannot use both")
+}
+
+func TestValidateMatrix_ConfigMatrixOnly(t *testing.T) {
+	yaml := `
+models:
+  gemma:
+    cmd: echo gemma
+    proxy: http://localhost:8080
+  qwen:
+    cmd: echo qwen
+    proxy: http://localhost:8081
+matrix:
+  vars:
+    g: gemma
+    q: qwen
+  sets:
+    combo: "g | q"
+`
+	cfg, err := LoadConfigFromReader(strings.NewReader(yaml))
+	require.NoError(t, err)
+	assert.NotNil(t, cfg.Matrix)
+	assert.Len(t, cfg.Matrix.ExpandedSets, 2)
+	assert.Equal(t, "matrix", cfg.Routing.Router.Use)
+	assert.Len(t, cfg.Routing.Router.Settings.Matrix.ExpandedSets, 2)
+	// Groups should be empty when matrix is used
+	assert.Empty(t, cfg.Groups)
+}
+
+func filterBySetName(sets []ExpandedSet, name string) []ExpandedSet {
+	var result []ExpandedSet
+	for _, s := range sets {
+		if s.SetName == name {
+			result = append(result, s)
+		}
+	}
+	return result
+}
@@ -0,0 +1,184 @@
+package config
+
+import (
+	"errors"
+	"fmt"
+	"runtime"
+)
+
+const (
+	MODEL_CONFIG_DEFAULT_TTL = -1
+)
+
+var validModalities = map[string]struct{}{
+	"text":  {},
+	"audio": {},
+	"image": {},
+}
+
+// ModelCapConfig defines what modalities and features a model supports.
+// Used in /v1/models to inform clients. An empty block (all zero values) is
+// treated as not configured.
+type ModelCapConfig struct {
+	In       []string `yaml:"in"`
+	Out      []string `yaml:"out"`
+	Tools    bool     `yaml:"tools"`
+	Reranker bool     `yaml:"reranker"`
+	Context  int      `yaml:"context"`
+}
+
+// Empty returns true when all fields are at their zero values.
+func (c ModelCapConfig) Empty() bool {
+	return len(c.In) == 0 && len(c.Out) == 0 && !c.Tools && !c.Reranker && c.Context == 0
+}
+
+// Validate checks that all modality values are recognized and context is
+// non-negative. Returns an error if any value is invalid.
+func (c ModelCapConfig) Validate() error {
+	for _, m := range c.In {
+		if _, ok := validModalities[m]; !ok {
+			return fmt.Errorf("capabilities.in: invalid modality %q, must be one of: text, audio, image", m)
+		}
+	}
+	for _, m := range c.Out {
+		if _, ok := validModalities[m]; !ok {
+			return fmt.Errorf("capabilities.out: invalid modality %q, must be one of: text, audio, image", m)
+		}
+	}
+	if c.Context < 0 {
+		return errors.New("capabilities.context: must be >= 0")
+	}
+	return nil
+}
+
+// TimeoutsConfig holds timeout settings for proxy connections
+// 0 = no timeout
+type TimeoutsConfig struct {
+	Connect        int `yaml:"connect"`
+	KeepAlive      int `yaml:"keepalive"`
+	ResponseHeader int `yaml:"responseHeader"`
+	TLSHandshake   int `yaml:"tlsHandshake"`
+	ExpectContinue int `yaml:"expectContinue"`
+	IdleConn       int `yaml:"idleConn"`
+}
+
+type ModelConfig struct {
+	Cmd           string   `yaml:"cmd"`
+	CmdStop       string   `yaml:"cmdStop"`
+	Proxy         string   `yaml:"proxy"`
+	Aliases       []string `yaml:"aliases"`
+	Env           []string `yaml:"env"`
+	CheckEndpoint string   `yaml:"checkEndpoint"`
+	UnloadAfter   int      `yaml:"ttl"`
+	Unlisted      bool     `yaml:"unlisted"`
+	UseModelName  string   `yaml:"useModelName"`
+
+	// #179 for /v1/models
+	Name        string `yaml:"name"`
+	Description string `yaml:"description"`
+
+	// Limit concurrency of HTTP requests to process
+	ConcurrencyLimit int `yaml:"concurrencyLimit"`
+
+	// Model filters see issue #174
+	Filters ModelFilters `yaml:"filters"`
+
+	// Macros: see #264
+	// Model level macros take precedence over the global macros
+	Macros MacroList `yaml:"macros"`
+
+	// Metadata: see #264
+	// Arbitrary metadata that can be exposed through the API
+	Metadata map[string]any `yaml:"metadata"`
+
+	// override global setting
+	SendLoadingState *bool `yaml:"sendLoadingState"`
+
+	// Timeout settings for proxy connections
+	Timeouts TimeoutsConfig `yaml:"timeouts"`
+
+	// Capabilities defines what modalities and features the model supports.
+	Capabilities ModelCapConfig `yaml:"capabilities"`
+
+	// Copy of HealthCheckTimeout from global config
+	HealthCheckTimeout int `yaml:"healthCheckTimeout"`
+}
+
+func (m *ModelConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
+	type rawModelConfig ModelConfig
+	defaults := rawModelConfig{
+		Cmd:              "",
+		CmdStop:          "",
+		Proxy:            "http://localhost:${PORT}",
+		Aliases:          []string{},
+		Env:              []string{},
+		CheckEndpoint:    "/health",
+		UnloadAfter:      MODEL_CONFIG_DEFAULT_TTL, // use GlobalTTL
+		Unlisted:         false,
+		UseModelName:     "",
+		ConcurrencyLimit: 0,
+		Name:             "",
+		Description:      "",
+
+		// matches http.DefaultTransport
+		Timeouts: TimeoutsConfig{
+			Connect:        30,
+			KeepAlive:      30,
+			ResponseHeader: 0,
+			TLSHandshake:   10,
+			ExpectContinue: 1,
+			IdleConn:       90,
+		},
+	}
+
+	// the default cmdStop to taskkill /f /t /pid ${PID}
+	if runtime.GOOS == "windows" {
+		defaults.CmdStop = "taskkill /f /t /pid ${PID}"
+	}
+
+	if err := unmarshal(&defaults); err != nil {
+		return err
+	}
+
+	*m = ModelConfig(defaults)
+	return nil
+}
+
+func (m *ModelConfig) SanitizedCommand() ([]string, error) {
+	return SanitizeCommand(m.Cmd)
+}
+
+// ModelFilters embeds Filters and adds legacy support for strip_params field
+// See issue #174
+type ModelFilters struct {
+	Filters `yaml:",inline"`
+}
+
+func (m *ModelFilters) UnmarshalYAML(unmarshal func(interface{}) error) error {
+	type rawModelFilters ModelFilters
+	defaults := rawModelFilters{}
+
+	if err := unmarshal(&defaults); err != nil {
+		return err
+	}
+
+	// Try to unmarshal with the old field name for backwards compatibility
+	if defaults.StripParams == "" {
+		var legacy struct {
+			StripParams string `yaml:"strip_params"`
+		}
+		if legacyErr := unmarshal(&legacy); legacyErr != nil {
+			return errors.New("failed to unmarshal legacy filters.strip_params: " + legacyErr.Error())
+		}
+		defaults.StripParams = legacy.StripParams
+	}
+
+	*m = ModelFilters(defaults)
+	return nil
+}
+
+// SanitizedStripParams wraps Filters.SanitizedStripParams for backwards compatibility
+// Returns ([]string, error) to match existing API
+func (f ModelFilters) SanitizedStripParams() ([]string, error) {
+	return f.Filters.SanitizedStripParams(), nil
+}
@@ -0,0 +1,336 @@
+package config
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestConfig_ModelConfigSanitizedCommand(t *testing.T) {
+	config := &ModelConfig{
+		Cmd: `python model1.py \
+    --arg1 value1 \
+    --arg2 value2`,
+	}
+
+	args, err := config.SanitizedCommand()
+	assert.NoError(t, err)
+	assert.Equal(t, []string{"python", "model1.py", "--arg1", "value1", "--arg2", "value2"}, args)
+}
+
+func TestConfig_ModelFilters(t *testing.T) {
+	content := `
+macros:
+  default_strip: "temperature, top_p"
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+    filters:
+      # macros inserted and list is cleaned of duplicates and empty strings
+      stripParams: "model, top_k, top_k, temperature, ${default_strip}, , ,"
+  # check for strip_params (legacy field name) compatibility
+  legacy:
+    cmd: path/to/cmd --port ${PORT}
+    filters:
+      strip_params: "model, top_k, top_k, temperature, ${default_strip}, , ,"
+`
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+	for modelId, modelConfig := range config.Models {
+		t.Run(fmt.Sprintf("Testing macros in filters for model %s", modelId), func(t *testing.T) {
+			assert.Equal(t, "model, top_k, top_k, temperature, temperature, top_p, , ,", modelConfig.Filters.StripParams)
+			sanitized, err := modelConfig.Filters.SanitizedStripParams()
+			if assert.NoError(t, err) {
+				// model has been removed
+				// empty strings have been removed
+				// duplicates have been removed
+				assert.Equal(t, []string{"temperature", "top_k", "top_p"}, sanitized)
+			}
+		})
+	}
+}
+
+func TestConfig_ModelSendLoadingState(t *testing.T) {
+	content := `
+sendLoadingState: true
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+    sendLoadingState: false
+  model2:
+    cmd: path/to/cmd --port ${PORT}
+`
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+	assert.True(t, config.SendLoadingState)
+	if assert.NotNil(t, config.Models["model1"].SendLoadingState) {
+		assert.False(t, *config.Models["model1"].SendLoadingState)
+	}
+	if assert.NotNil(t, config.Models["model2"].SendLoadingState) {
+		assert.True(t, *config.Models["model2"].SendLoadingState)
+	}
+}
+
+func TestConfig_SetParamsByIDAutoAlias(t *testing.T) {
+	content := `
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+    filters:
+      setParamsByID:
+        "${MODEL_ID}:high":
+          reasoning_effort: high
+        "${MODEL_ID}:low":
+          reasoning_effort: low
+`
+	cfg, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+
+	// Keys (other than the model's own ID) should be registered as aliases
+	realName, found := cfg.RealModelName("model1:high")
+	assert.True(t, found, "model1:high should be an auto-registered alias")
+	assert.Equal(t, "model1", realName)
+
+	realName, found = cfg.RealModelName("model1:low")
+	assert.True(t, found, "model1:low should be an auto-registered alias")
+	assert.Equal(t, "model1", realName)
+
+	// Auto-aliases should also appear in modelConfig.Aliases
+	aliases := cfg.Models["model1"].Aliases
+	assert.Contains(t, aliases, "model1:high")
+	assert.Contains(t, aliases, "model1:low")
+}
+
+func TestConfig_SetParamsByIDAutoAliasConflictWithModelID(t *testing.T) {
+	content := `
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+    filters:
+      setParamsByID:
+        model2:
+          reasoning_effort: high
+  model2:
+    cmd: path/to/cmd --port ${PORT}
+`
+	_, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.ErrorContains(t, err, "conflicts with an existing model ID")
+}
+
+func TestConfig_SetParamsByIDAutoAliasConflictWithOtherModel(t *testing.T) {
+	content := `
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+    filters:
+      setParamsByID:
+        "shared-alias":
+          reasoning_effort: high
+  model2:
+    cmd: path/to/cmd --port ${PORT}
+    filters:
+      setParamsByID:
+        "shared-alias":
+          reasoning_effort: low
+`
+	_, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.ErrorContains(t, err, "duplicate alias")
+}
+
+func TestConfig_ModelFiltersWithSetParams(t *testing.T) {
+	content := `
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+    filters:
+      stripParams: "top_k"
+      setParams:
+        temperature: 0.7
+        top_p: 0.9
+        stop:
+          - "<|end|>"
+          - "<|stop|>"
+ `
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+
+	modelConfig := config.Models["model1"]
+
+	// Check stripParams
+	stripParams, err := modelConfig.Filters.SanitizedStripParams()
+	assert.NoError(t, err)
+	assert.Equal(t, []string{"top_k"}, stripParams)
+
+	// Check setParams
+	setParams, keys := modelConfig.Filters.SanitizedSetParams()
+	assert.NotNil(t, setParams)
+	assert.Equal(t, []string{"stop", "temperature", "top_p"}, keys)
+	assert.Equal(t, 0.7, setParams["temperature"])
+	assert.Equal(t, 0.9, setParams["top_p"])
+}
+
+func TestConfig_ModelCapabilities(t *testing.T) {
+	t.Run("all fields", func(t *testing.T) {
+		content := `
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+    capabilities:
+      in:
+        - text
+        - audio
+        - image
+      out:
+        - text
+        - audio
+        - image
+      tools: true
+      context: 32000
+`
+		config, err := LoadConfigFromReader(strings.NewReader(content))
+		assert.NoError(t, err)
+
+		mc := config.Models["model1"]
+		assert.False(t, mc.Capabilities.Empty())
+		assert.Equal(t, []string{"text", "audio", "image"}, mc.Capabilities.In)
+		assert.Equal(t, []string{"text", "audio", "image"}, mc.Capabilities.Out)
+		assert.True(t, mc.Capabilities.Tools)
+		assert.Equal(t, 32000, mc.Capabilities.Context)
+	})
+
+	t.Run("partial fields", func(t *testing.T) {
+		content := `
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+    capabilities:
+      tools: true
+      context: 8192
+`
+		config, err := LoadConfigFromReader(strings.NewReader(content))
+		assert.NoError(t, err)
+
+		mc := config.Models["model1"]
+		assert.False(t, mc.Capabilities.Empty())
+		assert.Nil(t, mc.Capabilities.In)
+		assert.Nil(t, mc.Capabilities.Out)
+		assert.True(t, mc.Capabilities.Tools)
+		assert.Equal(t, 8192, mc.Capabilities.Context)
+	})
+
+	t.Run("not set", func(t *testing.T) {
+		content := `
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+`
+		config, err := LoadConfigFromReader(strings.NewReader(content))
+		assert.NoError(t, err)
+
+		mc := config.Models["model1"]
+		assert.True(t, mc.Capabilities.Empty())
+	})
+
+	t.Run("tools false is empty", func(t *testing.T) {
+		content := `
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+    capabilities:
+      tools: false
+`
+		config, err := LoadConfigFromReader(strings.NewReader(content))
+		assert.NoError(t, err)
+
+		mc := config.Models["model1"]
+		assert.True(t, mc.Capabilities.Empty())
+	})
+
+	t.Run("reranker true is not empty", func(t *testing.T) {
+		content := `
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+    capabilities:
+      reranker: true
+`
+		config, err := LoadConfigFromReader(strings.NewReader(content))
+		assert.NoError(t, err)
+
+		mc := config.Models["model1"]
+		assert.False(t, mc.Capabilities.Empty())
+		assert.True(t, mc.Capabilities.Reranker)
+	})
+
+	t.Run("reranker false is empty", func(t *testing.T) {
+		content := `
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+    capabilities:
+      reranker: false
+`
+		config, err := LoadConfigFromReader(strings.NewReader(content))
+		assert.NoError(t, err)
+
+		mc := config.Models["model1"]
+		assert.True(t, mc.Capabilities.Empty())
+	})
+}
+
+func TestConfig_ModelCapabilities_Validate(t *testing.T) {
+	t.Run("valid_modalities", func(t *testing.T) {
+		caps := ModelCapConfig{
+			In:      []string{"text", "image"},
+			Out:     []string{"text", "audio"},
+			Tools:   true,
+			Context: 100000,
+		}
+		assert.NoError(t, caps.Validate())
+	})
+
+	t.Run("empty_is_valid", func(t *testing.T) {
+		caps := ModelCapConfig{}
+		assert.NoError(t, caps.Validate())
+	})
+
+	t.Run("invalid_in_modality", func(t *testing.T) {
+		caps := ModelCapConfig{In: []string{"video"}}
+		err := caps.Validate()
+		assert.Error(t, err)
+		assert.Contains(t, err.Error(), "capabilities.in")
+		assert.Contains(t, err.Error(), "video")
+	})
+
+	t.Run("invalid_out_modality", func(t *testing.T) {
+		caps := ModelCapConfig{Out: []string{"video"}}
+		err := caps.Validate()
+		assert.Error(t, err)
+		assert.Contains(t, err.Error(), "capabilities.out")
+		assert.Contains(t, err.Error(), "video")
+	})
+
+	t.Run("negative_context", func(t *testing.T) {
+		caps := ModelCapConfig{Context: -1}
+		err := caps.Validate()
+		assert.Error(t, err)
+		assert.Contains(t, err.Error(), "capabilities.context")
+	})
+
+	t.Run("rejects_invalid_at_load", func(t *testing.T) {
+		content := `
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+    capabilities:
+      in:
+        - text
+        - video
+`
+		_, err := LoadConfigFromReader(strings.NewReader(content))
+		assert.Error(t, err)
+		assert.Contains(t, err.Error(), "video")
+	})
+}
@@ -0,0 +1,63 @@
+package config
+
+import (
+	"fmt"
+	"net/url"
+)
+
+type PeerDictionaryConfig map[string]PeerConfig
+type PeerConfig struct {
+	Proxy    string   `yaml:"proxy"`
+	ProxyURL *url.URL `yaml:"-"`
+	ApiKey   string   `yaml:"apiKey"`
+	Models   []string `yaml:"models"`
+	Filters  Filters  `yaml:"filters"`
+
+	// Timeout settings for proxy connections
+	Timeouts TimeoutsConfig `yaml:"timeouts"`
+}
+
+func (c *PeerConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
+	type rawPeerConfig PeerConfig
+	defaults := rawPeerConfig{
+		Proxy:   "",
+		ApiKey:  "",
+		Models:  []string{},
+		Filters: Filters{},
+
+		// mostly matches http.DefaultTransport but with a 60s ResponseHeader timeout
+		// to match the pre PR #619 functionality
+		Timeouts: TimeoutsConfig{
+			Connect:        30,
+			KeepAlive:      30,
+			ResponseHeader: 60,
+			TLSHandshake:   10,
+			ExpectContinue: 1,
+			IdleConn:       90,
+		},
+	}
+
+	if err := unmarshal(&defaults); err != nil {
+		return err
+	}
+
+	// Validate proxy is not empty
+	if defaults.Proxy == "" {
+		return fmt.Errorf("proxy is required")
+	}
+
+	// Validate proxy is a valid URL and store the parsed value
+	parsedURL, err := url.Parse(defaults.Proxy)
+	if err != nil {
+		return fmt.Errorf("invalid peer proxy URL (%s): %w", defaults.Proxy, err)
+	}
+	defaults.ProxyURL = parsedURL
+
+	// Validate models is not empty
+	if len(defaults.Models) == 0 {
+		return fmt.Errorf("peer models can not be empty")
+	}
+
+	*c = PeerConfig(defaults)
+	return nil
+}
@@ -0,0 +1,209 @@
+package config
+
+import (
+	"testing"
+
+	"gopkg.in/yaml.v3"
+)
+
+func TestPeerConfig_UnmarshalYAML(t *testing.T) {
+	tests := []struct {
+		name    string
+		yaml    string
+		wantErr string
+	}{
+		{
+			name: "valid config",
+			yaml: `
+proxy: http://192.168.1.23
+models:
+  - model_a
+  - model_b
+`,
+			wantErr: "",
+		},
+		{
+			name: "valid config with apiKey",
+			yaml: `
+proxy: https://openrouter.ai/api
+apiKey: sk-test-key
+models:
+  - meta-llama/llama-3.1-8b-instruct
+`,
+			wantErr: "",
+		},
+		{
+			name: "missing proxy",
+			yaml: `
+models:
+  - model_a
+`,
+			wantErr: "proxy is required",
+		},
+		{
+			name: "empty proxy",
+			yaml: `
+proxy: ""
+models:
+  - model_a
+`,
+			wantErr: "proxy is required",
+		},
+		{
+			name: "invalid proxy URL",
+			yaml: `
+proxy: "://invalid"
+models:
+  - model_a
+`,
+			wantErr: "invalid peer proxy URL",
+		},
+		{
+			name: "missing models",
+			yaml: `
+proxy: http://localhost:8080
+`,
+			wantErr: "peer models can not be empty",
+		},
+		{
+			name: "empty models",
+			yaml: `
+proxy: http://localhost:8080
+models: []
+`,
+			wantErr: "peer models can not be empty",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			var config PeerConfig
+			err := yaml.Unmarshal([]byte(tt.yaml), &config)
+
+			if tt.wantErr == "" {
+				if err != nil {
+					t.Errorf("unexpected error: %v", err)
+				}
+			} else {
+				if err == nil {
+					t.Errorf("expected error containing %q, got nil", tt.wantErr)
+				} else if !contains(err.Error(), tt.wantErr) {
+					t.Errorf("expected error containing %q, got %q", tt.wantErr, err.Error())
+				}
+			}
+		})
+	}
+}
+
+func TestPeerConfig_ProxyURL(t *testing.T) {
+	yamlData := `
+proxy: http://192.168.1.23:8080/api
+apiKey: sk-test
+models:
+  - model_a
+`
+	var config PeerConfig
+	err := yaml.Unmarshal([]byte(yamlData), &config)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if config.ProxyURL == nil {
+		t.Fatal("ProxyURL should not be nil")
+	}
+
+	if config.ProxyURL.Host != "192.168.1.23:8080" {
+		t.Errorf("expected host %q, got %q", "192.168.1.23:8080", config.ProxyURL.Host)
+	}
+
+	if config.ProxyURL.Scheme != "http" {
+		t.Errorf("expected scheme %q, got %q", "http", config.ProxyURL.Scheme)
+	}
+
+	if config.ProxyURL.Path != "/api" {
+		t.Errorf("expected path %q, got %q", "/api", config.ProxyURL.Path)
+	}
+}
+
+func contains(s, substr string) bool {
+	return len(s) >= len(substr) && searchSubstring(s, substr)
+}
+
+func searchSubstring(s, substr string) bool {
+	for i := 0; i <= len(s)-len(substr); i++ {
+		if s[i:i+len(substr)] == substr {
+			return true
+		}
+	}
+	return false
+}
+
+func TestPeerConfig_WithFilters(t *testing.T) {
+	yamlData := `
+proxy: https://openrouter.ai/api
+apiKey: sk-test
+models:
+  - model_a
+filters:
+  setParams:
+    temperature: 0.7
+    provider:
+      data_collection: deny
+`
+	var config PeerConfig
+	err := yaml.Unmarshal([]byte(yamlData), &config)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if config.Filters.SetParams == nil {
+		t.Fatal("Filters.SetParams should not be nil")
+	}
+
+	if config.Filters.SetParams["temperature"] != 0.7 {
+		t.Errorf("expected temperature 0.7, got %v", config.Filters.SetParams["temperature"])
+	}
+
+	provider, ok := config.Filters.SetParams["provider"].(map[string]any)
+	if !ok {
+		t.Fatal("provider should be a map")
+	}
+	if provider["data_collection"] != "deny" {
+		t.Errorf("expected data_collection deny, got %v", provider["data_collection"])
+	}
+}
+
+func TestPeerConfig_WithBothFilters(t *testing.T) {
+	yamlData := `
+proxy: https://openrouter.ai/api
+apiKey: sk-test
+models:
+  - model_a
+filters:
+  stripParams: "temperature, top_p"
+  setParams:
+    max_tokens: 1000
+`
+	var config PeerConfig
+	err := yaml.Unmarshal([]byte(yamlData), &config)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	// Check stripParams
+	stripParams := config.Filters.SanitizedStripParams()
+	if len(stripParams) != 2 {
+		t.Errorf("expected 2 strip params, got %d", len(stripParams))
+	}
+	if stripParams[0] != "temperature" || stripParams[1] != "top_p" {
+		t.Errorf("unexpected strip params: %v", stripParams)
+	}
+
+	// Check setParams
+	if config.Filters.SetParams == nil {
+		t.Fatal("Filters.SetParams should not be nil")
+	}
+	if config.Filters.SetParams["max_tokens"] != 1000 {
+		t.Errorf("expected max_tokens 1000, got %v", config.Filters.SetParams["max_tokens"])
+	}
+}
@@ -0,0 +1,34 @@
+package config
+
+import (
+	"fmt"
+	"time"
+)
+
+// PerformanceConfig holds configuration for system performance monitoring
+type PerformanceConfig struct {
+	Disabled bool          `yaml:"disabled"`
+	Every    time.Duration `yaml:"every"`
+}
+
+func (p *PerformanceConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
+	type rawPerformanceConfig PerformanceConfig
+	defaults := rawPerformanceConfig{
+		Every: 5 * time.Second,
+	}
+
+	if err := unmarshal(&defaults); err != nil {
+		return err
+	}
+
+	*p = PerformanceConfig(defaults)
+	return nil
+}
+
+// Validate checks the PerformanceConfig values and returns an error if invalid
+func (p *PerformanceConfig) Validate() error {
+	if p.Every < 5*time.Second {
+		return fmt.Errorf("every must be at least 5s, got %v", p.Every)
+	}
+	return nil
+}
@@ -0,0 +1,98 @@
+package config
+
+import (
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestPerformanceConfig_Defaults(t *testing.T) {
+	content := `
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+`
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+
+	// When performance section is missing, defaults should be applied
+	assert.False(t, config.Performance.Disabled)
+	assert.Equal(t, 5*time.Second, config.Performance.Every)
+}
+
+func TestPerformanceConfig_CustomValues(t *testing.T) {
+	content := `
+performance:
+  enable: true
+  every: 30s
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+`
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+
+	assert.False(t, config.Performance.Disabled)
+	assert.Equal(t, 30*time.Second, config.Performance.Every)
+}
+
+func TestPerformanceConfig_Disabled(t *testing.T) {
+	content := `
+performance:
+  disabled: true
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+`
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+
+	assert.True(t, config.Performance.Disabled)
+	// Duration defaults should still apply
+	assert.Equal(t, 5*time.Second, config.Performance.Every)
+}
+
+func TestPerformanceConfig_PartialValues(t *testing.T) {
+	content := `
+performance:
+  every: 10s
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+`
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+
+	// enable should default to true
+	assert.False(t, config.Performance.Disabled)
+	assert.Equal(t, 10*time.Second, config.Performance.Every)
+}
+
+func TestPerformanceConfig_InvalidEvery(t *testing.T) {
+	content := `
+performance:
+  every: 4s
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+`
+	_, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "every must be at least 5s")
+}
+
+func TestPerformanceConfig_ComplexDurations(t *testing.T) {
+	content := `
+performance:
+  every: 1m30s
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+`
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+
+	assert.Equal(t, 90*time.Second, config.Performance.Every)
+}
@@ -0,0 +1,55 @@
+package config
+
+import (
+	"fmt"
+	"regexp"
+
+	"gopkg.in/yaml.v3"
+)
+
+// DefaultUpstreamIgnorePathsPattern is the default regular expression applied
+// to upstream.ignorePaths when the section is empty or absent from the config.
+// It matches common static-asset suffixes so requests for .js/.css/.png/etc.
+// files do not trigger a model swap.
+const DefaultUpstreamIgnorePathsPattern = `.*\.(js|json|css|png|gif|jpg|jpeg|ico|txt)$`
+
+// DefaultUpstreamIgnorePaths returns the default compiled ignore paths used
+// when upstream.ignorePaths is not specified in the config. The returned slice
+// is fresh so callers may mutate it without affecting other configs.
+func DefaultUpstreamIgnorePaths() []*regexp.Regexp {
+	return []*regexp.Regexp{regexp.MustCompile(DefaultUpstreamIgnorePathsPattern)}
+}
+
+// UpstreamConfig controls behaviour of the /upstream passthrough endpoint.
+type UpstreamConfig struct {
+	// IgnorePaths is a slice of compiled regular expressions. Any request to
+	// /upstream/<model>/<path> whose remaining path matches any of these
+	// expressions will be ignored and not trigger a swap. When the config
+	// does not specify any patterns, DefaultUpstreamIgnorePaths is applied.
+	IgnorePaths []*regexp.Regexp `yaml:"-"`
+}
+
+// rawUpstreamConfig is the intermediate form used to unmarshal the YAML into
+// plain strings, which are then compiled into *regexp.Regexp.
+type rawUpstreamConfig struct {
+	IgnorePaths []string `yaml:"ignorePaths"`
+}
+
+// UnmarshalYAML compiles each ignorePaths entry into a *regexp.Regexp. If any
+// entry fails to compile, an error is returned.
+func (u *UpstreamConfig) UnmarshalYAML(value *yaml.Node) error {
+	var raw rawUpstreamConfig
+	if err := value.Decode(&raw); err != nil {
+		return err
+	}
+	patterns := make([]*regexp.Regexp, 0, len(raw.IgnorePaths))
+	for _, p := range raw.IgnorePaths {
+		re, err := regexp.Compile(p)
+		if err != nil {
+			return fmt.Errorf("upstream.ignorePaths: invalid regular expression %q: %w", p, err)
+		}
+		patterns = append(patterns, re)
+	}
+	u.IgnorePaths = patterns
+	return nil
+}
@@ -0,0 +1,88 @@
+package config
+
+import (
+	"regexp"
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+const upstreamConfigHeader = `
+models:
+  model1:
+    cmd: path/to/cmd --arg1 one
+    proxy: "http://localhost:8080"
+`
+
+func TestConfig_UpstreamIgnorePaths_DefaultWhenAbsent(t *testing.T) {
+	// When upstream is not specified at all, the default pattern is applied.
+	content := upstreamConfigHeader
+	cfg, err := LoadConfigFromReader(strings.NewReader(content))
+	require.NoError(t, err)
+	require.Len(t, cfg.Upstream.IgnorePaths, 1)
+
+	def := cfg.Upstream.IgnorePaths[0]
+	assert.IsType(t, &regexp.Regexp{}, def)
+	assert.Equal(t, DefaultUpstreamIgnorePathsPattern, def.String())
+
+	// The default matches common static-asset suffixes.
+	assert.True(t, def.MatchString("/foo.js"))
+	assert.True(t, def.MatchString("/bar/baz.json"))
+	assert.True(t, def.MatchString("/static/img.png"))
+	assert.True(t, def.MatchString("/notes.txt"))
+	assert.True(t, def.MatchString("/favicon.ico"))
+	// And does not match inference API paths.
+	assert.False(t, def.MatchString("/v1/chat/completions"))
+	assert.False(t, def.MatchString("/v1/models"))
+	assert.False(t, def.MatchString("/health"))
+}
+
+func TestConfig_UpstreamIgnorePaths_DefaultWhenSectionEmpty(t *testing.T) {
+	// When upstream is present but ignorePaths is omitted, the default is still
+	// applied.
+	content := `upstream: {}` + "\n" + upstreamConfigHeader
+	cfg, err := LoadConfigFromReader(strings.NewReader(content))
+	require.NoError(t, err)
+	require.Len(t, cfg.Upstream.IgnorePaths, 1)
+	assert.Equal(t, DefaultUpstreamIgnorePathsPattern, cfg.Upstream.IgnorePaths[0].String())
+}
+
+func TestConfig_UpstreamIgnorePaths_Compiles(t *testing.T) {
+	content := `
+upstream:
+  ignorePaths:
+    - ".*\\.(js|json|css|png|gif|jpg|jpeg|txt)$"
+    - "^/static/.*"
+` + upstreamConfigHeader
+
+	cfg, err := LoadConfigFromReader(strings.NewReader(content))
+	require.NoError(t, err)
+	require.Len(t, cfg.Upstream.IgnorePaths, 2)
+
+	// Verify the patterns are compiled into *regexp.Regexp and match as expected.
+	assert.True(t, cfg.Upstream.IgnorePaths[0].MatchString("/foo.js"))
+	assert.True(t, cfg.Upstream.IgnorePaths[0].MatchString("/bar/baz.json"))
+	assert.False(t, cfg.Upstream.IgnorePaths[0].MatchString("/v1/chat/completions"))
+	assert.True(t, cfg.Upstream.IgnorePaths[1].MatchString("/static/foo.png"))
+	assert.False(t, cfg.Upstream.IgnorePaths[1].MatchString("/v1/chat/completions"))
+
+	// Confirm the type is *regexp.Regexp to satisfy the API contract.
+	for _, re := range cfg.Upstream.IgnorePaths {
+		assert.IsType(t, &regexp.Regexp{}, re)
+	}
+}
+
+func TestConfig_UpstreamIgnorePaths_InvalidRegexReturnsError(t *testing.T) {
+	content := `
+upstream:
+  ignorePaths:
+    - "[invalid("
+` + upstreamConfigHeader
+
+	_, err := LoadConfigFromReader(strings.NewReader(content))
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "upstream.ignorePaths")
+	assert.Contains(t, err.Error(), "invalid regular expression")
+}
@@ -0,0 +1,3 @@
+The code in `event` was originally a part of https://github.com/kelindar/event (v1.5.2)
+
+The original code uses a `time.Ticker` to process the event queue which caused a large increase in CPU usage ([#189](https://github.com/mostlygeek/llama-swap/issues/189)). This code was ported to remove the ticker and instead be more event driven.
@@ -0,0 +1,30 @@
+// Copyright (c) Roman Atachiants and contributore. All rights reserved.
+// Licensed under the MIT license. See LICENSE file in the project root for detaile.
+
+package event
+
+import (
+	"context"
+)
+
+// Default initializes a default in-process dispatcher
+var Default = NewDispatcherConfig(25000)
+
+// On subscribes to an event, the type of the event will be automatically
+// inferred from the provided type. Must be constant for this to work. This
+// functions same way as Subscribe() but uses the default dispatcher instead.
+func On[T Event](handler func(T)) context.CancelFunc {
+	return Subscribe(Default, handler)
+}
+
+// OnType subscribes to an event with the specified event type. This functions
+// same way as SubscribeTo() but uses the default dispatcher instead.
+func OnType[T Event](eventType uint32, handler func(T)) context.CancelFunc {
+	return SubscribeTo(Default, eventType, handler)
+}
+
+// Emit writes an event into the dispatcher. This functions same way as
+// Publish() but uses the default dispatcher instead.
+func Emit[T Event](ev T) {
+	Publish(Default, ev)
+}
@@ -0,0 +1,54 @@
+// Copyright (c) Roman Atachiants and contributore. All rights reserved.
+// Licensed under the MIT license. See LICENSE file in the project root for detaile.
+
+package event
+
+import (
+	"sync"
+	"sync/atomic"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+/*
+cpu: 13th Gen Intel(R) Core(TM) i7-13700K
+BenchmarkSubcribeConcurrent-24    	 1826686	       606.3 ns/op	    1648 B/op	       5 allocs/op
+*/
+func BenchmarkSubscribeConcurrent(b *testing.B) {
+	d := NewDispatcher()
+	b.ReportAllocs()
+	b.ResetTimer()
+
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			unsub := Subscribe(d, func(ev MyEvent1) {})
+			unsub()
+		}
+	})
+}
+
+func TestDefaultPublish(t *testing.T) {
+	var wg sync.WaitGroup
+
+	// Subscribe
+	var count int64
+	defer On(func(ev MyEvent1) {
+		atomic.AddInt64(&count, 1)
+		wg.Done()
+	})()
+
+	defer OnType(TypeEvent1, func(ev MyEvent1) {
+		atomic.AddInt64(&count, 1)
+		wg.Done()
+	})()
+
+	// Publish
+	wg.Add(4)
+	Emit(MyEvent1{})
+	Emit(MyEvent1{})
+
+	// Wait and check
+	wg.Wait()
+	assert.Equal(t, int64(4), count)
+}
@@ -0,0 +1,324 @@
+// Copyright (c) Roman Atachiants and contributors. All rights reserved.
+// Licensed under the MIT license. See LICENSE file in the project root for details.
+
+package event
+
+import (
+	"context"
+	"fmt"
+	"reflect"
+	"sort"
+	"strings"
+	"sync"
+	"sync/atomic"
+)
+
+// Event represents an event contract
+type Event interface {
+	Type() uint32
+}
+
+// registry holds an immutable sorted array of event mappings
+type registry struct {
+	keys []uint32 // Event types (sorted)
+	grps []any    // Corresponding subscribers
+}
+
+// ------------------------------------- Dispatcher -------------------------------------
+
+// Dispatcher represents an event dispatcher.
+type Dispatcher struct {
+	subs     atomic.Pointer[registry] // Atomic pointer to immutable array
+	done     chan struct{}            // Cancellation
+	maxQueue int                      // Maximum queue size per consumer
+	mu       sync.Mutex               // Only for writes (subscribe/unsubscribe)
+}
+
+// NewDispatcher creates a new dispatcher of events.
+func NewDispatcher() *Dispatcher {
+	return NewDispatcherConfig(50000)
+}
+
+// NewDispatcherConfig creates a new dispatcher with configurable max queue size
+func NewDispatcherConfig(maxQueue int) *Dispatcher {
+	d := &Dispatcher{
+		done:     make(chan struct{}),
+		maxQueue: maxQueue,
+	}
+
+	d.subs.Store(&registry{
+		keys: make([]uint32, 0, 16),
+		grps: make([]any, 0, 16),
+	})
+	return d
+}
+
+// Close closes the dispatcher
+func (d *Dispatcher) Close() error {
+	close(d.done)
+	return nil
+}
+
+// isClosed returns whether the dispatcher is closed or not
+func (d *Dispatcher) isClosed() bool {
+	select {
+	case <-d.done:
+		return true
+	default:
+		return false
+	}
+}
+
+// findGroup performs a lock-free binary search for the event type
+func (d *Dispatcher) findGroup(eventType uint32) any {
+	reg := d.subs.Load()
+	keys := reg.keys
+
+	// Inlined binary search for better cache locality
+	left, right := 0, len(keys)
+	for left < right {
+		mid := left + (right-left)/2
+		if keys[mid] < eventType {
+			left = mid + 1
+		} else {
+			right = mid
+		}
+	}
+
+	if left < len(keys) && keys[left] == eventType {
+		return reg.grps[left]
+	}
+	return nil
+}
+
+// Subscribe subscribes to an event, the type of the event will be automatically
+// inferred from the provided type. Must be constant for this to work.
+func Subscribe[T Event](broker *Dispatcher, handler func(T)) context.CancelFunc {
+	var event T
+	return SubscribeTo(broker, event.Type(), handler)
+}
+
+// SubscribeTo subscribes to an event with the specified event type.
+func SubscribeTo[T Event](broker *Dispatcher, eventType uint32, handler func(T)) context.CancelFunc {
+	if broker.isClosed() {
+		panic(errClosed)
+	}
+
+	broker.mu.Lock()
+	defer broker.mu.Unlock()
+
+	// Check if group already exists
+	if existing := broker.findGroup(eventType); existing != nil {
+		grp := groupOf[T](eventType, existing)
+		sub := grp.Add(handler)
+		return func() {
+			grp.Del(sub)
+		}
+	}
+
+	// Create new group
+	grp := &group[T]{cond: sync.NewCond(new(sync.Mutex)), maxQueue: broker.maxQueue}
+	sub := grp.Add(handler)
+
+	// Copy-on-write: insert new entry in sorted position
+	old := broker.subs.Load()
+	idx := sort.Search(len(old.keys), func(i int) bool {
+		return old.keys[i] >= eventType
+	})
+
+	// Create new arrays with space for one more element
+	newKeys := make([]uint32, len(old.keys)+1)
+	newGrps := make([]any, len(old.grps)+1)
+
+	// Copy elements before insertion point
+	copy(newKeys[:idx], old.keys[:idx])
+	copy(newGrps[:idx], old.grps[:idx])
+
+	// Insert new element
+	newKeys[idx] = eventType
+	newGrps[idx] = grp
+
+	// Copy elements after insertion point
+	copy(newKeys[idx+1:], old.keys[idx:])
+	copy(newGrps[idx+1:], old.grps[idx:])
+
+	// Atomically store the new registry (mutex ensures no concurrent writers)
+	newReg := &registry{keys: newKeys, grps: newGrps}
+	broker.subs.Store(newReg)
+
+	return func() {
+		grp.Del(sub)
+	}
+}
+
+// Publish writes an event into the dispatcher
+func Publish[T Event](broker *Dispatcher, ev T) {
+	eventType := ev.Type()
+	if sub := broker.findGroup(eventType); sub != nil {
+		group := groupOf[T](eventType, sub)
+		group.Broadcast(ev)
+	}
+}
+
+// Count counts the number of subscribers, this is for testing only.
+func (d *Dispatcher) count(eventType uint32) int {
+	if group := d.findGroup(eventType); group != nil {
+		return group.(interface{ Count() int }).Count()
+	}
+	return 0
+}
+
+// groupOf casts the subscriber group to the specified generic type
+func groupOf[T Event](eventType uint32, subs any) *group[T] {
+	if group, ok := subs.(*group[T]); ok {
+		return group
+	}
+
+	panic(errConflict[T](eventType, subs))
+}
+
+// ------------------------------------- Subscriber -------------------------------------
+
+// consumer represents a consumer with a message queue
+type consumer[T Event] struct {
+	queue []T  // Current work queue
+	stop  bool // Stop signal
+}
+
+// Listen listens to the event queue and processes events
+func (s *consumer[T]) Listen(c *sync.Cond, fn func(T)) {
+	pending := make([]T, 0, 128)
+
+	for {
+		c.L.Lock()
+		for len(s.queue) == 0 {
+			switch {
+			case s.stop:
+				c.L.Unlock()
+				return
+			default:
+				c.Wait()
+			}
+		}
+
+		// Swap buffers and reset the current queue
+		temp := s.queue
+		s.queue = pending[:0]
+		pending = temp
+		c.L.Unlock()
+
+		// Outside of the critical section, process the work
+		for _, event := range pending {
+			fn(event)
+		}
+
+		// Notify potential publishers waiting due to backpressure
+		c.Broadcast()
+	}
+}
+
+// ------------------------------------- Subscriber Group -------------------------------------
+
+// group represents a consumer group
+type group[T Event] struct {
+	cond     *sync.Cond
+	subs     []*consumer[T]
+	maxQueue int // Maximum queue size per consumer
+	maxLen   int // Current maximum queue length across all consumers
+}
+
+// Broadcast sends an event to all consumers
+func (s *group[T]) Broadcast(ev T) {
+	s.cond.L.Lock()
+	defer s.cond.L.Unlock()
+
+	// Calculate current maximum queue length
+	s.maxLen = 0
+	for _, sub := range s.subs {
+		if len(sub.queue) > s.maxLen {
+			s.maxLen = len(sub.queue)
+		}
+	}
+
+	// Backpressure: wait if queues are full
+	for s.maxLen >= s.maxQueue {
+		s.cond.Wait()
+
+		// Recalculate after wakeup
+		s.maxLen = 0
+		for _, sub := range s.subs {
+			if len(sub.queue) > s.maxLen {
+				s.maxLen = len(sub.queue)
+			}
+		}
+	}
+
+	// Add event to all queues and track new maximum
+	newMax := 0
+	for _, sub := range s.subs {
+		sub.queue = append(sub.queue, ev)
+		if len(sub.queue) > newMax {
+			newMax = len(sub.queue)
+		}
+	}
+	s.maxLen = newMax
+	s.cond.Broadcast() // Wake consumers
+}
+
+// Add adds a subscriber to the list
+func (s *group[T]) Add(handler func(T)) *consumer[T] {
+	sub := &consumer[T]{
+		queue: make([]T, 0, 64),
+	}
+
+	// Add the consumer to the list of active consumers
+	s.cond.L.Lock()
+	s.subs = append(s.subs, sub)
+	s.cond.L.Unlock()
+
+	// Start listening
+	go sub.Listen(s.cond, handler)
+	return sub
+}
+
+// Del removes a subscriber from the list
+func (s *group[T]) Del(sub *consumer[T]) {
+	s.cond.L.Lock()
+	defer s.cond.L.Unlock()
+
+	// Search and remove the subscriber
+	sub.stop = true
+	for i, v := range s.subs {
+		if v == sub {
+			copy(s.subs[i:], s.subs[i+1:])
+			s.subs = s.subs[:len(s.subs)-1]
+			break
+		}
+	}
+}
+
+// ------------------------------------- Debugging -------------------------------------
+
+var errClosed = fmt.Errorf("event dispatcher is closed")
+
+// Count returns the number of subscribers in this group
+func (s *group[T]) Count() int {
+	return len(s.subs)
+}
+
+// String returns string representation of the type
+func (s *group[T]) String() string {
+	typ := reflect.TypeOf(s).String()
+	idx := strings.LastIndex(typ, "/")
+	typ = typ[idx+1 : len(typ)-1]
+	return typ
+}
+
+// errConflict returns a conflict message
+func errConflict[T any](eventType uint32, existing any) string {
+	var want T
+	return fmt.Sprintf(
+		"conflicting event type, want=<%T>, registered=<%s>, event=0x%v",
+		want, existing, eventType,
+	)
+}
@@ -0,0 +1,324 @@
+// Copyright (c) Roman Atachiants and contributore. All rights reserved.
+// Licensed under the MIT license. See LICENSE file in the project root for detaile.
+
+package event
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestPublish(t *testing.T) {
+	d := NewDispatcher()
+	var wg sync.WaitGroup
+
+	// Subscribe, must be received in order
+	var count int64
+	defer Subscribe(d, func(ev MyEvent1) {
+		assert.Equal(t, int(atomic.AddInt64(&count, 1)), ev.Number)
+		wg.Done()
+	})()
+
+	// Publish
+	wg.Add(3)
+	Publish(d, MyEvent1{Number: 1})
+	Publish(d, MyEvent1{Number: 2})
+	Publish(d, MyEvent1{Number: 3})
+
+	// Wait and check
+	wg.Wait()
+	assert.Equal(t, int64(3), count)
+}
+
+func TestUnsubscribe(t *testing.T) {
+	d := NewDispatcher()
+	assert.Equal(t, 0, d.count(TypeEvent1))
+	unsubscribe := Subscribe(d, func(ev MyEvent1) {
+		// Nothing
+	})
+
+	assert.Equal(t, 1, d.count(TypeEvent1))
+	unsubscribe()
+	assert.Equal(t, 0, d.count(TypeEvent1))
+}
+
+func TestConcurrent(t *testing.T) {
+	const max = 1000000
+	var count int64
+	var wg sync.WaitGroup
+	wg.Add(1)
+
+	d := NewDispatcher()
+	defer Subscribe(d, func(ev MyEvent1) {
+		if current := atomic.AddInt64(&count, 1); current == max {
+			wg.Done()
+		}
+	})()
+
+	// Asynchronously publish
+	go func() {
+		for i := 0; i < max; i++ {
+			Publish(d, MyEvent1{})
+		}
+	}()
+
+	defer Subscribe(d, func(ev MyEvent1) {
+		// Subscriber that does nothing
+	})()
+
+	wg.Wait()
+	assert.Equal(t, max, int(count))
+}
+
+func TestSubscribeDifferentType(t *testing.T) {
+	d := NewDispatcher()
+	assert.Panics(t, func() {
+		SubscribeTo(d, TypeEvent1, func(ev MyEvent1) {})
+		SubscribeTo(d, TypeEvent1, func(ev MyEvent2) {})
+	})
+}
+
+func TestPublishDifferentType(t *testing.T) {
+	d := NewDispatcher()
+	assert.Panics(t, func() {
+		SubscribeTo(d, TypeEvent1, func(ev MyEvent2) {})
+		Publish(d, MyEvent1{})
+	})
+}
+
+func TestCloseDispatcher(t *testing.T) {
+	d := NewDispatcher()
+	defer SubscribeTo(d, TypeEvent1, func(ev MyEvent2) {})()
+
+	assert.NoError(t, d.Close())
+	assert.Panics(t, func() {
+		SubscribeTo(d, TypeEvent1, func(ev MyEvent2) {})
+	})
+}
+
+func TestMatrix(t *testing.T) {
+	const amount = 1000
+	for _, subs := range []int{1, 10, 100} {
+		for _, topics := range []int{1, 10} {
+			expected := subs * topics * amount
+			t.Run(fmt.Sprintf("%dx%d", topics, subs), func(t *testing.T) {
+				var count atomic.Int64
+				var wg sync.WaitGroup
+				wg.Add(expected)
+
+				d := NewDispatcher()
+				for i := 0; i < subs; i++ {
+					for id := 0; id < topics; id++ {
+						defer SubscribeTo(d, uint32(id), func(ev MyEvent3) {
+							count.Add(1)
+							wg.Done()
+						})()
+					}
+				}
+
+				for n := 0; n < amount; n++ {
+					for id := 0; id < topics; id++ {
+						go Publish(d, MyEvent3{ID: id})
+					}
+				}
+
+				wg.Wait()
+				assert.Equal(t, expected, int(count.Load()))
+			})
+		}
+	}
+}
+
+func TestConcurrentSubscriptionRace(t *testing.T) {
+	// This test specifically targets the race condition that occurs when multiple
+	// goroutines try to subscribe to different event types simultaneously.
+	// Without the CAS loop, subscriptions could be lost due to registry corruption.
+
+	const numGoroutines = 100
+	const numEventTypes = 50
+
+	d := NewDispatcher()
+	defer d.Close()
+
+	var wg sync.WaitGroup
+	var receivedCount int64
+	var subscribedTypes sync.Map // Thread-safe map
+
+	wg.Add(numGoroutines)
+
+	// Start multiple goroutines that subscribe to different event types concurrently
+	for i := 0; i < numGoroutines; i++ {
+		go func(goroutineID int) {
+			defer wg.Done()
+
+			// Each goroutine subscribes to a unique event type
+			eventType := uint32(goroutineID%numEventTypes + 1000) // Offset to avoid collision with other tests
+
+			// Subscribe to the event type
+			SubscribeTo(d, eventType, func(ev MyEvent3) {
+				atomic.AddInt64(&receivedCount, 1)
+			})
+
+			// Record that this type was subscribed
+			subscribedTypes.Store(eventType, true)
+		}(i)
+	}
+
+	// Wait for all subscriptions to complete
+	wg.Wait()
+
+	// Count the number of unique event types subscribed
+	expectedTypes := 0
+	subscribedTypes.Range(func(key, value interface{}) bool {
+		expectedTypes++
+		return true
+	})
+
+	// Small delay to ensure all subscriptions are fully processed
+	time.Sleep(10 * time.Millisecond)
+
+	// Publish events to each subscribed type
+	subscribedTypes.Range(func(key, value interface{}) bool {
+		eventType := key.(uint32)
+		Publish(d, MyEvent3{ID: int(eventType)})
+		return true
+	})
+
+	// Wait for all events to be processed
+	time.Sleep(50 * time.Millisecond)
+
+	// Verify that we received at least the expected number of events
+	// (there might be more if multiple goroutines subscribed to the same event type)
+	received := atomic.LoadInt64(&receivedCount)
+	assert.GreaterOrEqual(t, int(received), expectedTypes,
+		"Should have received at least %d events, got %d", expectedTypes, received)
+
+	// Verify that we have the expected number of unique event types
+	assert.Equal(t, numEventTypes, expectedTypes,
+		"Should have exactly %d unique event types", numEventTypes)
+}
+
+func TestConcurrentHandlerRegistration(t *testing.T) {
+	const numGoroutines = 100
+
+	// Test concurrent subscriptions to the same event type
+	t.Run("SameEventType", func(t *testing.T) {
+		d := NewDispatcher()
+		var handlerCount int64
+		var wg sync.WaitGroup
+
+		// Start multiple goroutines subscribing to the same event type (0x1)
+		for i := 0; i < numGoroutines; i++ {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				SubscribeTo(d, uint32(0x1), func(ev MyEvent1) {
+					atomic.AddInt64(&handlerCount, 1)
+				})
+			}()
+		}
+
+		wg.Wait()
+
+		// Verify all handlers were registered by publishing an event
+		atomic.StoreInt64(&handlerCount, 0)
+		Publish(d, MyEvent1{})
+
+		// Small delay to ensure all handlers have executed
+		time.Sleep(10 * time.Millisecond)
+
+		assert.Equal(t, int64(numGoroutines), atomic.LoadInt64(&handlerCount),
+			"Not all handlers were registered due to race condition")
+	})
+
+	// Test concurrent subscriptions to different event types
+	t.Run("DifferentEventTypes", func(t *testing.T) {
+		d := NewDispatcher()
+		var wg sync.WaitGroup
+		receivedEvents := make(map[uint32]*int64)
+
+		// Create multiple event types and subscribe concurrently
+		for i := 0; i < numGoroutines; i++ {
+			eventType := uint32(100 + i)
+			counter := new(int64)
+			receivedEvents[eventType] = counter
+
+			wg.Add(1)
+			go func(et uint32, cnt *int64) {
+				defer wg.Done()
+				SubscribeTo(d, et, func(ev MyEvent3) {
+					atomic.AddInt64(cnt, 1)
+				})
+			}(eventType, counter)
+		}
+
+		wg.Wait()
+
+		// Publish events to all types
+		for eventType := uint32(100); eventType < uint32(100+numGoroutines); eventType++ {
+			Publish(d, MyEvent3{ID: int(eventType)})
+		}
+
+		// Small delay to ensure all handlers have executed
+		time.Sleep(10 * time.Millisecond)
+
+		// Verify all event types received their events
+		for eventType, counter := range receivedEvents {
+			assert.Equal(t, int64(1), atomic.LoadInt64(counter),
+				"Event type %d did not receive its event", eventType)
+		}
+	})
+}
+
+func TestBackpressure(t *testing.T) {
+	d := NewDispatcher()
+	d.maxQueue = 10
+
+	var processedCount int64
+	unsub := SubscribeTo(d, uint32(0x200), func(ev MyEvent3) {
+		atomic.AddInt64(&processedCount, 1)
+	})
+	defer unsub()
+
+	const eventsToPublish = 1000
+	for i := 0; i < eventsToPublish; i++ {
+		Publish(d, MyEvent3{ID: 0x200})
+	}
+
+	time.Sleep(100 * time.Millisecond)
+
+	// Verify all events were eventually processed
+	finalProcessed := atomic.LoadInt64(&processedCount)
+	assert.Equal(t, int64(eventsToPublish), finalProcessed)
+	t.Logf("Events processed: %d/%d", finalProcessed, eventsToPublish)
+}
+
+// ------------------------------------- Test Events -------------------------------------
+
+const (
+	TypeEvent1 = 0x1
+	TypeEvent2 = 0x2
+)
+
+type MyEvent1 struct {
+	Number int
+}
+
+func (t MyEvent1) Type() uint32 { return TypeEvent1 }
+
+type MyEvent2 struct {
+	Text string
+}
+
+func (t MyEvent2) Type() uint32 { return TypeEvent2 }
+
+type MyEvent3 struct {
+	ID int
+}
+
+func (t MyEvent3) Type() uint32 { return uint32(t.ID) }
@@ -0,0 +1,251 @@
+package logmon
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"os"
+	"sync"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/internal/event"
+)
+
+const DataEventID = 0x04
+
+type DataEvent struct {
+	Data []byte
+}
+
+func (e DataEvent) Type() uint32 {
+	return DataEventID
+}
+
+// circularBuffer is a fixed-size circular byte buffer that overwrites
+// oldest data when full. It provides O(1) writes and O(n) reads.
+type circularBuffer struct {
+	data []byte
+	head int
+	size int
+}
+
+func newCircularBuffer(capacity int) *circularBuffer {
+	return &circularBuffer{
+		data: make([]byte, capacity),
+		head: 0,
+		size: 0,
+	}
+}
+
+func (cb *circularBuffer) Write(p []byte) {
+	if len(p) == 0 {
+		return
+	}
+
+	cap := len(cb.data)
+
+	if len(p) >= cap {
+		copy(cb.data, p[len(p)-cap:])
+		cb.head = 0
+		cb.size = cap
+		return
+	}
+
+	firstPart := cap - cb.head
+	if firstPart >= len(p) {
+		copy(cb.data[cb.head:], p)
+		cb.head = (cb.head + len(p)) % cap
+	} else {
+		copy(cb.data[cb.head:], p[:firstPart])
+		copy(cb.data[:len(p)-firstPart], p[firstPart:])
+		cb.head = len(p) - firstPart
+	}
+
+	cb.size += len(p)
+	if cb.size > cap {
+		cb.size = cap
+	}
+}
+
+func (cb *circularBuffer) GetHistory() []byte {
+	if cb.size == 0 {
+		return nil
+	}
+
+	result := make([]byte, cb.size)
+	cap := len(cb.data)
+
+	start := (cb.head - cb.size + cap) % cap
+
+	if start+cb.size <= cap {
+		copy(result, cb.data[start:start+cb.size])
+	} else {
+		firstPart := cap - start
+		copy(result[:firstPart], cb.data[start:])
+		copy(result[firstPart:], cb.data[:cb.size-firstPart])
+	}
+
+	return result
+}
+
+type Level int
+
+const (
+	LevelDebug Level = iota
+	LevelInfo
+	LevelWarn
+	LevelError
+
+	BufferSize = 100 * 1024
+)
+
+type Monitor struct {
+	eventbus *event.Dispatcher
+	mu       sync.RWMutex
+	buffer   *circularBuffer
+	bufferMu sync.RWMutex
+
+	stdout io.Writer
+
+	level      Level
+	prefix     string
+	timeFormat string
+}
+
+func New() *Monitor {
+	return NewWriter(os.Stdout)
+}
+
+func NewWriter(stdout io.Writer) *Monitor {
+	return &Monitor{
+		eventbus:   event.NewDispatcherConfig(1000),
+		buffer:     nil,
+		stdout:     stdout,
+		level:      LevelInfo,
+		prefix:     "",
+		timeFormat: "",
+	}
+}
+
+func (w *Monitor) Write(p []byte) (n int, err error) {
+	if len(p) == 0 {
+		return 0, nil
+	}
+
+	n, err = w.stdout.Write(p)
+	if err != nil {
+		return n, err
+	}
+
+	w.bufferMu.Lock()
+	if w.buffer == nil {
+		w.buffer = newCircularBuffer(BufferSize)
+	}
+	w.buffer.Write(p)
+	w.bufferMu.Unlock()
+
+	bufferCopy := make([]byte, len(p))
+	copy(bufferCopy, p)
+	w.broadcast(bufferCopy)
+	return n, nil
+}
+
+func (w *Monitor) GetHistory() []byte {
+	w.bufferMu.RLock()
+	defer w.bufferMu.RUnlock()
+	if w.buffer == nil {
+		return nil
+	}
+	return w.buffer.GetHistory()
+}
+
+// Clear releases the buffer memory, making it eligible for GC.
+// The buffer will be lazily re-allocated on the next Write.
+func (w *Monitor) Clear() {
+	w.bufferMu.Lock()
+	w.buffer = nil
+	w.bufferMu.Unlock()
+}
+
+func (w *Monitor) OnLogData(callback func(data []byte)) context.CancelFunc {
+	return event.Subscribe(w.eventbus, func(e DataEvent) {
+		callback(e.Data)
+	})
+}
+
+func (w *Monitor) broadcast(msg []byte) {
+	event.Publish(w.eventbus, DataEvent{Data: msg})
+}
+
+func (w *Monitor) SetPrefix(prefix string) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	w.prefix = prefix
+}
+
+func (w *Monitor) SetLogLevel(level Level) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	w.level = level
+}
+
+func (w *Monitor) SetLogTimeFormat(timeFormat string) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	w.timeFormat = timeFormat
+}
+
+func (w *Monitor) formatMessage(level string, msg string) []byte {
+	prefix := ""
+	if w.prefix != "" {
+		prefix = fmt.Sprintf("[%s] ", w.prefix)
+	}
+	timestamp := ""
+	if w.timeFormat != "" {
+		timestamp = fmt.Sprintf("%s ", time.Now().Format(w.timeFormat))
+	}
+	return fmt.Appendf(nil, "%s%s[%s] %s\n", timestamp, prefix, level, msg)
+}
+
+func (w *Monitor) log(level Level, msg string) {
+	if level < w.level {
+		return
+	}
+	w.Write(w.formatMessage(level.String(), msg))
+}
+
+func (w *Monitor) Debug(msg string) { w.log(LevelDebug, msg) }
+func (w *Monitor) Info(msg string)  { w.log(LevelInfo, msg) }
+func (w *Monitor) Warn(msg string)  { w.log(LevelWarn, msg) }
+func (w *Monitor) Error(msg string) { w.log(LevelError, msg) }
+
+func (w *Monitor) Debugf(format string, args ...any) {
+	w.log(LevelDebug, fmt.Sprintf(format, args...))
+}
+
+func (w *Monitor) Infof(format string, args ...any) {
+	w.log(LevelInfo, fmt.Sprintf(format, args...))
+}
+
+func (w *Monitor) Warnf(format string, args ...any) {
+	w.log(LevelWarn, fmt.Sprintf(format, args...))
+}
+
+func (w *Monitor) Errorf(format string, args ...any) {
+	w.log(LevelError, fmt.Sprintf(format, args...))
+}
+
+func (l Level) String() string {
+	switch l {
+	case LevelDebug:
+		return "DEBUG"
+	case LevelInfo:
+		return "INFO"
+	case LevelWarn:
+		return "WARN"
+	case LevelError:
+		return "ERROR"
+	default:
+		return "UNKNOWN"
+	}
+}
@@ -0,0 +1,250 @@
+package logmon
+
+import (
+	"bytes"
+	"io"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+)
+
+func TestLogMonitor(t *testing.T) {
+	logMonitor := NewWriter(io.Discard)
+
+	var wg sync.WaitGroup
+
+	client1Messages := make([]byte, 0)
+	client2Messages := make([]byte, 0)
+
+	defer logMonitor.OnLogData(func(data []byte) {
+		client1Messages = append(client1Messages, data...)
+		wg.Done()
+	})()
+
+	defer logMonitor.OnLogData(func(data []byte) {
+		client2Messages = append(client2Messages, data...)
+		wg.Done()
+	})()
+
+	wg.Add(6) // 2 x 3 writes
+
+	logMonitor.Write([]byte("1"))
+	logMonitor.Write([]byte("2"))
+	logMonitor.Write([]byte("3"))
+
+	wg.Wait()
+
+	expectedHistory := "123"
+	history := string(logMonitor.GetHistory())
+
+	if history != expectedHistory {
+		t.Errorf("Expected history: %s, got: %s", expectedHistory, history)
+	}
+
+	c1Data := string(client1Messages)
+	if c1Data != expectedHistory {
+		t.Errorf("Client1 expected %s, got: %s", expectedHistory, c1Data)
+	}
+
+	c2Data := string(client2Messages)
+	if c2Data != expectedHistory {
+		t.Errorf("Client2 expected %s, got: %s", expectedHistory, c2Data)
+	}
+}
+
+func TestWrite_ImmutableBuffer(t *testing.T) {
+	lm := NewWriter(io.Discard)
+
+	msg := []byte("Hello, World!")
+	lenmsg := len(msg)
+
+	n, err := lm.Write(msg)
+	if err != nil {
+		t.Fatalf("Write failed: %v", err)
+	}
+
+	if n != lenmsg {
+		t.Errorf("Expected %d bytes written but got %d", lenmsg, n)
+	}
+
+	msg[0] = 'B'
+
+	history := lm.GetHistory()
+
+	expected := []byte("Hello, World!")
+	if !bytes.Equal(history, expected) {
+		t.Errorf("Expected history to be %q, got %q", expected, history)
+	}
+}
+
+func TestWrite_LogTimeFormat(t *testing.T) {
+	lm := NewWriter(io.Discard)
+
+	lm.timeFormat = time.RFC3339
+
+	lm.Info("Hello, World!")
+
+	history := lm.GetHistory()
+
+	timestamp := ""
+	fields := strings.Fields(string(history))
+	if len(fields) > 0 {
+		timestamp = fields[0]
+	} else {
+		t.Fatalf("Cannot extract string from history")
+	}
+
+	_, err := time.Parse(time.RFC3339, timestamp)
+	if err != nil {
+		t.Fatalf("Cannot find timestamp: %v", err)
+	}
+}
+
+func TestCircularBuffer_WrapAround(t *testing.T) {
+	cb := newCircularBuffer(10)
+
+	cb.Write([]byte("hello"))
+	if got := string(cb.GetHistory()); got != "hello" {
+		t.Errorf("Expected 'hello', got %q", got)
+	}
+
+	cb.Write([]byte("world"))
+	if got := string(cb.GetHistory()); got != "helloworld" {
+		t.Errorf("Expected 'helloworld', got %q", got)
+	}
+
+	cb.Write([]byte("12345"))
+	if got := string(cb.GetHistory()); got != "world12345" {
+		t.Errorf("Expected 'world12345', got %q", got)
+	}
+
+	cb.Write([]byte("abcdefghijklmnop"))
+	if got := string(cb.GetHistory()); got != "ghijklmnop" {
+		t.Errorf("Expected 'ghijklmnop', got %q", got)
+	}
+}
+
+func TestCircularBuffer_BoundaryConditions(t *testing.T) {
+	cb := newCircularBuffer(10)
+	if got := cb.GetHistory(); got != nil {
+		t.Errorf("Expected nil for empty buffer, got %q", got)
+	}
+
+	cb.Write([]byte("1234567890"))
+	if got := string(cb.GetHistory()); got != "1234567890" {
+		t.Errorf("Expected '1234567890', got %q", got)
+	}
+
+	cb = newCircularBuffer(10)
+	cb.Write([]byte("12345"))
+	cb.Write([]byte("67890"))
+	if got := string(cb.GetHistory()); got != "1234567890" {
+		t.Errorf("Expected '1234567890', got %q", got)
+	}
+}
+
+func TestLogMonitor_LazyInit(t *testing.T) {
+	lm := NewWriter(io.Discard)
+
+	if lm.buffer != nil {
+		t.Error("Expected buffer to be nil before first write")
+	}
+
+	if got := lm.GetHistory(); got != nil {
+		t.Errorf("Expected nil history before first write, got %q", got)
+	}
+
+	lm.Write([]byte("test"))
+
+	if lm.buffer == nil {
+		t.Error("Expected buffer to be initialized after write")
+	}
+
+	if got := string(lm.GetHistory()); got != "test" {
+		t.Errorf("Expected 'test', got %q", got)
+	}
+}
+
+func TestLogMonitor_Clear(t *testing.T) {
+	lm := NewWriter(io.Discard)
+
+	lm.Write([]byte("hello"))
+	if got := string(lm.GetHistory()); got != "hello" {
+		t.Errorf("Expected 'hello', got %q", got)
+	}
+
+	lm.Clear()
+
+	if lm.buffer != nil {
+		t.Error("Expected buffer to be nil after Clear")
+	}
+
+	if got := lm.GetHistory(); got != nil {
+		t.Errorf("Expected nil history after Clear, got %q", got)
+	}
+}
+
+func TestLogMonitor_ClearAndReuse(t *testing.T) {
+	lm := NewWriter(io.Discard)
+
+	lm.Write([]byte("first"))
+	lm.Clear()
+	lm.Write([]byte("second"))
+
+	if got := string(lm.GetHistory()); got != "second" {
+		t.Errorf("Expected 'second' after clear and reuse, got %q", got)
+	}
+}
+
+func BenchmarkLogMonitorWrite(b *testing.B) {
+	smallMsg := []byte("small message\n")
+	mediumMsg := []byte(strings.Repeat("medium message content ", 10) + "\n")
+	largeMsg := []byte(strings.Repeat("large message content for benchmarking ", 100) + "\n")
+
+	b.Run("SmallWrite", func(b *testing.B) {
+		lm := NewWriter(io.Discard)
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			lm.Write(smallMsg)
+		}
+	})
+
+	b.Run("MediumWrite", func(b *testing.B) {
+		lm := NewWriter(io.Discard)
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			lm.Write(mediumMsg)
+		}
+	})
+
+	b.Run("LargeWrite", func(b *testing.B) {
+		lm := NewWriter(io.Discard)
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			lm.Write(largeMsg)
+		}
+	})
+
+	b.Run("WithSubscribers", func(b *testing.B) {
+		lm := NewWriter(io.Discard)
+		for i := 0; i < 5; i++ {
+			lm.OnLogData(func(data []byte) {})
+		}
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			lm.Write(mediumMsg)
+		}
+	})
+
+	b.Run("GetHistory", func(b *testing.B) {
+		lm := NewWriter(io.Discard)
+		for i := 0; i < 1000; i++ {
+			lm.Write(mediumMsg)
+		}
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			lm.GetHistory()
+		}
+	})
+}
@@ -0,0 +1,92 @@
+package perf
+
+type LUID struct {
+	LowPart  uint32
+	HighPart int32
+}
+
+const maxEnumAdapters = 16
+
+type D3DKMT_ENUMADAPTERS2 struct {
+	NumAdapters uint32
+	pAdapters   uintptr
+}
+
+type D3DKMT_ADAPTERINFO struct {
+	hAdapter                     uint32
+	AdapterLuid                  LUID
+	NumOfSources                 uint32
+	bPresentMoveRegionsPreferred int32
+}
+
+type D3DKMT_OPENADAPTERFROMLUID struct {
+	AdapterLuid LUID
+	hAdapter    uint32
+}
+
+type D3DKMT_CLOSEADAPTER struct {
+	hAdapter uint32
+}
+
+type KMTQUERYADAPTERINFOTYPE int32
+
+const (
+	KMTQAITYPE_UMDRIVERPRIVATE          KMTQUERYADAPTERINFOTYPE = 0
+	KMTQAITYPE_ADAPTERREGISTRYINFO      KMTQUERYADAPTERINFOTYPE = 8
+	KMTQAITYPE_DRIVERVERSION            KMTQUERYADAPTERINFOTYPE = 13
+	KMTQAITYPE_PHYSICALADAPTERDEVICEIDS KMTQUERYADAPTERINFOTYPE = 31
+	KMTQAITYPE_NODEPERFDATA             KMTQUERYADAPTERINFOTYPE = 61
+	KMTQAITYPE_ADAPTERPERFDATA          KMTQUERYADAPTERINFOTYPE = 62
+	KMTQAITYPE_ADAPTERPERFDATA_CAPS     KMTQUERYADAPTERINFOTYPE = 63
+)
+
+type D3DKMT_QUERYADAPTERINFO struct {
+	hAdapter              uint32
+	Type                  KMTQUERYADAPTERINFOTYPE
+	pPrivateDriverData    uintptr
+	PrivateDriverDataSize uint32
+}
+
+type D3DKMT_ADAPTER_PERFDATA struct {
+	PhysicalAdapterIndex uint32
+	MemoryFrequency      uint64
+	MaxMemoryFrequency   uint64
+	MaxMemoryFrequencyOC uint64
+	MemoryBandwidth      uint64
+	PCIEBandwidth        uint64
+	FanRPM               uint32
+	Power                uint32
+	Temperature          uint32
+	PowerStateOverride   byte
+}
+
+type D3DKMT_QUERYSTATISTICS_TYPE int32
+
+const (
+	D3DKMT_QUERYSTATISTICS_ADAPTER             D3DKMT_QUERYSTATISTICS_TYPE = 0
+	D3DKMT_QUERYSTATISTICS_PROCESS             D3DKMT_QUERYSTATISTICS_TYPE = 1
+	D3DKMT_QUERYSTATISTICS_PROCESS_ADAPTER     D3DKMT_QUERYSTATISTICS_TYPE = 2
+	D3DKMT_QUERYSTATISTICS_SEGMENT             D3DKMT_QUERYSTATISTICS_TYPE = 3
+	D3DKMT_QUERYSTATISTICS_PROCESS_SEGMENT     D3DKMT_QUERYSTATISTICS_TYPE = 4
+	D3DKMT_QUERYSTATISTICS_NODE                D3DKMT_QUERYSTATISTICS_TYPE = 5
+	D3DKMT_QUERYSTATISTICS_PROCESS_NODE        D3DKMT_QUERYSTATISTICS_TYPE = 6
+	D3DKMT_QUERYSTATISTICS_VIDPNSOURCE         D3DKMT_QUERYSTATISTICS_TYPE = 7
+	D3DKMT_QUERYSTATISTICS_PROCESS_VIDPNSOURCE D3DKMT_QUERYSTATISTICS_TYPE = 8
+)
+
+type D3DKMT_ADAPTER_PERFDATACAPS struct {
+	PhysicalAdapterIndex uint32
+	MaxMemoryBandwidth   uint64
+	MaxPCIEBandwidth     uint64
+	MaxFanRPM            uint32
+	TemperatureMax       uint32
+	TemperatureWarning   uint32
+}
+
+type D3DKMT_QUERYSTATISTICS_QUERY_SEGMENT struct {
+	SegmentId uint32
+}
+
+type D3DKMT_QUERYSTATISTICS_QUERY_NODE struct {
+	NodeId uint32
+}
@@ -0,0 +1,529 @@
+//go:build windows
+
+package perf
+
+import (
+	"context"
+	"encoding/binary"
+	"fmt"
+	"sync"
+	"time"
+	"unsafe"
+
+	"github.com/mostlygeek/llama-swap/internal/logmon"
+	"golang.org/x/sys/windows"
+)
+
+var (
+	d3dkmDLL                *windows.LazyDLL
+	procEnumAdapters2       *windows.LazyProc
+	procOpenAdapterFromLuid *windows.LazyProc
+	procCloseAdapter        *windows.LazyProc
+	procQueryAdapterInfo    *windows.LazyProc
+	procQueryStatistics     *windows.LazyProc
+	d3dkmtInitOnce          sync.Once
+	d3dkmtInitErr           error
+)
+
+// initD3DKMT lazily loads gdi32.dll and resolves D3DKMT function pointers.
+// Safe for concurrent use via sync.Once.
+func initD3DKMT() error {
+	d3dkmtInitOnce.Do(func() {
+		d3dkmDLL = windows.NewLazySystemDLL("gdi32.dll")
+
+		procEnumAdapters2 = d3dkmDLL.NewProc("D3DKMTEnumAdapters2")
+		procOpenAdapterFromLuid = d3dkmDLL.NewProc("D3DKMTOpenAdapterFromLuid")
+		procCloseAdapter = d3dkmDLL.NewProc("D3DKMTCloseAdapter")
+		procQueryAdapterInfo = d3dkmDLL.NewProc("D3DKMTQueryAdapterInfo")
+		procQueryStatistics = d3dkmDLL.NewProc("D3DKMTQueryStatistics")
+
+		for name, p := range map[string]*windows.LazyProc{
+			"D3DKMTEnumAdapters2":       procEnumAdapters2,
+			"D3DKMTOpenAdapterFromLuid": procOpenAdapterFromLuid,
+			"D3DKMTCloseAdapter":        procCloseAdapter,
+			"D3DKMTQueryAdapterInfo":    procQueryAdapterInfo,
+			"D3DKMTQueryStatistics":     procQueryStatistics,
+		} {
+			if err := p.Find(); err != nil {
+				d3dkmtInitErr = fmt.Errorf("D3DKMT %s not found: %w", name, err)
+				return
+			}
+		}
+	})
+	return d3dkmtInitErr
+}
+
+// ntstatusCall invokes a D3DKMT function and returns a non-nil error if the
+// NTSTATUS result is not STATUS_SUCCESS (0).
+func ntstatusCall(proc *windows.LazyProc, arg unsafe.Pointer) error {
+	ret, _, _ := proc.Call(uintptr(arg))
+	if ret != 0 {
+		return fmt.Errorf("NTSTATUS 0x%08x", uint32(ret))
+	}
+	return nil
+}
+
+// d3dkmEnumerateAdapters enumerates all available graphics adapters via
+// D3DKMTEnumAdapters2.
+func d3dkmEnumerateAdapters() ([]D3DKMT_ADAPTERINFO, error) {
+	var adapters [maxEnumAdapters]D3DKMT_ADAPTERINFO
+	enum := D3DKMT_ENUMADAPTERS2{
+		NumAdapters: maxEnumAdapters,
+		pAdapters:   uintptr(unsafe.Pointer(&adapters[0])),
+	}
+	if err := ntstatusCall(procEnumAdapters2, unsafe.Pointer(&enum)); err != nil {
+		return nil, fmt.Errorf("EnumAdapters2: %w", err)
+	}
+	if enum.NumAdapters == 0 {
+		return nil, fmt.Errorf("no adapters found")
+	}
+	result := make([]D3DKMT_ADAPTERINFO, enum.NumAdapters)
+	for i := uint32(0); i < enum.NumAdapters; i++ {
+		result[i] = adapters[i]
+	}
+	return result, nil
+}
+
+// d3dkmOpenAdapter opens a D3DKMT adapter handle for the given LUID.
+func d3dkmOpenAdapter(luid LUID) (uint32, error) {
+	req := D3DKMT_OPENADAPTERFROMLUID{
+		AdapterLuid: luid,
+	}
+	if err := ntstatusCall(procOpenAdapterFromLuid, unsafe.Pointer(&req)); err != nil {
+		return 0, fmt.Errorf("OpenAdapterFromLuid: %w", err)
+	}
+	return req.hAdapter, nil
+}
+
+// d3dkmCloseAdapter closes a previously opened D3DKMT adapter handle.
+func d3dkmCloseAdapter(hAdapter uint32) error {
+	req := D3DKMT_CLOSEADAPTER{hAdapter: hAdapter}
+	return ntstatusCall(procCloseAdapter, unsafe.Pointer(&req))
+}
+
+// d3dkmGetAdapterPerfData queries per-adapter performance data (temperature,
+// fan RPM, power, bandwidth) via KMTQAITYPE_ADAPTERPERFDATA.
+func d3dkmGetAdapterPerfData(hAdapter uint32) (*D3DKMT_ADAPTER_PERFDATA, error) {
+	var data D3DKMT_ADAPTER_PERFDATA
+	req := D3DKMT_QUERYADAPTERINFO{
+		hAdapter:              hAdapter,
+		Type:                  KMTQAITYPE_ADAPTERPERFDATA,
+		pPrivateDriverData:    uintptr(unsafe.Pointer(&data)),
+		PrivateDriverDataSize: uint32(unsafe.Sizeof(data)),
+	}
+	if err := ntstatusCall(procQueryAdapterInfo, unsafe.Pointer(&req)); err != nil {
+		return nil, fmt.Errorf("QueryAdapterInfo(ADAPTERPERFDATA): %w", err)
+	}
+	return &data, nil
+}
+
+// d3dkmGetAdapterPerfDataCaps queries static adapter performance capabilities
+// (max fan RPM, temperature limits, max bandwidth) via KMTQAITYPE_ADAPTERPERFDATA_CAPS.
+func d3dkmGetAdapterPerfDataCaps(hAdapter uint32) (*D3DKMT_ADAPTER_PERFDATACAPS, error) {
+	var data D3DKMT_ADAPTER_PERFDATACAPS
+	req := D3DKMT_QUERYADAPTERINFO{
+		hAdapter:              hAdapter,
+		Type:                  KMTQAITYPE_ADAPTERPERFDATA_CAPS,
+		pPrivateDriverData:    uintptr(unsafe.Pointer(&data)),
+		PrivateDriverDataSize: uint32(unsafe.Sizeof(data)),
+	}
+	if err := ntstatusCall(procQueryAdapterInfo, unsafe.Pointer(&req)); err != nil {
+		return nil, fmt.Errorf("QueryAdapterInfo(ADAPTERPERFDATACAPS): %w", err)
+	}
+	return &data, nil
+}
+
+type queryStatsBuffer struct {
+	Type        int32   // offset 0
+	AdapterLuid LUID    // offset 4
+	hProcess    uintptr // offset 16
+	// _result mirrors the D3DKMT_QUERYSTATISTICS_RESULT union.
+	// sizeof(D3DKMT_QUERYSTATISTICS) == 0x328 (808 bytes) on x64.
+	//
+	// The C struct layout (x64):
+	//   offset  0: Type (int32, 4 bytes)
+	//   offset  4: AdapterLuid (LUID, 8 bytes)
+	//   offset 12: 4 bytes padding (for 8-byte alignment of hProcess)
+	//   offset 16: hProcess (HANDLE, 8 bytes)
+	//   offset 24: QueryResult (union, 780 bytes — largest member is AdapterInformation)
+	//   offset 804: anonymous input union (QueryNode.NodeId / QuerySegment.SegmentId, 4 bytes)
+	//
+	// Previous bug: _result was [776]byte, placing QueryId at offset 800 instead of 804.
+	// The kernel read NodeId/SegmentId from offset 804 (always zero from _pad),
+	// causing all NODE and SEGMENT queries to use index 0 regardless of the value
+	// passed in QueryId. This produced alternating behavior where only GPU util OR
+	// memory util appeared to work, depending on which test variant happened to put
+	// non-zero data near offset 804 in the result buffer.
+	_result [780]byte // offset 24, size 780 — places QueryId at offset 804
+	QueryId int32     // offset 804 — matches C anonymous union for NodeId/SegmentId
+}
+
+func init() {
+	var buf queryStatsBuffer
+	if unsafe.Sizeof(buf) != 808 {
+		panic(fmt.Sprintf("queryStatsBuffer size %d != expected 808 (sizeof D3DKMT_QUERYSTATISTICS on x64)", unsafe.Sizeof(buf)))
+	}
+	if unsafe.Offsetof(buf.QueryId) != 804 {
+		panic(fmt.Sprintf("queryStatsBuffer.QueryId offset %d != expected 804 (C anonymous union offset)", unsafe.Offsetof(buf.QueryId)))
+	}
+
+	var perfData D3DKMT_ADAPTER_PERFDATA
+	if unsafe.Sizeof(perfData) != 64 {
+		panic(fmt.Sprintf("D3DKMT_ADAPTER_PERFDATA size %d != expected 64 on x64", unsafe.Sizeof(perfData)))
+	}
+
+	var caps D3DKMT_ADAPTER_PERFDATACAPS
+	if unsafe.Sizeof(caps) != 40 {
+		panic(fmt.Sprintf("D3DKMT_ADAPTER_PERFDATACAPS size %d != expected 40 on x64", unsafe.Sizeof(caps)))
+	}
+}
+
+const (
+	qsoffsetNbSegments        = 0
+	qsoffsetNodeCount         = 4
+	qsoffsetCommitLimit       = 0
+	qsoffsetBytesCommitted    = 8
+	qsoffsetBytesResident     = 16
+	qsoffsetRunningTime       = 0
+	qsoffsetSystemRunningTime = 272
+)
+
+// d3dkmQueryAdapterStats returns the number of memory segments and compute
+// nodes for the adapter identified by luid.
+func d3dkmQueryAdapterStats(luid LUID) (nbSegments uint32, nodeCount uint32, err error) {
+	buf := queryStatsBuffer{
+		Type:        int32(D3DKMT_QUERYSTATISTICS_ADAPTER),
+		AdapterLuid: luid,
+	}
+	if err := ntstatusCall(procQueryStatistics, unsafe.Pointer(&buf)); err != nil {
+		return 0, 0, fmt.Errorf("QueryStatistics(ADAPTER): %w", err)
+	}
+	nbSegments = binary.LittleEndian.Uint32(buf._result[qsoffsetNbSegments : qsoffsetNbSegments+4])
+	nodeCount = binary.LittleEndian.Uint32(buf._result[qsoffsetNodeCount : qsoffsetNodeCount+4])
+	return nbSegments, nodeCount, nil
+}
+
+// d3dkmQuerySegmentStats returns the commit limit (total) and resident
+// (used) bytes for the given memory segment of an adapter.
+func d3dkmQuerySegmentStats(luid LUID, segmentID uint32) (commitLimit uint64, bytesResident uint64, err error) {
+	buf := queryStatsBuffer{
+		Type:        int32(D3DKMT_QUERYSTATISTICS_SEGMENT),
+		AdapterLuid: luid,
+		QueryId:     int32(segmentID),
+	}
+	if err := ntstatusCall(procQueryStatistics, unsafe.Pointer(&buf)); err != nil {
+		return 0, 0, fmt.Errorf("QueryStatistics(SEGMENT %d): %w", segmentID, err)
+	}
+	commitLimit = binary.LittleEndian.Uint64(buf._result[qsoffsetCommitLimit : qsoffsetCommitLimit+8])
+	bytesResident = binary.LittleEndian.Uint64(buf._result[qsoffsetBytesResident : qsoffsetBytesResident+8])
+	if bytesResident == 0 {
+		bytesResident = binary.LittleEndian.Uint64(buf._result[qsoffsetBytesCommitted : qsoffsetBytesCommitted+8])
+	}
+	return commitLimit, bytesResident, nil
+}
+
+// d3dkmQueryNodeStats returns the global and system running time counters
+// (in 100ns units) for the given compute node of an adapter.
+func d3dkmQueryNodeStats(luid LUID, nodeID uint32) (runningTime uint64, systemRunningTime uint64, err error) {
+	buf := queryStatsBuffer{
+		Type:        int32(D3DKMT_QUERYSTATISTICS_NODE),
+		AdapterLuid: luid,
+		QueryId:     int32(nodeID),
+	}
+	if err := ntstatusCall(procQueryStatistics, unsafe.Pointer(&buf)); err != nil {
+		return 0, 0, fmt.Errorf("QueryStatistics(NODE %d): %w", nodeID, err)
+	}
+	runningTime = binary.LittleEndian.Uint64(buf._result[qsoffsetRunningTime : qsoffsetRunningTime+8])
+	systemRunningTime = binary.LittleEndian.Uint64(buf._result[qsoffsetSystemRunningTime : qsoffsetSystemRunningTime+8])
+	return runningTime, systemRunningTime, nil
+}
+
+type nodeRunningTimes struct {
+	Global uint64
+	System uint64
+}
+
+// d3dkmtNodeUtil computes GPU node utilization as a percentage from running
+// time deltas. Returns -1 if counters went backwards (wrap/reset), 0 if idle.
+func d3dkmtNodeUtil(prevRT, curRT nodeRunningTimes, elapsed100ns int64) float64 {
+	if curRT.Global < prevRT.Global || curRT.System < prevRT.System {
+		return -1
+	}
+	gd := curRT.Global - prevRT.Global
+	sd := curRT.System - prevRT.System
+
+	if gd > 0 && sd > 0 {
+		util := float64(gd) / float64(sd)
+		if util > 1.0 {
+			util = 1.0
+		}
+		return util * 100.0
+	} else if gd > 0 && elapsed100ns > 0 {
+		util := float64(gd) / float64(elapsed100ns) * 100.0
+		if util > 100.0 {
+			util = 100.0
+		}
+		return util
+	}
+	return 0
+}
+
+// d3dkmtFanPct returns fan speed as a percentage of maxFanRPM, clamped to
+// 100%. Returns 0 if maxFanRPM is unavailable or fan is not spinning.
+func d3dkmtFanPct(fanRPM, maxFanRPM uint32) float64 {
+	if maxFanRPM > 0 && fanRPM > 0 {
+		pct := float64(fanRPM) / float64(maxFanRPM) * 100.0
+		if pct > 100.0 {
+			pct = 100.0
+		}
+		return pct
+	}
+	return 0
+}
+
+// d3dkmtPowerW converts power from deci-watts (as reported by D3DKMT) to
+// watts. Returns 0 if the power value is zero.
+func d3dkmtPowerW(power uint32) float64 {
+	if power > 0 {
+		return float64(power) / 10.0
+	}
+	return 0
+}
+
+// d3dkmtTempC converts temperature from deci-Celsius (as reported by D3DKMT)
+// to degrees Celsius.
+func d3dkmtTempC(tempDeciC uint32) int {
+	return int(tempDeciC / 10)
+}
+
+type d3dkmtAdapterState struct {
+	luid       LUID
+	hAdapter   uint32
+	nbSegments uint32
+	nodeCount  uint32
+	maxFanRPM  uint32
+	prevNodeRT map[uint32]nodeRunningTimes
+	prevTime   time.Time
+}
+
+// tryD3DKMT attempts to start GPU monitoring using D3DKMT and optional PDH
+// counters. It returns a channel of GpuStat snapshots or an error if no
+// usable adapters are found.
+func tryD3DKMT(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
+	if err := initD3DKMT(); err != nil {
+		return nil, err
+	}
+
+	adapterInfos, err := d3dkmEnumerateAdapters()
+	if err != nil {
+		return nil, err
+	}
+
+	type adapterMeta struct {
+		luid       LUID
+		nbSegments uint32
+		nodeCount  uint32
+		maxFanRPM  uint32
+	}
+
+	var metaList []adapterMeta
+
+	for i, ai := range adapterInfos {
+		hAdapter, err := d3dkmOpenAdapter(ai.AdapterLuid)
+		if err != nil {
+			logger.Debugf("adapter %d: open failed: %s", i, err.Error())
+			continue
+		}
+
+		nbSegments, nodeCount, err := d3dkmQueryAdapterStats(ai.AdapterLuid)
+		if err != nil {
+			logger.Debugf("adapter %d: query stats failed: %s", i, err.Error())
+			d3dkmCloseAdapter(hAdapter)
+			continue
+		}
+
+		caps, err := d3dkmGetAdapterPerfDataCaps(hAdapter)
+		if err != nil {
+			logger.Debugf("adapter %d: perf caps failed: %s", i, err.Error())
+		}
+
+		d3dkmCloseAdapter(hAdapter)
+
+		var maxFanRPM uint32
+		if caps != nil {
+			maxFanRPM = caps.MaxFanRPM
+		}
+
+		metaList = append(metaList, adapterMeta{
+			luid:       ai.AdapterLuid,
+			nbSegments: nbSegments,
+			nodeCount:  nodeCount,
+			maxFanRPM:  maxFanRPM,
+		})
+		logger.Debugf("adapter %d: segments=%d nodes=%d fan_max=%d luid=%d:%d", i, nbSegments, nodeCount, maxFanRPM, ai.AdapterLuid.HighPart, ai.AdapterLuid.LowPart)
+	}
+
+	if len(metaList) == 0 {
+		return nil, fmt.Errorf("no usable D3DKMT adapters found")
+	}
+
+	pdhUtil, pdhErr := initPdhGpuUtil()
+	if pdhErr != nil {
+		logger.Debugf("PDH GPU utilization not available: %s", pdhErr.Error())
+	} else {
+		logger.Info("using PDH performance counters for GPU utilization")
+	}
+
+	ch := make(chan []GpuStat, 1)
+
+	go func() {
+		defer close(ch)
+		if pdhUtil != nil {
+			defer pdhUtil.close()
+		}
+
+		var adapters []d3dkmtAdapterState
+		for _, m := range metaList {
+			hAdapter, err := d3dkmOpenAdapter(m.luid)
+			if err != nil {
+				logger.Debugf("reopen adapter failed: %s", err.Error())
+				continue
+			}
+			adapters = append(adapters, d3dkmtAdapterState{
+				luid:       m.luid,
+				hAdapter:   hAdapter,
+				nbSegments: m.nbSegments,
+				nodeCount:  m.nodeCount,
+				maxFanRPM:  m.maxFanRPM,
+				prevNodeRT: make(map[uint32]nodeRunningTimes),
+			})
+		}
+
+		if len(adapters) == 0 {
+			return
+		}
+
+		defer func() {
+			for _, a := range adapters {
+				d3dkmCloseAdapter(a.hAdapter)
+			}
+		}()
+
+		for i := range adapters {
+			a := &adapters[i]
+			for node := uint32(0); node < a.nodeCount; node++ {
+				globalRT, systemRT, err := d3dkmQueryNodeStats(a.luid, node)
+				if err != nil {
+					continue
+				}
+				a.prevNodeRT[node] = nodeRunningTimes{Global: globalRT, System: systemRT}
+			}
+			a.prevTime = time.Now()
+		}
+
+		ticker := time.NewTicker(every)
+		defer ticker.Stop()
+
+		for {
+			select {
+			case <-ctx.Done():
+				return
+			case <-ticker.C:
+				stats := make([]GpuStat, 0, len(adapters))
+				now := time.Now()
+
+				var pdhUtilMap map[LUID]float64
+				if pdhUtil != nil {
+					pdhUtilMap = pdhUtil.collect()
+				}
+
+				for i := range adapters {
+					a := &adapters[i]
+
+					perfData, err := d3dkmGetAdapterPerfData(a.hAdapter)
+					if err != nil {
+						logger.Debugf("adapter %d perfdata: %s", i, err.Error())
+						continue
+					}
+
+					var memUsedMB, memTotalMB int
+					for seg := uint32(0); seg < a.nbSegments; seg++ {
+						limit, resident, err := d3dkmQuerySegmentStats(a.luid, seg)
+						if err != nil {
+							continue
+						}
+						memUsedMB += int(resident / (1024 * 1024))
+						memTotalMB += int(limit / (1024 * 1024))
+					}
+
+					var gpuUtil float64
+					pdhGaveValue := false
+					if pdhUtilMap != nil {
+						if util, ok := pdhUtilMap[a.luid]; ok {
+							gpuUtil = util
+							pdhGaveValue = true
+						}
+					}
+
+					if !pdhGaveValue && a.nodeCount > 0 {
+						elapsedNs := now.Sub(a.prevTime).Nanoseconds()
+						elapsed100ns := elapsedNs / 100
+
+						for node := uint32(0); node < a.nodeCount; node++ {
+							globalRT, systemRT, err := d3dkmQueryNodeStats(a.luid, node)
+							if err != nil {
+								continue
+							}
+
+							if prevRT, ok := a.prevNodeRT[node]; ok {
+								if globalRT < prevRT.Global || systemRT < prevRT.System {
+									a.prevNodeRT[node] = nodeRunningTimes{Global: globalRT, System: systemRT}
+									continue
+								}
+								nodeUtil := d3dkmtNodeUtil(prevRT, nodeRunningTimes{Global: globalRT, System: systemRT}, elapsed100ns)
+								if nodeUtil > gpuUtil {
+									gpuUtil = nodeUtil
+								}
+							}
+							a.prevNodeRT[node] = nodeRunningTimes{Global: globalRT, System: systemRT}
+						}
+
+						a.prevTime = now
+					}
+
+					tempC := d3dkmtTempC(perfData.Temperature)
+
+					fanSpeedPct := d3dkmtFanPct(perfData.FanRPM, a.maxFanRPM)
+					powerDrawW := d3dkmtPowerW(perfData.Power)
+
+					var memUtilPct float64
+					if memTotalMB > 0 {
+						memUtilPct = float64(memUsedMB) / float64(memTotalMB) * 100.0
+					}
+
+					stats = append(stats, GpuStat{
+						Timestamp:   now,
+						ID:          i,
+						Name:        fmt.Sprintf("GPU %d", i),
+						TempC:       tempC,
+						GpuUtilPct:  gpuUtil,
+						MemUtilPct:  memUtilPct,
+						MemUsedMB:   memUsedMB,
+						MemTotalMB:  memTotalMB,
+						FanSpeedPct: fanSpeedPct,
+						PowerDrawW:  powerDrawW,
+					})
+				}
+
+				if len(stats) > 0 {
+					select {
+					case ch <- stats:
+					default:
+					}
+				}
+			}
+		}
+	}()
+
+	return ch, nil
+}
--- a/Show More
+++ b/Show More