Makefile,internal: fix websocket regression and other small things (#830 )

- fix websocket regression and add test to prevent in the future - fix staticheck errors - remove proxy package remnants from Makefile fix #829
internal/process,server: fix unload regression (#828 )
2026-06-09 21:37:53 -07:00 · 2026-06-09 20:49:58 -07:00 · 2026-06-06 21:00:30 -07:00 · 2026-06-04 14:26:21 -07:00 · 2026-06-04 11:00:43 -07:00 · 2026-06-03 21:51:03 -07:00
241 changed files with 36189 additions and 13079 deletions
@@ -4,12 +4,19 @@ early_access: false
 reviews:
  profile: "chill"
  request_changes_workflow: false
-  high_level_summary: true
+  high_level_summary: false
  poem: false
  review_status: true
  collapse_walkthrough: false
+  sequence_diagrams: false
+  finishing_touches:
+    docstrings:
+      enabled: false
  auto_review:
-    enabled: true
+    enabled: false
    drafts: false
 chat:
  auto_reply: true
+issue_enrichment:
+  planning:
+    enabled: false
@@ -11,13 +11,13 @@ jobs:
      issues: write
      pull-requests: write
    steps:
-      - uses: actions/stale@v9
+      - uses: actions/stale@b5d41d4e1d5dceea10e7104786b73624c18a190f #v10.2.0
        with:
-          days-before-issue-stale: 14
-          days-before-issue-close: 14
+          days-before-issue-stale: 30
+          days-before-issue-close: 30
          stale-issue-label: "stale"
-          stale-issue-message: "This issue is stale because it has been open for 2 weeks with no activity."
-          close-issue-message: "This issue was closed because it has been inactive for 2 weeks since being marked as stale."
+          stale-issue-message: "This issue is stale because it has been open without activity for 30 days. Please remove the stale label if this was an error."
+          close-issue-message: "This issue was closed because it has been inactive for 30 days since being marked as stale."
          days-before-pr-stale: -1
          days-before-pr-close: -1
          repo-token: ${{ secrets.GITHUB_TOKEN }}
@@ -4,11 +4,15 @@ on:
  pull_request:
    paths:
      - "config-schema.json"
+      - "config.example.yaml"
+      - ".github/workflows/config-schema.yml"
  push:
    branches:
      - main
    paths:
      - "config-schema.json"
+      - "config.example.yaml"
+      - ".github/workflows/config-schema.yml"

  workflow_dispatch:

@@ -17,7 +21,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2

      - name: Validate JSON Schema
        run: |
@@ -39,3 +43,14 @@ jobs:
          fi

          echo "✓ config-schema.json is valid"
+
+      - name: Set up Python
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 #v6.2.0
+        with:
+          python-version: "3.x"
+
+      - name: Install check-jsonschema
+        run: pip install check-jsonschema
+
+      - name: Validate config.example.yaml against schema
+        run: check-jsonschema --schemafile config-schema.json config.example.yaml
@@ -2,27 +2,69 @@ name: Build Containers

 on:
  # time has no specific meaning, trying to time it after
-  # the llama.cpp daily packages are published
+  # the llama.cpp daily packages have time to build and publish (~8hr after llama.cpp project's cron)
  # https://github.com/ggml-org/llama.cpp/blob/master/.github/workflows/docker.yml
  schedule:
-    - cron: "37 5 * * *"
+    - cron: "00 12,18 * * *"

  # Allows manual triggering of the workflow
  workflow_dispatch:
+    inputs:
+      dryrun:
+        description: "Run cleanup step in dry-run mode (log what would be deleted, delete nothing)"
+        type: boolean
+        default: false
+
+  # Run on workflow file changes (without pushing)
+  push:
+    paths:
+      - '.github/workflows/containers.yml'
+      - 'docker/build-container.sh'
+      - 'docker/*.Containerfile'
+
+# grant permissions on GITHUB_TOKEN to publish packages
+# ref: https://docs.github.com/en/packages/managing-github-packages-using-github-actions-workflows/publishing-and-installing-a-package-with-github-actions#publishing-a-package-using-an-action
+permissions:
+  contents: read
+  packages: write
+  id-token: write

 jobs:
  build-and-push:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        platform: [intel, cuda, vulkan, cpu, musa]
+        platform: [intel, cuda, cuda13, vulkan, cpu, musa, rocm]
      fail-fast: false
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
+
+      - name: Free up disk space
+        if: matrix.platform == 'rocm'
+        run: |
+          echo "Before cleanup:"
+          df -h
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+          sudo docker system prune -af
+          echo "After cleanup:"
+          df -h
+
+      # QEMU enables arm64 cross-builds on the amd64 GitHub runner.
+      # Currently only the cpu backend goes multi-arch; the action is a
+      # no-op for amd64-only builds, so leaving it on for every matrix
+      # entry keeps the workflow simple.
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a #v4.0.0
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd #v4.0.0

      - name: Log in to GitHub Container Registry
-        uses: docker/login-action@v2
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 #v4.1.0
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
@@ -31,16 +73,25 @@ jobs:
      - name: Run build-container
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: ./docker/build-container.sh ${{ matrix.platform }} true
+        run: ./docker/build-container.sh ${{ matrix.platform }} ${{ github.event_name != 'push' }}

-  # note make sure mostlygeek/llama-swap has admin rights to the llama-swap package
-  # see: https://github.com/actions/delete-package-versions/issues/74
+  # actions/delete-package-versions can't see manifest lists: pushing
+  # a multi-arch image with `docker buildx --push` creates a tagged OCI
+  # index plus one untagged per-platform manifest per arch, and
+  # `delete-only-untagged-versions: true` then nukes the per-platform
+  # children, leaving the index dangling — `docker pull :cpu` 404s on
+  # the referenced digest. dataaxiom/ghcr-cleanup-action walks tagged
+  # manifest lists and excludes their children from deletion.
  delete-untagged-containers:
    needs: build-and-push
+    # Skip on forks — the delete API requires package-admin on the
+    # upstream account and would otherwise red-x every fork CI run.
+    if: github.repository == 'mostlygeek/llama-swap'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/delete-package-versions@v5
+      - uses: dataaxiom/ghcr-cleanup-action@cd0cdb900b5dbf3a6f2cc869f0dbb0b8211f50c4 # v1.0.16
        with:
-          package-name: 'llama-swap'
-          package-type: 'container'
-          delete-only-untagged-versions: 'true'
+          token: ${{ secrets.GITHUB_TOKEN }}
+          package: llama-swap
+          delete-untagged: true
+          dry-run: ${{ inputs.dryrun || false }}
@@ -3,9 +3,25 @@ name: Windows CI
 on:
  push:
    branches: [ "main" ]
+    # only run when backend source changes
+    # cmd/ is excluded because it contains utilities without tests
+    paths:
+      - '**/*.go'
+      - '!cmd/**'
+      - 'go.mod'
+      - 'go.sum'
+      - 'Makefile'
+      - '.github/workflows/go-ci-windows.yml'

  pull_request:
    branches: [ "main" ]
+    paths:
+      - '**/*.go'
+      - '!cmd/**'
+      - 'go.mod'
+      - 'go.sum'
+      - 'Makefile'
+      - '.github/workflows/go-ci-windows.yml'

  # Allows manual triggering of the workflow
  workflow_dispatch:
@@ -15,20 +31,20 @@ jobs:
  run-tests:
    runs-on: windows-latest
    steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2

    - name: Set up Go
-      uses: actions/setup-go@v4
+      uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c #6.4.0
      with:
-        go-version: '1.23'
+        go-version-file: go.mod

    # cache simple-responder to save the build time
    - name: Restore Simple Responder
      id: restore-simple-responder
-      uses: actions/cache/restore@v4
+      uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
      with:
        path: ./build
-        key: ${{ runner.os }}-simple-responder-${{ hashFiles('misc/simple-responder/simple-responder.go') }}
+        key: ${{ runner.os }}-simple-responder-${{ hashFiles('cmd/simple-responder/simple-responder.go') }}

    # necessary for testing proxy/Process swapping
    - name: Create simple-responder
@@ -40,11 +56,11 @@ jobs:
      # nothing new to save ... skip this step
      if: steps.restore-simple-responder.outputs.cache-hit != 'true'
      id: save-simple-responder
-      uses: actions/cache/save@v4
+      uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
      with:
        path: ./build
-        key: ${{ runner.os }}-simple-responder-${{ hashFiles('misc/simple-responder/simple-responder.go') }}
+        key: ${{ runner.os }}-simple-responder-${{ hashFiles('cmd/simple-responder/simple-responder.go') }}

    - name: Test all
      shell: bash
-      run: make test-all
+      run: make test-all
@@ -2,53 +2,69 @@ name: Linux CI

 on:
  push:
-    branches: [ "main" ]
+    branches: ["main"]
+    # only run when backend source changes
+    # cmd/ is excluded because it contains utilities without tests
+    paths:
+      - "**/*.go"
+      - "!cmd/**"
+      - "go.mod"
+      - "go.sum"
+      - "Makefile"
+      - ".github/workflows/go-ci.yml"

  pull_request:
-    branches: [ "main" ]
+    branches: ["main"]
+    paths:
+      - "**/*.go"
+      - "!cmd/**"
+      - "go.mod"
+      - "go.sum"
+      - "Makefile"
+      - ".github/workflows/go-ci.yml"

  # Allows manual triggering of the workflow
  workflow_dispatch:

 jobs:
-
  run-tests:
    runs-on: ubuntu-latest
    steps:
-    - uses: actions/checkout@v4
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2

-    - name: Set up Go
-      uses: actions/setup-go@v4
-      with:
-        go-version: '1.23'
+      - name: Set up Go
+        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c #6.4.0
+        with:
+          go-version-file: go.mod

-    # Only run in this linux based runner
-    - name: Check Formatting
-      run: |
-        if [ "$(gofmt -l . | grep -v 'event/.*_test.go' | wc -l)" -gt 0 ]; then
-          gofmt -l . | grep -v 'event/.*_test.go'
-          exit 1
-        fi
-    # cache simple-responder to save the build time
-    - name: Restore Simple Responder
-      id: restore-simple-responder
-      uses: actions/cache/restore@v4
-      with:
-        path: ./build
-        key: ${{ runner.os }}-simple-responder-${{ hashFiles('misc/simple-responder/simple-responder.go') }}
+      # Only run in this linux based runner
+      - name: Check Formatting
+        run: |
+          if [ "$(gofmt -l . | wc -l)" -gt 0 ]; then
+            gofmt -l .
+            exit 1
+          fi
+      # cache simple-responder to save the build time
+      - name: Restore Simple Responder
+        id: restore-simple-responder
+        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+        with:
+          path: ./build
+          key: ${{ runner.os }}-simple-responder-${{ hashFiles('cmd/simple-responder/simple-responder.go') }}

-    # necessary for testing proxy/Process swapping
-    - name: Create simple-responder
-      run: make simple-responder
+      # necessary for testing proxy/Process swapping
+      - name: Create simple-responder
+        if: steps.restore-simple-responder.outputs.cache-hit != 'true'
+        run: make simple-responder

-    - name: Save Simple Responder
-      # nothing new to save ... skip this step
-      if: steps.restore-simple-responder.outputs.cache-hit != 'true'
-      id: save-simple-responder
-      uses: actions/cache/save@v4
-      with:
-        path: ./build
-        key: ${{ runner.os }}-simple-responder-${{ hashFiles('misc/simple-responder/simple-responder.go') }}
+      - name: Save Simple Responder
+        # nothing new to save ... skip this step
+        if: steps.restore-simple-responder.outputs.cache-hit != 'true'
+        id: save-simple-responder
+        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+        with:
+          path: ./build
+          key: ${{ runner.os }}-simple-responder-${{ hashFiles('cmd/simple-responder/simple-responder.go') }}

-    - name: Test all
-      run: make test-all
+      - name: Test all
+        run: make test-all
@@ -3,13 +3,13 @@ name: goreleaser
 on:
  push:
    tags:
-      - '*'
+      - "*"

  # Allows manual triggering of the workflow
  workflow_dispatch:
    inputs:
      tag:
-        description: 'Tag version to release (e.g. v144)'
+        description: "Tag version to release (e.g. v144)"
        required: true

 permissions:
@@ -19,35 +19,30 @@ jobs:
  goreleaser:
    runs-on: ubuntu-latest
    steps:
-      -
-        name: Checkout
-        uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.tag || github.ref }}
-      -
-        name: Set up Go
-        uses: actions/setup-go@v5
-      -
-        name: Set up Node.js
-        uses: actions/setup-node@v4
+      - name: Set up Go
+        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c #6.4.0
        with:
-          node-version: '23'
-      -
-        name: Install dependencies and build UI
+          go-version-file: go.mod
+      - name: Set up Node.js
+        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # 6.4.0
+        with:
+          node-version: "24"
+      - name: Build UI
        run: |
-          cd ui
-          npm ci
-          npm run build
+          make ui

-      -
-        name: Run GoReleaser
-        uses: goreleaser/goreleaser-action@v6
+      - name: Run GoReleaser
+        uses: goreleaser/goreleaser-action@1a80836c5c9d9e5755a25cb59ec6f45a3b5f41a8 #7.2.1
        with:
          # either 'goreleaser' (default) or 'goreleaser-pro'
          distribution: goreleaser
          # 'latest', 'nightly', or a semver
-          version: '~> v2'
+          version: "~> v2"
          args: release --clean
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -66,7 +61,7 @@ jobs:
          fi

      - name: "Trigger tap repository update"
-        uses: peter-evans/repository-dispatch@v2
+        uses: peter-evans/repository-dispatch@28959ce8df70de7be546dd1250a005dd32156697 #4.0.1
        with:
          token: ${{ secrets.TAP_REPO_PAT }}
          repository: mostlygeek/homebrew-llama-swap
@@ -76,4 +71,4 @@ jobs:
              "release": {
                "tag_name": "${{ steps.tag.outputs.tag }}"
              }
-            }
+            }
@@ -0,0 +1,33 @@
+name: UI Tests
+
+on:
+  push:
+    branches: [ "main" ]
+    paths:
+      - 'ui-svelte/**'
+      - '.github/workflows/ui-tests.yml'
+
+  pull_request:
+    branches: [ "main" ]
+    paths:
+      - 'ui-svelte/**'
+      - '.github/workflows/ui-tests.yml'
+
+  workflow_dispatch:
+
+jobs:
+
+  run-tests:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
+
+    - name: Set up Node.js
+      uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # 6.4.0
+      with:
+        node-version: '24'
+        cache: 'npm'
+        cache-dependency-path: ui-svelte/package-lock.json
+
+    - name: Run UI tests
+      run: make test-ui
@@ -0,0 +1,136 @@
+name: Build Unified Docker Image
+
+on:
+  schedule:
+    - cron: "37 5 * * *"
+
+  workflow_dispatch:
+    inputs:
+      llama_cpp_ref:
+        description: "llama.cpp commit hash, tag, or branch"
+        required: false
+        default: "master"
+      whisper_ref:
+        description: "whisper.cpp commit hash, tag, or branch"
+        required: false
+        default: "master"
+      sd_ref:
+        description: "stable-diffusion.cpp commit hash, tag, or branch"
+        required: false
+        default: "master"
+      ik_llama_ref:
+        description: "ik_llama.cpp commit hash, tag, or branch (CUDA only)"
+        required: false
+        default: "main"
+      llama_swap_version:
+        description: "llama-swap version (e.g. v198, latest, main)"
+        required: false
+        default: "main"
+      build_cuda:
+        description: "Build CUDA image"
+        type: boolean
+        required: false
+        default: true
+      build_vulkan:
+        description: "Build Vulkan image"
+        type: boolean
+        required: false
+        default: true
+      push_to_ghcr:
+        description: "Push images to ghcr.io"
+        type: boolean
+        required: false
+        default: true
+
+permissions:
+  contents: read
+  packages: write
+
+jobs:
+  setup:
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - id: set-matrix
+        run: |
+          backends=()
+          # schedule uses defaults (build both); workflow_dispatch respects inputs
+          if [[ "${{ github.event_name }}" == "schedule" ]] || [[ "${{ inputs.build_cuda }}" == "true" ]]; then
+            backends+=("cuda")
+          fi
+          if [[ "${{ github.event_name }}" == "schedule" ]] || [[ "${{ inputs.build_vulkan }}" == "true" ]]; then
+            backends+=("vulkan")
+          fi
+          matrix=$(printf '%s\n' "${backends[@]}" | jq -R . | jq -sc .)
+          echo "matrix=$matrix" >> $GITHUB_OUTPUT
+
+  build:
+    needs: setup
+    if: ${{ needs.setup.outputs.matrix != '[]' }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: ${{ fromJSON(needs.setup.outputs.matrix) }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
+
+      - name: Free up disk space
+        run: |
+          echo "Before cleanup:"
+          df -h
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+          sudo docker system prune -af
+          echo "After cleanup:"
+          df -h
+
+      # On GitHub Actions runners, create a fresh builder.
+      # When running locally under act, skip this and reuse the existing
+      # llama-swap-builder (which has ccache warm) to avoid exhausting disk.
+      - name: Set up Docker Buildx
+        if: ${{ !env.ACT }}
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd #v4.0.0
+
+      - name: Log in to GitHub Container Registry
+        if: ${{ !env.ACT }}
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 #v4.1.0
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build unified Docker image (${{ matrix.backend }})
+        env:
+          LLAMA_REF: ${{ inputs.llama_cpp_ref || 'master' }}
+          WHISPER_REF: ${{ inputs.whisper_ref || 'master' }}
+          SD_REF: ${{ inputs.sd_ref || 'master' }}
+          IK_LLAMA_REF: ${{ inputs.ik_llama_ref || 'main' }}
+          LS_VERSION: ${{ inputs.llama_swap_version || 'main' }}
+          DOCKER_IMAGE_TAG: ghcr.io/mostlygeek/llama-swap:unified-${{ matrix.backend }}
+          # When running under act, use the local builder that has warm ccache.
+          # On GitHub Actions, BUILDX_BUILDER is unset so docker uses the builder
+          # created by setup-buildx-action above.
+          BUILDX_BUILDER: ${{ env.ACT == 'true' && 'llama-swap-builder' || '' }}
+        run: |
+          chmod +x docker/unified/build-image.sh
+          docker/unified/build-image.sh --${{ matrix.backend }}
+
+      - name: Push to GitHub Container Registry
+        if: ${{ !env.ACT && (github.event_name == 'schedule' || inputs.push_to_ghcr == true) }}
+        run: |
+          BASE_TAG="ghcr.io/mostlygeek/llama-swap:unified-${{ matrix.backend }}"
+          DATE_TAG=$(date -u +%Y-%m-%d)
+
+          docker push "${BASE_TAG}"
+          docker tag "${BASE_TAG}" "${BASE_TAG}-${DATE_TAG}"
+          docker push "${BASE_TAG}-${DATE_TAG}"
+
+          ROOTLESS_TAG="${BASE_TAG}-rootless"
+          docker push "${ROOTLESS_TAG}"
+          docker tag "${ROOTLESS_TAG}" "${ROOTLESS_TAG}-${DATE_TAG}"
+          docker push "${ROOTLESS_TAG}-${DATE_TAG}"
@@ -5,3 +5,6 @@ dist/
 .vscode
 .DS_Store
 .dev/
+
+# UI build output; placeholder.txt is kept so the go:embed succeeds.
+internal/server/ui_dist/*
@@ -0,0 +1,53 @@
+## Project Description:
+
+llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
+
+## Tech stack
+
+- golang
+- typescript, vite and svelt5 for UI (located in ui/)
+
+## Workflow Tasks
+
+- when summarizing changes only include details that require further action
+- just say "Done." when there is no further action
+- use the github CLI `gh` to create pull requests and work with github
+- Rules for creating pull requests:
+  - keep them short and focused on changes.
+  - never include a test plan
+  - write the summary using the same style rules as commit message
+
+## Testing
+
+- Follow test naming conventions like `TestProxyManager_<test name>`, `TestProcessGroup_<test name>`, etc.
+- Use `go test -v -run <name pattern for new tests>` to run any new tests you've written.
+- Run `gofmt -w <file>` before committing to fix any formatting
+- Build go binaries into the ./build/ subdirectory
+- Use `make test-dev` after running new tests for a quick over all test run. This runs `go test` and `staticcheck`. Fix any static checking errors. Use this only when changes are made to any code under the `proxy/` directory
+- Use `make test-all` before completing work. This includes long running concurrency tests.
+- Use `make test-ui` after making changes to the UI in ui-svelte/
+
+### Commit message example format:
+
+```
+proxy: add new feature
+
+Add new feature that implements functionality X and Y.
+
+- key change 1
+- key change 2
+- key change 3
+
+fixes #123
+```
+
+## Code Reviews
+
+- use three levels High, Medium, Low severity
+- label each discovered issue with a label like H1, M2, L3 respectively
+- High severity are must fix issues (security, race conditions, critical bugs)
+- Medium severity are recommended improvements (coding style, missing functionality, inconsistencies)
+- Low severity are nice to have changes and nits
+- Include a suggestion with each discovered item
+- Limit your code review to three items with the highest priority first
+- Double check your discovered items and recommended remediations
@@ -1,43 +1 @@
-# Project: llama-swap
-
-## Project Description:
-
-llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
-
-## Tech stack
-
- golang
- typescript, vite and react for UI (ui/)
-
-## Testing
-
- `make test-dev` - Use this when making iterative changes. Runs `go test` and `staticcheck`. Fix any static checking errors. Use this only when changes are made to any code under the `proxy/` directory
- `make test-all` - runs at the end before completing work. Includes long running concurrency tests.
-
-## Workflow Tasks
-
-### Plan Improvements
-
-Work plans are located in ai-plans/. Plans written by the user may be incomplete, contain inconsistencies or errors.
-
-When the user asks to improve a plan follow these guidelines for expanding and improving it.
-
- Identify any inconsistencies.
- Expand plans out to be detailed specification of requirements and changes to be made.
- Plans should have at least these sections:
-  - Title - very short, describes changes
-  - Overview: A more detailed summary of goal and outcomes desired
-  - Design Requirements: Detailed descriptions of what needs to be done
-  - Testing Plan: Tests to be implemented
-  - Checklist: A detailed list of changes to be made
-
-Look for "plan expansion" as explicit instructions to improve a plan.
-
-### Implementation of plans
-
-When the user says "paint it", respond with "commencing automated assembly". Then implement the changes as described by the plan. Update the checklist as you complete items.
-
-## General Rules
-
- when summarizing changes only include details that require further action (action items)
- when there are no action items, just say "Done."
+@AGENTS.md
@@ -19,28 +19,25 @@ all: mac linux simple-responder
 clean:
 	rm -rf $(BUILD_DIR)

-proxy/ui_dist/placeholder.txt:
-	mkdir -p proxy/ui_dist
-	touch $@
-
 # use cached test results while developing
-test-dev: proxy/ui_dist/placeholder.txt
-	go test -short ./proxy/...
-	staticcheck ./proxy/... || true
+test-dev:
+	go test -short ./...
+	staticcheck ./... || true

-test: proxy/ui_dist/placeholder.txt
-	go test -short -count=1 ./proxy/...
+test:
+	go test -short -count=1 ./internal/...

 # for CI - full test (takes longer)
-test-all: proxy/ui_dist/placeholder.txt
-	go test -race -count=1 ./proxy/...
+test-all:
+	go test -race -count=1 ./internal/...

 ui/node_modules:
-	cd ui && npm install
+	cd ui-svelte && npm install

 # build react UI
 ui: ui/node_modules
-	cd ui && npm run build
+	cd ui-svelte && npm run build
+	touch internal/server/ui_dist/placeholder.txt

 # Build OSX binary
 mac: ui
@@ -48,9 +45,14 @@ mac: ui
 	GOOS=darwin GOARCH=arm64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-darwin-arm64

 # Build Linux binary
-linux: ui
-	@echo "Building Linux binary..."
+linux: linux-arm64 linux-amd64
+
+linux-amd64: ui
+	@echo "Building Linux AMD64 binary..."
 	GOOS=linux GOARCH=amd64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-linux-amd64
+
+linux-arm64: ui
+	@echo "Building Linux ARM64 binary..."
 	GOOS=linux GOARCH=arm64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-linux-arm64

 # Build Windows binary
@@ -58,7 +60,7 @@ windows: ui
 	@echo "Building Windows binary..."
 	GOOS=windows GOARCH=amd64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-windows-amd64.exe

-# for testing proxy.Process
+# for testing with real external processes
 simple-responder:
 	@echo "Building simple responder"
 	GOOS=darwin GOARCH=arm64 go build -o $(BUILD_DIR)/simple-responder_darwin_arm64 cmd/simple-responder/simple-responder.go
@@ -92,5 +94,9 @@ wol-proxy: $(BUILD_DIR)
 	@echo "Building wol-proxy"
 	go build -o $(BUILD_DIR)/wol-proxy-$(GOOS)-$(GOARCH)-$(shell date +%Y-%m-%d) cmd/wol-proxy/wol-proxy.go

+test-ui:
+	cd ui-svelte && npm ci && npm run check && npm test
+
 # Phony targets
-.PHONY: all clean ui mac linux windows simple-responder simple-responder-windows test test-all test-dev wol-proxy
+.PHONY: all clean ui mac windows simple-responder simple-responder-windows test test-all test-dev test-ui wol-proxy
+.PHONE: linux linux-arm64 linux-amd64
@@ -5,7 +5,7 @@

 # llama-swap

-Run multiple LLM models on your machine and hot-swap between them as needed. llama-swap works with any OpenAI API-compatible server, giving you the flexibility to switch models without restarting your applications.
+Run multiple generative AI models on your machine and hot-swap between them on demand. llama-swap works with any OpenAI and Anthropic API compatible server and is used by thousands of people to power their local AI workflows.

 Built in Go for performance and simplicity, llama-swap has zero dependencies and is incredibly easy to set up. Get started in minutes - just one binary and one configuration file.

@@ -13,42 +13,75 @@ Built in Go for performance and simplicity, llama-swap has zero dependencies and

 - ✅ Easy to deploy and configure: one binary, one configuration file. no external dependencies
 - ✅ On-demand model switching
- ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabbyAPI, etc.)
+- ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabbyAPI, stable-diffusion.cpp, etc.)
  - future proof, upgrade your inference servers at any time.
 - ✅ OpenAI API supported endpoints:
  - `v1/completions`
  - `v1/chat/completions`
+  - `v1/responses`
  - `v1/embeddings`
+  - `v1/models` - list available models
  - `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36))
  - `v1/audio/transcriptions` ([docs](https://github.com/mostlygeek/llama-swap/issues/41#issuecomment-2722637867))
+  - `v1/audio/voices`
+  - `v1/images/generations`
+  - `v1/images/edits`
 - ✅ Anthropic API supported endpoints:
  - `v1/messages`
+  - `v1/messages/count_tokens`
 - ✅ llama-server (llama.cpp) supported endpoints
  - `v1/rerank`, `v1/reranking`, `/rerank`
  - `/infill` - for code infilling
  - `/completion` - for completion endpoint
+- ✅ SDAPI via [stable-diffusion.cpp's server](https://github.com/leejet/stable-diffusion.cpp/tree/master/examples/server)
+  - `/sdapi/v1/txt2img`
+  - `/sdapi/v1/img2img`
+  - `/sdapi/v1/loras` - requires `model` in request body to fetch the correct loras
 - ✅ llama-swap API
  - `/ui` - web UI
  - `/upstream/:model_id` - direct access to upstream server ([demo](https://github.com/mostlygeek/llama-swap/pull/31))
-  - `/models/unload` - manually unload running models ([#58](https://github.com/mostlygeek/llama-swap/issues/58))
  - `/running` - list currently running models ([#61](https://github.com/mostlygeek/llama-swap/issues/61))
-  - `/log` - remote log monitoring
+  - `POST /api/models/unload` - manually unload all running models ([#58](https://github.com/mostlygeek/llama-swap/issues/58))
+  - `POST /api/models/unload/:model_id` - unload a specific model
+  - `/logs` - remote log monitoring
+    - `GET /logs` returns buffered plain text logs.
+      - If `Accept: text/html` is sent, `/logs` redirects to `/ui/`.
+    - `GET /logs/stream` keeps the connection open for live log streaming.
+      - Stream endpoints send buffered history first by default; add `?no-history` to stream only new lines.
+    - `GET /logs/stream/proxy` streams proxy logs only.
+    - `GET /logs/stream/upstream` streams upstream process logs only.
+    - `GET /logs/stream/{model_id}` streams logs for one model (including IDs with slashes, like `author/model`).
  - `/health` - just returns "OK"
+  - `/metrics` - system and GPU metrics for prometheus
+- ✅ API Key support - define keys to restrict access to API endpoints
 - ✅ Customizable
-  - Run multiple models at once with `Groups` ([#107](https://github.com/mostlygeek/llama-swap/issues/107))
+  - Run concurrent models with a custom DSL swap matrix ([#643](https://github.com/mostlygeek/llama-swap/issues/643))
  - Automatic unloading of models after timeout by setting a `ttl`
-  - Reliable Docker and Podman support using `cmd` and `cmdStop` together
+  - Docker and Podman support using `cmd` and `cmdStop` together
  - Preload models on startup with `hooks` ([#235](https://github.com/mostlygeek/llama-swap/pull/235))
+  - Apply filters to requests to control inference with `stripParams`, `setParams` and `setParamsByID`

 ### Web UI

-llama-swap includes a real time web interface for monitoring logs and controlling models:
+llama-swap includes a real time web interface with a playground for testing out all sorts of local models:

-<img width="1164" height="745" alt="image" src="https://github.com/user-attachments/assets/bacf3f9d-819f-430b-9ed2-1bfaa8d54579" />
+<img width="1125" height="876" alt="image" src="https://github.com/user-attachments/assets/8ee41947-97af-463d-b0f0-8e9c478fac07" />

-The Activity Page shows recent requests:
+View detailed token metrics:

-<img width="1360" height="963" alt="image" src="https://github.com/user-attachments/assets/5f3edee6-d03a-4ae5-ae06-b20ac1f135bd" />
+<img width="1111" height="515" alt="image" src="https://github.com/user-attachments/assets/64bfb280-d7a3-4126-971a-a128fd40410c" />
+
+Inspect request and responses:
+
+<img width="1111" height="720" alt="image" src="https://github.com/user-attachments/assets/24fe4aca-1448-4d7c-b9e8-a967589bda6c" />
+
+Manually load and unload models:
+
+<img width="1109" height="719" alt="image" src="https://github.com/user-attachments/assets/02b1e1f2-abd0-4050-84ae-facd66ff01c4" />
+
+Real time log streaming:
+
+<img width="1107" height="559" alt="image" src="https://github.com/user-attachments/assets/39669a10-cff2-409e-836a-5bad8bd0140c" />

 ## Installation

@@ -62,7 +95,24 @@ llama-swap can be installed in multiple ways

 ### Docker Install ([download images](https://github.com/mostlygeek/llama-swap/pkgs/container/llama-swap))

-Nightly container images with llama-swap and llama-server are built for multiple platforms (cuda, vulkan, intel, etc.) including [non-root variants with improved security](docs/container-security.md).
+Two types of container images are built nightly for llama-swap:
+
+1. A unified container with llama-server, ik-llama-server, stable-diffusion.cpp, whisper.cpp and llama-swap built from source. This is only available for cuda and vulkan but has more capabilities. This one is recommended for use.
+2. A legacy image that is based on llama.cpp's images and llama-swap copied into the container. Use this one if you prefer to stay close to llama.cpp's container images.
+
+#### Unified container (Recommended)
+
+```shell
+$ docker pull ghcr.io/mostlygeek/llama-swap:unified-cuda
+
+# run with a custom configuration and models directory
+$ docker run -it --rm --runtime nvidia -p 9292:8080 \
+ -v /path/to/models:/models \
+ -v /path/to/custom/config.yaml:/etc/llama-swap/config/config.yaml \
+ ghcr.io/mostlygeek/llama-swap:unified-cuda
+```
+
+#### Legacy container

 ```shell
 $ docker pull ghcr.io/mostlygeek/llama-swap:cuda
@@ -72,14 +122,6 @@ $ docker run -it --rm --runtime nvidia -p 9292:8080 \
 -v /path/to/models:/models \
 -v /path/to/custom/config.yaml:/app/config.yaml \
 ghcr.io/mostlygeek/llama-swap:cuda
-
-# configuration hot reload supported with a
-# directory volume mount
-$ docker run -it --rm --runtime nvidia -p 9292:8080 \
- -v /path/to/models:/models \
- -v /path/to/custom/config.yaml:/app/config.yaml \
- -v /path/to/config:/config \
- ghcr.io/mostlygeek/llama-swap:cuda -config /config/config.yaml -watch-config
 ```

 <details>
@@ -157,7 +199,7 @@ That's all you need to get started:
 Almost all configuration settings are optional and can be added one step at a time:

 - Advanced features
-  - `groups` to run multiple models at once
+  - `matrix` to run concurrent models with a custom swap logic DSL
  - `hooks` to run things on startup
  - `macros` reusable snippets
 - Model customization
@@ -175,7 +217,7 @@ See the [configuration documentation](docs/configuration.md) for all options.

 When a request is made to an OpenAI compatible endpoint, llama-swap will extract the `model` value and load the appropriate server configuration to serve it. If the wrong upstream server is running, it will be replaced with the correct one. This is where the "swap" part comes in. The upstream server is automatically swapped to handle the request correctly.

-In the most basic configuration llama-swap handles one model at a time. For more advanced use cases, the `groups` feature allows multiple models to be loaded at the same time. You have complete control over how your system resources are used.
+In the most basic configuration llama-swap handles one model at a time. For more advanced use cases, using a `matrix` allows multiple models to be loaded at the same time. You have complete control over how your system resources are used.

 ## Reverse Proxy Configuration (nginx)

@@ -235,6 +277,6 @@ For Python based inference servers like vllm or tabbyAPI it is recommended to ru
 ## Star History

 > [!NOTE]
-> ⭐️ Star this project to help others discover it!
+> Thank you to everyone who has given this project a ⭐️!

 [![Star History Chart](https://api.star-history.com/svg?repos=mostlygeek/llama-swap&type=Date)](https://www.star-history.com/#mostlygeek/llama-swap&Date)
@@ -0,0 +1,183 @@
+# Improve Testability (#655)
+
+## Current Pain Points
+
+1. **Tests bypass config loading** - ~80% of tests build `config.Config` structs directly, skipping YAML parsing, env var substitution, macro expansion, and `${PORT}` assignment. Config bugs in those paths go untested.
+
+2. **simple-responder is everywhere** - Every proxy/routing test launches a real subprocess, waits for health checks (~healthCheckTimeout: 15), and manages process lifecycle just to test HTTP routing. Most of that overhead is wasted.
+
+3. **Port counter is fragile** - A global `nextTestPort` counter starting at 12000 with a mutex. Parallel tests or leftover processes can collide.
+
+## Stages
+
+### Stage 1: YAML-based test config helper
+
+**Goal:** Tests go through the real `LoadConfigFromReader` path instead of hand-building structs.
+
+**Effort:** Low | **Impact:** Config bugs caught earlier | **Risk:** None
+
+Create a test helper in `proxy/helpers_test.go`:
+
+```go
+// testConfigFromYAML substitutes simple-responder paths and loads through
+// the real config pipeline (env vars, macros, port assignment, etc.)
+func testConfigFromYAML(t *testing.T, yamlTmpl string) config.Config {
+    t.Helper()
+    yamlStr := strings.ReplaceAll(yamlTmpl, "{{RESPONDER}}", filepath.ToSlash(simpleResponderPath))
+    cfg, err := config.LoadConfigFromReader(strings.NewReader(yamlStr))
+    require.NoError(t, err)
+    return cfg
+}
+```
+
+Tests would then look like:
+
+```go
+func TestProxyManager_SwapProcessCorrectly(t *testing.T) {
+    config := testConfigFromYAML(t, `
+healthCheckTimeout: 15
+logLevel: error
+models:
+  model1:
+    cmd: {{RESPONDER}} --port ${PORT} -silent -respond model1
+  model2:
+    cmd: {{RESPONDER}} --port ${PORT} -silent -respond model2
+`)
+    proxy := New(config)
+    // ... same assertions
+}
+```
+
+**Why this stage first:** Zero production code changes. Pure test-side refactoring. Can be done incrementally - migrate tests one at a time. Each migrated test now validates the full config pipeline.
+
+**Scope:** ~20-30 tests in `proxymanager_test.go`, `processgroup_test.go`, `peerproxy_test.go`.
+
+### Stage 2: Injected test handler (eliminate simple-responder for routing tests)
+
+**Goal:** Replace simple-responder subprocess launches with an injected `http.Handler` for tests that don't specifically test process lifecycle.
+
+**Effort:** Medium | **Impact:** 10-100x faster routing tests | **Risk:** Low (additive, no existing code broken)
+
+Add a `testHandler http.Handler` field to `Process`. When set, `ProxyRequest` delegates directly to this handler instead of going through the reverse proxy. No subprocess, no health checks, no TCP roundtrip.
+
+**2a. Add testHandler to Process:**
+
+```go
+// In Process struct (process.go):
+testHandler http.Handler  // set only in tests; bypasses subprocess and reverse proxy
+```
+
+In `Process.Start()`, skip subprocess + health check when handler is set:
+
+```go
+func (p *Process) start() error {
+    if p.testHandler != nil {
+        p.setState(StateReady)
+        return nil
+    }
+    // existing subprocess logic...
+}
+```
+
+In `Process.ProxyRequest()`, delegate directly to the handler:
+
+```go
+// Before the reverseProxy.ServeHTTP call:
+if p.testHandler != nil {
+    p.testHandler.ServeHTTP(w, r)
+    return
+}
+```
+
+**2b. Test helper to create the handler:**
+
+```go
+// newTestHandler returns an http.Handler that mimics llama.cpp's API
+// (same endpoints as simple-responder).
+func newTestHandler(respond string) http.Handler {
+    mux := http.NewServeMux()
+    mux.HandleFunc("/v1/chat/completions", func(w http.ResponseWriter, r *http.Request) { ... })
+    mux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) { ... })
+    // ... other endpoints
+    return mux
+}
+```
+
+Tests for routing/auth/CORS/streaming then become:
+
+```go
+func TestProxyManager_AuthRequired(t *testing.T) {
+    handler := newTestHandler("model1")
+
+    config := testConfigFromYAML(t, `
+healthCheckTimeout: 15
+logLevel: error
+requiredAPIKeys: [test-key]
+models:
+  model1:
+    cmd: {{RESPONDER}} --port ${PORT} -silent -respond model1
+`)
+    pm := NewProxyManager(config)
+    // inject handler — skips subprocess, health check, port allocation
+    pm.processGroups["model1"].process.testHandler = handler
+}
+```
+
+**Why this matters:** The handler is called directly in-process. No subprocess spawn, no health check timeout, no port allocation, no TCP roundtrip, no reverse proxy overhead. Routing tests go from ~100ms each (process startup + health check) to ~1ms. Unlike an `httptest.Server` approach, there are zero network hops.
+
+**Why not blank-cmd + proxy URL:** A blank `cmd` with a `proxy` field pointing at `httptest.Server` still requires a real TCP roundtrip through the reverse proxy and introduces "external process" semantics to the config schema. Injecting the handler directly keeps it purely a test concern with no config changes.
+
+**Scope:** Most tests in `proxymanager_test.go` (auth, CORS, model listing, streaming, peer proxy), `peerproxy_test.go`, `metrics_monitor_test.go`.
+
+### Stage 3: Migrate tests incrementally
+
+**Goal:** Convert existing tests to use the Stage 1 + Stage 2 helpers.
+
+**Effort:** Medium | **Impact:** Cleaner, more reliable tests | **Risk:** None
+
+Priority order:
+1. `proxymanager_test.go` routing tests (highest count, most repetition)
+2. `peerproxy_test.go` (straightforward, all HTTP routing)
+3. `metrics_monitor_test.go` (capture logic doesn't need real processes)
+4. `processgroup_test.go` swap tests (keep simple-responder for actual swap lifecycle tests)
+
+Tests that **must keep simple-responder:**
+- Process lifecycle: start/stop, SIGKILL, SIGTERM, TTL expiry, health check failures, failed start counting
+- ProcessGroup swap concurrency (the port-collision test in `TestProcessGroup_ProxyRequestSwapIsTrueParallel`)
+
+**Scope:** ~60-70% of tests can drop simple-responder.
+
+### Stage 4 (optional): Process interface for ProcessGroup
+
+**Goal:** Enable pure unit tests of ProcessGroup's swap/exclusive/concurrency logic without any HTTP server at all.
+
+**Effort:** High | **Impact:** Pure unit tests possible | **Risk:** Medium (refactor core code)
+
+```go
+type ProcessController interface {
+    Start() error
+    Stop(StopStrategy)
+    ProxyRequest(http.ResponseWriter, *http.Request) error
+    CurrentState() ProcessState
+    ID() string
+    SetState(ProcessState)  // for test setup
+}
+```
+
+This requires:
+- Extracting the interface
+- A `MockProcess` implementation
+- Refactoring `ProcessGroup` to use the interface instead of `*Process`
+
+**Recommendation:** Only do this if ProcessGroup grows significantly more complex. Stages 1-3 give 80% of the benefit for 20% of the effort.
+
+## Effort/Impact Summary
+
+| Stage | Effort | Impact | Risk |
+|-------|--------|--------|------|
+| 1. YAML config helper | Low | Config bugs caught earlier | None |
+| 2. Injected test handler | Medium | 10-100x faster routing tests | Low |
+| 3. Migrate tests | Medium | Cleaner, more reliable tests | None |
+| 4. Process interface | High | Pure unit tests possible | Medium |
+
+**Recommended approach:** Do stages 1-3 in order. Each stage is independently valuable and can ship on its own. Stage 4 is deferred unless there's a specific need.
@@ -0,0 +1,306 @@
+package main
+
+import (
+	"context"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io"
+	"log"
+	"net/http"
+	"os"
+	"os/signal"
+	"strings"
+	"syscall"
+	"time"
+
+	"github.com/tidwall/gjson"
+)
+
+var loremWords = strings.Fields(
+	"Lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor " +
+		"incididunt ut labore et dolore magna aliqua Ut enim ad minim veniam quis nostrud " +
+		"exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat Duis aute " +
+		"irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla " +
+		"pariatur Excepteur sint occaecat cupidatat non proident sunt in culpa qui officia " +
+		"deserunt mollit anim id est laborum Sed ut perspiciatis unde omnis iste natus error " +
+		"sit voluptatem accusantium doloremque laudantium totam rem aperiam eaque ipsa quae " +
+		"ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo " +
+		"Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit",
+)
+
+var (
+	flagListen = flag.String("listen", "localhost:9898", "listen address")
+	flagTokens = flag.Int("tokens", 1000, "number of tokens to return")
+	flagTPS    = flag.Float64("tps", 75, "tokens per second")
+	flagLoad   = flag.String("load", "0s", "simulated load duration (e.g. 2s, 500ms)")
+)
+
+type chunkDelta struct {
+	Role    string `json:"role,omitempty"`
+	Content string `json:"content,omitempty"`
+}
+
+type chunkChoice struct {
+	Index        int        `json:"index"`
+	Delta        chunkDelta `json:"delta"`
+	FinishReason *string    `json:"finish_reason"`
+}
+
+type chatChunk struct {
+	ID      string        `json:"id"`
+	Object  string        `json:"object"`
+	Created int64         `json:"created"`
+	Model   string        `json:"model"`
+	Choices []chunkChoice `json:"choices"`
+}
+
+type completionMessage struct {
+	Role    string `json:"role"`
+	Content string `json:"content"`
+}
+
+type completionChoice struct {
+	Index        int               `json:"index"`
+	Message      completionMessage `json:"message"`
+	FinishReason string            `json:"finish_reason"`
+}
+
+type completionUsage struct {
+	PromptTokens     int `json:"prompt_tokens"`
+	CompletionTokens int `json:"completion_tokens"`
+	TotalTokens      int `json:"total_tokens"`
+}
+
+type chatCompletion struct {
+	ID      string             `json:"id"`
+	Object  string             `json:"object"`
+	Created int64              `json:"created"`
+	Model   string             `json:"model"`
+	Choices []completionChoice `json:"choices"`
+	Usage   completionUsage    `json:"usage"`
+}
+
+func loremText(n int) string {
+	words := make([]string, n)
+	for i := range words {
+		words[i] = loremWords[i%len(loremWords)]
+	}
+	return strings.Join(words, " ")
+}
+
+func sendChunk(w http.ResponseWriter, content string, finishReason *string) error {
+	chunk := chatChunk{
+		ID:      "chatcmpl-fake",
+		Object:  "chat.completion.chunk",
+		Created: time.Now().Unix(),
+		Model:   "fake-model",
+		Choices: []chunkChoice{
+			{
+				Index:        0,
+				Delta:        chunkDelta{Content: content},
+				FinishReason: finishReason,
+			},
+		},
+	}
+	data, err := json.Marshal(chunk)
+	if err != nil {
+		return err
+	}
+	_, err = fmt.Fprintf(w, "data: %s\n\n", data)
+	return err
+}
+
+// startLoading runs the countdown log and closes ready when loadDur elapses.
+// If loadDur is zero, ready is closed immediately.
+func startLoading(loadDur time.Duration) <-chan struct{} {
+	ready := make(chan struct{})
+	if loadDur == 0 {
+		close(ready)
+		return ready
+	}
+	go func() {
+		deadline := time.Now().Add(loadDur)
+		log.Printf("loading... %s remaining", loadDur.Round(time.Second))
+		ticker := time.NewTicker(time.Second)
+		defer ticker.Stop()
+		timer := time.NewTimer(loadDur)
+		for {
+			select {
+			case <-timer.C:
+				close(ready)
+				log.Printf("ready")
+				return
+			case <-ticker.C:
+				if rem := time.Until(deadline).Round(time.Second); rem > 0 {
+					log.Printf("loading... %s remaining", rem)
+				}
+			}
+		}
+	}()
+	return ready
+}
+
+func healthHandler(ready <-chan struct{}) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		select {
+		case <-ready:
+			w.WriteHeader(http.StatusOK)
+		default:
+			w.WriteHeader(http.StatusServiceUnavailable)
+		}
+	}
+}
+
+func chatHandler(ready <-chan struct{}) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		if r.Method != http.MethodPost {
+			http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
+			return
+		}
+
+		body, err := io.ReadAll(r.Body)
+		if err != nil {
+			http.Error(w, "failed to read body", http.StatusBadRequest)
+			return
+		}
+
+		streaming := gjson.GetBytes(body, "stream").Bool()
+		ctx := r.Context()
+
+		select {
+		case <-ready:
+		case <-ctx.Done():
+			return
+		}
+
+		tokens := *flagTokens
+		tps := *flagTPS
+		if tps <= 0 {
+			tps = 1
+		}
+
+		if !streaming {
+			delay := time.Duration(float64(tokens) / tps * float64(time.Second))
+			select {
+			case <-time.After(delay):
+			case <-ctx.Done():
+				return
+			}
+			text := loremText(tokens)
+			resp := chatCompletion{
+				ID:      "chatcmpl-fake",
+				Object:  "chat.completion",
+				Created: time.Now().Unix(),
+				Model:   "fake-model",
+				Choices: []completionChoice{
+					{
+						Index:        0,
+						Message:      completionMessage{Role: "assistant", Content: text},
+						FinishReason: "stop",
+					},
+				},
+				Usage: completionUsage{
+					PromptTokens:     0,
+					CompletionTokens: tokens,
+					TotalTokens:      tokens,
+				},
+			}
+			w.Header().Set("Content-Type", "application/json")
+			json.NewEncoder(w).Encode(resp)
+			return
+		}
+
+		w.Header().Set("Content-Type", "text/event-stream")
+		w.Header().Set("Cache-Control", "no-cache")
+		w.Header().Set("Connection", "keep-alive")
+
+		flusher, ok := w.(http.Flusher)
+		if !ok {
+			http.Error(w, "streaming not supported", http.StatusInternalServerError)
+			return
+		}
+
+		// Send role delta first
+		first := chatChunk{
+			ID:      "chatcmpl-fake",
+			Object:  "chat.completion.chunk",
+			Created: time.Now().Unix(),
+			Model:   "fake-model",
+			Choices: []chunkChoice{
+				{Index: 0, Delta: chunkDelta{Role: "assistant"}},
+			},
+		}
+		if data, err := json.Marshal(first); err == nil {
+			fmt.Fprintf(w, "data: %s\n\n", data)
+			flusher.Flush()
+		}
+
+		interval := time.Duration(float64(time.Second) / tps)
+		ticker := time.NewTicker(interval)
+		defer ticker.Stop()
+
+		stop := "stop"
+		for i := 0; i < tokens; i++ {
+			select {
+			case <-ctx.Done():
+				return
+			case <-ticker.C:
+			}
+
+			word := loremWords[i%len(loremWords)]
+			if i < tokens-1 {
+				if err := sendChunk(w, word+" ", nil); err != nil {
+					return
+				}
+			} else {
+				if err := sendChunk(w, word, &stop); err != nil {
+					return
+				}
+			}
+			flusher.Flush()
+		}
+
+		fmt.Fprintf(w, "data: [DONE]\n\n")
+		flusher.Flush()
+	}
+}
+
+func main() {
+	flag.Parse()
+
+	loadDur, err := time.ParseDuration(*flagLoad)
+	if err != nil {
+		log.Fatalf("invalid -load value %q: %v", *flagLoad, err)
+	}
+
+	ready := startLoading(loadDur)
+
+	mux := http.NewServeMux()
+	mux.HandleFunc("/health", healthHandler(ready))
+	mux.HandleFunc("/v1/chat/completions", chatHandler(ready))
+
+	srv := &http.Server{
+		Addr:    *flagListen,
+		Handler: mux,
+	}
+
+	go func() {
+		log.Printf("listening on %s (tokens=%d tps=%.1f load=%s)",
+			*flagListen, *flagTokens, *flagTPS, loadDur)
+		if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
+			log.Fatalf("server error: %v", err)
+		}
+	}()
+
+	quit := make(chan os.Signal, 1)
+	signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
+	<-quit
+
+	log.Println("shutting down...")
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+	if err := srv.Shutdown(ctx); err != nil {
+		log.Printf("shutdown error: %v", err)
+	}
+}
@@ -0,0 +1,92 @@
+package main
+
+import (
+	"context"
+	"errors"
+	"flag"
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/internal/config"
+	"github.com/mostlygeek/llama-swap/internal/logmon"
+	"github.com/mostlygeek/llama-swap/internal/perf"
+)
+
+func printSysStat(s perf.SysStat) {
+	cores := make([]string, len(s.CpuUtilPerCore))
+	for i, v := range s.CpuUtilPerCore {
+		cores[i] = fmt.Sprintf("%.1f%%", v)
+	}
+	fmt.Printf("[SYS %s]\n", s.Timestamp.Format("15:04:05"))
+	fmt.Printf("  CPU:  %s\n", strings.Join(cores, "  "))
+	fmt.Printf("  Mem:  %d MB used / %d MB total (%d MB free)\n", s.MemUsedMB, s.MemTotalMB, s.MemFreeMB)
+	fmt.Printf("  Swap: %d MB used / %d MB total\n", s.SwapUsedMB, s.SwapTotalMB)
+	fmt.Printf("  Load: %.2f  %.2f  %.2f  (1m 5m 15m)\n", s.LoadAvg1, s.LoadAvg5, s.LoadAvg15)
+}
+
+func printGpuStats(gpus []perf.GpuStat) {
+	for _, g := range gpus {
+		fmt.Printf("[GPU %d %s]\n", g.ID, g.Name)
+		fmt.Printf("  Util:  GPU %.1f%%  Mem %.1f%%\n", g.GpuUtilPct, g.MemUtilPct)
+		fmt.Printf("  Mem:   %d MB used / %d MB total\n", g.MemUsedMB, g.MemTotalMB)
+		fmt.Printf("  Temp:  %d°C   Fan: %.1f%%   Power: %.1f W\n", g.TempC, g.FanSpeedPct, g.PowerDrawW)
+	}
+}
+
+func main() {
+	stream := flag.Bool("stream", false, "stream stats")
+	interval := flag.Duration("t", time.Second, "polling interval (clamped to 1s–1h)")
+	flag.Parse()
+
+	every := *interval
+	if every < time.Second {
+		every = time.Second
+	} else if every > time.Hour {
+		every = time.Hour
+	}
+
+	l := logmon.New()
+	l.SetLogLevel(logmon.LevelDebug)
+
+	s, err := perf.ReadSysStats()
+	if err != nil && err != perf.ErrNotImplemented {
+		fmt.Println("Sys Error:", err)
+		return
+	}
+	printSysStat(s)
+
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	gpuCh, err := perf.GetGpuStats(ctx, every, l)
+	if err != nil && !errors.Is(err, perf.ErrNotImplemented) && !errors.Is(err, perf.ErrNoGpuTool) {
+		fmt.Println("GPU Init Error:", err)
+		return
+	}
+
+	if gpuCh != nil {
+		select {
+		case g := <-gpuCh:
+			printGpuStats(g)
+		case <-ctx.Done():
+			fmt.Println("GPU: timed out waiting for stats")
+		}
+	}
+
+	if *stream {
+		m, _ := perf.New(config.PerformanceConfig{Every: every}, l)
+		m.Start()
+		defer m.Stop()
+		sysCh, gpuCh, unsub := m.Subscribe()
+		defer unsub()
+		for {
+			select {
+			case s := <-sysCh:
+				printSysStat(s)
+			case g := <-gpuCh:
+				printGpuStats(g)
+			}
+		}
+	}
+}
@@ -210,6 +210,11 @@ func main() {
 		})
 	})

+	r.GET("/v1/audio/voices", func(c *gin.Context) {
+		model := c.Query("model")
+		c.JSON(http.StatusOK, gin.H{"voices": []string{"voice1"}, "model": model})
+	})
+
 	r.GET("/slow-respond", func(c *gin.Context) {
 		echo := c.Query("echo")
 		delay := c.Query("delay")
@@ -269,6 +274,43 @@ func main() {
 		c.String(200, fmt.Sprintf("%s %s", c.Request.Method, c.Request.URL.Path))
 	})

+	// SD API endpoints
+	r.POST("/sdapi/v1/txt2img", func(c *gin.Context) {
+		body, err := io.ReadAll(c.Request.Body)
+		if err != nil {
+			c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to read request body"})
+			return
+		}
+		defer c.Request.Body.Close()
+
+		modelName := gjson.GetBytes(body, "model").String()
+		c.JSON(http.StatusOK, gin.H{
+			"model":  modelName,
+			"images": []string{},
+		})
+	})
+
+	r.POST("/sdapi/v1/img2img", func(c *gin.Context) {
+		body, err := io.ReadAll(c.Request.Body)
+		if err != nil {
+			c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to read request body"})
+			return
+		}
+		defer c.Request.Body.Close()
+
+		modelName := gjson.GetBytes(body, "model").String()
+		c.JSON(http.StatusOK, gin.H{
+			"model":  modelName,
+			"images": []string{},
+		})
+	})
+
+	r.GET("/sdapi/v1/loras", func(c *gin.Context) {
+		c.JSON(http.StatusOK, gin.H{
+			"loras": []string{},
+		})
+	})
+
 	address := "127.0.0.1:" + *port // Address with the specified port

 	srv := &http.Server{
@@ -0,0 +1,96 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"os"
+	"sync"
+	"time"
+
+	tea "github.com/charmbracelet/bubbletea"
+)
+
+func main() {
+	prompt := flag.String("prompt", "Write a few sentences about the history of computing.", "user message sent to each model")
+	maxTokens := flag.Int("max-tokens", 256, "max_tokens per request")
+	flag.Usage = func() {
+		fmt.Fprintf(os.Stderr, "Usage: %s [flags] <base-url> <model> [model...]\n", os.Args[0])
+		fmt.Fprintf(os.Stderr, "Example: %s -max-tokens 400 http://localhost:8080 A B C D\n\n", os.Args[0])
+		flag.PrintDefaults()
+	}
+	flag.Parse()
+
+	args := flag.Args()
+	if len(args) < 2 {
+		flag.Usage()
+		os.Exit(1)
+	}
+
+	baseURL := args[0]
+	models := args[1:]
+
+	m := newModel(models)
+	prog := tea.NewProgram(m, tea.WithAltScreen(), tea.WithMouseCellMotion())
+
+	// Chain of triggers ensures requests are sent in the order provided.
+	triggers := make([]chan struct{}, len(models))
+	for i := range triggers {
+		triggers[i] = make(chan struct{}, 1)
+	}
+	triggers[0] <- struct{}{}
+
+	var wg sync.WaitGroup
+	start := time.Now()
+
+	for i, name := range models {
+		wg.Add(1)
+		go func(idx int, mdl string) {
+			defer wg.Done()
+
+			<-triggers[idx]
+
+			reqStart := time.Now()
+			prog.Send(statusMsg{idx: idx, status: statusStreaming})
+
+			if idx+1 < len(triggers) {
+				triggers[idx+1] <- struct{}{}
+			}
+
+			err := sendRequest(baseURL, mdl, *prompt, *maxTokens, idx, func(i int, text string) {
+				prog.Send(deltaMsg{idx: i, text: text})
+			})
+
+			elapsed := time.Since(reqStart)
+			if err != nil {
+				prog.Send(statusMsg{idx: idx, status: statusError, elapsed: elapsed, err: err})
+			} else {
+				prog.Send(statusMsg{idx: idx, status: statusDone, elapsed: elapsed})
+			}
+		}(i, name)
+	}
+
+	if _, err := prog.Run(); err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(1)
+	}
+
+	wg.Wait()
+	printSummary(m, start)
+}
+
+func printSummary(m *model, start time.Time) {
+	fmt.Println("Summary:")
+	for _, p := range m.panels {
+		switch p.status {
+		case statusError:
+			fmt.Printf("  [%d] %-20s ERROR   elapsed=%s err=%v\n",
+				p.idx, p.model, p.elapsed.Round(time.Millisecond), p.err)
+		case statusDone:
+			fmt.Printf("  [%d] %-20s done    elapsed=%s\n",
+				p.idx, p.model, p.elapsed.Round(time.Millisecond))
+		default:
+			fmt.Printf("  [%d] %-20s %s\n", p.idx, p.model, p.status)
+		}
+	}
+	fmt.Printf("all done in %s\n", time.Since(start).Round(time.Millisecond))
+}
@@ -0,0 +1,88 @@
+package main
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+)
+
+// deltaSink receives streamed text fragments for a given model panel.
+type deltaSink func(idx int, text string)
+
+type streamDelta struct {
+	Content          string `json:"content"`
+	ReasoningContent string `json:"reasoning_content"`
+}
+
+type streamChoice struct {
+	Delta streamDelta `json:"delta"`
+}
+
+type streamChunk struct {
+	Choices []streamChoice `json:"choices"`
+}
+
+// sendRequest streams a chat completion and forwards each content/reasoning
+// delta to sink. Reasoning and assistant content are emitted into the same
+// stream so they render together.
+func sendRequest(baseURL, model, prompt string, maxTokens, idx int, sink deltaSink) error {
+	payload := map[string]any{
+		"model": model,
+		"messages": []map[string]string{
+			{"role": "user", "content": prompt},
+		},
+		"max_tokens": maxTokens,
+		"stream":     true,
+	}
+
+	body, err := json.Marshal(payload)
+	if err != nil {
+		return err
+	}
+
+	resp, err := http.Post(baseURL+"/v1/chat/completions", "application/json", bytes.NewReader(body))
+	if err != nil {
+		return err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		b, _ := io.ReadAll(resp.Body)
+		return fmt.Errorf("status %d: %s", resp.StatusCode, strings.TrimSpace(string(b)))
+	}
+
+	scanner := bufio.NewScanner(resp.Body)
+	scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
+	for scanner.Scan() {
+		line := scanner.Text()
+		if !strings.HasPrefix(line, "data:") {
+			continue
+		}
+		data := strings.TrimSpace(strings.TrimPrefix(line, "data:"))
+		if data == "" || data == "[DONE]" {
+			if data == "[DONE]" {
+				break
+			}
+			continue
+		}
+
+		var chunk streamChunk
+		if err := json.Unmarshal([]byte(data), &chunk); err != nil {
+			continue
+		}
+		for _, c := range chunk.Choices {
+			if c.Delta.ReasoningContent != "" {
+				sink(idx, c.Delta.ReasoningContent)
+			}
+			if c.Delta.Content != "" {
+				sink(idx, c.Delta.Content)
+			}
+		}
+	}
+
+	return scanner.Err()
+}
@@ -0,0 +1,343 @@
+package main
+
+import (
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/charmbracelet/bubbles/viewport"
+	tea "github.com/charmbracelet/bubbletea"
+	"github.com/charmbracelet/lipgloss"
+)
+
+type panelStatus int
+
+const (
+	statusWaiting panelStatus = iota
+	statusStreaming
+	statusDone
+	statusError
+)
+
+func (s panelStatus) String() string {
+	switch s {
+	case statusStreaming:
+		return "streaming"
+	case statusDone:
+		return "done"
+	case statusError:
+		return "error"
+	default:
+		return "waiting"
+	}
+}
+
+// deltaMsg appends streamed text to a panel.
+type deltaMsg struct {
+	idx  int
+	text string
+}
+
+// statusMsg updates a panel's lifecycle state.
+type statusMsg struct {
+	idx     int
+	status  panelStatus
+	elapsed time.Duration
+	err     error
+}
+
+type panel struct {
+	idx     int
+	model   string
+	color   lipgloss.Color
+	status  panelStatus
+	buf     strings.Builder
+	elapsed time.Duration
+	err     error
+}
+
+const (
+	minPanelWidth = 28
+	maxCols       = 3
+	panelHeight   = 9 // total box height including border + header
+)
+
+type model struct {
+	panels  []*panel
+	focused int
+	vp      viewport.Model
+	width   int
+	height  int
+	cols    int
+	pw      int // inner panel content width
+	ready   bool
+}
+
+func newModel(models []string) *model {
+	// Assign a stable color per unique model name (by first appearance).
+	colorOf := map[string]lipgloss.Color{}
+	panels := make([]*panel, len(models))
+	for i, m := range models {
+		c, ok := colorOf[m]
+		if !ok {
+			c = modelPalette[len(colorOf)%len(modelPalette)]
+			colorOf[m] = c
+		}
+		panels[i] = &panel{idx: i, model: m, color: c, status: statusWaiting}
+	}
+	return &model{panels: panels, focused: 0}
+}
+
+func (m *model) Init() tea.Cmd { return nil }
+
+func (m *model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
+	switch msg := msg.(type) {
+	case tea.WindowSizeMsg:
+		m.width = msg.Width
+		m.height = msg.Height
+		m.relayout()
+		m.refreshViewport(true)
+		return m, nil
+
+	case tea.KeyMsg:
+		switch msg.String() {
+		case "q", "ctrl+c", "esc":
+			return m, tea.Quit
+		case "tab", "right", "l":
+			m.setFocus(m.focused + 1)
+			return m, nil
+		case "shift+tab", "left", "h":
+			m.setFocus(m.focused - 1)
+			return m, nil
+		}
+		var cmd tea.Cmd
+		m.vp, cmd = m.vp.Update(msg)
+		return m, cmd
+
+	case tea.MouseMsg:
+		if msg.Action == tea.MouseActionPress && msg.Button == tea.MouseButtonLeft {
+			if idx, ok := m.panelAt(msg.X, msg.Y); ok {
+				m.setFocus(idx)
+			}
+			return m, nil
+		}
+		var cmd tea.Cmd
+		m.vp, cmd = m.vp.Update(msg)
+		return m, cmd
+
+	case deltaMsg:
+		p := m.panels[msg.idx]
+		p.buf.WriteString(msg.text)
+		if msg.idx == m.focused {
+			atBottom := m.vp.AtBottom()
+			m.refreshViewport(false)
+			if atBottom {
+				m.vp.GotoBottom()
+			}
+		}
+		return m, nil
+
+	case statusMsg:
+		p := m.panels[msg.idx]
+		p.status = msg.status
+		p.elapsed = msg.elapsed
+		p.err = msg.err
+		if msg.err != nil {
+			errTxt := lipgloss.NewStyle().Foreground(lipgloss.Color("196")).Render("\n" + msg.err.Error())
+			p.buf.WriteString(errTxt)
+			if msg.idx == m.focused {
+				m.refreshViewport(false)
+				m.vp.GotoBottom()
+			}
+		}
+		return m, nil
+	}
+
+	return m, nil
+}
+
+func (m *model) setFocus(idx int) {
+	if len(m.panels) == 0 {
+		return
+	}
+	if idx < 0 {
+		idx = len(m.panels) - 1
+	}
+	if idx >= len(m.panels) {
+		idx = 0
+	}
+	if idx == m.focused {
+		return
+	}
+	m.focused = idx
+	m.refreshViewport(true)
+}
+
+// relayout recomputes grid columns and panel/viewport dimensions.
+func (m *model) relayout() {
+	if m.width < minPanelWidth+4 {
+		m.cols = 1
+	} else {
+		m.cols = m.width / (minPanelWidth + 2)
+		if m.cols > maxCols {
+			m.cols = maxCols
+		}
+		if m.cols > len(m.panels) {
+			m.cols = len(m.panels)
+		}
+		if m.cols < 1 {
+			m.cols = 1
+		}
+	}
+
+	// inner content width: total width / cols, minus borders+padding (4) and gap.
+	boxOuter := m.width/m.cols - 1
+	m.pw = boxOuter - 4
+	if m.pw < 8 {
+		m.pw = 8
+	}
+
+	m.vp = viewport.New(m.pw, panelHeight-2)
+	m.ready = true
+}
+
+func (m *model) refreshViewport(reset bool) {
+	if !m.ready || len(m.panels) == 0 {
+		return
+	}
+	content := lipgloss.NewStyle().Width(m.pw).Render(m.panels[m.focused].buf.String())
+	m.vp.SetContent(content)
+	if reset {
+		m.vp.GotoBottom()
+	}
+}
+
+// panelAt maps screen coordinates to a panel index based on the grid layout.
+func (m *model) panelAt(x, y int) (int, bool) {
+	if m.cols == 0 {
+		return 0, false
+	}
+	boxOuterW := m.width/m.cols + 1
+	col := x / boxOuterW
+	row := y / panelHeight
+	idx := row*m.cols + col
+	if col < m.cols && idx >= 0 && idx < len(m.panels) {
+		return idx, true
+	}
+	return 0, false
+}
+
+func (m *model) View() string {
+	if !m.ready {
+		return "loading..."
+	}
+
+	rows := []string{}
+	var current []string
+	for i, p := range m.panels {
+		current = append(current, m.renderPanel(p, i == m.focused))
+		if len(current) == m.cols {
+			rows = append(rows, lipgloss.JoinHorizontal(lipgloss.Top, current...))
+			current = nil
+		}
+	}
+	if len(current) > 0 {
+		rows = append(rows, lipgloss.JoinHorizontal(lipgloss.Top, current...))
+	}
+
+	grid := lipgloss.JoinVertical(lipgloss.Left, rows...)
+	footer := lipgloss.NewStyle().Faint(true).Render(
+		"tab/click: focus panel  •  wheel/↑↓/pgup/pgdn: scroll focused  •  q: quit")
+	return grid + "\n" + footer
+}
+
+// modelPalette gives each panel a distinct, readable color for its name.
+var modelPalette = []lipgloss.Color{
+	"39",  // blue
+	"213", // magenta
+	"214", // orange
+	"45",  // cyan
+	"141", // purple
+	"203", // salmon
+	"82",  // lime
+	"227", // light yellow
+}
+
+func statusColor(s panelStatus) lipgloss.Color {
+	switch s {
+	case statusStreaming:
+		return lipgloss.Color("220") // yellow - active
+	case statusDone:
+		return lipgloss.Color("42") // green - success
+	case statusError:
+		return lipgloss.Color("196") // red - error
+	default:
+		return lipgloss.Color("244") // gray - waiting
+	}
+}
+
+func (m *model) renderPanel(p *panel, focused bool) string {
+	border := lipgloss.RoundedBorder()
+	if focused {
+		border = lipgloss.DoubleBorder()
+	}
+	style := lipgloss.NewStyle().
+		Border(border).
+		BorderForeground(lipgloss.Color("240"))
+
+	statusTxt := p.status.String()
+	if p.elapsed > 0 {
+		statusTxt += " " + p.elapsed.Round(time.Millisecond).String()
+	}
+
+	// Header: model name (left, model color) + status/timer (right, status color).
+	name := fmt.Sprintf("[%d] %s", p.idx, p.model)
+	gap := m.pw - lipgloss.Width(name) - lipgloss.Width(statusTxt)
+	if gap < 1 {
+		name = truncate(name, m.pw-lipgloss.Width(statusTxt)-1)
+		gap = m.pw - lipgloss.Width(name) - lipgloss.Width(statusTxt)
+	}
+	if gap < 1 {
+		gap = 1
+	}
+	header := lipgloss.NewStyle().Bold(true).Foreground(p.color).Render(name) +
+		strings.Repeat(" ", gap) +
+		lipgloss.NewStyle().Foreground(statusColor(p.status)).Render(statusTxt)
+
+	var bodyLines string
+	if focused {
+		bodyLines = m.vp.View()
+	} else {
+		bodyLines = tailLines(p.buf.String(), m.pw, panelHeight-2)
+	}
+
+	content := lipgloss.JoinVertical(lipgloss.Left, header, bodyLines)
+	return style.Width(m.pw).Height(panelHeight - 2).Render(content)
+}
+
+func truncate(s string, w int) string {
+	if w <= 0 {
+		return ""
+	}
+	if lipgloss.Width(s) <= w {
+		return s
+	}
+	r := []rune(s)
+	if len(r) > w {
+		r = r[:w]
+	}
+	return string(r)
+}
+
+// tailLines wraps text to width w and returns the last n lines.
+func tailLines(s string, w, n int) string {
+	wrapped := lipgloss.NewStyle().Width(w).Render(s)
+	lines := strings.Split(wrapped, "\n")
+	if len(lines) > n {
+		lines = lines[len(lines)-n:]
+	}
+	for len(lines) < n {
+		lines = append(lines, "")
+	}
+	return strings.Join(lines, "\n")
+}
@@ -39,6 +39,49 @@
            },
            "default": {},
            "description": "A dictionary of string substitutions. Macros are reusable snippets used in model cmd, cmdStop, proxy, checkEndpoint, filters.stripParams. Macro names must be <64 chars, match ^[a-zA-Z0-9_-]+$, and not be PORT or MODEL_ID. Values can be string, number, or boolean. Macros can reference other macros defined before them."
+        },
+        "timeouts": {
+            "type": "object",
+            "properties": {
+                "connect": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "default": 30,
+                    "description": "TCP connection timeout in seconds. Set to 0 to disable."
+                },
+                "keepalive": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "default": 30,
+                    "description": "TCP keepalive timeout in seconds. Set to 0 to disable."
+                },
+                "responseHeader": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "default": 0,
+                    "description": "Time to wait for response headers in seconds. Set to 0 to disable."
+                },
+                "tlsHandshake": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "default": 10,
+                    "description": "TLS handshake timeout in seconds. Set to 0 to disable."
+                },
+                "expectContinue": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "default": 1,
+                    "description": "Expect-Continue timeout in seconds. Set to 0 to disable."
+                },
+                "idleConn": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "default": 90,
+                    "description": "Idle connection timeout in seconds. Set to 0 to disable."
+                }
+            },
+            "additionalProperties": false,
+            "description": "Timeout settings for proxy connections."
        }
    },
    "properties": {
@@ -48,6 +91,12 @@
            "default": 120,
            "description": "Number of seconds to wait for a model to be ready to serve requests."
        },
+        "globalTTL": {
+            "type": "integer",
+            "minimum": 0,
+            "default": 0,
+            "description": "Default TTL for all models in seconds, 0 means no TTL and models will never be automatically unloaded"
+        },
        "logLevel": {
            "type": "string",
            "enum": [
@@ -87,6 +136,31 @@
            "default": 1000,
            "description": "Maximum number of metrics to keep in memory. Controls how many metrics are stored before older ones are discarded."
        },
+        "captureBuffer": {
+            "type": "integer",
+            "minimum": 0,
+            "default": 5,
+            "description": "Size in megabytes of the buffer for storing request/response captures. Set to 0 to disable captures."
+        },
+        "performance": {
+            "type": "object",
+            "properties": {
+                "disabled": {
+                    "type": "boolean",
+                    "default": false,
+                    "description": "Disable system performance monitoring."
+                },
+                "every": {
+                    "type": "string",
+                    "pattern": "^[-+]?(\\d+(\\.\\d+)?(ns|us|ms|s|m|h))+$",
+                    "default": "15s",
+                    "description": "Delay between polling for new performance statistics. Minimum duration is 1s. Lower values use more RAM as stats are kept in memory."
+                }
+            },
+            "additionalProperties": false,
+            "default": {},
+            "description": "Configuration for CPU, RAM and GPU monitoring statistics."
+        },
        "startPort": {
            "type": "integer",
            "default": 5800,
@@ -171,9 +245,9 @@
                    },
                    "ttl": {
                        "type": "integer",
-                        "minimum": 0,
-                        "default": 0,
-                        "description": "Automatically unload the model after ttl seconds. 0 disables unloading. Must be >0 to enable."
+                        "minimum": -1,
+                        "default": -1,
+                        "description": "Automatically unload the model after ttl seconds. -1 uses the global TTL value, 0 disables unloading. Must be >0 to enable."
                    },
                    "useModelName": {
                        "type": "string",
@@ -188,11 +262,26 @@
                                "default": "",
                                "pattern": "^[a-zA-Z0-9_, ]*$",
                                "description": "Comma separated list of parameters to remove from the request. Used for server-side enforcement of sampling parameters."
+                            },
+                            "setParams": {
+                                "type": "object",
+                                "additionalProperties": true,
+                                "default": {},
+                                "description": "Dictionary of parameters to set/override in requests. Useful for enforcing specific parameter values. Protected params like 'model' cannot be overridden. Values can be strings, numbers, booleans, arrays, or objects."
+                            },
+                            "setParamsByID": {
+                                "type": "object",
+                                "additionalProperties": {
+                                    "type": "object",
+                                    "additionalProperties": true
+                                },
+                                "default": {},
+                                "description": "Dictionary mapping requested model IDs (or aliases) to parameters to set/override in requests. Applied after setParams and can override those values. Useful with aliases to vary behaviour depending on which alias the client used (e.g. different reasoning_effort per alias). Keys support ${MODEL_ID} macro substitution. Protected params like 'model' cannot be overridden."
                            }
                        },
                        "additionalProperties": false,
                        "default": {},
-                        "description": "Dictionary of filter settings. Only stripParams is supported."
+                        "description": "Dictionary of filter settings. Supports stripParams, setParams, and setParamsByID."
                    },
                    "metadata": {
                        "type": "object",
@@ -214,6 +303,9 @@
                        "type": "boolean",
                        "default": false,
                        "description": "If true the model will not show up in /v1/models responses. It can still be used as normal in API requests."
+                    },
+                    "timeouts": {
+                        "$ref": "#/definitions/timeouts"
                    }
                }
            }
@@ -252,6 +344,44 @@
            },
            "description": "A dictionary of group settings. Provides advanced controls over model swapping behaviour. Model IDs must be defined in models. A model can only be a member of one group. Behaviour controlled via swap, exclusive, persistent."
        },
+        "matrix": {
+            "type": "object",
+            "description": "Solver-based alternative to groups. Declares valid combinations of concurrent models. The solver minimizes eviction cost when swapping. A config must use either groups or matrix, not both.",
+            "required": [
+                "vars",
+                "sets"
+            ],
+            "properties": {
+                "vars": {
+                    "type": "object",
+                    "description": "Short names for models. Keys must be alphanumeric, 1-8 characters. All sets and evict_costs must use these IDs.",
+                    "minProperties": 1,
+                    "additionalProperties": {
+                        "type": "string"
+                    },
+                    "propertyNames": {
+                        "pattern": "^[a-zA-Z0-9]{1,8}$"
+                    }
+                },
+                "evict_costs": {
+                    "type": "object",
+                    "description": "Relative cost of evicting a running model. Models not listed default to 1. Values must be positive integers.",
+                    "additionalProperties": {
+                        "type": "integer",
+                        "minimum": 1
+                    }
+                },
+                "sets": {
+                    "type": "object",
+                    "description": "Named sets of concurrent model combinations. Values are DSL strings using & (AND), | (OR), () (grouping), and +ref (inline another set). Definition order is used for tie-breaking.",
+                    "minProperties": 1,
+                    "additionalProperties": {
+                        "type": "string"
+                    }
+                }
+            },
+            "additionalProperties": false
+        },
        "hooks": {
            "type": "object",
            "properties": {
@@ -273,6 +403,137 @@
            },
            "additionalProperties": false,
            "description": "A dictionary of event triggers and actions. Only supported hook is on_startup."
+        },
+        "logToStdout": {
+            "type": "string",
+            "enum": [
+                "proxy",
+                "upstream",
+                "both",
+                "none"
+            ],
+            "default": "proxy",
+            "description": "Controls what is logged to stdout. 'proxy': logs generated by llama-swap, 'upstream': copy of upstream process stdout logs, 'both': both interleaved together, 'none': no logs written to stdout."
+        },
+        "apiKeys": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "minLength": 1
+            },
+            "default": [],
+            "description": "Require an API key when making requests to inference endpoints. When empty, authorization will not be checked. Each key is a non-empty string."
+        },
+        "peers": {
+            "type": "object",
+            "additionalProperties": {
+                "type": "object",
+                "required": [
+                    "proxy",
+                    "models"
+                ],
+                "properties": {
+                    "proxy": {
+                        "type": "string",
+                        "format": "uri",
+                        "description": "A valid base URL to proxy requests to. Requested path to llama-swap will be appended to the end of the proxy value."
+                    },
+                    "apiKey": {
+                        "type": "string",
+                        "default": "",
+                        "description": "A string key to be injected into the request. If blank, no key will be added. Key will be injected into headers: Authorization: Bearer <key> and x-api-key: <key>."
+                    },
+                    "models": {
+                        "type": "array",
+                        "items": {
+                            "type": "string",
+                            "minLength": 1
+                        },
+                        "description": "A list of models served by the peer."
+                    },
+                    "filters": {
+                        "type": "object",
+                        "properties": {
+                            "stripParams": {
+                                "type": "string",
+                                "default": "",
+                                "pattern": "^[a-zA-Z0-9_, ]*$",
+                                "description": "Comma separated list of parameters to remove from the request. Useful for removing parameters that the peer doesn't support."
+                            },
+                            "setParams": {
+                                "type": "object",
+                                "additionalProperties": true,
+                                "default": {},
+                                "description": "Dictionary of parameters to set/override in requests to this peer. Useful for injecting provider-specific settings. Protected params like 'model' cannot be overridden. Values can be strings, numbers, booleans, arrays, or objects."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "default": {},
+                        "description": "Dictionary of filter settings for peer requests. Supports stripParams and setParams."
+                    },
+                    "timeouts": {
+                        "type": "object",
+                        "properties": {
+                            "connect": {
+                                "type": "integer",
+                                "minimum": 0,
+                                "default": 30,
+                                "description": "TCP connection timeout in seconds."
+                            },
+                            "keepalive": {
+                                "type": "integer",
+                                "minimum": 0,
+                                "default": 30,
+                                "description": "TCP keepalive connection timeout in seconds."
+                            },
+                            "responseHeader": {
+                                "type": "integer",
+                                "minimum": 0,
+                                "default": 0,
+                                "description": "Time to wait for response headers in seconds."
+                            },
+                            "tlsHandshake": {
+                                "type": "integer",
+                                "minimum": 0,
+                                "default": 10,
+                                "description": "TLS handshake timeout in seconds."
+                            },
+                            "idleConn": {
+                                "type": "integer",
+                                "minimum": 0,
+                                "default": 90,
+                                "description": "Idle connection timeout in seconds."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "description": "Timeout settings for proxy connections to this peer."
+                    }
+                }
+            },
+            "default": {},
+            "description": "A dictionary of remote peers and models they provide. Peers can be another llama-swap or any server that provides the /v1/ generative API endpoints supported by llama-swap."
        }
-    }
+    },
+    "allOf": [
+        {
+            "if": {
+                "required": ["groups"]
+            },
+            "then": {
+                "not": {
+                    "required": ["matrix"]
+                }
+            }
+        },
+        {
+            "if": {
+                "required": ["matrix"]
+            },
+            "then": {
+                "not": {
+                    "required": ["groups"]
+                }
+            }
+        }
+    ]
 }
@@ -34,12 +34,39 @@ logLevel: info
 # - For more info, read: https://pkg.go.dev/time#pkg-constants
 logTimeFormat: ""

+# logToStdout: controls what is logged to stdout
+# - optional, default: "proxy"
+# - valid values:
+#   - "proxy": logs generated by llama-swap when swapping models,
+#      handling requests, etc.
+#   - "upstream": a copy of an upstream processes stdout logs
+#   - "both": both the proxy and upstream logs interleaved together
+#   - "none": no logs are ever written to stdout
+logToStdout: "proxy"
+
 # metricsMaxInMemory: maximum number of metrics to keep in memory
 # - optional, default: 1000
 # - controls how many metrics are stored in memory before older ones are discarded
 # - useful for limiting memory usage when processing large volumes of metrics
 metricsMaxInMemory: 1000

+# captureBuffer: how many MBs to allocate for storing request/response captures
+# - optional, default: 10
+# - set to 0 to disable
+captureBuffer: 15
+
+# performance: configuration for system monitoring statistics
+# - timing values are duration strings like 1s, 1h30m, 90m, 2h10s, etc.
+performance:
+  # disabled: boolean
+  # - default: false
+  disabled: false
+
+  # every: delay between polling for new performance statistics
+  # - default: 5s
+  # - minimum duration 5s
+  every: 15s
+
 # startPort: sets the starting port number for the automatic ${PORT} macro.
 # - optional, default: 5800
 # - the ${PORT} macro can be used in model.cmd and model.proxy settings
@@ -60,6 +87,11 @@ sendLoadingState: true
 #   all fields except for Id so chat UIs can use the alias equivalent to the original.
 includeAliasesInList: false

+# globalTTL: the default TTL in seconds before unloading a model
+# - optional, default: 0 (never automatically unload)
+# - must be >= 0
+globalTTL: 0
+
 # macros: a dictionary of string substitutions
 # - optional, default: empty dictionary
 # - macros are reusable snippets
@@ -70,11 +102,13 @@ includeAliasesInList: false
 # - macro names must not be a reserved name: PORT or MODEL_ID
 # - macro values can be numbers, bools, or strings
 # - macros can contain other macros, but they must be defined before they are used
+# - environment variables can be referenced with ${env.VAR_NAME} syntax
+#   - env macros are substituted first, before regular macros
+#   - if the env var is not set, config loading will fail with an error
 macros:
  # Example of a multi-line macro
  "latest-llama": >
-    /path/to/llama-server/llama-server-ec9e0301
-    --port ${PORT}
+    /path/to/llama-server/llama-server-ec9e0301 --port ${PORT}

  "default_ctx": 4096

@@ -82,6 +116,24 @@ macros:
  # but they must be previously declared.
  "default_args": "--ctx-size ${default_ctx}"

+  # Example of environment variable macros
+  # - ${env.VAR_NAME} pulls the value from the system environment
+  # - useful for paths, secrets, or machine-specific configuration
+  "models_dir": "${env.HOME}/models"
+
+# apiKeys: require an API key when making requests to inference endpoints
+# - optional, default: []
+# - when empty (the default) authorization will not be checked as llama-swap is default-allow
+# - each key is a non-empty string
+apiKeys:
+  - "sk-hunter2"
+  # tip, one liner: printf "sk-%s\n" "$(head -c 48 /dev/urandom | base64 )"
+  - "sk-gyCPiKUcIfPlaM4OSMZekkprgijPx6+OsmQs8Rsg0xZ9qpy6gKWsIKqHOk+cgXVx"
+
+  # use environment variable macros to keep secrets out of the config
+  - "${env.API_KEY_1}"
+  - "${env.API_KEY_2}"
+
 # models: a dictionary of model configurations
 # - required
 # - each key is the model's ID, used in API requests
@@ -90,7 +142,7 @@ macros:
 # - below are examples of the all the settings a model can have
 models:
  # keys are the model names used in API requests
-  "llama":
+  "gpt-oss-120b":
    # macros: a dictionary of string substitutions specific to this model
    # - optional, default: empty dictionary
    # - macros defined here override macros defined in the global macros section
@@ -107,7 +159,7 @@ models:
    cmd: |
      # ${latest-llama} is a macro that is defined above
      ${latest-llama}
-      --model path/to/llama-8B-Q4_K_M.gguf
+      --model path/to/gpt-oss-120B.gguf
      --ctx-size ${default_ctx}
      --temperature ${temp}

@@ -115,13 +167,13 @@ models:
    # - optional, default: empty string
    # - if set, it will be used in the v1/models API response
    # - if not set, it will be omitted in the JSON model record
-    name: "llama 3.1 8B"
+    name: "gpt-oss 120B"

    # description: a description for the model
    # - optional, default: empty string
    # - if set, it will be used in the v1/models API response
    # - if not set, it will be omitted in the JSON model record
-    description: "A small but capable model used for quick testing"
+    description: "A thinking model from OpenAI"

    # env: define an array of environment variables to inject into cmd's environment
    # - optional, default: empty array
@@ -136,14 +188,6 @@ models:
    # - if you use a custom port in cmd this *must* be set
    proxy: http://127.0.0.1:8999

-    # aliases: alternative model names that this model configuration is used for
-    # - optional, default: empty array
-    # - aliases must be unique globally
-    # - useful for impersonating a specific model
-    aliases:
-      - "gpt-4o-mini"
-      - "gpt-3.5-turbo"
-
    # checkEndpoint: URL path to check if the server is ready
    # - optional, default: /health
    # - endpoint is expected to return an HTTP 200 response
@@ -152,8 +196,10 @@ models:
    checkEndpoint: /custom-endpoint

    # ttl: automatically unload the model after ttl seconds
-    # - optional, default: 0
-    # - ttl values must be a value greater than 0
+    # - optional, default: -1 (use global default)
+    # - ttl values must be a value greater than or equal to 0
+    # - a ttl of -1 will use the global TTL value as the default
+    # - a ttl of 0 will mean never unload
    # - a value of 0 disables automatic unloading of the model
    ttl: 60

@@ -161,11 +207,11 @@ models:
    # - optional, default: ""
    # - useful for when the upstream server expects a specific model name that
    #   is different from the model's ID
-    useModelName: "qwen:qwq"
+    useModelName: "openai/gpt-oss-120B"

    # filters: a dictionary of filter settings
    # - optional, default: empty dictionary
-    # - only stripParams is currently supported
+    # - same capabilities as peer filters (stripParams, setParams)
    filters:
      # stripParams: a comma separated list of parameters to remove from the request
      # - optional, default: ""
@@ -175,6 +221,43 @@ models:
      # - recommended to stick to sampling parameters
      stripParams: "temperature, top_p, top_k"

+      # setParams: a dictionary of parameters to set/override in requests
+      # - optional, default: empty dictionary
+      # - useful for enforcing specific parameter values
+      # - protected params like "model" cannot be overridden
+      # - values can be strings, numbers, booleans, arrays, or objects
+      # - always runs for the model
+      setParams:
+        # Example: enforce specific sampling parameters
+        temperature: 0.7
+        top_p: 0.9
+
+      # setParamsByID: a dictionary of parameters to set based the model ID
+      # - optional, default: empty dictionary
+      # - combine with aliases to create variant behaviour without reloading the model
+      # - parameters are set in the request body JSON
+      # - run after setParams so it will override any settings
+      # - protected params like "model" cannot be overridden
+      # - values can be strings, numbers, booleans, arrays, or objects
+      # - model aliases will be automatically created for each key
+      setParamsByID:
+        "${MODEL_ID}":
+          chat_template_kwargs:
+            reasoning_effort: medium
+        "${MODEL_ID}:high":
+          chat_template_kwargs:
+            reasoning_effort: high
+        "${MODEL_ID}:low":
+          chat_template_kwargs:
+            reasoning_effort: low
+
+    # aliases: alternative model names that this model configuration is used for
+    # - optional, default: empty array
+    # - aliases must be unique globally
+    # - useful for impersonating a specific model
+    aliases:
+      - "gpt-4o-mini"
+
    # metadata: a dictionary of arbitrary values that are included in /v1/models
    # - optional, default: empty dictionary
    # - while metadata can contains complex types it is recommended to keep it simple
@@ -185,7 +268,8 @@ models:

      # the ${temp} macro will remain a float
      temperature: ${temp}
-      note: "The ${MODEL_ID} is running on port ${PORT} temp=${temp}, context=${default_ctx}"
+      note: "The ${MODEL_ID} is running on port ${PORT} temp=${temp},
+        context=${default_ctx}"

      a_list:
        - 1
@@ -212,6 +296,22 @@ models:
    # - optional, default: undefined (use global setting)
    sendLoadingState: false

+    # timeouts: configure proxy connection timeouts for this model
+    # - optional, defaults shown below
+    # - useful for models running on slower hardware that need longer timeouts
+    # - connect: TCP dial connection timeout in seconds, default: 30 seconds
+    # - keepalive: TCP connection keepalive timeout, default: 30 seconds
+    # - responseHeader: time to wait for response headers in seconds, default: 0 (no timeout)
+    # - tlsHandshake: TLS handshake timeout in seconds, default: 10 seconds
+    # - idleConn: idle connection timeout in seconds, default: 90 seconds
+    # - set any value to 0 to disable that timeout (not recommended)
+    timeouts:
+      connect: 30
+      keepalive: 0
+      responseHeader: 60
+      tlsHandshake: 10
+      idleConn: 90
+
  # Unlisted model example:
  "qwen-unlisted":
    # unlisted: boolean, true or false
@@ -243,68 +343,92 @@ models:
    # - processes have 5 seconds to shutdown until forceful termination is attempted
    cmdStop: docker stop ${MODEL_ID}

-# groups: a dictionary of group settings
-# - optional, default: empty dictionary
-# - provides advanced controls over model swapping behaviour
-# - using groups some models can be kept loaded indefinitely, while others are swapped out
-# - model IDs must be defined in the Models section
-# - a model can only be a member of one group
-# - group behaviour is controlled via the `swap`, `exclusive` and `persistent` fields
-# - see issue #109 for details
+# =============================================================================
+# matrix: run concurrent models with a solver-based swap DSL
+# =============================================================================
 #
-# NOTE: the example below uses model names that are not defined above for demonstration purposes
-groups:
-  # group1 works the same as the default behaviour of llama-swap where only one model is allowed
-  # to run a time across the whole llama-swap instance
-  "group1":
-    # swap: controls the model swapping behaviour in within the group
-    # - optional, default: true
-    # - true : only one model is allowed to run at a time
-    # - false: all models can run together, no swapping
-    swap: true
+# Matrix or Groups?
+#
+# Groups are available and fully supported. The syntax may be easier to use
+# for simple use cases.
+#
+# Documentation can be found here:
+# https://github.com/mostlygeek/llama-swap/blob/40e39f7/config.example.yaml#L334-L396
+#
+# A config can only use a matrix (recommended) or groups. A configuration error
+# will occur if both are defined. Groups is legacy but is fully supported with
+# no plans to deprecate it.
+#
+# ~~~~~
+#
+# The matrix declares valid combinations of models that can run concurrently.
+# When a model is requested, the solver finds the cheapest way to make it
+# available by evicting as few (and least costly) running models as possible.
+#
+# Solver behavior:
+#   1. Request arrives for model X
+#   2. If X is already running, forward immediately. Done.
+#   3. Find all sets containing X
+#   4. For each candidate set, compute cost: sum of evict_costs for
+#      every running model NOT in that set
+#   5. Pick lowest cost candidate. Ties broken by definition order.
+#   6. Evict what needs to stop. Start X. Forward request.
+#
+# Subset semantics: a set [a, b, c] means any subset is valid.
+# Only the requested model is started — others are not preloaded.
+#
+# A model not appearing in any set can only run alone.
+#
+matrix:
+  # vars: short names for models (alphanumeric, 1-8 chars)
+  # - required for sets and evict_costs settings
+  # - each entry is a short name to a real model ID. Do not use an alias
+  # - used to keep set DSL logic short and easier to read
+  # - sets and evict_costs only use identifiers defined in vars
+  vars:
+    g: gemma-model
+    q: qwen-model
+    m: mistral-model
+    v: voxtral-model
+    e: reranker-model
+    L: llama-70B
+    sd: stable-diffusion

-    # exclusive: controls how the group affects other groups
-    # - optional, default: true
-    # - true: causes all other groups to unload when this group runs a model
-    # - false: does not affect other groups
-    exclusive: true
+  # evict_costs: relative cost of losing a running model (default: 1)
+  evict_costs:
+    v: 50 # vllm backend, slow cold start
+    L: 30 # 70B weights, slow to load

-    # members references the models defined above
-    # required
-    members:
-      - "llama"
-      - "qwen-unlisted"
+  # sets: named sets of concurrent model combinations
+  # Values are DSL strings with operators:
+  #   &     AND (models run together)
+  #   |     OR  (alternatives)
+  #   ()    grouping
+  #   +ref  inline another set's expression
+  #
+  # Expansion examples:
+  #   "L"                  → [L]
+  #   "a & b"              → [a, b]
+  #   "a | b"              → [a], [b]
+  #   "(a | b) & c"        → [a, c], [b, c]
+  #   "(a | b) & (c | d)"  → [a,c], [a,d], [b,c], [b,d]
+  #   "+llms & v"          → expands llms inline, then applies & v
+  sets:
+    # LLM + TTS: switching between g/q/m won't evict v
+    # expands to: [g,v], [q,v], [m,v]
+    standard: "(g | q | m) & v"

-  # Example:
-  # - in group2 all models can run at the same time
-  # - when a different group is loaded it causes all running models in this group to unload
-  "group2":
-    swap: false
+    # LLM + TTS + reranker
+    # expands to: [g,v,e], [q,v,e]
+    with_rerank: "(g | q) & v & e"

-    # exclusive: false does not unload other groups when a model in group2 is requested
-    # - the models in group2 will be loaded but will not unload any other groups
-    exclusive: false
-    members:
-      - "docker-llama"
-      - "modelA"
-      - "modelB"
+    # LLM + image generation, no TTS
+    # expands to: [g,sd], [q,sd]
+    creative: "(g | q) & sd"

-  # Example:
-  # - a persistent group, prevents other groups from unloading it
-  "forever":
-    # persistent: prevents over groups from unloading the models in this group
-    # - optional, default: false
-    # - does not affect individual model behaviour
-    persistent: true
-
-    # set swap/exclusive to false to prevent swapping inside the group
-    # and the unloading of other groups
-    swap: false
-    exclusive: false
-    members:
-      - "forever-modelA"
-      - "forever-modelB"
-      - "forever-modelc"
+    # 70B model uses all GPUs, can only run alone
+    # expands to: [L]
+    full: "L"

 # hooks: a dictionary of event triggers and actions
 # - optional, default: empty dictionary
@@ -321,3 +445,67 @@ hooks:
    #   otherwise models will be loaded and swapped out
    preload:
      - "llama"
+
+# peers: a dictionary of remote peers and models they provide
+# - optional, default empty dictionary
+# - peers can be another llama-swap
+# - peers can be any server that provides the /v1/ generative api endpoints supported by llama-swap
+peers:
+  # keys is the peer'd ID
+  llama-swap-peer:
+    # proxy: a valid base URL to proxy requests to
+    # - required
+    # - requested path to llama-swap will be appended to the end of the proxy value
+    proxy: http://192.168.1.23
+    # models: a list of models served by the peer
+    # - required
+    models:
+      - model_a
+      - model_b
+      - embeddings/model_c
+  openrouter:
+    proxy: https://openrouter.ai/api
+    # apiKey: a string key to be injected into the request
+    # - optional, default: ""
+    # - if blank, no key will be added to the request
+    # - key will be injected into headers: Authorization: Bearer <key> and x-api-key: <key>
+    # - can be a string or a macro
+    apiKey: ${env.OPENROUTER_API_KEY}
+    models:
+      - meta-llama/llama-3.1-8b-instruct
+      - qwen/qwen3-235b-a22b-2507
+      - deepseek/deepseek-v3.2
+      - z-ai/glm-4.7
+      - moonshotai/kimi-k2-0905
+      - minimax/minimax-m2.1
+    # timeouts: configure proxy connection timeouts for this peer
+    # - optional, defaults shown below
+    # - useful when the peer runs on slower hardware
+    # - set any value to 0 to disable that timeout (not recommended)
+    timeouts:
+      connect: 30
+      keepalive: 30
+      responseHeader: 60
+      tlsHandshake: 10
+      idleConn: 90
+
+    # filters: a dictionary of filter settings for peer requests
+    # - optional, default: empty dictionary
+    # - same capabilities as model filters (stripParams, setParams)
+    filters:
+      # stripParams: a comma separated list of parameters to remove from the request
+      # - optional, default: ""
+      # - useful for removing parameters that the peer doesn't support
+      # - the `model` parameter can never be removed
+      stripParams: "temperature, top_p"
+
+      # setParams: a dictionary of parameters to set/override in requests to this peer
+      # - optional, default: empty dictionary
+      # - useful for injecting provider-specific settings like data retention policies
+      # - protected params like "model" cannot be overridden
+      # - values can be strings, numbers, booleans, arrays, or objects
+      setParams:
+        # Example: enforce zero-data-retention for OpenRouter
+        provider:
+          data_collection: "deny"
+          zdr: true
@@ -1,56 +1,166 @@
 #!/bin/bash

+set -euo pipefail
+
 cd $(dirname "$0")

+# use this to test locally, example:
+# GITHUB_TOKEN=$(gh auth token) LOG_DEBUG=1 DEBUG_ABORT_BUILD=1 ./docker/build-container.sh rocm
+# you need read:package scope on the token. Generate a personal access token with
+# the scopes: gist, read:org, repo, write:packages
+# then: gh auth login (and copy/paste the new token)
+
+LOG_DEBUG=${LOG_DEBUG:-0}
+DEBUG_ABORT_BUILD=${DEBUG_ABORT_BUILD:-}
+
+log_debug() {
+    if [ "$LOG_DEBUG" = "1" ]; then
+        echo "[DEBUG] $*"
+    fi
+}
+
+log_info() {
+    echo "[INFO] $*"
+}
+
 ARCH=$1
 PUSH_IMAGES=${2:-false}

 # List of allowed architectures
-ALLOWED_ARCHS=("intel" "vulkan" "musa" "cuda" "cpu")
+ALLOWED_ARCHS=("intel" "vulkan" "musa" "cuda" "cuda13" "cpu" "rocm")

 # Check if ARCH is in the allowed list
 if [[ ! " ${ALLOWED_ARCHS[@]} " =~ " ${ARCH} " ]]; then
-  echo "Error: ARCH must be one of the following: ${ALLOWED_ARCHS[@]}"
+  log_info "Error: ARCH must be one of the following: ${ALLOWED_ARCHS[@]}"
  exit 1
 fi

 # Check if GITHUB_TOKEN is set and not empty
-if [[ -z "$GITHUB_TOKEN" ]]; then
-  echo "Error: GITHUB_TOKEN is not set or is empty."
+if [[ -z "${GITHUB_TOKEN:-}" ]]; then
+  log_info "Error: GITHUB_TOKEN is not set or is empty."
  exit 1
 fi

 # Set llama.cpp base image, customizable using the BASE_LLAMACPP_IMAGE environment
 # variable, this permits testing with forked llama.cpp repositories
 BASE_IMAGE=${BASE_LLAMACPP_IMAGE:-ghcr.io/ggml-org/llama.cpp}
+SD_IMAGE=${BASE_SDCPP_IMAGE:-ghcr.io/leejet/stable-diffusion.cpp}

-# Set llama-swap repository, automatically uses GITHUB_REPOSITORY variable
-# to enable easy container builds on forked repos
+# LS_REPO is the destination of the built container image — defaults to the
+# current GitHub repository so forked CI builds publish to the fork's own
+# ghcr.io namespace without code changes.
 LS_REPO=${GITHUB_REPOSITORY:-mostlygeek/llama-swap}

+# LS_BINARY_REPO is where the llama-swap release tarball is downloaded
+# from. Decoupled from LS_REPO so forks (which usually have no releases of
+# their own) can still build a container by pulling the canonical binary
+# from upstream. Override via the LS_BINARY_REPO env var when you maintain
+# fork-side releases.
+LS_BINARY_REPO=${LS_BINARY_REPO:-mostlygeek/llama-swap}
+
 # the most recent llama-swap tag
-# have to strip out the 'v' due to .tar.gz file naming
-LS_VER=$(curl -s https://api.github.com/repos/${LS_REPO}/releases/latest | jq -r .tag_name | sed 's/v//')
+# have to strip out the 'v' due to .tar.gz file naming.
+# Authenticated request — unauth'd github.com API is 60/hr per IP and GHA
+# runners share IPs, so the call regularly returns rate-limit JSON and
+# `.tag_name` then resolves to "null", producing a bogus `vnull` URL below.
+LS_VER=$(curl -s -H "Authorization: Bearer $GITHUB_TOKEN" \
+    "https://api.github.com/repos/${LS_BINARY_REPO}/releases/latest" \
+    | jq -r .tag_name | sed 's/v//')
+
+if [[ -z "$LS_VER" || "$LS_VER" == "null" ]]; then
+    log_info "Error: could not resolve latest llama-swap release tag from ${LS_BINARY_REPO}"
+    exit 1
+fi
+
+# Fetches the most recent llama.cpp tag matching the given prefix
+# Handles pagination to search beyond the first 100 results
+# $1 - tag_prefix (e.g., "server" or "server-vulkan")
+# Returns: the version number extracted from the tag
+fetch_llama_tag() {
+    local tag_prefix=$1
+    local page=1
+    local per_page=100
+
+    while true; do
+        log_debug "Fetching page $page for tag prefix: $tag_prefix"
+
+        local response=$(curl -s -H "Authorization: Bearer $GITHUB_TOKEN" \
+            "https://api.github.com/users/ggml-org/packages/container/llama.cpp/versions?per_page=${per_page}&page=${page}")
+
+        # Check for API errors
+        if echo "$response" | jq -e '.message' > /dev/null 2>&1; then
+            local error_msg=$(echo "$response" | jq -r '.message')
+            log_info "GitHub API error: $error_msg"
+            return 1
+        fi
+
+        # Check if response is empty array (no more pages)
+        if [ "$(echo "$response" | jq 'length')" -eq 0 ]; then
+            log_debug "No more pages (empty response)"
+            return 1
+        fi
+
+        # Extract matching tag from this page
+        local found_tag=$(echo "$response" | jq -r \
+            ".[] | select(.metadata.container.tags[]? | startswith(\"$tag_prefix\")) | .metadata.container.tags[] | select(startswith(\"$tag_prefix\"))" \
+            | sort -r | head -n1)
+
+        if [ -n "$found_tag" ]; then
+            log_debug "Found tag: $found_tag on page $page"
+            echo "$found_tag" | awk -F '-' '{print $NF}'
+            return 0
+        fi
+
+        page=$((page + 1))
+
+        # Safety limit to prevent infinite loops
+        if [ $page -gt 50 ]; then
+            log_info "Reached pagination safety limit (50 pages)"
+            return 1
+        fi
+    done
+}

 if [ "$ARCH" == "cpu" ]; then
-    # cpu only containers just use the server tag
-    LCPP_TAG=$(curl -s -H "Authorization: Bearer $GITHUB_TOKEN" \
-        "https://api.github.com/users/ggml-org/packages/container/llama.cpp/versions" \
-        | jq -r '.[] | select(.metadata.container.tags[] | startswith("server")) | .metadata.container.tags[]' \
-        | sort -r | head -n1 | awk -F '-' '{print $3}')
+    LCPP_TAG=$(fetch_llama_tag "server")
    BASE_TAG=server-${LCPP_TAG}
 else
-    LCPP_TAG=$(curl -s -H "Authorization: Bearer $GITHUB_TOKEN" \
-        "https://api.github.com/users/ggml-org/packages/container/llama.cpp/versions" \
-        | jq -r --arg arch "$ARCH" '.[] | select(.metadata.container.tags[] | startswith("server-\($arch)")) | .metadata.container.tags[]' \
-        | sort -r | head -n1 | awk -F '-' '{print $3}')
+    LCPP_TAG=$(fetch_llama_tag "server-${ARCH}")
    BASE_TAG=server-${ARCH}-${LCPP_TAG}
 fi

+SD_TAG=master-${ARCH}
+
 # Abort if LCPP_TAG is empty.
 if [[ -z "$LCPP_TAG" ]]; then
-    echo "Abort: Could not find llama-server container for arch: $ARCH"
+    log_info "Abort: Could not find llama-server container for arch: $ARCH"
    exit 1
+else
+    log_info "LCPP_TAG: $LCPP_TAG"
+fi
+
+if [[ ! -z "$DEBUG_ABORT_BUILD" ]]; then
+    log_info "Abort: DEBUG_ABORT_BUILD set"
+    exit 0
+fi
+
+# cpu is the only backend with a multi-arch upstream base
+# (ghcr.io/ggml-org/llama.cpp:server-bXXXX ships amd64+arm64); GPU backends
+# are amd64-only and stay on the original `docker build` path so the
+# sd-server layer can still FROM the just-built image via the local
+# dockerd image store (buildx's container driver has a separate store
+# that doesn't share with dockerd, which breaks the sd build).
+if [ "$ARCH" == "cpu" ]; then
+    if [ "$PUSH_IMAGES" == "true" ]; then
+        BUILDX_FLAGS="--push --platform linux/amd64,linux/arm64"
+    else
+        # Smoke build: validate both platforms but emit no output. buildx
+        # on the docker-container driver defaults to cacheonly when
+        # neither --push nor --load is given, so each arch fully builds
+        # and a regression in either fails CI — without materializing the
+        # image or needing to --load (which is multi-arch-incompatible).
+        BUILDX_FLAGS="--platform linux/amd64,linux/arm64"
+    fi
 fi

 for CONTAINER_TYPE in non-root root; do
@@ -68,11 +178,36 @@ for CONTAINER_TYPE in non-root root; do
    USER_HOME=/app
  fi

-  echo "Building $CONTAINER_TYPE $CONTAINER_TAG $LS_VER"
-  docker build -f llama-swap.Containerfile --build-arg BASE_TAG=${BASE_TAG} --build-arg LS_VER=${LS_VER} --build-arg UID=${USER_UID} \
-    --build-arg LS_REPO=${LS_REPO} --build-arg GID=${USER_GID} --build-arg USER_HOME=${USER_HOME} -t ${CONTAINER_TAG} -t ${CONTAINER_LATEST} \
-    --build-arg BASE_IMAGE=${BASE_IMAGE} .
-  if [ "$PUSH_IMAGES" == "true" ]; then
+  log_info "Building $CONTAINER_TYPE $CONTAINER_TAG $LS_VER"
+  if [ "$ARCH" == "cpu" ]; then
+    docker buildx build $BUILDX_FLAGS --provenance=false \
+      -f llama-swap.Containerfile \
+      --build-arg BASE_TAG=${BASE_TAG} --build-arg LS_VER=${LS_VER} --build-arg UID=${USER_UID} \
+      --build-arg LS_REPO=${LS_BINARY_REPO} --build-arg GID=${USER_GID} --build-arg USER_HOME=${USER_HOME} \
+      --build-arg BASE_IMAGE=${BASE_IMAGE} \
+      -t ${CONTAINER_TAG} -t ${CONTAINER_LATEST} .
+  else
+    docker build --provenance=false -f llama-swap.Containerfile \
+      --build-arg BASE_TAG=${BASE_TAG} --build-arg LS_VER=${LS_VER} --build-arg UID=${USER_UID} \
+      --build-arg LS_REPO=${LS_BINARY_REPO} --build-arg GID=${USER_GID} --build-arg USER_HOME=${USER_HOME} \
+      -t ${CONTAINER_TAG} -t ${CONTAINER_LATEST} \
+      --build-arg BASE_IMAGE=${BASE_IMAGE} .
+  fi
+
+  # For architectures with stable-diffusion.cpp support, layer sd-server on top.
+  # Stays on `docker build` so the base resolves from local dockerd.
+  case "$ARCH" in
+    "musa" | "vulkan")
+      log_info "Adding sd-server to $CONTAINER_TAG"
+      docker build --provenance=false -f llama-swap-sd.Containerfile \
+        --build-arg BASE=${CONTAINER_TAG} \
+        --build-arg SD_IMAGE=${SD_IMAGE} --build-arg SD_TAG=${SD_TAG} \
+        --build-arg UID=${USER_UID} --build-arg GID=${USER_GID} \
+        -t ${CONTAINER_TAG} -t ${CONTAINER_LATEST} . ;;
+  esac
+
+  # cpu builds push inline via buildx --push; all other archs push here.
+  if [ "$ARCH" != "cpu" ] && [ "$PUSH_IMAGES" == "true" ]; then
    docker push ${CONTAINER_TAG}
    docker push ${CONTAINER_LATEST}
  fi
@@ -0,0 +1,305 @@
+#!/bin/bash
+#
+# Build script for llama-swap-docker with commit hash pinning
+#
+# Usage:
+#   ./build-image.sh --cuda                    # Build CUDA image
+#   ./build-image.sh --vulkan                  # Build Vulkan image
+#   ./build-image.sh --cuda --no-cache         # Build CUDA image without cache
+#   LLAMA_COMMIT_HASH=abc123 ./build-image.sh --cuda      # Override llama.cpp commit
+#   LLAMA_COMMIT_HASH=b8429 ./build-image.sh --vulkan    # Override llama.cpp release tag (vulkan uses prebuilt binaries)
+#   WHISPER_COMMIT_HASH=def456 ./build-image.sh --vulkan  # Override whisper.cpp commit
+#   SD_COMMIT_HASH=ghi789 ./build-image.sh --cuda        # Override stable-diffusion.cpp commit
+#
+# Features:
+#   - Auto-detects latest commit hashes from git repos
+#   - Builds llama-swap from local source code
+#   - Allows environment variable overrides for reproducible builds
+#   - Cache-friendly: changing commit hash busts cache appropriately
+#   - Supports both CUDA and Vulkan backends (requires explicit flag)
+#
+
+set -euo pipefail
+
+# Parse command line arguments
+BACKEND=""
+NO_CACHE=false
+
+if [[ $# -eq 0 ]]; then
+    echo "Error: No backend specified. Please use --cuda or --vulkan."
+    echo ""
+    echo "Usage: ./build-image.sh --cuda|--vulkan [--no-cache]"
+    echo ""
+    echo "Options:"
+    echo "  --cuda      Build CUDA image (NVIDIA GPUs)"
+    echo "  --vulkan    Build Vulkan image (AMD GPUs and compatible hardware)"
+    echo "  --no-cache  Force rebuild without using Docker cache"
+    echo "  --help, -h  Show this help message"
+    echo ""
+    echo "Environment variables:"
+    echo "  DOCKER_IMAGE_TAG     Set custom image tag (default: llama-swap:cuda or llama-swap:vulkan)"
+    echo "  LLAMA_COMMIT_HASH    Override llama.cpp commit hash"
+    echo "  WHISPER_COMMIT_HASH  Override whisper.cpp commit hash"
+    echo "  SD_COMMIT_HASH       Override stable-diffusion.cpp commit hash"
+    exit 1
+fi
+
+for arg in "$@"; do
+    case $arg in
+        --cuda)
+            BACKEND="cuda"
+            ;;
+        --vulkan)
+            BACKEND="vulkan"
+            ;;
+        --no-cache)
+            NO_CACHE=true
+            ;;
+        --help|-h)
+            echo "Usage: ./build-image.sh --cuda|--vulkan [--no-cache]"
+            echo ""
+            echo "Options:"
+            echo "  --cuda      Build CUDA image (NVIDIA GPUs)"
+            echo "  --vulkan    Build Vulkan image (AMD GPUs and compatible hardware)"
+            echo "  --no-cache  Force rebuild without using Docker cache"
+            echo "  --help, -h  Show this help message"
+            echo ""
+            echo "Environment variables:"
+            echo "  DOCKER_IMAGE_TAG     Set custom image tag (default: llama-swap:cuda or llama-swap:vulkan)"
+            echo "  LLAMA_COMMIT_HASH    Override llama.cpp commit hash"
+            echo "  WHISPER_COMMIT_HASH  Override whisper.cpp commit hash"
+            echo "  SD_COMMIT_HASH       Override stable-diffusion.cpp commit hash"
+            exit 0
+            ;;
+    esac
+done
+
+# Validate backend selection
+if [[ -z "$BACKEND" ]]; then
+    echo "Error: No backend specified. Please use --cuda or --vulkan."
+    exit 1
+fi
+
+# Configuration
+if [[ -n "${DOCKER_IMAGE_TAG:-}" ]]; then
+    # User provided a custom tag, use it as-is
+    :
+elif [[ "$BACKEND" == "vulkan" ]]; then
+    DOCKER_IMAGE_TAG="llama-swap:vulkan"
+else
+    DOCKER_IMAGE_TAG="llama-swap:cuda"
+fi
+DOCKER_BUILDKIT="${DOCKER_BUILDKIT:-1}"
+
+# Single unified Dockerfile, backend selected via build arg
+DOCKERFILE="Dockerfile"
+if [[ "$BACKEND" == "vulkan" ]]; then
+    echo "Building for: Vulkan (AMD GPUs and compatible hardware)"
+else
+    echo "Building for: CUDA (NVIDIA GPUs)"
+fi
+
+# Git repository URLs
+LLAMA_REPO="https://github.com/ggml-org/llama.cpp.git"
+WHISPER_REPO="https://github.com/ggml-org/whisper.cpp.git"
+SD_REPO="https://github.com/leejet/stable-diffusion.cpp.git"
+
+# Function to get the latest commit hash from a git repo's default branch
+get_latest_commit() {
+    local repo_url="$1"
+    local branch="${2:-master}"
+
+    # Try to get the latest commit hash for the specified branch
+    git ls-remote --heads "${repo_url}" "${branch}" 2>/dev/null | head -1 | cut -f1
+}
+
+# Function to get the default branch name (master or main)
+get_default_branch() {
+    local repo_url="$1"
+
+    # Check for master first
+    if git ls-remote --heads "${repo_url}" master &>/dev/null; then
+        echo "master"
+    elif git ls-remote --heads "${repo_url}" main &>/dev/null; then
+        echo "main"
+    else
+        echo "master"  # fallback
+    fi
+}
+
+# Function to get the latest release tag from a GitHub repo
+get_latest_release_tag() {
+    local owner_repo="$1"
+    curl -fsSL "https://api.github.com/repos/${owner_repo}/releases/latest" \
+        | grep '"tag_name"' | head -1 | cut -d'"' -f4
+}
+
+echo "=========================================="
+echo "llama-swap-docker Build Script"
+echo "=========================================="
+echo ""
+
+# Determine commit hashes / release tags - use env vars or auto-detect
+# For vulkan builds, llama and sd use GitHub release tags (prebuilt binaries).
+# For cuda builds (or whisper on any backend), use git commit hashes.
+if [[ -n "${LLAMA_COMMIT_HASH:-}" ]]; then
+    LLAMA_HASH="${LLAMA_COMMIT_HASH}"
+    echo "llama.cpp: Using provided version: ${LLAMA_HASH}"
+elif [[ "$BACKEND" == "vulkan" ]]; then
+    LLAMA_HASH=$(get_latest_release_tag "ggml-org/llama.cpp")
+    if [[ -z "${LLAMA_HASH}" ]]; then
+        echo "ERROR: Could not determine latest release tag for llama.cpp" >&2
+        exit 1
+    fi
+    echo "llama.cpp: Auto-detected latest release tag: ${LLAMA_HASH}"
+else
+    LLAMA_BRANCH=$(get_default_branch "${LLAMA_REPO}")
+    LLAMA_HASH=$(get_latest_commit "${LLAMA_REPO}" "${LLAMA_BRANCH}")
+    if [[ -z "${LLAMA_HASH}" ]]; then
+        echo "ERROR: Could not determine latest commit for llama.cpp" >&2
+        exit 1
+    fi
+    echo "llama.cpp: Auto-detected latest commit (${LLAMA_BRANCH}): ${LLAMA_HASH}"
+fi
+
+if [[ -n "${WHISPER_COMMIT_HASH:-}" ]]; then
+    WHISPER_HASH="${WHISPER_COMMIT_HASH}"
+    echo "whisper.cpp: Using provided commit hash: ${WHISPER_HASH}"
+else
+    WHISPER_BRANCH=$(get_default_branch "${WHISPER_REPO}")
+    WHISPER_HASH=$(get_latest_commit "${WHISPER_REPO}" "${WHISPER_BRANCH}")
+    if [[ -z "${WHISPER_HASH}" ]]; then
+        echo "ERROR: Could not determine latest commit for whisper.cpp" >&2
+        exit 1
+    fi
+    echo "whisper.cpp: Auto-detected latest commit (${WHISPER_BRANCH}): ${WHISPER_HASH}"
+fi
+
+if [[ -n "${SD_COMMIT_HASH:-}" ]]; then
+    SD_HASH="${SD_COMMIT_HASH}"
+    echo "stable-diffusion.cpp: Using provided version: ${SD_HASH}"
+elif [[ "$BACKEND" == "vulkan" ]]; then
+    SD_HASH=$(get_latest_release_tag "leejet/stable-diffusion.cpp")
+    if [[ -z "${SD_HASH}" ]]; then
+        echo "ERROR: Could not determine latest release tag for stable-diffusion.cpp" >&2
+        exit 1
+    fi
+    echo "stable-diffusion.cpp: Auto-detected latest release tag: ${SD_HASH}"
+else
+    SD_BRANCH=$(get_default_branch "${SD_REPO}")
+    SD_HASH=$(get_latest_commit "${SD_REPO}" "${SD_BRANCH}")
+    if [[ -z "${SD_HASH}" ]]; then
+        echo "ERROR: Could not determine latest commit for stable-diffusion.cpp" >&2
+        exit 1
+    fi
+    echo "stable-diffusion.cpp: Auto-detected latest commit (${SD_BRANCH}): ${SD_HASH}"
+fi
+
+echo ""
+echo "=========================================="
+echo "Starting Docker build..."
+echo "=========================================="
+echo ""
+
+# Build the Docker image with commit hashes as build args
+# Build context is the repository root (..) so the Dockerfile can access Go source
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+BUILD_ARGS=(
+    --build-arg "BACKEND=${BACKEND}"
+    --build-arg "LLAMA_COMMIT_HASH=${LLAMA_HASH}"
+    --build-arg "WHISPER_COMMIT_HASH=${WHISPER_HASH}"
+    --build-arg "SD_COMMIT_HASH=${SD_HASH}"
+    -t "${DOCKER_IMAGE_TAG}"
+    -f "${SCRIPT_DIR}/${DOCKERFILE}"
+)
+
+if [[ "$NO_CACHE" == true ]]; then
+    BUILD_ARGS+=(--no-cache)
+    echo "Note: Building without cache"
+fi
+
+# Use docker buildx with a custom builder for parallelism control
+# The legacy DOCKER_BUILDKIT=1 docker build doesn't respect BUILDKIT_MAX_PARALLELISM env var
+# We need to use a custom builder with a buildkitd.toml config file
+BUILDER_NAME="llama-swap-builder"
+
+# Check if our custom builder exists with the right config, create/update if needed
+if ! docker buildx inspect "$BUILDER_NAME" >/dev/null 2>&1; then
+    echo "Creating custom buildx builder with max-parallelism=1..."
+    
+    # Create buildkitd.toml config file
+    cat > buildkitd.toml << 'BUILDKIT_EOF'
+[worker.oci]
+  max-parallelism = 1
+BUILDKIT_EOF
+    
+    # Create the builder with the config
+    docker buildx create --name "$BUILDER_NAME" \
+        --driver docker-container \
+        --buildkitd-config buildkitd.toml \
+        --use
+else
+    # Switch to our builder
+    docker buildx use "$BUILDER_NAME"
+fi
+
+echo "Building with sequential stages (one at a time), each using all CPU cores..."
+echo "Using builder: $BUILDER_NAME"
+
+# Use docker buildx build with --load to load the image into Docker
+# The --builder flag ensures we use our custom builder with max-parallelism=1
+# Build context is the repository root so we can access Go source files
+docker buildx build --builder "$BUILDER_NAME" --load "${BUILD_ARGS[@]}" "${REPO_ROOT}"
+
+echo ""
+echo "=========================================="
+echo "Verifying build artifacts..."
+echo "=========================================="
+echo ""
+
+# Verify all expected binaries exist in the image
+MISSING_BINARIES=()
+
+for binary in llama-server llama-cli whisper-server whisper-cli sd-server sd-cli llama-swap; do
+    if ! docker run --rm "${DOCKER_IMAGE_TAG}" which "${binary}" >/dev/null 2>&1; then
+        MISSING_BINARIES+=("${binary}")
+    fi
+done
+
+if [[ ${#MISSING_BINARIES[@]} -gt 0 ]]; then
+    echo "ERROR: Build succeeded but the following binaries are missing from the image:"
+    for binary in "${MISSING_BINARIES[@]}"; do
+        echo "  - ${binary}"
+    done
+    echo ""
+    echo "This usually indicates a build stage failure. Try running with --no-cache flag:"
+    echo "  ./build-image.sh --vulkan --no-cache"
+    exit 1
+fi
+
+echo "All expected binaries verified: llama-server, llama-cli, whisper-server, whisper-cli, sd-server, sd-cli, llama-swap"
+
+echo ""
+echo "=========================================="
+echo "Build complete!"
+echo "=========================================="
+echo ""
+echo "Image tag: ${DOCKER_IMAGE_TAG}"
+echo ""
+echo "Built with:"
+echo "  llama.cpp:           ${LLAMA_HASH}"
+echo "  whisper.cpp:         ${WHISPER_HASH}"
+echo "  stable-diffusion.cpp: ${SD_HASH}"
+echo "  llama-swap:          $(docker run --rm "${DOCKER_IMAGE_TAG}" cat /versions.txt | grep llama-swap | cut -d' ' -f2-)"
+echo ""
+if [[ "$BACKEND" == "vulkan" ]]; then
+    echo "Run with:"
+    echo "  docker run -it --rm --device /dev/dri:/dev/dri ${DOCKER_IMAGE_TAG}"
+    echo ""
+    echo "Note: For AMD GPUs, you may also need to mount render devices:"
+    echo "  docker run -it --rm --device /dev/dri:/dev/dri --group-add video ${DOCKER_IMAGE_TAG}"
+else
+    echo "Run with:"
+    echo "  docker run -it --rm --gpus all ${DOCKER_IMAGE_TAG}"
+fi
@@ -15,4 +15,19 @@ models:
    cmd: >
      /app/llama-server
      -hf bartowski/SmolLM2-135M-Instruct-GGUF:Q4_K_M
-      --port 9999
+      --port 9999
+
+  z-image:
+    checkEndpoint: /
+    cmd: |
+      /app/sd-server
+      --listen-port 9999
+      --diffusion-fa
+      --diffusion-model /models/z_image_turbo-Q8_0.gguf
+      --vae /models/ae.safetensors
+      --llm /models/qwen3-4b-instruct-2507-q8_0.gguf
+      --offload-to-cpu
+      --cfg-scale 1.0
+      --height 512 --width 512
+      --steps 8
+    aliases: [gpt-image-1,dall-e-2,dall-e-3,gpt-image-1-mini,gpt-image-1.5]
@@ -0,0 +1,11 @@
+ARG SD_IMAGE=ghcr.io/leejet/stable-diffusion.cpp
+ARG SD_TAG=master-vulkan
+ARG BASE=llama-swap:latest
+
+FROM ${SD_IMAGE}:${SD_TAG} AS sd-source
+FROM ${BASE}
+
+ARG UID=10001
+ARG GID=10001
+
+COPY --from=sd-source --chown=${UID}:${GID} /sd-server /app/sd-server
@@ -2,7 +2,6 @@ ARG BASE_IMAGE=ghcr.io/ggml-org/llama.cpp
 ARG BASE_TAG=server-cuda
 FROM ${BASE_IMAGE}:${BASE_TAG}

-# has to be after the FROM
 ARG LS_VER=170
 ARG LS_REPO=mostlygeek/llama-swap

@@ -34,9 +33,15 @@ WORKDIR /app
 ENV PATH="/app:${PATH}"

 RUN \
-    curl -LO "https://github.com/${LS_REPO}/releases/download/v${LS_VER}/llama-swap_${LS_VER}_linux_amd64.tar.gz" && \
-    tar -zxf "llama-swap_${LS_VER}_linux_amd64.tar.gz" && \
-    rm "llama-swap_${LS_VER}_linux_amd64.tar.gz"
+    set -eux; \
+    case "$(uname -m)" in \
+        x86_64)  ARCH=amd64 ;; \
+        aarch64) ARCH=arm64 ;; \
+        *) echo "unsupported arch: $(uname -m)" >&2; exit 1 ;; \
+    esac; \
+    curl --fail -LO "https://github.com/${LS_REPO}/releases/download/v${LS_VER}/llama-swap_${LS_VER}_linux_${ARCH}.tar.gz" && \
+    tar -zxf "llama-swap_${LS_VER}_linux_${ARCH}.tar.gz" && \
+    rm "llama-swap_${LS_VER}_linux_${ARCH}.tar.gz"

 COPY --chown=$UID:$GID config.example.yaml /app/config.yaml

@@ -0,0 +1,207 @@
+# Unified multi-stage Dockerfile for AI inference tools
+# Supports CUDA and Vulkan backends via BACKEND build arg
+#
+# Usage:
+#   docker buildx build --build-arg BACKEND=cuda -t llama-swap:unified-cuda .
+#   docker buildx build --build-arg BACKEND=vulkan -t llama-swap:unified-vulkan .
+#   docker buildx build --build-arg BACKEND=cuda --build-arg CMAKE_CUDA_ARCHITECTURES="86;89" -t llama-swap:unified-cuda .
+#
+# Each project has its own install script that handles cloning, building,
+# and installing binaries. Build stages are independent for cache efficiency.
+
+ARG BACKEND=cuda
+
+# ── Builder bases ──────────────────────────────────────────────────────
+
+FROM nvidia/cuda:12.9.1-devel-ubuntu24.04 AS builder-base-cuda
+
+ARG CMAKE_CUDA_ARCHITECTURES="60;61;75;86;89"
+ENV DEBIAN_FRONTEND=noninteractive
+ENV CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}
+ENV CCACHE_DIR=/ccache
+ENV CCACHE_MAXSIZE=2G
+ENV PATH="/usr/lib/ccache:${PATH}"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential cmake git python3 python3-pip libssl-dev \
+    curl ca-certificates ccache make wget \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /build
+
+# ──
+
+FROM ubuntu:24.04 AS builder-base-vulkan
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV CCACHE_DIR=/ccache
+ENV CCACHE_MAXSIZE=2G
+ENV PATH="/usr/lib/ccache:${PATH}"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential cmake git python3 python3-pip libssl-dev \
+    curl ca-certificates ccache make wget software-properties-common \
+    libvulkan-dev glslang-tools spirv-tools vulkan-validationlayers glslc \
+    spirv-headers \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /build
+
+# ── Select builder base by BACKEND ────────────────────────────────────
+
+FROM builder-base-${BACKEND} AS builder-base
+
+# ── Build whisper.cpp (fastest build, run first) ──────────────────────
+
+FROM builder-base AS whisper-build
+ARG BACKEND=cuda
+ARG WHISPER_COMMIT_HASH=master
+COPY install-whisper.sh /build/
+RUN --mount=type=cache,id=ccache-${BACKEND},target=/ccache \
+    --mount=type=cache,id=whisper-${BACKEND},target=/src/whisper.cpp/build \
+    BACKEND=${BACKEND} bash /build/install-whisper.sh "${WHISPER_COMMIT_HASH}"
+
+# ── Build stable-diffusion.cpp ────────────────────────────────────────
+
+FROM builder-base AS sd-build
+ARG BACKEND=cuda
+ARG SD_COMMIT_HASH=master
+COPY install-sd.sh /build/
+RUN --mount=type=cache,id=ccache-${BACKEND},target=/ccache \
+    --mount=type=cache,id=sd-${BACKEND},target=/src/stable-diffusion.cpp/build \
+    BACKEND=${BACKEND} bash /build/install-sd.sh "${SD_COMMIT_HASH}"
+
+# ── Build llama.cpp (slowest build, run last) ─────────────────────────
+
+FROM builder-base AS llama-build
+ARG BACKEND=cuda
+ARG LLAMA_COMMIT_HASH=master
+COPY install-llama.sh /build/
+RUN --mount=type=cache,id=ccache-${BACKEND},target=/ccache \
+    --mount=type=cache,id=llama-${BACKEND},target=/src/llama.cpp/build \
+    BACKEND=${BACKEND} bash /build/install-llama.sh "${LLAMA_COMMIT_HASH}"
+
+# ── Build ik_llama.cpp (CUDA only) ────────────────────────────────────
+#
+# Two named stages allow ARG BACKEND to select at build time:
+#   - ik-llama-cuda  : real build (from builder-base-cuda)
+#   - ik-llama-vulkan: no-op (empty /install/bin, skips CUDA pull entirely)
+# BuildKit only evaluates the selected branch, so vulkan builds never
+# pull nvidia/cuda:*-devel or compile ik_llama.cpp.
+
+FROM builder-base-vulkan AS ik-llama-vulkan
+RUN mkdir -p /install/bin
+
+FROM builder-base-cuda AS ik-llama-cuda
+ARG IK_LLAMA_COMMIT_HASH=main
+COPY install-ik-llama.sh /build/
+RUN --mount=type=cache,id=ccache-cuda,target=/ccache \
+    --mount=type=cache,id=ik-llama-cuda,target=/src/ik_llama.cpp/build \
+    bash /build/install-ik-llama.sh "${IK_LLAMA_COMMIT_HASH}"
+
+ARG BACKEND=cuda
+FROM ik-llama-${BACKEND} AS ik-llama-build
+
+# ── Download llama-swap release binary ────────────────────────────────
+
+FROM builder-base AS llama-swap-download
+ARG LS_VERSION=latest
+COPY install-llama-swap.sh /build/
+RUN bash /build/install-llama-swap.sh "${LS_VERSION}"
+
+# ── Runtime bases ─────────────────────────────────────────────────────
+
+FROM nvidia/cuda:12.9.1-runtime-ubuntu24.04 AS runtime-cuda
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
+ENV PATH="/usr/local/bin:${PATH}"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgomp1 python3 curl ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# CUDA stub drivers for container compatibility
+COPY --from=builder-base-cuda /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so
+COPY --from=builder-base-cuda /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
+
+# ──
+
+FROM ubuntu:24.04 AS runtime-vulkan
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PATH="/usr/local/bin:${PATH}"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgomp1 libvulkan1 mesa-vulkan-drivers \
+    python3 curl ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# ── Select runtime base by BACKEND ────────────────────────────────────
+
+FROM runtime-${BACKEND} AS runtime
+
+ARG BACKEND=cuda
+ARG LLAMA_COMMIT_HASH=unknown
+ARG WHISPER_COMMIT_HASH=unknown
+ARG SD_COMMIT_HASH=unknown
+ARG IK_LLAMA_COMMIT_HASH=unknown
+ARG RUN_UID=0
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3-numpy python3-sentencepiece python3-pip \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create non-root user when RUN_UID != 0
+RUN if [ "$RUN_UID" != "0" ]; then \
+      groupadd --system --gid $RUN_UID llama-swap && \
+      useradd --system --uid $RUN_UID --gid $RUN_UID \
+        --home /app --shell /sbin/nologin llama-swap; \
+    fi && \
+    mkdir -p /etc/llama-swap/config && \
+    chown -R ${RUN_UID}:${RUN_UID} /etc/llama-swap
+
+WORKDIR /app
+
+# Copy whisper.cpp binaries and libraries
+COPY --from=whisper-build /install/bin/whisper-server /usr/local/bin/
+COPY --from=whisper-build /install/bin/whisper-cli /usr/local/bin/
+COPY --from=whisper-build /install/lib/ /usr/local/lib/
+
+# Copy stable-diffusion.cpp binaries and libraries
+COPY --from=sd-build /install/bin/sd-server /usr/local/bin/
+COPY --from=sd-build /install/bin/sd-cli /usr/local/bin/
+COPY --from=sd-build /install/lib/ /usr/local/lib/
+
+# Copy llama.cpp binaries (statically linked)
+COPY --from=llama-build /install/bin/llama-server /usr/local/bin/
+COPY --from=llama-build /install/bin/llama-cli /usr/local/bin/
+
+# Copy ik-llama-server (CUDA only; empty copy for vulkan)
+COPY --from=ik-llama-build /install/bin/ /usr/local/bin/
+
+# Install uv
+RUN pip install uv --break-system-packages
+
+# Copy llama-swap binary
+COPY --from=llama-swap-download /install/bin/llama-swap /usr/local/bin/
+COPY --from=llama-swap-download /install/llama-swap-version /tmp/
+
+RUN ldconfig
+
+COPY config.example.yaml /etc/llama-swap/config/config.yaml
+
+# Version tracking
+RUN echo "llama.cpp: ${LLAMA_COMMIT_HASH}" > /versions.txt && \
+    echo "whisper.cpp: ${WHISPER_COMMIT_HASH}" >> /versions.txt && \
+    echo "stable-diffusion.cpp: ${SD_COMMIT_HASH}" >> /versions.txt && \
+    echo "ik_llama.cpp: ${IK_LLAMA_COMMIT_HASH}" >> /versions.txt && \
+    echo "llama-swap: $(cat /tmp/llama-swap-version)" >> /versions.txt && \
+    echo "backend: ${BACKEND}" >> /versions.txt && \
+    echo "build_timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> /versions.txt
+
+RUN mkdir -p /models && chown ${RUN_UID}:${RUN_UID} /models
+WORKDIR /models
+USER ${RUN_UID}
+ENTRYPOINT ["llama-swap"]
+CMD ["-config", "/etc/llama-swap/config/config.yaml", "-listen", "0.0.0.0:8080"]
@@ -0,0 +1,8 @@
+# Unified Docker Container
+
+These scripts create a custom llama-swap container that contains:
+
+- llama-server for LLMs, rerank and embedding model support
+- sd-server (stable-diffusion.cpp) for image generation
+- whisper.cpp for ASR
+
@@ -0,0 +1,303 @@
+#!/bin/bash
+#
+# Build script for unified container with version pinning
+#
+# Usage:
+#   ./build-image.sh --cuda                              # Build CUDA image
+#   ./build-image.sh --vulkan                            # Build Vulkan image
+#   ./build-image.sh --cuda --no-cache                   # Build without cache
+#   LLAMA_REF=b1234 ./build-image.sh --vulkan            # Pin llama.cpp to a commit hash
+#   LLAMA_REF=v1.2.3 ./build-image.sh --cuda             # Pin llama.cpp to a tag
+#   WHISPER_REF=v1.0.0 ./build-image.sh --vulkan         # Pin whisper.cpp to a tag
+#   SD_REF=master ./build-image.sh --cuda                # Pin stable-diffusion.cpp to a branch
+#   LS_VERSION=170 ./build-image.sh --cuda               # Override llama-swap version
+#   IK_LLAMA_REF=main ./build-image.sh --cuda            # Pin ik_llama.cpp to main branch (CUDA only)
+#
+
+set -euo pipefail
+
+BACKEND=""
+NO_CACHE=false
+
+for arg in "$@"; do
+    case $arg in
+        --cuda)
+            BACKEND="cuda"
+            ;;
+        --vulkan)
+            BACKEND="vulkan"
+            ;;
+        --no-cache)
+            NO_CACHE=true
+            ;;
+        --help|-h)
+            echo "Usage: ./build-image.sh --cuda|--vulkan [--no-cache]"
+            echo ""
+            echo "Options:"
+            echo "  --cuda      Build CUDA image (NVIDIA GPUs)"
+            echo "  --vulkan    Build Vulkan image (AMD GPUs and compatible hardware)"
+            echo "  --no-cache  Force rebuild without using Docker cache"
+            echo "  --help, -h  Show this help message"
+            echo ""
+            echo "Environment variables:"
+            echo "  DOCKER_IMAGE_TAG     Set custom image tag (default: llama-swap:unified-cuda or llama-swap:unified-vulkan)"
+            echo "  LLAMA_REF            Pin llama.cpp to a commit, tag, or branch"
+            echo "  WHISPER_REF          Pin whisper.cpp to a commit, tag, or branch"
+            echo "  SD_REF               Pin stable-diffusion.cpp to a commit, tag, or branch"
+            echo "  IK_LLAMA_REF         Pin ik_llama.cpp to a commit, tag, or branch (CUDA only)"
+            echo "  LS_VERSION           Override llama-swap version (e.g., '170' or 'latest')"
+            exit 0
+            ;;
+    esac
+done
+
+if [[ -z "$BACKEND" ]]; then
+    echo "Error: No backend specified. Please use --cuda or --vulkan."
+    echo ""
+    echo "Usage: ./build-image.sh --cuda|--vulkan [--no-cache]"
+    exit 1
+fi
+
+DOCKER_IMAGE_TAG="${DOCKER_IMAGE_TAG:-llama-swap:unified-${BACKEND}}"
+
+# Git repository URLs
+LLAMA_REPO="https://github.com/ggml-org/llama.cpp.git"
+WHISPER_REPO="https://github.com/ggml-org/whisper.cpp.git"
+SD_REPO="https://github.com/leejet/stable-diffusion.cpp.git"
+LLAMA_SWAP_REPO="https://github.com/mostlygeek/llama-swap.git"
+IK_LLAMA_REPO="https://github.com/ikawrakow/ik_llama.cpp.git"
+
+# Resolve a git ref (commit hash, tag, or branch) to a full commit hash.
+# Requires only: git, network access to the remote.
+resolve_ref() {
+    local repo_url="$1"
+    local ref="$2"
+
+    # Full 40-char SHA — use as-is
+    if [[ "${ref}" =~ ^[0-9a-f]{40}$ ]]; then
+        echo "${ref}"
+        return
+    fi
+
+    # Try tag then branch (exact match)
+    local hash
+    hash=$(git ls-remote "${repo_url}" "refs/tags/${ref}" "refs/heads/${ref}" 2>/dev/null | head -1 | cut -f1)
+    if [[ -n "${hash}" ]]; then
+        echo "${hash}"
+        return
+    fi
+
+    # Short hash (7+ chars): scan all refs for a SHA with this prefix
+    if [[ "${ref}" =~ ^[0-9a-f]{7,}$ ]]; then
+        hash=$(git ls-remote "${repo_url}" 2>/dev/null | grep "^${ref}" | head -1 | cut -f1)
+        if [[ -n "${hash}" ]]; then
+            echo "${hash}"
+            return
+        fi
+    fi
+
+    echo "ERROR: Could not resolve ref '${ref}' for ${repo_url}" >&2
+    if [[ "${ref}" =~ ^[0-9a-f]+$ && ${#ref} -lt 7 ]]; then
+        echo "  Short hashes must be at least 7 characters (got ${#ref})." >&2
+    else
+        echo "  Tried: tag, branch, git ls-remote prefix match" >&2
+    fi
+    echo "  Use a full 40-char SHA, a tag name, a branch name, or a 7+ char short hash." >&2
+    return 1
+}
+
+# Resolve HEAD of a repo without needing to know the default branch name.
+get_latest_hash() {
+    git ls-remote "${1}" HEAD 2>/dev/null | head -1 | cut -f1
+}
+
+echo "=========================================="
+echo "llama-swap Unified Build (${BACKEND})"
+echo "=========================================="
+echo ""
+
+# Resolve llama.cpp ref
+if [[ -n "${LLAMA_REF:-}" ]]; then
+    LLAMA_HASH=$(resolve_ref "${LLAMA_REPO}" "${LLAMA_REF}") || exit 1
+    echo "llama.cpp: ${LLAMA_REF} -> ${LLAMA_HASH}"
+else
+    LLAMA_HASH=$(get_latest_hash "${LLAMA_REPO}")
+    if [[ -z "${LLAMA_HASH}" ]]; then
+        echo "ERROR: Could not determine latest commit for llama.cpp" >&2
+        exit 1
+    fi
+    echo "llama.cpp: latest HEAD: ${LLAMA_HASH}"
+fi
+
+# Resolve whisper.cpp ref
+if [[ -n "${WHISPER_REF:-}" ]]; then
+    WHISPER_HASH=$(resolve_ref "${WHISPER_REPO}" "${WHISPER_REF}") || exit 1
+    echo "whisper.cpp: ${WHISPER_REF} -> ${WHISPER_HASH}"
+else
+    WHISPER_HASH=$(get_latest_hash "${WHISPER_REPO}")
+    if [[ -z "${WHISPER_HASH}" ]]; then
+        echo "ERROR: Could not determine latest commit for whisper.cpp" >&2
+        exit 1
+    fi
+    echo "whisper.cpp: latest HEAD: ${WHISPER_HASH}"
+fi
+
+# Resolve stable-diffusion.cpp ref
+if [[ -n "${SD_REF:-}" ]]; then
+    SD_HASH=$(resolve_ref "${SD_REPO}" "${SD_REF}") || exit 1
+    echo "stable-diffusion.cpp: ${SD_REF} -> ${SD_HASH}"
+else
+    SD_HASH=$(get_latest_hash "${SD_REPO}")
+    if [[ -z "${SD_HASH}" ]]; then
+        echo "ERROR: Could not determine latest commit for stable-diffusion.cpp" >&2
+        exit 1
+    fi
+    echo "stable-diffusion.cpp: latest HEAD: ${SD_HASH}"
+fi
+
+# Resolve ik_llama.cpp ref (CUDA only)
+if [[ "$BACKEND" == "cuda" ]]; then
+    if [[ -n "${IK_LLAMA_REF:-}" ]]; then
+        IK_LLAMA_HASH=$(resolve_ref "${IK_LLAMA_REPO}" "${IK_LLAMA_REF}") || exit 1
+        echo "ik_llama.cpp: ${IK_LLAMA_REF} -> ${IK_LLAMA_HASH}"
+    else
+        IK_LLAMA_HASH=$(get_latest_hash "${IK_LLAMA_REPO}")
+        if [[ -z "${IK_LLAMA_HASH}" ]]; then
+            echo "ERROR: Could not determine latest commit for ik_llama.cpp" >&2
+            exit 1
+        fi
+        echo "ik_llama.cpp: latest HEAD: ${IK_LLAMA_HASH}"
+    fi
+else
+    IK_LLAMA_HASH="n/a"
+    echo "ik_llama.cpp: skipped (vulkan build)"
+fi
+
+# Resolve llama-swap ref
+if [[ -n "${LS_VERSION:-}" ]]; then
+    LS_HASH=$(resolve_ref "${LLAMA_SWAP_REPO}" "${LS_VERSION}") || exit 1
+    echo "llama-swap: ${LS_VERSION} -> ${LS_HASH}"
+else
+    LS_HASH=$(get_latest_hash "${LLAMA_SWAP_REPO}")
+    if [[ -z "${LS_HASH}" ]]; then
+        echo "ERROR: Could not determine latest commit for llama-swap" >&2
+        exit 1
+    fi
+    echo "llama-swap: latest HEAD: ${LS_HASH}"
+fi
+
+echo ""
+echo "=========================================="
+echo "Starting Docker build..."
+echo "=========================================="
+echo ""
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+BUILD_ARGS=(
+    --build-arg "BACKEND=${BACKEND}"
+    --build-arg "LLAMA_COMMIT_HASH=${LLAMA_HASH}"
+    --build-arg "WHISPER_COMMIT_HASH=${WHISPER_HASH}"
+    --build-arg "SD_COMMIT_HASH=${SD_HASH}"
+    --build-arg "IK_LLAMA_COMMIT_HASH=${IK_LLAMA_HASH}"
+    --build-arg "LS_VERSION=${LS_HASH}"
+    -t "${DOCKER_IMAGE_TAG}"
+    -f "${SCRIPT_DIR}/Dockerfile"
+)
+
+if [[ "$NO_CACHE" == true ]]; then
+    BUILD_ARGS+=(--no-cache)
+    echo "Note: Building without cache"
+elif [[ "${GITHUB_ACTIONS:-}" == "true" && "${ACT:-}" != "true" ]]; then
+    CACHE_REF="ghcr.io/mostlygeek/llama-swap:unified-${BACKEND}-cache"
+    BUILD_ARGS+=(
+        --cache-from "type=registry,ref=${CACHE_REF}"
+        --cache-to "type=registry,ref=${CACHE_REF},mode=max"
+    )
+    echo "Note: Using registry cache (${CACHE_REF})"
+fi
+
+DOCKER_BUILDKIT=1 docker buildx build --load "${BUILD_ARGS[@]}" "${SCRIPT_DIR}"
+
+echo ""
+echo "=========================================="
+echo "Verifying build artifacts..."
+echo "=========================================="
+echo ""
+
+EXPECTED_BINARIES=(llama-server llama-cli whisper-server whisper-cli sd-server sd-cli llama-swap)
+if [[ "$BACKEND" == "cuda" ]]; then
+    EXPECTED_BINARIES+=(ik-llama-server)
+fi
+
+MISSING_BINARIES=()
+for binary in "${EXPECTED_BINARIES[@]}"; do
+    if ! docker run --rm --entrypoint which "${DOCKER_IMAGE_TAG}" "${binary}" >/dev/null 2>&1; then
+        MISSING_BINARIES+=("${binary}")
+    fi
+done
+
+if [[ ${#MISSING_BINARIES[@]} -gt 0 ]]; then
+    echo "ERROR: Build succeeded but the following binaries are missing:"
+    for binary in "${MISSING_BINARIES[@]}"; do
+        echo "  - ${binary}"
+    done
+    echo ""
+    echo "Try running with --no-cache flag:"
+    echo "  ./build-image.sh --${BACKEND} --no-cache"
+    exit 1
+fi
+
+VERIFIED_LIST="llama-server, llama-cli, whisper-server, whisper-cli, sd-server, sd-cli, llama-swap"
+if [[ "$BACKEND" == "cuda" ]]; then
+    VERIFIED_LIST="${VERIFIED_LIST}, ik-llama-server"
+fi
+echo "All expected binaries verified: ${VERIFIED_LIST}"
+
+echo ""
+echo "=========================================="
+echo "Building rootless image..."
+echo "=========================================="
+echo ""
+
+ROOTLESS_TAG="${DOCKER_IMAGE_TAG}-rootless"
+docker buildx build --load -t "${ROOTLESS_TAG}" - <<EOF
+FROM ${DOCKER_IMAGE_TAG}
+USER root
+RUN groupadd --system --gid 10001 llama-swap && \\
+    useradd --system --uid 10001 --gid 10001 \\
+      --home /app --shell /sbin/nologin llama-swap && \\
+    chown -R 10001:10001 /etc/llama-swap /models
+USER 10001
+EOF
+
+echo "Rootless image built: ${ROOTLESS_TAG}"
+
+echo ""
+echo "=========================================="
+echo "Build complete!"
+echo "=========================================="
+echo ""
+echo "Image tags:"
+echo "  ${DOCKER_IMAGE_TAG}"
+echo "  ${ROOTLESS_TAG}"
+echo ""
+echo "Built with:"
+echo "  llama.cpp:            ${LLAMA_HASH}"
+echo "  whisper.cpp:          ${WHISPER_HASH}"
+echo "  stable-diffusion.cpp: ${SD_HASH}"
+if [[ "$BACKEND" == "cuda" ]]; then
+    echo "  ik_llama.cpp:         ${IK_LLAMA_HASH}"
+fi
+echo "  llama-swap:           $(docker run --rm --entrypoint cat "${DOCKER_IMAGE_TAG}" /versions.txt | grep llama-swap | cut -d' ' -f2-)"
+echo ""
+if [[ "$BACKEND" == "vulkan" ]]; then
+    echo "Run with:"
+    echo "  docker run -it --rm --device /dev/dri:/dev/dri ${DOCKER_IMAGE_TAG}"
+    echo ""
+    echo "Note: For AMD GPUs, you may also need:"
+    echo "  docker run -it --rm --device /dev/dri:/dev/dri --group-add video ${DOCKER_IMAGE_TAG}"
+else
+    echo "Run with:"
+    echo "  docker run -it --rm --gpus all ${DOCKER_IMAGE_TAG}"
+fi
@@ -0,0 +1,33 @@
+# placeholder example configuration
+healthCheckTimeout: 300
+logRequests: true
+
+models:
+  "llama":
+    cmd: >
+      llama-server
+      -hf bartowski/Qwen2.5-0.5B-Instruct-GGUF:Q4_K_M
+      --port ${PORT}
+
+  "whisper":
+    checkEndpoint: /v1/audio/transcriptions/
+    cmd: >
+      whisper-server
+      --port ${PORT}
+      --m /models/whisper.bin
+      --flash-attn
+      --request-path /v1/audio/transcriptions --inference-path ""
+
+  "image":
+    checkEndpoint: /
+    cmd: |
+      /app/sd-server
+      --listen-port 9999
+      --diffusion-fa
+      --diffusion-model /models/z_image_turbo-Q8_0.gguf
+      --vae /models/ae.safetensors
+      --llm /models/qwen3-4b-instruct-2507-q8_0.gguf
+      --offload-to-cpu
+      --cfg-scale 1.0
+      --height 512 --width 512
+      --steps 8
@@ -0,0 +1,48 @@
+#!/bin/bash
+# Install ik_llama.cpp - clone, build, and install binaries
+# Usage: ./install-ik-llama.sh <commit_hash>
+# Note: CUDA only; always built against builder-base-cuda
+set -e
+
+COMMIT_HASH="${1:-main}"
+
+mkdir -p /install/bin
+
+# Clone and checkout (init-based so cache-mounted build dir doesn't break clone)
+echo "=== Cloning ik_llama.cpp at ${COMMIT_HASH} ==="
+mkdir -p /src/ik_llama.cpp
+cd /src/ik_llama.cpp
+if [ ! -d .git ]; then
+    git init
+    git remote add origin https://github.com/ikawrakow/ik_llama.cpp.git
+fi
+git fetch --depth=1 origin "${COMMIT_HASH}"
+git checkout FETCH_HEAD
+
+CMAKE_FLAGS=(
+    -DGGML_NATIVE=OFF
+    -DBUILD_SHARED_LIBS=OFF
+    -DCMAKE_BUILD_TYPE=Release
+    -DCMAKE_C_COMPILER_LAUNCHER=ccache
+    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+    -DGGML_CUDA=ON
+    "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:?CMAKE_CUDA_ARCHITECTURES must be set}"
+    "-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
+    "-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda -Wl,--allow-shlib-undefined"
+)
+
+rm -rf build/CMakeCache.txt build/CMakeFiles 2>/dev/null || true
+
+echo "=== Building ik_llama.cpp ==="
+cmake -B build "${CMAKE_FLAGS[@]}"
+cmake --build build --config Release -j"$(nproc)" --target llama-server
+
+if [ ! -f "build/bin/llama-server" ]; then
+    echo "FATAL: llama-server not found in build/bin/" >&2
+    exit 1
+fi
+
+# Install as ik-llama-server to avoid collision with llama.cpp's llama-server
+cp "build/bin/llama-server" "/install/bin/ik-llama-server"
+echo "=== ik_llama.cpp build complete ==="
+ls -la /install/bin/
@@ -0,0 +1,67 @@
+#!/bin/bash
+# Install llama-swap - download latest release binary from GitHub
+# Usage: ./install-llama-swap.sh [version]
+#   version: release version number (e.g., "170") or "latest" (default)
+set -e
+
+VERSION="${1:-latest}"
+REPO="mostlygeek/llama-swap"
+
+mkdir -p /install/bin
+
+# If a full commit hash is given, find the release tag that points to it
+if echo "${VERSION}" | grep -qE '^[0-9a-f]{40}$'; then
+    echo "=== Resolving commit ${VERSION:0:7} to release tag ==="
+    TAG=$(git ls-remote --tags "https://github.com/${REPO}.git" 2>/dev/null \
+        | grep "^${VERSION}" | sed 's|.*refs/tags/||' | grep -v '\^{}' | head -1)
+    if [ -n "${TAG}" ]; then
+        echo "Resolved to tag: ${TAG}"
+        VERSION="${TAG#v}"
+    else
+        echo "No release tag found for commit ${VERSION:0:7}, using latest"
+        VERSION="latest"
+    fi
+fi
+
+# Strip leading 'v' prefix so both "198" and "v198" work
+VERSION="${VERSION#v}"
+
+# Resolve "latest" to actual version number
+if [ "$VERSION" = "latest" ]; then
+    echo "=== Resolving latest llama-swap release ==="
+    VERSION=$(curl -fsSL "https://api.github.com/repos/${REPO}/releases/latest" \
+        | grep '"tag_name"' | head -1 | cut -d'"' -f4 | sed 's/^v//')
+    if [ -z "$VERSION" ]; then
+        echo "FATAL: Could not determine latest release version" >&2
+        exit 1
+    fi
+    echo "Latest version: ${VERSION}"
+fi
+
+
+ARCH=$(uname -m)
+case "$ARCH" in
+    x86_64) ARCH="amd64" ;;
+    aarch64|arm64) ARCH="arm64" ;;
+    *) echo "FATAL: Unsupported architecture: $ARCH" >&2; exit 1 ;;
+esac
+
+# Download and extract
+URL="https://github.com/${REPO}/releases/download/v${VERSION}/llama-swap_${VERSION}_linux_${ARCH}.tar.gz"
+echo "=== Downloading llama-swap v${VERSION} ==="
+echo "URL: $URL"
+curl -fSL -o /tmp/llama-swap.tar.gz "$URL"
+tar -xzf /tmp/llama-swap.tar.gz -C /install/bin/
+rm /tmp/llama-swap.tar.gz
+
+# Validate
+if [ ! -x "/install/bin/llama-swap" ]; then
+    echo "FATAL: llama-swap binary not found or not executable" >&2
+    ls -la /install/bin/ >&2
+    exit 1
+fi
+
+echo "$VERSION" > /install/llama-swap-version
+
+echo "=== llama-swap v${VERSION} installed ==="
+ls -la /install/bin/llama-swap
@@ -0,0 +1,63 @@
+#!/bin/bash
+# Install llama.cpp - clone, build, and install binaries
+# Usage: BACKEND=cuda|vulkan ./install-llama.sh <commit_hash>
+set -e
+
+COMMIT_HASH="${1:-master}"
+BACKEND="${BACKEND:-cuda}"
+
+mkdir -p /install/bin
+
+# Clone and checkout (init-based so cache-mounted /src/llama.cpp/build dir doesn't break clone)
+echo "=== Cloning llama.cpp at ${COMMIT_HASH} ==="
+mkdir -p /src/llama.cpp
+cd /src/llama.cpp
+if [ ! -d .git ]; then
+    git init
+    git remote add origin https://github.com/ggml-org/llama.cpp.git
+fi
+git fetch --depth=1 origin "${COMMIT_HASH}"
+git checkout FETCH_HEAD
+
+# Common cmake flags
+CMAKE_FLAGS=(
+    -DGGML_NATIVE=OFF
+    -DBUILD_SHARED_LIBS=OFF
+    -DCMAKE_BUILD_TYPE=Release
+    -DCMAKE_C_COMPILER_LAUNCHER=ccache
+    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+    -DLLAMA_BUILD_TESTS=OFF
+)
+
+if [ "$BACKEND" = "cuda" ]; then
+    CMAKE_FLAGS+=(
+        -DGGML_CUDA=ON
+        -DGGML_VULKAN=OFF
+        "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:?CMAKE_CUDA_ARCHITECTURES must be set}"
+        "-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
+        "-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
+    )
+elif [ "$BACKEND" = "vulkan" ]; then
+    CMAKE_FLAGS+=(
+        -DGGML_CUDA=OFF
+        -DGGML_VULKAN=ON
+    )
+fi
+
+TARGETS=(llama-cli llama-server)
+
+rm -rf build/CMakeCache.txt build/CMakeFiles 2>/dev/null || true
+
+echo "=== Building llama.cpp for ${BACKEND} ==="
+cmake -B build "${CMAKE_FLAGS[@]}"
+cmake --build build --config Release -j"$(nproc)" --target "${TARGETS[@]}"
+
+for bin in "${TARGETS[@]}"; do
+    if [ ! -f "build/bin/$bin" ]; then
+        echo "FATAL: $bin not found in build/bin/" >&2
+        exit 1
+    fi
+    cp "build/bin/$bin" "/install/bin/"
+done
+echo "=== llama.cpp build complete ==="
+ls -la /install/bin/
@@ -0,0 +1,68 @@
+#!/bin/bash
+# Install stable-diffusion.cpp - clone, build, and install binaries and library
+# Usage: BACKEND=cuda|vulkan ./install-sd.sh <commit_hash>
+set -e
+
+COMMIT_HASH="${1:-master}"
+BACKEND="${BACKEND:-cuda}"
+
+mkdir -p /install/bin /install/lib
+
+# Clone and checkout (init-based so cache-mounted /src/stable-diffusion.cpp/build dir doesn't break clone)
+echo "=== Cloning stable-diffusion.cpp at ${COMMIT_HASH} ==="
+mkdir -p /src/stable-diffusion.cpp
+cd /src/stable-diffusion.cpp
+if [ ! -d .git ]; then
+    git init
+    git remote add origin https://github.com/leejet/stable-diffusion.cpp.git
+fi
+git fetch --depth=1 origin "${COMMIT_HASH}"
+git checkout FETCH_HEAD
+git submodule update --init --recursive --depth=1
+
+# Common cmake flags
+CMAKE_FLAGS=(
+    -DGGML_NATIVE=OFF
+    -DCMAKE_BUILD_TYPE=Release
+    -DCMAKE_C_COMPILER_LAUNCHER=ccache
+    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+    -DSD_BUILD_EXAMPLES=ON
+)
+
+if [ "$BACKEND" = "cuda" ]; then
+    CMAKE_FLAGS+=(
+        -DGGML_CUDA=ON
+        -DGGML_VULKAN=OFF
+        "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:?CMAKE_CUDA_ARCHITECTURES must be set}"
+        "-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
+        "-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
+        "-DCMAKE_SHARED_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
+        -DSD_CUDA=ON
+    )
+elif [ "$BACKEND" = "vulkan" ]; then
+    CMAKE_FLAGS+=(
+        -DGGML_CUDA=OFF
+        -DGGML_VULKAN=ON
+        -DSD_VULKAN=ON
+    )
+fi
+
+TARGETS=(stable-diffusion sd-cli sd-server)
+
+rm -rf build/CMakeCache.txt build/CMakeFiles 2>/dev/null || true
+
+echo "=== Building stable-diffusion.cpp for ${BACKEND} ==="
+cmake -B build "${CMAKE_FLAGS[@]}"
+cmake --build build --config Release -j"$(nproc)" --target "${TARGETS[@]}"
+
+for bin in sd-cli sd-server; do
+    if [ ! -f "build/bin/$bin" ]; then
+        echo "FATAL: $bin not found in build/bin/" >&2
+        exit 1
+    fi
+    cp "build/bin/$bin" "/install/bin/"
+done
+find build -name "*.so*" -type f -exec cp {} /install/lib/ \;
+
+echo "=== stable-diffusion.cpp build complete ==="
+ls -la /install/bin/ /install/lib/
@@ -0,0 +1,64 @@
+#!/bin/bash
+# Install whisper.cpp - clone, build, and install binaries
+# Usage: BACKEND=cuda|vulkan ./install-whisper.sh <commit_hash>
+set -e
+
+COMMIT_HASH="${1:-master}"
+BACKEND="${BACKEND:-cuda}"
+
+mkdir -p /install/bin /install/lib
+
+# Clone and checkout (init-based so cache-mounted /src/whisper.cpp/build dir doesn't break clone)
+echo "=== Cloning whisper.cpp at ${COMMIT_HASH} ==="
+mkdir -p /src/whisper.cpp
+cd /src/whisper.cpp
+if [ ! -d .git ]; then
+    git init
+    git remote add origin https://github.com/ggml-org/whisper.cpp.git
+fi
+git fetch --depth=1 origin "${COMMIT_HASH}"
+git checkout FETCH_HEAD
+
+# Common cmake flags
+CMAKE_FLAGS=(
+    -DGGML_NATIVE=OFF
+    -DCMAKE_BUILD_TYPE=Release
+    -DCMAKE_C_COMPILER_LAUNCHER=ccache
+    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+)
+
+if [ "$BACKEND" = "cuda" ]; then
+    CMAKE_FLAGS+=(
+        -DGGML_CUDA=ON
+        -DGGML_VULKAN=OFF
+        "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:?CMAKE_CUDA_ARCHITECTURES must be set}"
+        "-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
+        "-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
+        "-DCMAKE_SHARED_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
+    )
+elif [ "$BACKEND" = "vulkan" ]; then
+    CMAKE_FLAGS+=(
+        -DGGML_CUDA=OFF
+        -DGGML_VULKAN=ON
+    )
+fi
+
+TARGETS=(whisper-cli whisper-server)
+
+rm -rf build/CMakeCache.txt build/CMakeFiles 2>/dev/null || true
+
+echo "=== Building whisper.cpp for ${BACKEND} ==="
+cmake -B build "${CMAKE_FLAGS[@]}"
+cmake --build build --config Release -j"$(nproc)" --target "${TARGETS[@]}"
+
+for bin in "${TARGETS[@]}"; do
+    if [ ! -f "build/bin/$bin" ]; then
+        echo "FATAL: $bin not found in build/bin/" >&2
+        exit 1
+    fi
+    cp "build/bin/$bin" "/install/bin/"
+done
+find build -name "*.so*" -type f -exec cp {} /install/lib/ \;
+
+echo "=== whisper.cpp build complete ==="
+ls -la /install/bin/
@@ -22,7 +22,7 @@ models:
    cmd: llama-server --port ${PORT} -m /path/to/third_model.gguf
 ```

-With this configuration models will be hot swapped and loaded on demand. The special `${PORT}` macro provides a unique port per model. Useful if you want to run multiple models at the same time with the `groups` feature.
+With this configuration models will be hot swapped and loaded on demand. The special `${PORT}` macro provides a unique port per model which is useful if you want to run multiple models at the same time with the `matrix` feature.

 ## Advanced control with `cmd`

@@ -76,7 +76,7 @@ llama-swap supports many more features to customize how you want to manage your
 | --------- | ---------------------------------------------- |
 | `ttl`     | automatic unloading of models after a timeout  |
 | `macros`  | reusable snippets to use in configurations     |
-| `groups`  | run multiple models at a time                  |
+| `matrix`  | run multiple models at a time                  |
 | `hooks`   | event driven functionality                     |
 | `env`     | define environment variables per model         |
 | `aliases` | serve a model with different names             |
@@ -86,9 +86,12 @@ llama-swap supports many more features to customize how you want to manage your
 ## Full Configuration Example

 > [!NOTE]
-> This is a copy of `config.example.yaml`. Always check that for the most up to date examples.
+> Always check [config.example.yaml](https://github.com/mostlygeek/llama-swap/blob/main/config.example.yaml) for the most up to date reference for all example configurations.

 ```yaml
+# add this modeline for validation in vscode
+# yaml-language-server: $schema=https://raw.githubusercontent.com/mostlygeek/llama-swap/refs/heads/main/config-schema.json
+#
 # llama-swap YAML configuration example
 # -------------------------------------
 #
@@ -114,18 +117,72 @@ healthCheckTimeout: 500
 # - Valid log levels: debug, info, warn, error
 logLevel: info

+# logTimeFormat: enables and sets the logging timestamp format
+# - optional, default (disabled): ""
+# - Valid values: "", "ansic", "unixdate", "rubydate", "rfc822", "rfc822z",
+#   "rfc850", "rfc1123", "rfc1123z", "rfc3339", "rfc3339nano", "kitchen",
+#   "stamp", "stampmilli", "stampmicro", and "stampnano".
+# - For more info, read: https://pkg.go.dev/time#pkg-constants
+logTimeFormat: ""
+
+# logToStdout: controls what is logged to stdout
+# - optional, default: "proxy"
+# - valid values:
+#   - "proxy": logs generated by llama-swap when swapping models,
+#      handling requests, etc.
+#   - "upstream": a copy of an upstream processes stdout logs
+#   - "both": both the proxy and upstream logs interleaved together
+#   - "none": no logs are ever written to stdout
+logToStdout: "proxy"
+
 # metricsMaxInMemory: maximum number of metrics to keep in memory
 # - optional, default: 1000
 # - controls how many metrics are stored in memory before older ones are discarded
 # - useful for limiting memory usage when processing large volumes of metrics
 metricsMaxInMemory: 1000

+# captureBuffer: how many MBs to allocate for storing request/response captures
+# - optional, default: 10
+# - set to 0 to disable
+captureBuffer: 15
+
+# performance: configuration for system monitoring statistics
+# - timing values are duration strings like 1s, 1h30m, 90m, 2h10s, etc.
+performance:
+  # disabled: boolean
+  # - default: false
+  enable: true
+
+  # every: delay between polling for new performance statistics
+  # - default: 5s
+  # - minimum duration 5s
+  every: 5s
+
 # startPort: sets the starting port number for the automatic ${PORT} macro.
 # - optional, default: 5800
 # - the ${PORT} macro can be used in model.cmd and model.proxy settings
 # - it is automatically incremented for every model that uses it
 startPort: 10001

+# sendLoadingState: inject loading status updates into the reasoning (thinking)
+# field
+# - optional, default: false
+# - when true, a stream of loading messages will be sent to the client in the
+#   reasoning field so chat UIs can show that loading is in progress.
+# - see #366 for more details
+sendLoadingState: true
+
+# includeAliasesInList: present aliases within the /v1/models OpenAI API listing
+# - optional, default: false
+# - when true, model aliases will be output to the API model listing duplicating
+#   all fields except for Id so chat UIs can use the alias equivalent to the original.
+includeAliasesInList: false
+
+# globalTTL: the default TTL in seconds before unloading a model
+# - optional, default: 0 (never automatically unload)
+# - must be >= 0
+globalTTL: 0
+
 # macros: a dictionary of string substitutions
 # - optional, default: empty dictionary
 # - macros are reusable snippets
@@ -136,11 +193,13 @@ startPort: 10001
 # - macro names must not be a reserved name: PORT or MODEL_ID
 # - macro values can be numbers, bools, or strings
 # - macros can contain other macros, but they must be defined before they are used
+# - environment variables can be referenced with ${env.VAR_NAME} syntax
+#   - env macros are substituted first, before regular macros
+#   - if the env var is not set, config loading will fail with an error
 macros:
  # Example of a multi-line macro
  "latest-llama": >
-    /path/to/llama-server/llama-server-ec9e0301
-    --port ${PORT}
+    /path/to/llama-server/llama-server-ec9e0301 --port ${PORT}

  "default_ctx": 4096

@@ -148,6 +207,24 @@ macros:
  # but they must be previously declared.
  "default_args": "--ctx-size ${default_ctx}"

+  # Example of environment variable macros
+  # - ${env.VAR_NAME} pulls the value from the system environment
+  # - useful for paths, secrets, or machine-specific configuration
+  "models_dir": "${env.HOME}/models"
+
+# apiKeys: require an API key when making requests to inference endpoints
+# - optional, default: []
+# - when empty (the default) authorization will not be checked as llama-swap is default-allow
+# - each key is a non-empty string
+apiKeys:
+  - "sk-hunter2"
+  # tip, one liner: printf "sk-%s\n" "$(head -c 48 /dev/urandom | base64 )"
+  - "sk-gyCPiKUcIfPlaM4OSMZekkprgijPx6+OsmQs8Rsg0xZ9qpy6gKWsIKqHOk+cgXVx"
+
+  # use environment variable macros to keep secrets out of the config
+  - "${env.API_KEY_1}"
+  - "${env.API_KEY_2}"
+
 # models: a dictionary of model configurations
 # - required
 # - each key is the model's ID, used in API requests
@@ -156,7 +233,7 @@ macros:
 # - below are examples of the all the settings a model can have
 models:
  # keys are the model names used in API requests
-  "llama":
+  "gpt-oss-120b":
    # macros: a dictionary of string substitutions specific to this model
    # - optional, default: empty dictionary
    # - macros defined here override macros defined in the global macros section
@@ -173,7 +250,7 @@ models:
    cmd: |
      # ${latest-llama} is a macro that is defined above
      ${latest-llama}
-      --model path/to/llama-8B-Q4_K_M.gguf
+      --model path/to/gpt-oss-120B.gguf
      --ctx-size ${default_ctx}
      --temperature ${temp}

@@ -181,13 +258,13 @@ models:
    # - optional, default: empty string
    # - if set, it will be used in the v1/models API response
    # - if not set, it will be omitted in the JSON model record
-    name: "llama 3.1 8B"
+    name: "gpt-oss 120B"

    # description: a description for the model
    # - optional, default: empty string
    # - if set, it will be used in the v1/models API response
    # - if not set, it will be omitted in the JSON model record
-    description: "A small but capable model used for quick testing"
+    description: "A thinking model from OpenAI"

    # env: define an array of environment variables to inject into cmd's environment
    # - optional, default: empty array
@@ -202,14 +279,6 @@ models:
    # - if you use a custom port in cmd this *must* be set
    proxy: http://127.0.0.1:8999

-    # aliases: alternative model names that this model configuration is used for
-    # - optional, default: empty array
-    # - aliases must be unique globally
-    # - useful for impersonating a specific model
-    aliases:
-      - "gpt-4o-mini"
-      - "gpt-3.5-turbo"
-
    # checkEndpoint: URL path to check if the server is ready
    # - optional, default: /health
    # - endpoint is expected to return an HTTP 200 response
@@ -218,8 +287,10 @@ models:
    checkEndpoint: /custom-endpoint

    # ttl: automatically unload the model after ttl seconds
-    # - optional, default: 0
-    # - ttl values must be a value greater than 0
+    # - optional, default: -1 (use global default)
+    # - ttl values must be a value greater than or equal to 0
+    # - a ttl of -1 will use the global TTL value as the default
+    # - a ttl of 0 will mean never unload
    # - a value of 0 disables automatic unloading of the model
    ttl: 60

@@ -227,11 +298,11 @@ models:
    # - optional, default: ""
    # - useful for when the upstream server expects a specific model name that
    #   is different from the model's ID
-    useModelName: "qwen:qwq"
+    useModelName: "openai/gpt-oss-120B"

    # filters: a dictionary of filter settings
    # - optional, default: empty dictionary
-    # - only stripParams is currently supported
+    # - same capabilities as peer filters (stripParams, setParams)
    filters:
      # stripParams: a comma separated list of parameters to remove from the request
      # - optional, default: ""
@@ -241,6 +312,43 @@ models:
      # - recommended to stick to sampling parameters
      stripParams: "temperature, top_p, top_k"

+      # setParams: a dictionary of parameters to set/override in requests
+      # - optional, default: empty dictionary
+      # - useful for enforcing specific parameter values
+      # - protected params like "model" cannot be overridden
+      # - values can be strings, numbers, booleans, arrays, or objects
+      # - always runs for the model
+      setParams:
+        # Example: enforce specific sampling parameters
+        temperature: 0.7
+        top_p: 0.9
+
+      # setParamsByID: a dictionary of parameters to set based the model ID
+      # - optional, default: empty dictionary
+      # - combine with aliases to create variant behaviour without reloading the model
+      # - parameters are set in the request body JSON
+      # - run after setParams so it will override any settings
+      # - protected params like "model" cannot be overridden
+      # - values can be strings, numbers, booleans, arrays, or objects
+      # - model aliases will be automatically created for each key
+      setParamsByID:
+        "${MODEL_ID}":
+          chat_template_kwargs:
+            reasoning_effort: medium
+        "${MODEL_ID}:high":
+          chat_template_kwargs:
+            reasoning_effort: high
+        "${MODEL_ID}:low":
+          chat_template_kwargs:
+            reasoning_effort: low
+
+    # aliases: alternative model names that this model configuration is used for
+    # - optional, default: empty array
+    # - aliases must be unique globally
+    # - useful for impersonating a specific model
+    aliases:
+      - "gpt-4o-mini"
+
    # metadata: a dictionary of arbitrary values that are included in /v1/models
    # - optional, default: empty dictionary
    # - while metadata can contains complex types it is recommended to keep it simple
@@ -251,7 +359,8 @@ models:

      # the ${temp} macro will remain a float
      temperature: ${temp}
-      note: "The ${MODEL_ID} is running on port ${PORT} temp=${temp}, context=${default_ctx}"
+      note: "The ${MODEL_ID} is running on port ${PORT} temp=${temp},
+        context=${default_ctx}"

      a_list:
        - 1
@@ -274,6 +383,26 @@ models:
    # - recommended to be omitted and the default used
    concurrencyLimit: 0

+    # sendLoadingState: overrides the global sendLoadingState setting for this model
+    # - optional, default: undefined (use global setting)
+    sendLoadingState: false
+
+    # timeouts: configure proxy connection timeouts for this model
+    # - optional, defaults shown below
+    # - useful for models running on slower hardware that need longer timeouts
+    # - connect: TCP dial connection timeout in seconds, default: 30 seconds
+    # - keepalive: TCP connection keepalive timeout, default: 30 seconds
+    # - responseHeader: time to wait for response headers in seconds, default: 0 (no timeout)
+    # - tlsHandshake: TLS handshake timeout in seconds, default: 10 seconds
+    # - idleConn: idle connection timeout in seconds, default: 90 seconds
+    # - set any value to 0 to disable that timeout (not recommended)
+    timeouts:
+      connect: 30
+      keepalive: 0
+      responseHeader: 60
+      tlsHandshake: 10
+      idleConn: 90
+
  # Unlisted model example:
  "qwen-unlisted":
    # unlisted: boolean, true or false
@@ -305,68 +434,83 @@ models:
    # - processes have 5 seconds to shutdown until forceful termination is attempted
    cmdStop: docker stop ${MODEL_ID}

-# groups: a dictionary of group settings
-# - optional, default: empty dictionary
-# - provides advanced controls over model swapping behaviour
-# - using groups some models can be kept loaded indefinitely, while others are swapped out
-# - model IDs must be defined in the Models section
-# - a model can only be a member of one group
-# - group behaviour is controlled via the `swap`, `exclusive` and `persistent` fields
-# - see issue #109 for details
+# =============================================================================
+# matrix: run concurrent models with a solver-based swap DSL
+# =============================================================================
 #
-# NOTE: the example below uses model names that are not defined above for demonstration purposes
-groups:
-  # group1 works the same as the default behaviour of llama-swap where only one model is allowed
-  # to run a time across the whole llama-swap instance
-  "group1":
-    # swap: controls the model swapping behaviour in within the group
-    # - optional, default: true
-    # - true : only one model is allowed to run at a time
-    # - false: all models can run together, no swapping
-    swap: true
+# Note:
+# A config must use either a matrix or legacy groups, not both. A configuration error
+# will occur if both are defined. Configuration examples for legacy Groups can be found:
+# https://github.com/mostlygeek/llama-swap/blob/40e39f7/config.example.yaml#L334-L396
+#
+# The matrix declares valid combinations of models that can run concurrently.
+# When a model is requested, the solver finds the cheapest way to make it
+# available by evicting as few (and least costly) running models as possible.
+#
+# Solver behavior:
+#   1. Request arrives for model X
+#   2. If X is already running, forward immediately. Done.
+#   3. Find all sets containing X
+#   4. For each candidate set, compute cost: sum of evict_costs for
+#      every running model NOT in that set
+#   5. Pick lowest cost candidate. Ties broken by definition order.
+#   6. Evict what needs to stop. Start X. Forward request.
+#
+# Subset semantics: a set [a, b, c] means any subset is valid.
+# Only the requested model is started — others are not preloaded.
+#
+# A model not appearing in any set can only run alone.
+#
+matrix:
+  # vars: short names for models (alphanumeric, 1-8 chars)
+  # - required for sets and evict_costs settings
+  # - each entry is a short name to a real model ID. Do not use an alias
+  # - used to keep set DSL logic short and easier to read
+  # - sets and evict_costs only use identifiers defined in vars
+  vars:
+    g: gemma-model
+    q: qwen-model
+    m: mistral-model
+    v: voxtral-model
+    e: reranker-model
+    L: llama-70B
+    sd: stable-diffusion

-    # exclusive: controls how the group affects other groups
-    # - optional, default: true
-    # - true: causes all other groups to unload when this group runs a model
-    # - false: does not affect other groups
-    exclusive: true
+  # evict_costs: relative cost of losing a running model (default: 1)
+  evict_costs:
+    v: 50 # vllm backend, slow cold start
+    L: 30 # 70B weights, slow to load

-    # members references the models defined above
-    # required
-    members:
-      - "llama"
-      - "qwen-unlisted"
+  # sets: named sets of concurrent model combinations
+  # Values are DSL strings with operators:
+  #   &     AND (models run together)
+  #   |     OR  (alternatives)
+  #   ()    grouping
+  #   +ref  inline another set's expression
+  #
+  # Expansion examples:
+  #   "L"                  → [L]
+  #   "a & b"              → [a, b]
+  #   "a | b"              → [a], [b]
+  #   "(a | b) & c"        → [a, c], [b, c]
+  #   "(a | b) & (c | d)"  → [a,c], [a,d], [b,c], [b,d]
+  #   "+llms & v"          → expands llms inline, then applies & v
+  sets:
+    # LLM + TTS: switching between g/q/m won't evict v
+    # expands to: [g,v], [q,v], [m,v]
+    standard: "(g | q | m) & v"

-  # Example:
-  # - in group2 all models can run at the same time
-  # - when a different group is loaded it causes all running models in this group to unload
-  "group2":
-    swap: false
+    # LLM + TTS + reranker
+    # expands to: [g,v,e], [q,v,e]
+    with_rerank: "(g | q) & v & e"

-    # exclusive: false does not unload other groups when a model in group2 is requested
-    # - the models in group2 will be loaded but will not unload any other groups
-    exclusive: false
-    members:
-      - "docker-llama"
-      - "modelA"
-      - "modelB"
+    # LLM + image generation, no TTS
+    # expands to: [g,sd], [q,sd]
+    creative: "(g | q) & sd"

-  # Example:
-  # - a persistent group, prevents other groups from unloading it
-  "forever":
-    # persistent: prevents over groups from unloading the models in this group
-    # - optional, default: false
-    # - does not affect individual model behaviour
-    persistent: true
-
-    # set swap/exclusive to false to prevent swapping inside the group
-    # and the unloading of other groups
-    swap: false
-    exclusive: false
-    members:
-      - "forever-modelA"
-      - "forever-modelB"
-      - "forever-modelc"
+    # 70B model uses all GPUs, can only run alone
+    # expands to: [L]
+    full: "L"

 # hooks: a dictionary of event triggers and actions
 # - optional, default: empty dictionary
@@ -383,4 +527,68 @@ hooks:
    #   otherwise models will be loaded and swapped out
    preload:
      - "llama"
+
+# peers: a dictionary of remote peers and models they provide
+# - optional, default empty dictionary
+# - peers can be another llama-swap
+# - peers can be any server that provides the /v1/ generative api endpoints supported by llama-swap
+peers:
+  # keys is the peer'd ID
+  llama-swap-peer:
+    # proxy: a valid base URL to proxy requests to
+    # - required
+    # - requested path to llama-swap will be appended to the end of the proxy value
+    proxy: http://192.168.1.23
+    # models: a list of models served by the peer
+    # - required
+    models:
+      - model_a
+      - model_b
+      - embeddings/model_c
+  openrouter:
+    proxy: https://openrouter.ai/api
+    # apiKey: a string key to be injected into the request
+    # - optional, default: ""
+    # - if blank, no key will be added to the request
+    # - key will be injected into headers: Authorization: Bearer <key> and x-api-key: <key>
+    # - can be a string or a macro
+    apiKey: ${env.OPENROUTER_API_KEY}
+    models:
+      - meta-llama/llama-3.1-8b-instruct
+      - qwen/qwen3-235b-a22b-2507
+      - deepseek/deepseek-v3.2
+      - z-ai/glm-4.7
+      - moonshotai/kimi-k2-0905
+      - minimax/minimax-m2.1
+    # timeouts: configure proxy connection timeouts for this peer
+    # - optional, defaults shown below
+    # - useful when the peer runs on slower hardware
+    # - set any value to 0 to disable that timeout (not recommended)
+    timeouts:
+      connect: 30
+      keepalive: 30
+      responseHeader: 60
+      tlsHandshake: 10
+      idleConn: 90
+
+    # filters: a dictionary of filter settings for peer requests
+    # - optional, default: empty dictionary
+    # - same capabilities as model filters (stripParams, setParams)
+    filters:
+      # stripParams: a comma separated list of parameters to remove from the request
+      # - optional, default: ""
+      # - useful for removing parameters that the peer doesn't support
+      # - the `model` parameter can never be removed
+      stripParams: "temperature, top_p"
+
+      # setParams: a dictionary of parameters to set/override in requests to this peer
+      # - optional, default: empty dictionary
+      # - useful for injecting provider-specific settings like data retention policies
+      # - protected params like "model" cannot be overridden
+      # - values can be strings, numbers, booleans, arrays, or objects
+      setParams:
+        # Example: enforce zero-data-retention for OpenRouter
+        provider:
+          data_collection: "deny"
+          zdr: true
 ```
@@ -0,0 +1,264 @@
+# New Router Migration TODO
+
+This document tracks the work needed for [cmd/newrouter/main.go](../cmd/newrouter/main.go) and [internal/router/](../internal/router/) to reach feature parity with the legacy entrypoint at [llama-swap.go](../llama-swap.go) plus [proxy/proxymanager.go](../proxy/proxymanager.go).
+
+The work is split into phases so each can land and be tested independently. Earlier phases unblock later ones.
+
+## Current state (newrouter)
+
+`cmd/newrouter` already supports:
+
+- Loading config via `-config`
+- Selecting Matrix vs Group router based on config
+- Peer routing fallback
+- Plain HTTP listen (`-listen`)
+- Graceful shutdown on `SIGINT` / `SIGTERM`
+- Model extraction from JSON body, query string, and form bodies (see [router.go:88](../internal/router/router.go#L88))
+- `Server.ServeHTTP` dispatches a single request to peer or local router based on the requested model
+
+Everything below is missing or only partially implemented.
+
+---
+
+## Phase 1 — Package relocation -- Completed.
+
+Goal: move shared infrastructure packages out from under `proxy/` so the new router does not depend on the legacy proxy tree. This is a prerequisite for retiring `proxy/` in Phase 8.
+
+---
+
+## Phase 2 — Server lifecycle parity -- Completed.
+
+Goal: make `cmd/newrouter` a drop-in replacement for the legacy binary's process model, _without_ yet adding any extra HTTP endpoints.
+
+---
+
+## Phase 3 — `internal/chain` package -- Completed.
+
+API: `chain.New(mws...).Then(final)` for ServeMux registration; `Append` returns an extended Chain without mutating the receiver, so a base stack (auth/CORS) can be reused across many routes with per-route additions.
+
+---
+
+## Phase 4 — `internal/server` package scaffolding (ProxyManager replacement) -- Completed.
+
+Goal: build the [internal/server](../internal/server/) package so it can stand in for [proxy.ProxyManager](../proxy/proxymanager.go#L67) — the mux, lifecycle, model dispatch, custom endpoints, request filters, auth/CORS, and upstream passthrough. After this phase, `cmd/newrouter/main.go` constructs a `server.Server` instead of a bare `router.Server`.
+
+The legacy `ProxyManager` collapses three concerns into one struct: the HTTP mux, the model→process router, and the cross-cutting services (loggers, metrics, perf, inflight counter, version). The new layout keeps the `router.Router` implementations focused on model dispatch and lets `internal/server.Server` own the mux and all cross-cutting middleware. `server.Server` builds the `local` and `peer` routers directly and dispatches between them itself, so it fully **supersedes `internal/router.Server`** — see the cleanup item below.
+
+The phase is split into sub-phases that can land and be tested independently:
+
+| Sub-phase | Scope                                                                      |
+| --------- | -------------------------------------------------------------------------- |
+| 4a        | package scaffolding — struct, `New`, `ServeHTTP`, `Shutdown`, model routes |
+| 4b        | custom (non-model-dispatched) HTTP endpoints                               |
+| 4c        | request-body filter middleware                                             |
+| 4d        | auth & CORS middleware                                                     |
+| 4e        | upstream passthrough                                                       |
+
+The package is split by concern across stub files already in place:
+
+| File         | Responsibility                                  | Filled in by           |
+| ------------ | ----------------------------------------------- | ---------------------- |
+| `server.go`  | `Server` struct, `New`, `ServeHTTP`, `Shutdown` | 4a                     |
+| `log.go`     | `muxlog` combined logger; `/logs` handlers      | 4a                     |
+| `auth.go`    | `CreateAuthMiddleware`                          | 4d                     |
+| `filters.go` | request-body filter middleware                  | 4c                     |
+| `api.go`     | llama-swap-specific API handlers                | 4b / Phase 5 / Phase 6 |
+| `ui.go`      | embedded UI serving                             | Phase 7                |
+
+### Phase 4a — package scaffolding -- Completed.
+
+`server.Server` owns the mux, the `local`/`peer` routers, `muxlog`, and a
+shutdown context. `New` builds the routers, registers all model-dispatched
+routes on a stdlib `http.ServeMux`, and wraps the mux with the global CORS
+middleware. `localPeerHandler` resolves the model once via `router.FetchModel`
+and dispatches to `local` or `peer`. `Shutdown` stops both routers in parallel
+and is idempotent. `cmd/newrouter/main.go` now constructs `server.New(...)`;
+`internal/router/server.go` and `server_test.go` were removed as dead code.
+
+### Phase 4b — Custom HTTP endpoints -- Completed.
+
+`GET /v1/models` (local + peer models, aliases, metadata), `GET /health`,
+`GET /wol-health`, and `GET /` → `/ui` are registered. `GET /favicon.ico` is
+deferred to Phase 7 since it requires the embedded UI filesystem.
+
+### Phase 4c — Request-body filters -- Completed.
+
+`CreateFilterMiddleware` (in `filters.go`) applies `UseModelName`,
+`StripParams`, `SetParams`, and `SetParamsByID` to JSON requests, then
+re-attaches the body with `Content-Length` / `Transfer-Encoding` cleanup.
+
+### Phase 4d — Auth & CORS -- Completed.
+
+`CreateAuthMiddleware` validates API keys (Bearer / Basic / `x-api-key`) and
+strips the headers before upstream. `CreateCORSMiddleware` answers OPTIONS
+preflight; `/v1/models` echoes the `Origin`.
+
+### Phase 4e — Upstream passthrough -- Completed.
+
+`GET /upstream` → `/ui/models`, and `/upstream/<model>/<path>` proxies to the
+resolved model with multi-segment name resolution, canonical-form redirect
+(301/308), and prefix stripping.
+
+---
+
+## Phase 5 — Operations endpoints -- Completed.
+
+A new `router.LocalRouter` interface embeds `Router` and adds `RunningModels()`
+and `Unload(timeout, models...)`, both implemented once on `baseRouter` so
+`Group` and `Matrix` share them — the legacy matrix/group divergence at
+[proxymanager.go:1167](../proxy/proxymanager.go#L1167) collapses since
+`baseRouter` already unifies process storage. `Peer` does not implement it;
+`Server.local` is typed `LocalRouter`, `Server.peer` stays `Router`.
+
+`GET /unload` stops every local process; `GET /running` lists non-stopped
+processes joined against config for `cmd`/`proxy`/`ttl`/`name`/`description`.
+`startPreload` fires a background `GET /` at each `Hooks.OnStartup.Preload`
+model and emits `shared.ModelPreloadedEvent`.
+
+---
+
+## Phase 6 — Metrics, perf, and SSE -- Completed.
+
+`perf.Monitor` is created and started in `cmd/newrouter/main.go` (it outlives
+config reloads via `UpdateConfig`) and passed into `server.New`. `GET /metrics`
+serves `perf.Monitor.MetricsHandler()` output, 503 when disabled.
+
+`internal/process` emits `shared.ProcessStateChangeEvent` from `setState`.
+`server.inflightCounter` (atomic) + `CreateInflightMiddleware` track
+model-dispatched requests and emit `InFlightRequestsEvent`. `metricsMonitor`
+(in `metrics.go`) parses token usage from upstream responses via
+`CreateMetricsMiddleware`.
+
+The `/api` group (API-key protected) is registered: `POST /api/models/unload`,
+`POST /api/models/unload/{model...}`, `GET /api/events` (SSE: `modelStatus` /
+`logData` / `metrics` / `inflight`), `GET /api/metrics`, `GET /api/performance`
+(`?after=` RFC3339 filter), `GET /api/version`. `GET /api/captures/{id}`
+returns 501 until 6f.
+
+### Phase 6f — Request/response captures -- Completed.
+
+`proxy/cache` moved to `internal/cache`. `metricsMonitor` stores zstd+CBOR
+`ReqRespCapture` records in a sized `cache.Cache` (`captureBuffer` MB, 0
+disables). `CreateMetricsMiddleware` buffers request body/headers before
+dispatch; `record` builds the capture per a `captureFieldsByPath` table
+(`captures.go`) that trims large audio/image payloads, defaulting JSON routes
+to `captureAll`. `GET /api/captures/{id}` decompresses and returns the capture;
+`getMetrics` resolves `HasCapture` against the cache.
+
+---
+
+## Phase 7 — UI serving -- Completed.
+
+`internal/server/ui.go` embeds `ui_dist` and serves it. `GET /ui/` is
+brotli/gzip-aware via `serveCompressedFile`; unknown paths without a file
+extension fall back to `index.html` for SPA routing. `GET /favicon.ico` serves
+from the same embedded FS. The Makefile `ui` target copies the vite build into
+`internal/server/ui_dist`; a committed `placeholder.txt` keeps the embed valid
+before a build runs.
+
+---
+
+## Phase 8a - Review Part I
+
+- [x] All functionality from the proxy package has been migrated in the above phases — with the remaining gaps listed in Phase 8b
+- [x] Test coverage at or exceeds the level from the proxy package — `internal/server` now at 76.6% vs 73.9% (`proxy`)
+
+### Findings
+
+**Gap 1 — Request logging middleware missing -- Resolved.**
+
+`CreateRequestLogMiddleware` ([log.go](../internal/server/log.go)) records one
+access-log line per request to `s.proxylog` in the legacy format
+`clientIP "METHOD PATH PROTO" status bodySize "UA" duration`, skipping
+`/wol-health`, `/api/performance`, and `/metrics`. A `statusRecorder` captures
+the status/body size (forwarding `Flush` for SSE) and `clientIP` honours
+`X-Forwarded-For` / `X-Real-IP`. It is wired as the outermost middleware in
+`routes()`, wrapping the CORS layer.
+
+**Gap 2 — Per-model log streaming not supported -- Resolved **
+
+`Server.getLogger` ([log.go:50](../internal/server/log.go#L50)) only handles `""`, `"proxy"`, and `"upstream"`. The legacy `ProxyManager.getLogger` ([proxymanager_loghandlers.go:92](../proxy/proxymanager_loghandlers.go#L92)) additionally resolves a model ID against the active process groups / matrix and returns that process's logger. Callers of `GET /logs/stream/<modelID>` will get a 400 instead of the model's live log stream.
+
+**Gap 3 — `UseModelName` not applied to multipart form endpoints -- Resolved.**
+
+`CreateFormFilterMiddleware` ([filters.go](../internal/server/filters.go)) parses
+`multipart/form-data` requests, rewrites the `model` field with `UseModelName`,
+reconstructs the body via `rewriteMultipartModel`, and re-attaches it with
+`Content-Type` / `Content-Length` cleanup. It runs in `modelChain` after the
+JSON `filterMW`; each is a no-op for the other's Content-Type. Audio
+transcription (`/v1/audio/transcriptions`) and image edit (`/v1/images/edits`)
+now honour `use_model_name`.
+
+**Coverage gaps (0 % functions) -- Resolved.**
+
+The functions previously at 0 % (`handleListModels`, `handleMetrics`,
+`handleRootRedirect`, `handleUpstreamRedirect`, `handleUpstream`,
+`findModelInPath`, `handleAPICapture`, `handleAPIUnloadAll`,
+`handleAPIUnloadModel`, `CreateAuthMiddleware`, `extractAPIKey`,
+`handleLogStream`, `applyFilters`, `decompressBody`, `filterAcceptEncoding`,
+`handleUI`, `handleFavicon`) now have tests across `auth_test.go`, `api_test.go`,
+`filters_test.go`, `log_test.go`, and `extras_test.go`.
+
+---
+
+### Phase 8b - Fill gaps discovered in Phase 8a
+
+- [x] **Add request-log middleware** — `CreateRequestLogMiddleware` ([log.go](../internal/server/log.go)) records `clientIP "METHOD PATH PROTO" status bodySize "UA" duration` to `s.proxylog`, skips `/wol-health` / `/api/performance` / `/metrics`, and is wired as the outermost middleware in `routes()`.
+- [x] **Extend `getLogger` with model-ID resolution** — add a `default:` branch to `Server.getLogger` ([log.go:50](../internal/server/log.go#L50)) that resolves the ID via `s.local` (using a new `LocalRouter.GetProcess(name)` method or equivalent) and returns that process's `Logger()`. Match the fallback behaviour: return a 400 with `"invalid logger. Use 'proxy', 'upstream' or a model's ID"` when not found.
+- [x] **`UseModelName` rewrite for multipart endpoints** — `CreateFormFilterMiddleware` parses `multipart/form-data`, rewrites the `model` field according to `UseModelName`, reconstructs the body, and updates `Content-Type` / `Content-Length`. It is wired into `modelChain` after the JSON filter.
+- [x] **Raise test coverage to ≥ 74 %** — `internal/server` now at 76.1%; tests added for every 0 % function across `auth_test.go`, `api_test.go`, `filters_test.go`, `log_test.go`, and `extras_test.go`.
+
+---
+
+## Phase 8c - Review Part II (entrypoint comparison)
+
+A second pass comparing [cmd/newrouter/main.go](../cmd/newrouter/main.go) against
+the legacy [llama-swap.go](../llama-swap.go) + [proxy.New](../proxy/proxymanager.go#L104)
+surfaced four more gaps, all in logger setup.
+
+**Gap 4 — `LogToStdout` config ignored -- Resolved.**
+
+`cmd/newrouter/main.go` previously hardcoded `proxyLog` / `upstreamLog` to
+`os.Stdout`, and the old `muxlog()` helper built a Monitor that nothing wrote
+into — so `logToStdout` had no effect and `/logs` (combined history) was always
+empty. `server.NewLoggers` ([log.go](../internal/server/log.go)) now replicates
+the legacy switch: `proxy` / `upstream` monitors feed `muxLog` (or `io.Discard`)
+per `none` / `both` / `upstream` / `proxy`, so `muxLog` accumulates the combined
+history. `server.New` takes `muxlog` as a parameter. The loggers outlive config
+reloads, so a `LogToStdout` change requires a restart to take effect.
+
+**Gap 5 — `LogTimeFormat` config ignored -- Resolved.**
+
+`cmd/newrouter/main.go` now maps `cfg.LogTimeFormat` to a Go time layout via the
+`logTimeFormats` table and applies it (alongside log level) to the proxy and
+upstream monitors in `applyLogSettings`, re-applied on config reload.
+
+**Gap 6 — `LogRequests` deprecation warning missing.**
+
+The legacy [proxymanager.go:127](../proxy/proxymanager.go#L127) warns when the
+deprecated `logRequests` config key is set. `cmd/newrouter` does not. Low
+priority — left open.
+
+**Gap 7 — PID debug log missing -- Resolved.**
+
+`cmd/newrouter/main.go` now logs `PID: %d` at debug level after `applyLogSettings`,
+matching [llama-swap.go:71](../llama-swap.go#L71).
+
+---
+
+## Phase X (tbd) — Cutover
+
+- [ ] Swap `llama-swap.go` to delegate to `cmd/newrouter` (or rename newrouter to be the primary entrypoint)
+- [ ] Update `Makefile` build targets
+- [ ] Update docs / README references to the legacy binary
+- [ ] Remove `proxy/proxymanager*.go` and `gin-gonic` dependency once nothing imports them
+- [ ] Run `make test-all` and confirm concurrency suite still passes against the new entrypoint
+
+---
+
+## Cross-cutting concerns to keep in mind
+
+- **Single body read**: legacy and newrouter both buffer the request body once. When adding filters (Phase 4c), make sure the buffered bytes flow through `Content-Length` / `transfer-encoding` cleanup as in [proxymanager.go:872](../proxy/proxymanager.go#L872).
+- **Streaming flag in context**: legacy stashes `streaming` and `model` under `proxyCtxKey`. The new router uses `ModelKey` / `ModelIDKey` — pick one set of keys and use them consistently for metrics + log handlers.
+- **Matrix vs Group divergence**: any handler that calls `swapProcessGroup` or `findGroupByModelName` in the legacy needs a matrix branch too. The new router's `Router` interface already abstracts this — preserve that abstraction rather than reintroducing the branch in every handler.
+- **Shutdown ordering**: `httpServer.Shutdown` must drain inflight requests _before_ `Server.Shutdown` tears down processes, otherwise inflight requests 502. Current newrouter ordering at [main.go:87](../cmd/newrouter/main.go#L87) is correct — keep it.
@@ -1,25 +1,43 @@
 module github.com/mostlygeek/llama-swap

-go 1.25.4
+go 1.26.1

 require (
 	github.com/billziss-gh/golib v0.2.0
-	github.com/fsnotify/fsnotify v1.9.0
+	github.com/charmbracelet/bubbles v1.0.0
+	github.com/charmbracelet/bubbletea v1.3.10
+	github.com/charmbracelet/lipgloss v1.1.0
+	github.com/fxamacker/cbor/v2 v2.9.1
 	github.com/gin-gonic/gin v1.10.0
-	github.com/stretchr/testify v1.9.0
+	github.com/klauspost/compress v1.18.5
+	github.com/shirou/gopsutil/v4 v4.26.4
+	github.com/stretchr/testify v1.11.1
 	github.com/tidwall/gjson v1.18.0
 	github.com/tidwall/sjson v1.2.5
+	golang.org/x/sync v0.20.0
+	golang.org/x/sys v0.41.0
 	gopkg.in/yaml.v3 v3.0.1
 )

 require (
+	github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
 	github.com/bytedance/sonic v1.11.6 // indirect
 	github.com/bytedance/sonic/loader v0.1.1 // indirect
+	github.com/charmbracelet/colorprofile v0.4.1 // indirect
+	github.com/charmbracelet/x/ansi v0.11.6 // indirect
+	github.com/charmbracelet/x/cellbuf v0.0.15 // indirect
+	github.com/charmbracelet/x/term v0.2.2 // indirect
+	github.com/clipperhouse/displaywidth v0.9.0 // indirect
+	github.com/clipperhouse/stringish v0.1.1 // indirect
+	github.com/clipperhouse/uax29/v2 v2.5.0 // indirect
 	github.com/cloudwego/base64x v0.1.4 // indirect
 	github.com/cloudwego/iasm v0.2.0 // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/ebitengine/purego v0.10.0 // indirect
+	github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect
 	github.com/gabriel-vasile/mimetype v1.4.3 // indirect
 	github.com/gin-contrib/sse v0.1.0 // indirect
+	github.com/go-ole/go-ole v1.2.6 // indirect
 	github.com/go-playground/locales v0.14.1 // indirect
 	github.com/go-playground/universal-translator v0.18.1 // indirect
 	github.com/go-playground/validator/v10 v10.20.0 // indirect
@@ -27,19 +45,32 @@ require (
 	github.com/json-iterator/go v1.1.12 // indirect
 	github.com/klauspost/cpuid/v2 v2.2.7 // indirect
 	github.com/leodido/go-urn v1.4.0 // indirect
+	github.com/lucasb-eyer/go-colorful v1.3.0 // indirect
+	github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
 	github.com/mattn/go-isatty v0.0.20 // indirect
+	github.com/mattn/go-localereader v0.0.1 // indirect
+	github.com/mattn/go-runewidth v0.0.19 // indirect
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.2 // indirect
+	github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect
+	github.com/muesli/cancelreader v0.2.2 // indirect
+	github.com/muesli/termenv v0.16.0 // indirect
 	github.com/pelletier/go-toml/v2 v2.2.2 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
+	github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect
+	github.com/rivo/uniseg v0.4.7 // indirect
 	github.com/tidwall/match v1.1.1 // indirect
 	github.com/tidwall/pretty v1.2.1 // indirect
+	github.com/tklauser/go-sysconf v0.3.16 // indirect
+	github.com/tklauser/numcpus v0.11.0 // indirect
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
 	github.com/ugorji/go/codec v1.2.12 // indirect
+	github.com/x448/float16 v0.8.4 // indirect
+	github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
+	github.com/yusufpapurcu/wmi v1.2.4 // indirect
 	golang.org/x/arch v0.8.0 // indirect
 	golang.org/x/crypto v0.45.0 // indirect
 	golang.org/x/net v0.47.0 // indirect
-	golang.org/x/sys v0.38.0 // indirect
 	golang.org/x/text v0.31.0 // indirect
 	google.golang.org/protobuf v1.34.1 // indirect
 )
@@ -1,9 +1,31 @@
+github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
+github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
 github.com/billziss-gh/golib v0.2.0 h1:NyvcAQdfvM8xokKkKotiligKjKXzuQD4PPykg1nKc/8=
 github.com/billziss-gh/golib v0.2.0/go.mod h1:mZpUYANXZkDKSnyYbX9gfnyxwe0ddRhUtfXcsD5r8dw=
 github.com/bytedance/sonic v1.11.6 h1:oUp34TzMlL+OY1OUWxHqsdkgC/Zfc85zGqw9siXjrc0=
 github.com/bytedance/sonic v1.11.6/go.mod h1:LysEHSvpvDySVdC2f87zGWf6CIKJcAvqab1ZaiQtds4=
 github.com/bytedance/sonic/loader v0.1.1 h1:c+e5Pt1k/cy5wMveRDyk2X4B9hF4g7an8N3zCYjJFNM=
 github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU=
+github.com/charmbracelet/bubbles v1.0.0 h1:12J8/ak/uCZEMQ6KU7pcfwceyjLlWsDLAxB5fXonfvc=
+github.com/charmbracelet/bubbles v1.0.0/go.mod h1:9d/Zd5GdnauMI5ivUIVisuEm3ave1XwXtD1ckyV6r3E=
+github.com/charmbracelet/bubbletea v1.3.10 h1:otUDHWMMzQSB0Pkc87rm691KZ3SWa4KUlvF9nRvCICw=
+github.com/charmbracelet/bubbletea v1.3.10/go.mod h1:ORQfo0fk8U+po9VaNvnV95UPWA1BitP1E0N6xJPlHr4=
+github.com/charmbracelet/colorprofile v0.4.1 h1:a1lO03qTrSIRaK8c3JRxJDZOvhvIeSco3ej+ngLk1kk=
+github.com/charmbracelet/colorprofile v0.4.1/go.mod h1:U1d9Dljmdf9DLegaJ0nGZNJvoXAhayhmidOdcBwAvKk=
+github.com/charmbracelet/lipgloss v1.1.0 h1:vYXsiLHVkK7fp74RkV7b2kq9+zDLoEU4MZoFqR/noCY=
+github.com/charmbracelet/lipgloss v1.1.0/go.mod h1:/6Q8FR2o+kj8rz4Dq0zQc3vYf7X+B0binUUBwA0aL30=
+github.com/charmbracelet/x/ansi v0.11.6 h1:GhV21SiDz/45W9AnV2R61xZMRri5NlLnl6CVF7ihZW8=
+github.com/charmbracelet/x/ansi v0.11.6/go.mod h1:2JNYLgQUsyqaiLovhU2Rv/pb8r6ydXKS3NIttu3VGZQ=
+github.com/charmbracelet/x/cellbuf v0.0.15 h1:ur3pZy0o6z/R7EylET877CBxaiE1Sp1GMxoFPAIztPI=
+github.com/charmbracelet/x/cellbuf v0.0.15/go.mod h1:J1YVbR7MUuEGIFPCaaZ96KDl5NoS0DAWkskup+mOY+Q=
+github.com/charmbracelet/x/term v0.2.2 h1:xVRT/S2ZcKdhhOuSP4t5cLi5o+JxklsoEObBSgfgZRk=
+github.com/charmbracelet/x/term v0.2.2/go.mod h1:kF8CY5RddLWrsgVwpw4kAa6TESp6EB5y3uxGLeCqzAI=
+github.com/clipperhouse/displaywidth v0.9.0 h1:Qb4KOhYwRiN3viMv1v/3cTBlz3AcAZX3+y9OLhMtAtA=
+github.com/clipperhouse/displaywidth v0.9.0/go.mod h1:aCAAqTlh4GIVkhQnJpbL0T/WfcrJXHcj8C0yjYcjOZA=
+github.com/clipperhouse/stringish v0.1.1 h1:+NSqMOr3GR6k1FdRhhnXrLfztGzuG+VuFDfatpWHKCs=
+github.com/clipperhouse/stringish v0.1.1/go.mod h1:v/WhFtE1q0ovMta2+m+UbpZ+2/HEXNWYXQgCt4hdOzA=
+github.com/clipperhouse/uax29/v2 v2.5.0 h1:x7T0T4eTHDONxFJsL94uKNKPHrclyFI0lm7+w94cO8U=
+github.com/clipperhouse/uax29/v2 v2.5.0/go.mod h1:Wn1g7MK6OoeDT0vL+Q0SQLDz/KpfsVRgg6W7ihQeh4g=
 github.com/cloudwego/base64x v0.1.4 h1:jwCgWpFanWmN8xoIUHa2rtzmkd5J2plF/dnLS6Xd/0Y=
 github.com/cloudwego/base64x v0.1.4/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w=
 github.com/cloudwego/iasm v0.2.0 h1:1KNIy1I1H9hNNFEEH3DVnI4UujN+1zjpuk6gwHLTssg=
@@ -11,14 +33,20 @@ github.com/cloudwego/iasm v0.2.0/go.mod h1:8rXZaNYT2n95jn+zTI1sDr+IgcD2GVs0nlbbQ
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
-github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
+github.com/ebitengine/purego v0.10.0 h1:QIw4xfpWT6GWTzaW5XEKy3HXoqrJGx1ijYHzTF0/ISU=
+github.com/ebitengine/purego v0.10.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ=
+github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4=
+github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM=
+github.com/fxamacker/cbor/v2 v2.9.1 h1:2rWm8B193Ll4VdjsJY28jxs70IdDsHRWgQYAI80+rMQ=
+github.com/fxamacker/cbor/v2 v2.9.1/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ=
 github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uqxgUFjbI0=
 github.com/gabriel-vasile/mimetype v1.4.3/go.mod h1:d8uq/6HKRL6CGdk+aubisF/M5GcPfT7nKyLpA0lbSSk=
 github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE=
 github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI=
 github.com/gin-gonic/gin v1.10.0 h1:nTuyha1TYqgedzytsKYqna+DfLos46nTv2ygFy86HFU=
 github.com/gin-gonic/gin v1.10.0/go.mod h1:4PMNQiOhvDRa013RKVbsiNwoyezlm2rm0uX/T7kzp5Y=
+github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY=
+github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
 github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
 github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
 github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA=
@@ -29,28 +57,51 @@ github.com/go-playground/validator/v10 v10.20.0 h1:K9ISHbSaI0lyB2eWMPJo+kOS/FBEx
 github.com/go-playground/validator/v10 v10.20.0/go.mod h1:dbuPbCMFw/DrkbEynArYaCwl3amGuJotoKCe95atGMM=
 github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
 github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
-github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU=
-github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
+github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
 github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
+github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBFW2IE=
+github.com/klauspost/compress v1.18.5/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ=
 github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
 github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM=
 github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
 github.com/knz/go-libedit v1.10.1/go.mod h1:MZTVkCWyz0oBc7JOWP3wNAzd002ZbM/5hgShxwh4x8M=
 github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ=
 github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI=
+github.com/lucasb-eyer/go-colorful v1.3.0 h1:2/yBRLdWBZKrf7gB40FoiKfAWYQ0lqNcbuQwVHXptag=
+github.com/lucasb-eyer/go-colorful v1.3.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
+github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4=
+github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I=
 github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
 github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
+github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4=
+github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88=
+github.com/mattn/go-runewidth v0.0.19 h1:v++JhqYnZuu5jSKrk9RbgF5v4CGUjqRfBm05byFGLdw=
+github.com/mattn/go-runewidth v0.0.19/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs=
 github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
 github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
+github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI=
+github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo=
+github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA=
+github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
+github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc=
+github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk=
 github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6Wq+LM=
 github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU=
+github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
+github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
+github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
+github.com/shirou/gopsutil/v4 v4.26.4 h1:B4SXVbcwTyrocPHEmWBC4uCYr4Xcu3MK1TXqbprAOWY=
+github.com/shirou/gopsutil/v4 v4.26.4/go.mod h1:LZ6ewCSkBqUpvSOf+LsTGnRinC6iaNUNMGBtDkJBaLQ=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
 github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
@@ -61,8 +112,9 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
-github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
 github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
+github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
+github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
 github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
 github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY=
 github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
@@ -73,24 +125,40 @@ github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4=
 github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
 github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY=
 github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28=
+github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA=
+github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI=
+github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw=
+github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ=
 github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
 github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
 github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE=
 github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
+github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
+github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
+github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no=
+github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM=
+github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0=
+github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
 golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
 golang.org/x/arch v0.8.0 h1:3wRIsP3pM4yUptoR96otTUOXI367OS0+c9eeRi9doIc=
 golang.org/x/arch v0.8.0/go.mod h1:FEVrYAQjsQXMVJ1nsMoVVXPZg6p2JE2mx8psSWTDQys=
 golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q=
 golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4=
+golang.org/x/exp v0.0.0-20231006140011-7918f672742d h1:jtJma62tbqLibJ5sFQz8bKtEM8rJBtfilJ2qTU199MI=
+golang.org/x/exp v0.0.0-20231006140011-7918f672742d/go.mod h1:ldy0pHrwJyGW56pPQzzkH36rKxoZW1tw7ZJpeKx+hdo=
 golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY=
 golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU=
+golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
+golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
+golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc=
-golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
+golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k=
+golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
 golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM=
 golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM=
-golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 google.golang.org/protobuf v1.34.1 h1:9ddQBjfCyZPOHPUiPxpYESBLc+T8P3E+Vo4IbKZgFWg=
 google.golang.org/protobuf v1.34.1/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos=
@@ -0,0 +1,102 @@
+package cache
+
+import (
+	"errors"
+	"sync"
+)
+
+var (
+	ErrExceedsMaxSize = errors.New("item exceeds maximum cache size")
+	ErrNotFound       = errors.New("item not found")
+)
+
+type Cache struct {
+	mu      sync.Mutex
+	items   map[int][]byte
+	order   []int
+	size    int
+	maxSize int
+}
+
+func New(maxBytes int) *Cache {
+	return &Cache{
+		items:   make(map[int][]byte),
+		order:   make([]int, 0),
+		maxSize: maxBytes,
+	}
+}
+
+func (c *Cache) Add(id int, data []byte) error {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	dataSize := len(data)
+	if dataSize > c.maxSize {
+		return ErrExceedsMaxSize
+	}
+
+	// If key already exists, remove old entry from size and order
+	if old, exists := c.items[id]; exists {
+		c.size -= len(old)
+		c.removeOrder(id)
+	}
+
+	// Evict oldest (FIFO) until room available
+	for c.size+dataSize > c.maxSize && len(c.order) > 0 {
+		oldestID := c.order[0]
+		c.order = c.order[1:]
+		if evicted, exists := c.items[oldestID]; exists {
+			c.size -= len(evicted)
+			delete(c.items, oldestID)
+		}
+	}
+
+	c.items[id] = data
+	c.order = append(c.order, id)
+	c.size += dataSize
+	return nil
+}
+
+func (c *Cache) removeOrder(id int) {
+	for i, v := range c.order {
+		if v == id {
+			c.order = append(c.order[:i], c.order[i+1:]...)
+			return
+		}
+	}
+}
+
+func (c *Cache) Get(id int) ([]byte, error) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	data, exists := c.items[id]
+	if !exists {
+		return nil, ErrNotFound
+	}
+	return data, nil
+}
+
+func (c *Cache) Has(id int) bool {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	_, exists := c.items[id]
+	return exists
+}
+
+func (c *Cache) Size() int {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	return c.size
+}
+
+func (c *Cache) Clear() {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	c.items = make(map[int][]byte)
+	c.order = c.order[:0]
+	c.size = 0
+}
@@ -0,0 +1,130 @@
+package cache
+
+import (
+	"sync"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestCache_Add(t *testing.T) {
+	t.Run("adds and retrieves item", func(t *testing.T) {
+		c := New(1024)
+		data := []byte("hello")
+		require.NoError(t, c.Add(1, data))
+
+		got, err := c.Get(1)
+		require.NoError(t, err)
+		assert.Equal(t, data, got)
+	})
+
+	t.Run("returns error for oversized item", func(t *testing.T) {
+		c := New(10)
+		err := c.Add(1, make([]byte, 20))
+		assert.ErrorIs(t, err, ErrExceedsMaxSize)
+	})
+
+	t.Run("evicts oldest items to make room", func(t *testing.T) {
+		c := New(100)
+
+		require.NoError(t, c.Add(1, make([]byte, 40)))
+		require.NoError(t, c.Add(2, make([]byte, 40)))
+		// Adding item 3 should evict item 1
+		require.NoError(t, c.Add(3, make([]byte, 40)))
+
+		assert.False(t, c.Has(1))
+		assert.True(t, c.Has(2))
+		assert.True(t, c.Has(3))
+	})
+
+	t.Run("overwrites existing key", func(t *testing.T) {
+		c := New(100)
+		require.NoError(t, c.Add(1, []byte("old")))
+		require.NoError(t, c.Add(1, []byte("new")))
+
+		got, err := c.Get(1)
+		require.NoError(t, err)
+		assert.Equal(t, []byte("new"), got)
+		assert.Equal(t, 3, c.Size())
+	})
+}
+
+func TestCache_Get(t *testing.T) {
+	t.Run("returns ErrNotFound for missing key", func(t *testing.T) {
+		c := New(100)
+		_, err := c.Get(99)
+		assert.ErrorIs(t, err, ErrNotFound)
+	})
+}
+
+func TestCache_Has(t *testing.T) {
+	t.Run("returns true for existing key", func(t *testing.T) {
+		c := New(100)
+		require.NoError(t, c.Add(1, []byte("data")))
+		assert.True(t, c.Has(1))
+	})
+
+	t.Run("returns false for missing key", func(t *testing.T) {
+		c := New(100)
+		assert.False(t, c.Has(1))
+	})
+}
+
+func TestCache_Size(t *testing.T) {
+	t.Run("tracks byte usage", func(t *testing.T) {
+		c := New(1000)
+		assert.Equal(t, 0, c.Size())
+
+		require.NoError(t, c.Add(1, make([]byte, 100)))
+		assert.Equal(t, 100, c.Size())
+
+		require.NoError(t, c.Add(2, make([]byte, 200)))
+		assert.Equal(t, 300, c.Size())
+	})
+
+	t.Run("updates on eviction", func(t *testing.T) {
+		c := New(150)
+		require.NoError(t, c.Add(1, make([]byte, 100)))
+		require.NoError(t, c.Add(2, make([]byte, 100)))
+
+		// Item 1 should be evicted, size = 100
+		assert.Equal(t, 100, c.Size())
+	})
+}
+
+func TestCache_Clear(t *testing.T) {
+	t.Run("removes all items and resets size", func(t *testing.T) {
+		c := New(1000)
+		require.NoError(t, c.Add(1, []byte("a")))
+		require.NoError(t, c.Add(2, []byte("b")))
+
+		c.Clear()
+
+		assert.Equal(t, 0, c.Size())
+		assert.False(t, c.Has(1))
+		assert.False(t, c.Has(2))
+	})
+}
+
+func TestCache_Concurrent(t *testing.T) {
+	t.Run("concurrent operations are safe", func(t *testing.T) {
+		c := New(10000)
+
+		var wg sync.WaitGroup
+		for i := 0; i < 10; i++ {
+			wg.Add(1)
+			go func(id int) {
+				defer wg.Done()
+				for j := 0; j < 100; j++ {
+					key := id*100 + j
+					_ = c.Add(key, []byte("data"))
+					_, _ = c.Get(key)
+					_ = c.Has(key)
+					_ = c.Size()
+				}
+			}(i)
+		}
+		wg.Wait()
+	})
+}
@@ -0,0 +1,63 @@
+// Package chain composes http.Handler middleware into a single handler.
+//
+// A Middleware wraps a downstream http.Handler and may run logic before or
+// after delegating to it, or short-circuit by not calling next at all
+// (e.g. auth failure, CORS preflight).
+package chain
+
+import "net/http"
+
+// Middleware wraps an http.Handler with cross-cutting behavior. It receives
+// the next handler in the chain and returns a handler that may call next,
+// modify the request/response around it, or short-circuit.
+type Middleware func(next http.Handler) http.Handler
+
+// Chain is a reusable middleware stack. Build it once with New (and optionally
+// extend per-route with Append), then call Then to wrap each terminal handler
+// when registering routes against an http.ServeMux:
+//
+//	api := chain.New(authMW, corsMW)
+//	mux.Handle("/v1/chat/completions", api.Then(dispatch))
+//	mux.Handle("/v1/embeddings",       api.Append(filters).Then(dispatch))
+//
+// Middlewares execute left-to-right: mws[0] runs first and may call into
+// mws[1], and so on, with the terminal handler invoked last. A middleware
+// that does not call next short-circuits the remainder of the chain.
+// A zero Chain is valid and applies no middleware.
+type Chain struct {
+	mws []Middleware
+}
+
+// New returns a Chain that applies mws left-to-right around any terminal
+// handler passed to Then.
+func New(mws ...Middleware) Chain {
+	cp := make([]Middleware, len(mws))
+	copy(cp, mws)
+	return Chain{mws: cp}
+}
+
+// Append returns a new Chain with mws added after the existing middleware.
+// The receiver is not modified, so a base Chain can be safely reused across
+// multiple routes that each need different per-route additions.
+func (c Chain) Append(mws ...Middleware) Chain {
+	out := make([]Middleware, 0, len(c.mws)+len(mws))
+	out = append(out, c.mws...)
+	out = append(out, mws...)
+	return Chain{mws: out}
+}
+
+// Then wraps final with the chain's middleware and returns the resulting
+// handler, suitable for passing to http.ServeMux.Handle. With an empty chain,
+// Then returns final unchanged.
+func (c Chain) Then(final http.Handler) http.Handler {
+	h := final
+	for i := len(c.mws) - 1; i >= 0; i-- {
+		h = c.mws[i](h)
+	}
+	return h
+}
+
+// ThenFunc is shorthand for Then(http.HandlerFunc(f)).
+func (c Chain) ThenFunc(f http.HandlerFunc) http.Handler {
+	return c.Then(f)
+}
@@ -0,0 +1,205 @@
+package chain
+
+import (
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+)
+
+// recordingMiddleware appends tag before calling next and "-after-"+tag after.
+func recordingMiddleware(tag string, log *[]string) Middleware {
+	return func(next http.Handler) http.Handler {
+		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			*log = append(*log, tag)
+			next.ServeHTTP(w, r)
+			*log = append(*log, "after-"+tag)
+		})
+	}
+}
+
+func TestChain_HandlersExecuteInDeclaredOrder(t *testing.T) {
+	var log []string
+	final := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		log = append(log, "final")
+	})
+
+	h := New(
+		recordingMiddleware("a", &log),
+		recordingMiddleware("b", &log),
+		recordingMiddleware("c", &log),
+	).Then(final)
+
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodGet, "/", nil)
+	h.ServeHTTP(rec, req)
+
+	want := []string{"a", "b", "c", "final", "after-c", "after-b", "after-a"}
+	if !equal(log, want) {
+		t.Fatalf("execution order mismatch:\n got: %v\nwant: %v", log, want)
+	}
+}
+
+func TestChain_ShortCircuitsWhenMiddlewareDoesNotCallNext(t *testing.T) {
+	var log []string
+	final := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		log = append(log, "final")
+	})
+
+	gate := func(next http.Handler) http.Handler {
+		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			log = append(log, "gate")
+			w.WriteHeader(http.StatusUnauthorized)
+		})
+	}
+
+	h := New(
+		recordingMiddleware("outer", &log),
+		gate,
+		recordingMiddleware("inner", &log),
+	).Then(final)
+
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodGet, "/", nil)
+	h.ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusUnauthorized {
+		t.Fatalf("status: got %d, want %d", rec.Code, http.StatusUnauthorized)
+	}
+	want := []string{"outer", "gate", "after-outer"}
+	if !equal(log, want) {
+		t.Fatalf("short-circuit order mismatch:\n got: %v\nwant: %v", log, want)
+	}
+}
+
+func TestChain_EarlyWritesAreVisibleToLaterMiddleware(t *testing.T) {
+	header := func(next http.Handler) http.Handler {
+		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			w.Header().Set("X-Set-By", "outer")
+			_, _ = io.WriteString(w, "outer:")
+			next.ServeHTTP(w, r)
+		})
+	}
+	inner := func(next http.Handler) http.Handler {
+		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			// The outer middleware already set the header; we should see it.
+			if got := w.Header().Get("X-Set-By"); got != "outer" {
+				_, _ = io.WriteString(w, "missing-header;")
+			}
+			_, _ = io.WriteString(w, "inner:")
+			next.ServeHTTP(w, r)
+		})
+	}
+	final := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		_, _ = io.WriteString(w, "final")
+	})
+
+	h := New(header, inner).Then(final)
+
+	rec := httptest.NewRecorder()
+	h.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
+
+	body, _ := io.ReadAll(rec.Body)
+	if got := string(body); !strings.Contains(got, "outer:inner:final") {
+		t.Fatalf("body: got %q, want it to contain %q", got, "outer:inner:final")
+	}
+	if got := rec.Header().Get("X-Set-By"); got != "outer" {
+		t.Fatalf("header X-Set-By: got %q, want %q", got, "outer")
+	}
+}
+
+func TestChain_ReusableAcrossRoutesViaThen(t *testing.T) {
+	var log []string
+	base := New(
+		recordingMiddleware("auth", &log),
+		recordingMiddleware("cors", &log),
+	)
+
+	mux := http.NewServeMux()
+	mux.Handle("/a", base.ThenFunc(func(w http.ResponseWriter, r *http.Request) {
+		log = append(log, "handler-a")
+	}))
+	mux.Handle("/b", base.ThenFunc(func(w http.ResponseWriter, r *http.Request) {
+		log = append(log, "handler-b")
+	}))
+
+	srv := httptest.NewServer(mux)
+	defer srv.Close()
+
+	for _, path := range []string{"/a", "/b"} {
+		resp, err := http.Get(srv.URL + path)
+		if err != nil {
+			t.Fatalf("GET %s: %v", path, err)
+		}
+		resp.Body.Close()
+	}
+
+	want := []string{
+		"auth", "cors", "handler-a", "after-cors", "after-auth",
+		"auth", "cors", "handler-b", "after-cors", "after-auth",
+	}
+	if !equal(log, want) {
+		t.Fatalf("reusable chain order mismatch:\n got: %v\nwant: %v", log, want)
+	}
+}
+
+func TestChain_AppendDoesNotMutateReceiver(t *testing.T) {
+	var log []string
+	base := New(recordingMiddleware("base", &log))
+	extended := base.Append(recordingMiddleware("extra", &log))
+
+	final := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		log = append(log, "final")
+	})
+
+	// Run extended first to surface any aliasing of the underlying slice.
+	rec := httptest.NewRecorder()
+	extended.Then(final).ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
+
+	rec = httptest.NewRecorder()
+	base.Then(final).ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
+
+	want := []string{
+		"base", "extra", "final", "after-extra", "after-base",
+		"base", "final", "after-base",
+	}
+	if !equal(log, want) {
+		t.Fatalf("Append must not mutate the receiver:\n got: %v\nwant: %v", log, want)
+	}
+}
+
+func TestChain_ZeroValueAndEmptyThenAreIdentity(t *testing.T) {
+	final := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusTeapot)
+	})
+
+	for name, c := range map[string]Chain{
+		"zero":  {},
+		"empty": New(),
+	} {
+		t.Run(name, func(t *testing.T) {
+			h := c.Then(final)
+			if _, ok := h.(http.HandlerFunc); !ok {
+				t.Fatalf("expected http.HandlerFunc identity, got %T", h)
+			}
+			rec := httptest.NewRecorder()
+			h.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
+			if rec.Code != http.StatusTeapot {
+				t.Fatalf("status: got %d, want %d", rec.Code, http.StatusTeapot)
+			}
+		})
+	}
+}
+
+func equal(a, b []string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i := range a {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
+}
@@ -9,12 +9,19 @@ import (
 	"runtime"
 	"sort"
 	"strings"
+	"time"

 	"github.com/billziss-gh/golib/shlex"
 	"gopkg.in/yaml.v3"
 )

 const DEFAULT_GROUP_ID = "(default)"
+const (
+	LogToStdoutProxy    = "proxy"
+	LogToStdoutUpstream = "upstream"
+	LogToStdoutBoth     = "both"
+	LogToStdoutNone     = "none"
+)

 type MacroEntry struct {
 	Name  string
@@ -81,6 +88,7 @@ type GroupConfig struct {
 var (
 	macroNameRegex    = regexp.MustCompile(`^[a-zA-Z0-9_-]+$`)
 	macroPatternRegex = regexp.MustCompile(`\$\{([a-zA-Z0-9_-]+)\}`)
+	envMacroRegex     = regexp.MustCompile(`\$\{env\.([a-zA-Z_][a-zA-Z0-9_]*)\}`)
 )

 // set default values for GroupConfig
@@ -114,11 +122,21 @@ type Config struct {
 	LogRequests        bool                   `yaml:"logRequests"`
 	LogLevel           string                 `yaml:"logLevel"`
 	LogTimeFormat      string                 `yaml:"logTimeFormat"`
+	LogToStdout        string                 `yaml:"logToStdout"`
 	MetricsMaxInMemory int                    `yaml:"metricsMaxInMemory"`
+	CaptureBuffer      int                    `yaml:"captureBuffer"`
+	Performance        PerformanceConfig      `yaml:"performance"`
+	GlobalTTL          int                    `yaml:"globalTTL"`
 	Models             map[string]ModelConfig `yaml:"models"` /* key is model ID */
 	Profiles           map[string][]string    `yaml:"profiles"`
 	Groups             map[string]GroupConfig `yaml:"groups"` /* key is group ID */

+	// swap matrix: solver-based alternative to groups
+	Matrix *MatrixConfig `yaml:"matrix"`
+
+	// populated during validation when matrix is configured
+	ExpandedSets []ExpandedSet `yaml:"-"`
+
 	// for key/value replacements in model's cmd, cmdStop, proxy, checkEndPoint
 	Macros MacroList `yaml:"macros"`

@@ -136,6 +154,12 @@ type Config struct {

 	// present aliases to /v1/models OpenAI API listing
 	IncludeAliasesInList bool `yaml:"includeAliasesInList"`
+
+	// support API keys, see issue #433, #50, #251
+	RequiredAPIKeys []string `yaml:"apiKeys"`
+
+	// support remote peers, see issue #433, #296
+	Peers PeerDictionaryConfig `yaml:"peers"`
 }

 func (c *Config) RealModelName(search string) (string, bool) {
@@ -170,29 +194,56 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 	if err != nil {
 		return Config{}, err
 	}
+	yamlStr := string(data)

-	// default configuration values
+	// Phase 1: Substitute all ${env.VAR} macros at string level
+	// This is safe because env values are simple strings without YAML formatting
+	yamlStr, err = substituteEnvMacros(yamlStr)
+	if err != nil {
+		return Config{}, err
+	}
+
+	// Unmarshal into full Config with defaults
 	config := Config{
 		HealthCheckTimeout: 120,
 		StartPort:          5800,
 		LogLevel:           "info",
 		LogTimeFormat:      "",
+		LogToStdout:        LogToStdoutProxy,
 		MetricsMaxInMemory: 1000,
+		CaptureBuffer:      5,
+		GlobalTTL:          0,
 	}
-	err = yaml.Unmarshal(data, &config)
-	if err != nil {
+	if err = yaml.Unmarshal([]byte(yamlStr), &config); err != nil {
 		return Config{}, err
 	}

 	if config.HealthCheckTimeout < 15 {
-		// set a minimum of 15 seconds
 		config.HealthCheckTimeout = 15
 	}

+	// Apply defaults for performance config when section is missing
+	if config.Performance.Every == 0 {
+		config.Performance.Every = 5 * time.Second
+	}
+	if err = config.Performance.Validate(); err != nil {
+		return Config{}, fmt.Errorf("performance: %w", err)
+	}
+
 	if config.StartPort < 1 {
 		return Config{}, fmt.Errorf("startPort must be greater than 1")
 	}

+	if config.GlobalTTL < 0 {
+		return Config{}, fmt.Errorf("globalTTL must be >= 0")
+	}
+
+	switch config.LogToStdout {
+	case LogToStdoutProxy, LogToStdoutUpstream, LogToStdoutBoth, LogToStdoutNone:
+	default:
+		return Config{}, fmt.Errorf("logToStdout must be one of: proxy, upstream, both, none")
+	}
+
 	// Populate the aliases map
 	config.aliases = make(map[string]string)
 	for modelName, modelConfig := range config.Models {
@@ -204,55 +255,56 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 		}
 	}

-	/* check macro constraint rules:
-
-	- name must fit the regex ^[a-zA-Z0-9_-]+$
-	- names must be less than 64 characters (no reason, just cause)
-	- name can not be any reserved macros: PORT, MODEL_ID
-	- macro values must be less than 1024 characters
-	*/
+	// Validate global macros
 	for _, macro := range config.Macros {
 		if err = validateMacro(macro.Name, macro.Value); err != nil {
 			return Config{}, err
 		}
 	}

-	// Get and sort all model IDs first, makes testing more consistent
+	// Get and sort all model IDs for consistent port assignment
 	modelIds := make([]string, 0, len(config.Models))
 	for modelId := range config.Models {
 		modelIds = append(modelIds, modelId)
 	}
-	sort.Strings(modelIds) // This guarantees stable iteration order
+	sort.Strings(modelIds)

 	nextPort := config.StartPort
 	for _, modelId := range modelIds {
 		modelConfig := config.Models[modelId]
+		modelConfig.HealthCheckTimeout = config.HealthCheckTimeout

-		// Strip comments from command fields before macro expansion
+		// Strip comments from command fields
 		modelConfig.Cmd = StripComments(modelConfig.Cmd)
 		modelConfig.CmdStop = StripComments(modelConfig.CmdStop)

-		// validate model macros
+		// set model TTL to globalTTL it is the default value
+		if modelConfig.UnloadAfter == MODEL_CONFIG_DEFAULT_TTL {
+			modelConfig.UnloadAfter = config.GlobalTTL
+		}
+
+		if modelConfig.UnloadAfter < 0 {
+			return Config{}, fmt.Errorf("model %s: invalid TTL value %d", modelId, modelConfig.UnloadAfter)
+		}
+
+		// Validate model macros
 		for _, macro := range modelConfig.Macros {
 			if err = validateMacro(macro.Name, macro.Value); err != nil {
 				return Config{}, fmt.Errorf("model %s: %s", modelId, err.Error())
 			}
 		}

-		// Merge global config and model macros. Model macros take precedence
-		mergedMacros := make(MacroList, 0, len(config.Macros)+len(modelConfig.Macros))
+		// Build merged macro list: MODEL_ID + global macros + model macros (model overrides global)
+		mergedMacros := make(MacroList, 0, len(config.Macros)+len(modelConfig.Macros)+1)
 		mergedMacros = append(mergedMacros, MacroEntry{Name: "MODEL_ID", Value: modelId})
-
-		// Add global macros first
 		mergedMacros = append(mergedMacros, config.Macros...)

-		// Add model macros (can override global)
+		// Add model macros (override globals with same name)
 		for _, entry := range modelConfig.Macros {
-			// Remove any existing global macro with same name
 			found := false
 			for i, existing := range mergedMacros {
 				if existing.Name == entry.Name {
-					mergedMacros[i] = entry // Override
+					mergedMacros[i] = entry
 					found = true
 					break
 				}
@@ -262,23 +314,40 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 			}
 		}

-		// First pass: Substitute user-defined macros in reverse order (LIFO - last defined first)
-		// This allows later macros to reference earlier ones
+		// Substitute remaining macros in model fields (LIFO order)
 		for i := len(mergedMacros) - 1; i >= 0; i-- {
 			entry := mergedMacros[i]
 			macroSlug := fmt.Sprintf("${%s}", entry.Name)
 			macroStr := fmt.Sprintf("%v", entry.Value)

-			// Substitute in command fields
 			modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, macroSlug, macroStr)
 			modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, macroSlug, macroStr)
 			modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, macroSlug, macroStr)
 			modelConfig.CheckEndpoint = strings.ReplaceAll(modelConfig.CheckEndpoint, macroSlug, macroStr)
 			modelConfig.Filters.StripParams = strings.ReplaceAll(modelConfig.Filters.StripParams, macroSlug, macroStr)
+			modelConfig.Name = strings.ReplaceAll(modelConfig.Name, macroSlug, macroStr)
+			modelConfig.Description = strings.ReplaceAll(modelConfig.Description, macroSlug, macroStr)

-			// Substitute in metadata (recursive)
+			// Substitute macros in SetParamsByID keys and values
+			if len(modelConfig.Filters.SetParamsByID) > 0 {
+				newSetParamsByID := make(map[string]map[string]any, len(modelConfig.Filters.SetParamsByID))
+				for key, paramMap := range modelConfig.Filters.SetParamsByID {
+					newKey := strings.ReplaceAll(key, macroSlug, macroStr)
+					newValAny, err := substituteMacroInValue(any(paramMap), entry.Name, entry.Value)
+					if err != nil {
+						return Config{}, fmt.Errorf("model %s filters.setParamsByID: %s", modelId, err.Error())
+					}
+					newParamMap, ok := newValAny.(map[string]any)
+					if !ok {
+						return Config{}, fmt.Errorf("model %s filters.setParamsByID: unexpected type after macro substitution", modelId)
+					}
+					newSetParamsByID[newKey] = newParamMap
+				}
+				modelConfig.Filters.SetParamsByID = newSetParamsByID
+			}
+
+			// Substitute in metadata (type-preserving)
 			if len(modelConfig.Metadata) > 0 {
-				var err error
 				result, err := substituteMacroInValue(modelConfig.Metadata, entry.Name, entry.Value)
 				if err != nil {
 					return Config{}, fmt.Errorf("model %s metadata: %s", modelId, err.Error())
@@ -287,29 +356,25 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 			}
 		}

-		// Final pass: check if PORT macro is needed after macro expansion
-		// ${PORT} is a resource on the local machine so a new port is only allocated
-		// if it is required in either cmd or proxy keys
+		// Handle PORT macro - only allocate if cmd uses it
 		cmdHasPort := strings.Contains(modelConfig.Cmd, "${PORT}")
 		proxyHasPort := strings.Contains(modelConfig.Proxy, "${PORT}")
-		if cmdHasPort || proxyHasPort { // either has it
-			if !cmdHasPort && proxyHasPort { // but both don't have it
+		if cmdHasPort || proxyHasPort {
+			if !cmdHasPort && proxyHasPort {
 				return Config{}, fmt.Errorf("model %s: proxy uses ${PORT} but cmd does not - ${PORT} is only available when used in cmd", modelId)
 			}

-			// Add PORT macro and substitute it
-			portEntry := MacroEntry{Name: "PORT", Value: nextPort}
 			macroSlug := "${PORT}"
 			macroStr := fmt.Sprintf("%v", nextPort)

 			modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, macroSlug, macroStr)
 			modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, macroSlug, macroStr)
 			modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, macroSlug, macroStr)
+			modelConfig.Name = strings.ReplaceAll(modelConfig.Name, macroSlug, macroStr)
+			modelConfig.Description = strings.ReplaceAll(modelConfig.Description, macroSlug, macroStr)

-			// Substitute PORT in metadata
 			if len(modelConfig.Metadata) > 0 {
-				var err error
-				result, err := substituteMacroInValue(modelConfig.Metadata, portEntry.Name, portEntry.Value)
+				result, err := substituteMacroInValue(modelConfig.Metadata, "PORT", nextPort)
 				if err != nil {
 					return Config{}, fmt.Errorf("model %s metadata: %s", modelId, err.Error())
 				}
@@ -319,13 +384,15 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 			nextPort++
 		}

-		// make sure there are no unknown macros that have not been replaced
+		// Validate no unknown macros remain
 		fieldMap := map[string]string{
 			"cmd":                 modelConfig.Cmd,
 			"cmdStop":             modelConfig.CmdStop,
 			"proxy":               modelConfig.Proxy,
 			"checkEndpoint":       modelConfig.CheckEndpoint,
 			"filters.stripParams": modelConfig.Filters.StripParams,
+			"name":                modelConfig.Name,
+			"description":         modelConfig.Description,
 		}

 		for fieldName, fieldValue := range fieldMap {
@@ -333,62 +400,94 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 			for _, match := range matches {
 				macroName := match[1]
 				if macroName == "PID" && fieldName == "cmdStop" {
-					continue // this is ok, has to be replaced by process later
+					continue // replaced at runtime
 				}
-				// Reserved macros are always valid (they should have been substituted already)
 				if macroName == "PORT" || macroName == "MODEL_ID" {
 					return Config{}, fmt.Errorf("macro '${%s}' should have been substituted in %s.%s", macroName, modelId, fieldName)
 				}
-				// Any other macro is unknown
 				return Config{}, fmt.Errorf("unknown macro '${%s}' found in %s.%s", macroName, modelId, fieldName)
 			}
 		}

-		// Check for unknown macros in metadata
 		if len(modelConfig.Metadata) > 0 {
-			if err := validateMetadataForUnknownMacros(modelConfig.Metadata, modelId); err != nil {
+			if err := validateNestedForUnknownMacros(modelConfig.Metadata, fmt.Sprintf("model %s metadata", modelId)); err != nil {
 				return Config{}, err
 			}
 		}

-		// Validate the proxy URL.
-		if _, err := url.Parse(modelConfig.Proxy); err != nil {
-			return Config{}, fmt.Errorf(
-				"model %s: invalid proxy URL: %w", modelId, err,
-			)
+		// Validate SetParamsByID keys and values
+		for key, paramMap := range modelConfig.Filters.SetParamsByID {
+			if matches := macroPatternRegex.FindAllStringSubmatch(key, -1); len(matches) > 0 {
+				return Config{}, fmt.Errorf("unknown macro '${%s}' found in model %s filters.setParamsByID key", matches[0][1], modelId)
+			}
+			if err := validateNestedForUnknownMacros(any(paramMap), fmt.Sprintf("model %s filters.setParamsByID[%s]", modelId, key)); err != nil {
+				return Config{}, err
+			}
+		}
+
+		// Auto-register setParamsByID keys as aliases (skip the model's own ID)
+		for key := range modelConfig.Filters.SetParamsByID {
+			if key == modelId {
+				continue
+			}
+			if _, exists := config.Models[key]; exists {
+				return Config{}, fmt.Errorf("model %s filters.setParamsByID: key '%s' conflicts with an existing model ID", modelId, key)
+			}
+			if existingModel, exists := config.aliases[key]; exists {
+				if existingModel != modelId {
+					return Config{}, fmt.Errorf("duplicate alias '%s' in model %s filters.setParamsByID, already used by model %s", key, modelId, existingModel)
+				}
+				continue // already registered as explicit alias for this model
+			}
+			config.aliases[key] = modelId
+			modelConfig.Aliases = append(modelConfig.Aliases, key)
+		}
+
+		if _, err := url.Parse(modelConfig.Proxy); err != nil {
+			return Config{}, fmt.Errorf("model %s: invalid proxy URL: %w", modelId, err)
 		}

-		// if sendLoadingState is nil, set it to the global config value
-		// see #366
 		if modelConfig.SendLoadingState == nil {
-			v := config.SendLoadingState // copy it
+			v := config.SendLoadingState
 			modelConfig.SendLoadingState = &v
 		}

 		config.Models[modelId] = modelConfig
 	}

-	config = AddDefaultGroupToConfig(config)
-	// check that members are all unique in the groups
-	memberUsage := make(map[string]string) // maps member to group it appears in
-	for groupID, groupConfig := range config.Groups {
-		prevSet := make(map[string]bool)
-		for _, member := range groupConfig.Members {
-			// Check for duplicates within this group
-			if _, found := prevSet[member]; found {
-				return Config{}, fmt.Errorf("duplicate model member %s found in group: %s", member, groupID)
-			}
-			prevSet[member] = true
+	// groups XOR matrix
+	if config.Matrix != nil && len(config.Groups) > 0 {
+		return Config{}, fmt.Errorf("config cannot use both 'groups' and 'matrix'")
+	}

-			// Check if member is used in another group
-			if existingGroup, exists := memberUsage[member]; exists {
-				return Config{}, fmt.Errorf("model member %s is used in multiple groups: %s and %s", member, existingGroup, groupID)
+	if config.Matrix != nil {
+		expandedSets, err := ValidateMatrix(*config.Matrix, config.Models)
+		if err != nil {
+			return Config{}, fmt.Errorf("matrix: %w", err)
+		}
+		config.ExpandedSets = expandedSets
+	} else {
+		config = AddDefaultGroupToConfig(config)
+
+		// Validate group members
+		memberUsage := make(map[string]string)
+		for groupID, groupConfig := range config.Groups {
+			prevSet := make(map[string]bool)
+			for _, member := range groupConfig.Members {
+				if _, found := prevSet[member]; found {
+					return Config{}, fmt.Errorf("duplicate model member %s found in group: %s", member, groupID)
+				}
+				prevSet[member] = true
+
+				if existingGroup, exists := memberUsage[member]; exists {
+					return Config{}, fmt.Errorf("model member %s is used in multiple groups: %s and %s", member, existingGroup, groupID)
+				}
+				memberUsage[member] = groupID
 			}
-			memberUsage[member] = groupID
 		}
 	}

-	// clean up hooks preload
+	// Clean up hooks preload
 	if len(config.Hooks.OnStartup.Preload) > 0 {
 		var toPreload []string
 		for _, modelID := range config.Hooks.OnStartup.Preload {
@@ -400,10 +499,56 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 				toPreload = append(toPreload, real)
 			}
 		}
-
 		config.Hooks.OnStartup.Preload = toPreload
 	}

+	// Validate API keys (env macros already substituted at string level)
+	for i, apikey := range config.RequiredAPIKeys {
+		if apikey == "" {
+			return Config{}, fmt.Errorf("empty api key found in apiKeys")
+		}
+		if strings.Contains(apikey, " ") {
+			return Config{}, fmt.Errorf("api key cannot contain spaces: `%s`", apikey)
+		}
+		config.RequiredAPIKeys[i] = apikey
+	}
+
+	// Process peers with global macro substitution
+	for peerName, peerConfig := range config.Peers {
+		// Substitute global macros (LIFO order)
+		for i := len(config.Macros) - 1; i >= 0; i-- {
+			entry := config.Macros[i]
+			macroSlug := fmt.Sprintf("${%s}", entry.Name)
+			macroStr := fmt.Sprintf("%v", entry.Value)
+
+			peerConfig.ApiKey = strings.ReplaceAll(peerConfig.ApiKey, macroSlug, macroStr)
+			peerConfig.Filters.StripParams = strings.ReplaceAll(peerConfig.Filters.StripParams, macroSlug, macroStr)
+
+			// Substitute in setParams (type-preserving)
+			if len(peerConfig.Filters.SetParams) > 0 {
+				result, err := substituteMacroInValue(peerConfig.Filters.SetParams, entry.Name, entry.Value)
+				if err != nil {
+					return Config{}, fmt.Errorf("peers.%s.filters.setParams: %w", peerName, err)
+				}
+				peerConfig.Filters.SetParams = result.(map[string]any)
+			}
+		}
+
+		// Validate no unknown macros remain
+		if matches := macroPatternRegex.FindAllStringSubmatch(peerConfig.ApiKey, -1); len(matches) > 0 {
+			return Config{}, fmt.Errorf("peers.%s.apiKey: unknown macro '${%s}'", peerName, matches[0][1])
+		}
+		if matches := macroPatternRegex.FindAllStringSubmatch(peerConfig.Filters.StripParams, -1); len(matches) > 0 {
+			return Config{}, fmt.Errorf("peers.%s.filters.stripParams: unknown macro '${%s}'", peerName, matches[0][1])
+		}
+		if len(peerConfig.Filters.SetParams) > 0 {
+			if err := validateNestedForUnknownMacros(peerConfig.Filters.SetParams, fmt.Sprintf("peers.%s.filters.setParams", peerName)); err != nil {
+				return Config{}, err
+			}
+		}
+		config.Peers[peerName] = peerConfig
+	}
+
 	return config, nil
 }

@@ -512,9 +657,6 @@ func validateMacro(name string, value any) error {
 	// Validate that value is a scalar type
 	switch v := value.(type) {
 	case string:
-		if len(v) >= 1024 {
-			return fmt.Errorf("macro value for '%s' exceeds maximum length of 1024 characters", name)
-		}
 		// Check for self-reference
 		macroSlug := fmt.Sprintf("${%s}", name)
 		if strings.Contains(v, macroSlug) {
@@ -534,20 +676,26 @@ func validateMacro(name string, value any) error {
 	return nil
 }

-// validateMetadataForUnknownMacros recursively checks for any remaining macro references in metadata
-func validateMetadataForUnknownMacros(value any, modelId string) error {
+// validateNestedForUnknownMacros recursively checks for any remaining macro references in nested structures
+func validateNestedForUnknownMacros(value any, context string) error {
 	switch v := value.(type) {
 	case string:
 		matches := macroPatternRegex.FindAllStringSubmatch(v, -1)
 		for _, match := range matches {
 			macroName := match[1]
-			return fmt.Errorf("model %s metadata: unknown macro '${%s}'", modelId, macroName)
+			return fmt.Errorf("%s: unknown macro '${%s}'", context, macroName)
+		}
+		// Check for unsubstituted env macros
+		envMatches := envMacroRegex.FindAllStringSubmatch(v, -1)
+		for _, match := range envMatches {
+			varName := match[1]
+			return fmt.Errorf("%s: environment variable '%s' not set", context, varName)
 		}
 		return nil

 	case map[string]any:
 		for _, val := range v {
-			if err := validateMetadataForUnknownMacros(val, modelId); err != nil {
+			if err := validateNestedForUnknownMacros(val, context); err != nil {
 				return err
 			}
 		}
@@ -555,7 +703,7 @@ func validateMetadataForUnknownMacros(value any, modelId string) error {

 	case []any:
 		for _, val := range v {
-			if err := validateMetadataForUnknownMacros(val, modelId); err != nil {
+			if err := validateNestedForUnknownMacros(val, context); err != nil {
 				return err
 			}
 		}
@@ -614,3 +762,67 @@ func substituteMacroInValue(value any, macroName string, macroValue any) (any, e
 		return value, nil
 	}
 }
+
+// substituteEnvMacros replaces ${env.VAR_NAME} with environment variable values.
+// Returns error if any referenced env var is not set or contains invalid characters.
+// Env macros inside YAML comments are ignored by unmarshalling the YAML first
+// (which strips comments) and only checking the comment-free version for macros.
+func substituteEnvMacros(s string) (string, error) {
+	// Unmarshal and remarshal to strip YAML comments
+	var raw any
+	if err := yaml.Unmarshal([]byte(s), &raw); err != nil {
+		// If YAML is invalid, fall back to scanning the original string
+		// so the user gets the env var error rather than a confusing YAML parse error
+		return substituteEnvMacrosInString(s, s)
+	}
+	clean, err := yaml.Marshal(raw)
+	if err != nil {
+		return substituteEnvMacrosInString(s, s)
+	}
+
+	return substituteEnvMacrosInString(s, string(clean))
+}
+
+// substituteEnvMacrosInString finds ${env.VAR} macros in scanStr and substitutes
+// them in target. This separation allows scanning comment-free YAML while
+// substituting in the original string.
+func substituteEnvMacrosInString(target, scanStr string) (string, error) {
+	result := target
+	matches := envMacroRegex.FindAllStringSubmatch(scanStr, -1)
+	for _, match := range matches {
+		fullMatch := match[0] // ${env.VAR_NAME}
+		varName := match[1]   // VAR_NAME
+
+		value, exists := os.LookupEnv(varName)
+		if !exists {
+			return "", fmt.Errorf("environment variable '%s' is not set", varName)
+		}
+
+		// Sanitize the value for safe YAML substitution
+		value, err := sanitizeEnvValueForYAML(value, varName)
+		if err != nil {
+			return "", err
+		}
+
+		result = strings.ReplaceAll(result, fullMatch, value)
+	}
+	return result, nil
+}
+
+// sanitizeEnvValueForYAML ensures an environment variable value is safe for YAML substitution.
+// It rejects values with characters that break YAML structure and escapes quotes/backslashes
+// for compatibility with double-quoted YAML strings.
+func sanitizeEnvValueForYAML(value, varName string) (string, error) {
+	// Reject values that would break YAML structure regardless of quoting context
+	if strings.ContainsAny(value, "\n\r\x00") {
+		return "", fmt.Errorf("environment variable '%s' contains newlines or null bytes which are not allowed in YAML substitution", varName)
+	}
+
+	// Escape backslashes and double quotes for safe use in double-quoted YAML strings.
+	// In unquoted contexts, these escapes appear literally (harmless for most use cases).
+	// In double-quoted contexts, they are interpreted correctly.
+	value = strings.ReplaceAll(value, `\`, `\\`)
+	value = strings.ReplaceAll(value, `"`, `\"`)
+
+	return value, nil
+}
@@ -7,6 +7,7 @@ import (
 	"path/filepath"
 	"strings"
 	"testing"
+	"time"

 	"github.com/stretchr/testify/assert"
 )
@@ -163,9 +164,19 @@ groups:

 	modelLoadingState := false

+	defaultTimeout := TimeoutsConfig{
+		Connect:        30,
+		KeepAlive:      30,
+		ResponseHeader: 0,
+		TLSHandshake:   10,
+		ExpectContinue: 1,
+		IdleConn:       90,
+	}
+
 	expected := Config{
 		LogLevel:      "info",
 		LogTimeFormat: "",
+		LogToStdout:   LogToStdoutProxy,
 		StartPort:     5800,
 		Macros: MacroList{
 			{"svr-path", "path/to/server"},
@@ -178,42 +189,54 @@ groups:
 		SendLoadingState: false,
 		Models: map[string]ModelConfig{
 			"model1": {
-				Cmd:              "path/to/cmd --arg1 one",
-				Proxy:            "http://localhost:8080",
-				Aliases:          []string{"m1", "model-one"},
-				Env:              []string{"VAR1=value1", "VAR2=value2"},
-				CheckEndpoint:    "/health",
-				Name:             "Model 1",
-				Description:      "This is model 1",
-				SendLoadingState: &modelLoadingState,
+				Cmd:                "path/to/cmd --arg1 one",
+				Proxy:              "http://localhost:8080",
+				Aliases:            []string{"m1", "model-one"},
+				Env:                []string{"VAR1=value1", "VAR2=value2"},
+				CheckEndpoint:      "/health",
+				Name:               "Model 1",
+				Description:        "This is model 1",
+				SendLoadingState:   &modelLoadingState,
+				Timeouts:           defaultTimeout,
+				HealthCheckTimeout: 15,
 			},
 			"model2": {
-				Cmd:              "path/to/server --arg1 one",
-				Proxy:            "http://localhost:8081",
-				Aliases:          []string{"m2"},
-				Env:              []string{},
-				CheckEndpoint:    "/",
-				SendLoadingState: &modelLoadingState,
+				Cmd:                "path/to/server --arg1 one",
+				Proxy:              "http://localhost:8081",
+				Aliases:            []string{"m2"},
+				Env:                []string{},
+				CheckEndpoint:      "/",
+				SendLoadingState:   &modelLoadingState,
+				Timeouts:           defaultTimeout,
+				HealthCheckTimeout: 15,
 			},
 			"model3": {
-				Cmd:              "path/to/cmd --arg1 one",
-				Proxy:            "http://localhost:8081",
-				Aliases:          []string{"mthree"},
-				Env:              []string{},
-				CheckEndpoint:    "/",
-				SendLoadingState: &modelLoadingState,
+				Cmd:                "path/to/cmd --arg1 one",
+				Proxy:              "http://localhost:8081",
+				Aliases:            []string{"mthree"},
+				Env:                []string{},
+				CheckEndpoint:      "/",
+				SendLoadingState:   &modelLoadingState,
+				Timeouts:           defaultTimeout,
+				HealthCheckTimeout: 15,
 			},
 			"model4": {
-				Cmd:              "path/to/cmd --arg1 one",
-				Proxy:            "http://localhost:8082",
-				CheckEndpoint:    "/",
-				Aliases:          []string{},
-				Env:              []string{},
-				SendLoadingState: &modelLoadingState,
+				Cmd:                "path/to/cmd --arg1 one",
+				Proxy:              "http://localhost:8082",
+				CheckEndpoint:      "/",
+				Aliases:            []string{},
+				Env:                []string{},
+				SendLoadingState:   &modelLoadingState,
+				Timeouts:           defaultTimeout,
+				HealthCheckTimeout: 15,
 			},
 		},
 		HealthCheckTimeout: 15,
 		MetricsMaxInMemory: 1000,
+		CaptureBuffer:      5,
+		Performance: PerformanceConfig{
+			Every: 5 * time.Second,
+		},
 		Profiles: map[string][]string{
 			"test": {"model1", "model2"},
 		},
@@ -7,6 +7,7 @@ import (
 	"path/filepath"
 	"strings"
 	"testing"
+	"time"

 	"github.com/stretchr/testify/assert"
 )
@@ -155,9 +156,19 @@ groups:

 	modelLoadingState := false

+	defaultTimeout := TimeoutsConfig{
+		Connect:        30,
+		KeepAlive:      30,
+		ResponseHeader: 0,
+		TLSHandshake:   10,
+		ExpectContinue: 1,
+		IdleConn:       90,
+	}
+
 	expected := Config{
 		LogLevel:      "info",
 		LogTimeFormat: "",
+		LogToStdout:   LogToStdoutProxy,
 		StartPort:     5800,
 		Macros: MacroList{
 			{"svr-path", "path/to/server"},
@@ -165,44 +176,56 @@ groups:
 		SendLoadingState: false,
 		Models: map[string]ModelConfig{
 			"model1": {
-				Cmd:              "path/to/cmd --arg1 one",
-				CmdStop:          "taskkill /f /t /pid ${PID}",
-				Proxy:            "http://localhost:8080",
-				Aliases:          []string{"m1", "model-one"},
-				Env:              []string{"VAR1=value1", "VAR2=value2"},
-				CheckEndpoint:    "/health",
-				SendLoadingState: &modelLoadingState,
+				Cmd:                "path/to/cmd --arg1 one",
+				CmdStop:            "taskkill /f /t /pid ${PID}",
+				Proxy:              "http://localhost:8080",
+				Aliases:            []string{"m1", "model-one"},
+				Env:                []string{"VAR1=value1", "VAR2=value2"},
+				CheckEndpoint:      "/health",
+				SendLoadingState:   &modelLoadingState,
+				Timeouts:           defaultTimeout,
+				HealthCheckTimeout: 15,
 			},
 			"model2": {
-				Cmd:              "path/to/server --arg1 one",
-				CmdStop:          "taskkill /f /t /pid ${PID}",
-				Proxy:            "http://localhost:8081",
-				Aliases:          []string{"m2"},
-				Env:              []string{},
-				CheckEndpoint:    "/",
-				SendLoadingState: &modelLoadingState,
+				Cmd:                "path/to/server --arg1 one",
+				CmdStop:            "taskkill /f /t /pid ${PID}",
+				Proxy:              "http://localhost:8081",
+				Aliases:            []string{"m2"},
+				Env:                []string{},
+				CheckEndpoint:      "/",
+				SendLoadingState:   &modelLoadingState,
+				Timeouts:           defaultTimeout,
+				HealthCheckTimeout: 15,
 			},
 			"model3": {
-				Cmd:              "path/to/cmd --arg1 one",
-				CmdStop:          "taskkill /f /t /pid ${PID}",
-				Proxy:            "http://localhost:8081",
-				Aliases:          []string{"mthree"},
-				Env:              []string{},
-				CheckEndpoint:    "/",
-				SendLoadingState: &modelLoadingState,
+				Cmd:                "path/to/cmd --arg1 one",
+				CmdStop:            "taskkill /f /t /pid ${PID}",
+				Proxy:              "http://localhost:8081",
+				Aliases:            []string{"mthree"},
+				Env:                []string{},
+				CheckEndpoint:      "/",
+				SendLoadingState:   &modelLoadingState,
+				Timeouts:           defaultTimeout,
+				HealthCheckTimeout: 15,
 			},
 			"model4": {
-				Cmd:              "path/to/cmd --arg1 one",
-				CmdStop:          "taskkill /f /t /pid ${PID}",
-				Proxy:            "http://localhost:8082",
-				CheckEndpoint:    "/",
-				Aliases:          []string{},
-				Env:              []string{},
-				SendLoadingState: &modelLoadingState,
+				Cmd:                "path/to/cmd --arg1 one",
+				CmdStop:            "taskkill /f /t /pid ${PID}",
+				Proxy:              "http://localhost:8082",
+				CheckEndpoint:      "/",
+				Aliases:            []string{},
+				Env:                []string{},
+				SendLoadingState:   &modelLoadingState,
+				Timeouts:           defaultTimeout,
+				HealthCheckTimeout: 15,
 			},
 		},
 		HealthCheckTimeout: 15,
 		MetricsMaxInMemory: 1000,
+		CaptureBuffer:      5,
+		Performance: PerformanceConfig{
+			Every: 5 * time.Second,
+		},
 		Profiles: map[string][]string{
 			"test": {"model1", "model2"},
 		},
@@ -0,0 +1,114 @@
+package config
+
+import (
+	"slices"
+	"sort"
+	"strings"
+)
+
+// ProtectedParams is a list of parameters that cannot be set or stripped via filters
+// These are protected to prevent breaking the proxy's ability to route requests correctly
+var ProtectedParams = []string{"model"}
+
+// Filters contains filter settings for modifying request parameters
+// Used by both models and peers
+type Filters struct {
+	// StripParams is a comma-separated list of parameters to remove from requests
+	// The "model" parameter can never be removed
+	StripParams string `yaml:"stripParams"`
+
+	// SetParams is a dictionary of parameters to set/override in requests
+	// Protected params (like "model") cannot be set
+	SetParams map[string]any `yaml:"setParams"`
+
+	// SetParamsByID maps requested model IDs to parameters to set/override in requests.
+	// Useful with aliases: a single loaded model can behave differently depending on
+	// which alias the client used. Applied after SetParams, so it can override those values.
+	// Protected params (like "model") cannot be set.
+	SetParamsByID map[string]map[string]any `yaml:"setParamsByID"`
+}
+
+// SanitizedStripParams returns a sorted list of parameters to strip,
+// with duplicates, empty strings, and protected params removed
+func (f Filters) SanitizedStripParams() []string {
+	if f.StripParams == "" {
+		return nil
+	}
+
+	params := strings.Split(f.StripParams, ",")
+	cleaned := make([]string, 0, len(params))
+	seen := make(map[string]bool)
+
+	for _, param := range params {
+		trimmed := strings.TrimSpace(param)
+		// Skip protected params, empty strings, and duplicates
+		if slices.Contains(ProtectedParams, trimmed) || trimmed == "" || seen[trimmed] {
+			continue
+		}
+		seen[trimmed] = true
+		cleaned = append(cleaned, trimmed)
+	}
+
+	if len(cleaned) == 0 {
+		return nil
+	}
+
+	slices.Sort(cleaned)
+	return cleaned
+}
+
+// SanitizedSetParamsByID returns the params to set for the given requestedModelID,
+// with protected params removed and keys sorted for consistent iteration order.
+// Returns nil if the ID has no entry or all its params are protected.
+func (f Filters) SanitizedSetParamsByID(requestedModelID string) (map[string]any, []string) {
+	if len(f.SetParamsByID) == 0 {
+		return nil, nil
+	}
+	params, found := f.SetParamsByID[requestedModelID]
+	if !found || len(params) == 0 {
+		return nil, nil
+	}
+	result := make(map[string]any, len(params))
+	keys := make([]string, 0, len(params))
+	for key, value := range params {
+		if slices.Contains(ProtectedParams, key) {
+			continue
+		}
+		result[key] = value
+		keys = append(keys, key)
+	}
+	sort.Strings(keys)
+	if len(result) == 0 {
+		return nil, nil
+	}
+	return result, keys
+}
+
+// SanitizedSetParams returns a copy of SetParams with protected params removed
+// and keys sorted for consistent iteration order
+func (f Filters) SanitizedSetParams() (map[string]any, []string) {
+	if len(f.SetParams) == 0 {
+		return nil, nil
+	}
+
+	result := make(map[string]any, len(f.SetParams))
+	keys := make([]string, 0, len(f.SetParams))
+
+	for key, value := range f.SetParams {
+		// Skip protected params
+		if slices.Contains(ProtectedParams, key) {
+			continue
+		}
+		result[key] = value
+		keys = append(keys, key)
+	}
+
+	// Sort keys for consistent ordering
+	sort.Strings(keys)
+
+	if len(result) == 0 {
+		return nil, nil
+	}
+
+	return result, keys
+}
@@ -0,0 +1,285 @@
+package config
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestFilters_SanitizedStripParams(t *testing.T) {
+	tests := []struct {
+		name        string
+		stripParams string
+		want        []string
+	}{
+		{
+			name:        "empty string",
+			stripParams: "",
+			want:        nil,
+		},
+		{
+			name:        "single param",
+			stripParams: "temperature",
+			want:        []string{"temperature"},
+		},
+		{
+			name:        "multiple params",
+			stripParams: "temperature, top_p, top_k",
+			want:        []string{"temperature", "top_k", "top_p"}, // sorted
+		},
+		{
+			name:        "model param filtered",
+			stripParams: "model, temperature, top_p",
+			want:        []string{"temperature", "top_p"},
+		},
+		{
+			name:        "only model param",
+			stripParams: "model",
+			want:        nil,
+		},
+		{
+			name:        "duplicates removed",
+			stripParams: "temperature, top_p, temperature",
+			want:        []string{"temperature", "top_p"},
+		},
+		{
+			name:        "extra whitespace",
+			stripParams: "  temperature  ,  top_p  ",
+			want:        []string{"temperature", "top_p"},
+		},
+		{
+			name:        "empty values filtered",
+			stripParams: "temperature,,top_p,",
+			want:        []string{"temperature", "top_p"},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			f := Filters{StripParams: tt.stripParams}
+			got := f.SanitizedStripParams()
+			assert.Equal(t, tt.want, got)
+		})
+	}
+}
+
+func TestFilters_SanitizedSetParams(t *testing.T) {
+	tests := []struct {
+		name       string
+		setParams  map[string]any
+		wantParams map[string]any
+		wantKeys   []string
+	}{
+		{
+			name:       "empty setParams",
+			setParams:  nil,
+			wantParams: nil,
+			wantKeys:   nil,
+		},
+		{
+			name:       "empty map",
+			setParams:  map[string]any{},
+			wantParams: nil,
+			wantKeys:   nil,
+		},
+		{
+			name: "normal params",
+			setParams: map[string]any{
+				"temperature": 0.7,
+				"top_p":       0.9,
+			},
+			wantParams: map[string]any{
+				"temperature": 0.7,
+				"top_p":       0.9,
+			},
+			wantKeys: []string{"temperature", "top_p"},
+		},
+		{
+			name: "protected model param filtered",
+			setParams: map[string]any{
+				"model":       "should-be-filtered",
+				"temperature": 0.7,
+			},
+			wantParams: map[string]any{
+				"temperature": 0.7,
+			},
+			wantKeys: []string{"temperature"},
+		},
+		{
+			name: "only protected param",
+			setParams: map[string]any{
+				"model": "should-be-filtered",
+			},
+			wantParams: nil,
+			wantKeys:   nil,
+		},
+		{
+			name: "complex nested values",
+			setParams: map[string]any{
+				"provider": map[string]any{
+					"data_collection": "deny",
+					"allow_fallbacks": false,
+				},
+				"transforms": []string{"middle-out"},
+			},
+			wantParams: map[string]any{
+				"provider": map[string]any{
+					"data_collection": "deny",
+					"allow_fallbacks": false,
+				},
+				"transforms": []string{"middle-out"},
+			},
+			wantKeys: []string{"provider", "transforms"},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			f := Filters{SetParams: tt.setParams}
+			gotParams, gotKeys := f.SanitizedSetParams()
+
+			assert.Equal(t, len(tt.wantKeys), len(gotKeys), "keys length mismatch")
+			for i, key := range gotKeys {
+				assert.Equal(t, tt.wantKeys[i], key, "key mismatch at %d", i)
+			}
+
+			if tt.wantParams == nil {
+				assert.Nil(t, gotParams, "expected nil params")
+				return
+			}
+
+			assert.Equal(t, len(tt.wantParams), len(gotParams), "params length mismatch")
+			for key, wantValue := range tt.wantParams {
+				gotValue, exists := gotParams[key]
+				assert.True(t, exists, "missing key: %s", key)
+				// Simple comparison for basic types
+				switch v := wantValue.(type) {
+				case string, int, float64, bool:
+					assert.Equal(t, v, gotValue, "value mismatch for key %s", key)
+				}
+			}
+		})
+	}
+}
+
+func TestFilters_SanitizedSetParamsByID(t *testing.T) {
+	tests := []struct {
+		name             string
+		setParamsByID    map[string]map[string]any
+		requestedModelID string
+		wantParams       map[string]any
+		wantKeys         []string
+	}{
+		{
+			name:             "empty SetParamsByID returns nil",
+			setParamsByID:    nil,
+			requestedModelID: "model1",
+			wantParams:       nil,
+			wantKeys:         nil,
+		},
+		{
+			name:             "empty map returns nil",
+			setParamsByID:    map[string]map[string]any{},
+			requestedModelID: "model1",
+			wantParams:       nil,
+			wantKeys:         nil,
+		},
+		{
+			name: "non-matching model ID returns nil",
+			setParamsByID: map[string]map[string]any{
+				"model2": {"temperature": 0.9},
+			},
+			requestedModelID: "model1",
+			wantParams:       nil,
+			wantKeys:         nil,
+		},
+		{
+			name: "matching model ID returns correct params",
+			setParamsByID: map[string]map[string]any{
+				"model1": {"temperature": 0.7, "top_p": 0.9},
+				"model2": {"temperature": 0.5},
+			},
+			requestedModelID: "model1",
+			wantParams: map[string]any{
+				"temperature": 0.7,
+				"top_p":       0.9,
+			},
+			wantKeys: []string{"temperature", "top_p"},
+		},
+		{
+			name: "protected param model is filtered out",
+			setParamsByID: map[string]map[string]any{
+				"model1": {
+					"model":       "should-be-filtered",
+					"temperature": 0.7,
+				},
+			},
+			requestedModelID: "model1",
+			wantParams: map[string]any{
+				"temperature": 0.7,
+			},
+			wantKeys: []string{"temperature"},
+		},
+		{
+			name: "only protected param returns nil",
+			setParamsByID: map[string]map[string]any{
+				"model1": {
+					"model": "should-be-filtered",
+				},
+			},
+			requestedModelID: "model1",
+			wantParams:       nil,
+			wantKeys:         nil,
+		},
+		{
+			name: "keys are sorted",
+			setParamsByID: map[string]map[string]any{
+				"model1": {
+					"z_param": "z",
+					"a_param": "a",
+					"m_param": "m",
+				},
+			},
+			requestedModelID: "model1",
+			wantParams: map[string]any{
+				"z_param": "z",
+				"a_param": "a",
+				"m_param": "m",
+			},
+			wantKeys: []string{"a_param", "m_param", "z_param"},
+		},
+		{
+			name: "alias style key lookup",
+			setParamsByID: map[string]map[string]any{
+				"model1:high": {"reasoning_effort": "high"},
+				"model1:low":  {"reasoning_effort": "low"},
+			},
+			requestedModelID: "model1:high",
+			wantParams: map[string]any{
+				"reasoning_effort": "high",
+			},
+			wantKeys: []string{"reasoning_effort"},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			f := Filters{SetParamsByID: tt.setParamsByID}
+			gotParams, gotKeys := f.SanitizedSetParamsByID(tt.requestedModelID)
+
+			if tt.wantParams == nil {
+				assert.Nil(t, gotParams)
+				assert.Nil(t, gotKeys)
+				return
+			}
+
+			assert.Equal(t, tt.wantKeys, gotKeys)
+			assert.Equal(t, tt.wantParams, gotParams)
+		})
+	}
+}
+
+func TestProtectedParams(t *testing.T) {
+	// Verify that "model" is protected
+	assert.Contains(t, ProtectedParams, "model")
+}
@@ -104,6 +104,62 @@ models:
 	assert.Contains(t, err.Error(), "self-reference")
 }

+// Test macro substitution in name and description fields
+func TestConfig_MacroInNameAndDescription(t *testing.T) {
+	content := `
+startPort: 10000
+macros:
+  "VARIANT": "Q4_K_M"
+  "FAMILY": "llama"
+
+models:
+  my-model:
+    cmd: echo ok
+    proxy: http://localhost:8080
+    name: "${FAMILY} ${VARIANT}"
+    description: "A ${FAMILY} model in ${VARIANT} format"
+`
+
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+	assert.Equal(t, "llama Q4_K_M", config.Models["my-model"].Name)
+	assert.Equal(t, "A llama model in Q4_K_M format", config.Models["my-model"].Description)
+}
+
+// Test MODEL_ID macro in name and description fields
+func TestConfig_ModelIDInNameAndDescription(t *testing.T) {
+	content := `
+startPort: 10000
+models:
+  llama-3b:
+    cmd: echo ok
+    proxy: http://localhost:8080
+    name: "Model: ${MODEL_ID}"
+    description: "Running ${MODEL_ID}"
+`
+
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+	assert.Equal(t, "Model: llama-3b", config.Models["llama-3b"].Name)
+	assert.Equal(t, "Running llama-3b", config.Models["llama-3b"].Description)
+}
+
+// Test unknown macro in name or description returns an error
+func TestConfig_UnknownMacroInNameDescription(t *testing.T) {
+	content := `
+startPort: 10000
+models:
+  test:
+    cmd: echo ok
+    proxy: http://localhost:8080
+    name: "Model ${UNDEFINED}"
+`
+
+	_, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "UNDEFINED")
+}
+
 // Test undefined macro reference error
 func TestConfig_UndefinedMacroReference(t *testing.T) {
 	content := `
@@ -0,0 +1,226 @@
+package config
+
+import (
+	"fmt"
+	"regexp"
+	"sort"
+
+	"gopkg.in/yaml.v3"
+)
+
+var varKeyPattern = regexp.MustCompile(`^[a-zA-Z0-9]{1,8}$`)
+
+// MatrixConfig represents the swap matrix configuration block.
+type MatrixConfig struct {
+	Var        map[string]string `yaml:"vars"`
+	EvictCosts map[string]int    `yaml:"evict_costs"`
+	Sets       OrderedSets       `yaml:"sets"`
+}
+
+// SetEntry is a single named set with its DSL expression.
+type SetEntry struct {
+	Name string
+	DSL  string
+}
+
+// OrderedSets preserves YAML definition order of sets (used for tie-breaking).
+type OrderedSets []SetEntry
+
+func (os *OrderedSets) UnmarshalYAML(value *yaml.Node) error {
+	if value.Kind != yaml.MappingNode {
+		return fmt.Errorf("sets must be a mapping")
+	}
+
+	entries := make([]SetEntry, 0, len(value.Content)/2)
+	for i := 0; i < len(value.Content); i += 2 {
+		keyNode := value.Content[i]
+		valueNode := value.Content[i+1]
+
+		var name string
+		if err := keyNode.Decode(&name); err != nil {
+			return fmt.Errorf("failed to decode set name: %w", err)
+		}
+
+		var dsl string
+		if err := valueNode.Decode(&dsl); err != nil {
+			return fmt.Errorf("failed to decode DSL for set %q: %w", name, err)
+		}
+
+		entries = append(entries, SetEntry{Name: name, DSL: dsl})
+	}
+
+	*os = entries
+	return nil
+}
+
+// ExpandedSet is one valid combination of concurrent models (real model names).
+type ExpandedSet struct {
+	SetName string
+	DSL     string
+	Models  []string // real model names, sorted
+}
+
+// ValidateMatrix validates the matrix config and returns all expanded sets.
+func ValidateMatrix(matrix MatrixConfig, models map[string]ModelConfig) ([]ExpandedSet, error) {
+	if len(matrix.Sets) == 0 {
+		return nil, fmt.Errorf("matrix must define at least one set")
+	}
+
+	if len(matrix.Var) == 0 {
+		return nil, fmt.Errorf("matrix must define at least one var")
+	}
+
+	// Validate var entries
+	if matrix.Var != nil {
+		for id, modelName := range matrix.Var {
+			if !varKeyPattern.MatchString(id) {
+				return nil, fmt.Errorf("var key %q must be alphanumeric and 1-8 characters", id)
+			}
+			if _, exists := models[modelName]; !exists {
+				return nil, fmt.Errorf("var key %q references unknown model %q", id, modelName)
+			}
+		}
+	}
+
+	// Validate evict_costs
+	if matrix.EvictCosts != nil {
+		for key, cost := range matrix.EvictCosts {
+			if cost <= 0 {
+				return nil, fmt.Errorf("evict_cost for %q must be a positive integer, got %d", key, cost)
+			}
+			if _, ok := matrix.Var[key]; !ok {
+				return nil, fmt.Errorf("evict_costs: unknown var ID %q", key)
+			}
+		}
+	}
+
+	// Build dependency graph for +ref topological sort
+	setNames := make(map[string]bool)
+	for _, entry := range matrix.Sets {
+		setNames[entry.Name] = true
+	}
+
+	deps := make(map[string][]string) // setName -> set names it depends on
+	for _, entry := range matrix.Sets {
+		refs, err := extractRefs(entry.DSL)
+		if err != nil {
+			return nil, fmt.Errorf("set %q: %w", entry.Name, err)
+		}
+		for _, ref := range refs {
+			if !setNames[ref] {
+				return nil, fmt.Errorf("set %q references undefined set %q", entry.Name, ref)
+			}
+		}
+		deps[entry.Name] = refs
+	}
+
+	// Topological sort with cycle detection
+	order, err := topologicalSort(matrix.Sets, deps)
+	if err != nil {
+		return nil, err
+	}
+
+	// Expand sets in topological order
+	resolvedRefs := make(map[string][][]string) // set name -> expanded alias-level combos
+	var allExpanded []ExpandedSet
+	totalCombinations := 0
+
+	// Build ordered map for efficient lookup
+	setDSL := make(map[string]string)
+	for _, entry := range matrix.Sets {
+		setDSL[entry.Name] = entry.DSL
+	}
+
+	for _, name := range order {
+		dsl := setDSL[name]
+		combos, err := ParseAndExpandDSL(dsl, resolvedRefs)
+		if err != nil {
+			return nil, fmt.Errorf("set %q: %w", name, err)
+		}
+
+		resolvedRefs[name] = combos
+
+		// Resolve var IDs to real model names
+		for _, combo := range combos {
+			resolved := make([]string, len(combo))
+			for i, ident := range combo {
+				realName, ok := matrix.Var[ident]
+				if !ok {
+					return nil, fmt.Errorf("set %q: unknown var ID %q", name, ident)
+				}
+				resolved[i] = realName
+			}
+			sort.Strings(resolved)
+			allExpanded = append(allExpanded, ExpandedSet{
+				SetName: name,
+				DSL:     dsl,
+				Models:  resolved,
+			})
+		}
+
+		totalCombinations += len(combos)
+		if totalCombinations > maxDSLExpansions {
+			return nil, fmt.Errorf("total expanded combinations (%d) exceed limit of %d", totalCombinations, maxDSLExpansions)
+		}
+	}
+
+	return allExpanded, nil
+}
+
+// topologicalSort returns set names in dependency order.
+// Returns an error if a cycle is detected.
+func topologicalSort(sets OrderedSets, deps map[string][]string) ([]string, error) {
+	// States: 0 = unvisited, 1 = visiting, 2 = visited
+	state := make(map[string]int)
+	var order []string
+
+	var visit func(name string) error
+	visit = func(name string) error {
+		switch state[name] {
+		case 1:
+			return fmt.Errorf("circular reference detected involving set %q", name)
+		case 2:
+			return nil
+		}
+		state[name] = 1
+
+		for _, dep := range deps[name] {
+			if err := visit(dep); err != nil {
+				return err
+			}
+		}
+
+		state[name] = 2
+		order = append(order, name)
+		return nil
+	}
+
+	// Visit in definition order for deterministic output
+	for _, entry := range sets {
+		if state[entry.Name] == 0 {
+			if err := visit(entry.Name); err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	return order, nil
+}
+
+// ResolvedEvictCosts returns a map of real model name -> evict cost,
+// resolving var IDs. Models not listed default to 1.
+func (m *MatrixConfig) ResolvedEvictCosts() map[string]int {
+	costs := make(map[string]int)
+	if m.EvictCosts == nil {
+		return costs
+	}
+	for key, cost := range m.EvictCosts {
+		// Resolve var ID if present
+		if realName, ok := m.Var[key]; ok {
+			costs[realName] = cost
+		} else {
+			costs[key] = cost
+		}
+	}
+	return costs
+}
@@ -0,0 +1,376 @@
+package config
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+	"unicode"
+)
+
+const maxDSLExpansions = 1000
+
+// Token types for the DSL lexer
+type tokenType int
+
+const (
+	tokIdent  tokenType = iota // model alias or name
+	tokAnd                     // &
+	tokOr                      // |
+	tokLParen                  // (
+	tokRParen                  // )
+	tokRef                     // +setName
+	tokEOF
+)
+
+type token struct {
+	typ tokenType
+	val string
+}
+
+// tokenize splits a DSL string into tokens.
+func tokenize(input string) ([]token, error) {
+	var tokens []token
+	i := 0
+	runes := []rune(input)
+
+	for i < len(runes) {
+		ch := runes[i]
+
+		// skip whitespace
+		if unicode.IsSpace(ch) {
+			i++
+			continue
+		}
+
+		switch ch {
+		case '&':
+			tokens = append(tokens, token{tokAnd, "&"})
+			i++
+		case '|':
+			tokens = append(tokens, token{tokOr, "|"})
+			i++
+		case '(':
+			tokens = append(tokens, token{tokLParen, "("})
+			i++
+		case ')':
+			tokens = append(tokens, token{tokRParen, ")"})
+			i++
+		case '+':
+			// +ref: read the identifier that follows
+			i++
+			start := i
+			for i < len(runes) && isIdentChar(runes[i]) {
+				i++
+			}
+			if i == start {
+				return nil, fmt.Errorf("expected set name after '+' at position %d", start)
+			}
+			tokens = append(tokens, token{tokRef, string(runes[start:i])})
+		default:
+			if isIdentChar(ch) {
+				start := i
+				for i < len(runes) && isIdentChar(runes[i]) {
+					i++
+				}
+				tokens = append(tokens, token{tokIdent, string(runes[start:i])})
+			} else {
+				return nil, fmt.Errorf("unexpected character %q at position %d", ch, i)
+			}
+		}
+	}
+
+	tokens = append(tokens, token{tokEOF, ""})
+	return tokens, nil
+}
+
+func isIdentChar(ch rune) bool {
+	return unicode.IsLetter(ch) || unicode.IsDigit(ch) || ch == '_' || ch == '-' || ch == '.'
+}
+
+// AST node types
+type dslNode interface {
+	dslNode()
+}
+
+type andNode struct {
+	children []dslNode
+}
+
+type orNode struct {
+	children []dslNode
+}
+
+type leafNode struct {
+	name string
+}
+
+type refNode struct {
+	setName string
+}
+
+func (andNode) dslNode()  {}
+func (orNode) dslNode()   {}
+func (leafNode) dslNode() {}
+func (refNode) dslNode()  {}
+
+// parser holds state for recursive-descent parsing.
+type parser struct {
+	tokens []token
+	pos    int
+}
+
+func (p *parser) peek() token {
+	if p.pos < len(p.tokens) {
+		return p.tokens[p.pos]
+	}
+	return token{tokEOF, ""}
+}
+
+func (p *parser) next() token {
+	t := p.peek()
+	if t.typ != tokEOF {
+		p.pos++
+	}
+	return t
+}
+
+func (p *parser) expect(typ tokenType) (token, error) {
+	t := p.next()
+	if t.typ != typ {
+		return t, fmt.Errorf("expected token type %d, got %q", typ, t.val)
+	}
+	return t, nil
+}
+
+// Grammar:
+//
+//	expr    = andExpr
+//	andExpr = orExpr ('&' orExpr)*
+//	orExpr  = atom ('|' atom)*
+//	atom    = ident | '+' ident | '(' expr ')'
+//
+// & binds tighter than |, so "a | b & c" means "a | (b & c)"
+func parse(tokens []token) (dslNode, error) {
+	p := &parser{tokens: tokens}
+	node, err := p.parseExpr()
+	if err != nil {
+		return nil, err
+	}
+	if p.peek().typ != tokEOF {
+		return nil, fmt.Errorf("unexpected token %q after expression", p.peek().val)
+	}
+	return node, nil
+}
+
+func (p *parser) parseExpr() (dslNode, error) {
+	return p.parseOrExpr()
+}
+
+func (p *parser) parseOrExpr() (dslNode, error) {
+	left, err := p.parseAndExpr()
+	if err != nil {
+		return nil, err
+	}
+
+	if p.peek().typ == tokOr {
+		children := []dslNode{left}
+		for p.peek().typ == tokOr {
+			p.next() // consume |
+			right, err := p.parseAndExpr()
+			if err != nil {
+				return nil, err
+			}
+			children = append(children, right)
+		}
+		return orNode{children: children}, nil
+	}
+
+	return left, nil
+}
+
+func (p *parser) parseAndExpr() (dslNode, error) {
+	left, err := p.parseAtom()
+	if err != nil {
+		return nil, err
+	}
+
+	if p.peek().typ == tokAnd {
+		children := []dslNode{left}
+		for p.peek().typ == tokAnd {
+			p.next() // consume &
+			right, err := p.parseAtom()
+			if err != nil {
+				return nil, err
+			}
+			children = append(children, right)
+		}
+		return andNode{children: children}, nil
+	}
+
+	return left, nil
+}
+
+func (p *parser) parseAtom() (dslNode, error) {
+	t := p.peek()
+
+	switch t.typ {
+	case tokIdent:
+		p.next()
+		return leafNode{name: t.val}, nil
+
+	case tokRef:
+		p.next()
+		return refNode{setName: t.val}, nil
+
+	case tokLParen:
+		p.next() // consume (
+		node, err := p.parseExpr()
+		if err != nil {
+			return nil, err
+		}
+		if _, err := p.expect(tokRParen); err != nil {
+			return nil, fmt.Errorf("missing closing parenthesis")
+		}
+		return node, nil
+
+	default:
+		return nil, fmt.Errorf("unexpected token %q", t.val)
+	}
+}
+
+// expand walks the AST and produces all combinations.
+// resolvedRefs contains previously expanded sets for +ref resolution.
+func expand(node dslNode, resolvedRefs map[string][][]string) ([][]string, error) {
+	switch n := node.(type) {
+	case leafNode:
+		return [][]string{{n.name}}, nil
+
+	case refNode:
+		expanded, ok := resolvedRefs[n.setName]
+		if !ok {
+			return nil, fmt.Errorf("unknown set reference +%s", n.setName)
+		}
+		// Return a copy
+		result := make([][]string, len(expanded))
+		for i, combo := range expanded {
+			result[i] = make([]string, len(combo))
+			copy(result[i], combo)
+		}
+		return result, nil
+
+	case orNode:
+		// Union of all children's expansions
+		var result [][]string
+		for _, child := range n.children {
+			childResult, err := expand(child, resolvedRefs)
+			if err != nil {
+				return nil, err
+			}
+			result = append(result, childResult...)
+			if len(result) > maxDSLExpansions {
+				return nil, fmt.Errorf("DSL expansion exceeded %d combinations", maxDSLExpansions)
+			}
+		}
+		return result, nil
+
+	case andNode:
+		// Cartesian product across children
+		result := [][]string{{}} // start with one empty combo
+		for _, child := range n.children {
+			childResult, err := expand(child, resolvedRefs)
+			if err != nil {
+				return nil, err
+			}
+			result, err = cartesianProduct(result, childResult, maxDSLExpansions)
+			if err != nil {
+				return nil, err
+			}
+		}
+		return result, nil
+
+	default:
+		return nil, fmt.Errorf("unknown node type %T", node)
+	}
+}
+
+// cartesianProduct computes the cartesian product of two sets of combinations.
+// It returns an error if the product would exceed cap.
+func cartesianProduct(left, right [][]string, cap int) ([][]string, error) {
+	if int64(len(left))*int64(len(right)) > int64(cap) {
+		return nil, fmt.Errorf("DSL expansion exceeded %d combinations", cap)
+	}
+	result := make([][]string, 0, len(left)*len(right))
+	for _, l := range left {
+		for _, r := range right {
+			combo := make([]string, 0, len(l)+len(r))
+			combo = append(combo, l...)
+			combo = append(combo, r...)
+			result = append(result, combo)
+		}
+	}
+	return result, nil
+}
+
+// ParseAndExpandDSL tokenizes, parses, and expands a DSL string.
+// resolvedRefs contains previously expanded sets for +ref inlining.
+func ParseAndExpandDSL(dsl string, resolvedRefs map[string][][]string) ([][]string, error) {
+	dsl = strings.TrimSpace(dsl)
+	if dsl == "" {
+		return nil, fmt.Errorf("empty DSL expression")
+	}
+
+	tokens, err := tokenize(dsl)
+	if err != nil {
+		return nil, fmt.Errorf("tokenize: %w", err)
+	}
+
+	tree, err := parse(tokens)
+	if err != nil {
+		return nil, fmt.Errorf("parse: %w", err)
+	}
+
+	result, err := expand(tree, resolvedRefs)
+	if err != nil {
+		return nil, err
+	}
+
+	// Deduplicate models within each combination and sort for consistency
+	for i, combo := range result {
+		result[i] = dedupAndSort(combo)
+	}
+
+	return result, nil
+}
+
+// dedupAndSort removes duplicate entries and sorts alphabetically.
+func dedupAndSort(items []string) []string {
+	seen := make(map[string]bool, len(items))
+	var unique []string
+	for _, item := range items {
+		if !seen[item] {
+			seen[item] = true
+			unique = append(unique, item)
+		}
+	}
+	sort.Strings(unique)
+	return unique
+}
+
+// extractRefs scans a DSL string for +ref tokens without full parsing.
+// Used for building the dependency graph for topological sorting.
+func extractRefs(dsl string) ([]string, error) {
+	tokens, err := tokenize(dsl)
+	if err != nil {
+		return nil, err
+	}
+
+	var refs []string
+	seen := make(map[string]bool)
+	for _, t := range tokens {
+		if t.typ == tokRef && !seen[t.val] {
+			seen[t.val] = true
+			refs = append(refs, t.val)
+		}
+	}
+	return refs, nil
+}
@@ -0,0 +1,300 @@
+package config
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestDSL_Tokenize(t *testing.T) {
+	tests := []struct {
+		name   string
+		input  string
+		expect []token
+		errMsg string
+	}{
+		{
+			name:  "single identifier",
+			input: "abc",
+			expect: []token{
+				{tokIdent, "abc"},
+				{tokEOF, ""},
+			},
+		},
+		{
+			name:  "identifier with hyphens and dots",
+			input: "model-name.v2",
+			expect: []token{
+				{tokIdent, "model-name.v2"},
+				{tokEOF, ""},
+			},
+		},
+		{
+			name:  "and expression",
+			input: "a & b",
+			expect: []token{
+				{tokIdent, "a"},
+				{tokAnd, "&"},
+				{tokIdent, "b"},
+				{tokEOF, ""},
+			},
+		},
+		{
+			name:  "or expression",
+			input: "a | b",
+			expect: []token{
+				{tokIdent, "a"},
+				{tokOr, "|"},
+				{tokIdent, "b"},
+				{tokEOF, ""},
+			},
+		},
+		{
+			name:  "parentheses",
+			input: "(a | b) & c",
+			expect: []token{
+				{tokLParen, "("},
+				{tokIdent, "a"},
+				{tokOr, "|"},
+				{tokIdent, "b"},
+				{tokRParen, ")"},
+				{tokAnd, "&"},
+				{tokIdent, "c"},
+				{tokEOF, ""},
+			},
+		},
+		{
+			name:  "ref token",
+			input: "+llms & v",
+			expect: []token{
+				{tokRef, "llms"},
+				{tokAnd, "&"},
+				{tokIdent, "v"},
+				{tokEOF, ""},
+			},
+		},
+		{
+			name:  "no whitespace",
+			input: "(a|b)&c",
+			expect: []token{
+				{tokLParen, "("},
+				{tokIdent, "a"},
+				{tokOr, "|"},
+				{tokIdent, "b"},
+				{tokRParen, ")"},
+				{tokAnd, "&"},
+				{tokIdent, "c"},
+				{tokEOF, ""},
+			},
+		},
+		{
+			name:   "empty ref",
+			input:  "+",
+			errMsg: "expected set name after '+'",
+		},
+		{
+			name:   "invalid character",
+			input:  "a @ b",
+			errMsg: "unexpected character",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			tokens, err := tokenize(tt.input)
+			if tt.errMsg != "" {
+				require.Error(t, err)
+				assert.Contains(t, err.Error(), tt.errMsg)
+			} else {
+				require.NoError(t, err)
+				assert.Equal(t, tt.expect, tokens)
+			}
+		})
+	}
+}
+
+func TestDSL_ParseAndExpand(t *testing.T) {
+	tests := []struct {
+		name   string
+		dsl    string
+		refs   map[string][][]string
+		expect [][]string
+		errMsg string
+	}{
+		{
+			name:   "single model",
+			dsl:    "L",
+			expect: [][]string{{"L"}},
+		},
+		{
+			name:   "two models with AND",
+			dsl:    "a & b",
+			expect: [][]string{{"a", "b"}},
+		},
+		{
+			name:   "two models with OR",
+			dsl:    "a | b",
+			expect: [][]string{{"a"}, {"b"}},
+		},
+		{
+			name:   "three models with OR",
+			dsl:    "a | b | c",
+			expect: [][]string{{"a"}, {"b"}, {"c"}},
+		},
+		{
+			name: "cartesian product (a|b) & (c|d)",
+			dsl:  "(a | b) & (c | d)",
+			expect: [][]string{
+				{"a", "c"},
+				{"a", "d"},
+				{"b", "c"},
+				{"b", "d"},
+			},
+		},
+		{
+			name: "three-way AND",
+			dsl:  "a & b & c",
+			expect: [][]string{
+				{"a", "b", "c"},
+			},
+		},
+		{
+			name: "(g | q | m) & v",
+			dsl:  "(g | q | m) & v",
+			expect: [][]string{
+				{"g", "v"},
+				{"q", "v"},
+				{"m", "v"},
+			},
+		},
+		{
+			name: "(g | q) & v & e",
+			dsl:  "(g | q) & v & e",
+			expect: [][]string{
+				{"e", "g", "v"},
+				{"e", "q", "v"},
+			},
+		},
+		{
+			name: "precedence: a | b & c means a | (b & c)",
+			dsl:  "a | b & c",
+			expect: [][]string{
+				{"a"},
+				{"b", "c"},
+			},
+		},
+		{
+			name: "+ref inlining",
+			dsl:  "+llms & v",
+			refs: map[string][][]string{
+				"llms": {{"g"}, {"q"}, {"m"}},
+			},
+			expect: [][]string{
+				{"g", "v"},
+				{"q", "v"},
+				{"m", "v"},
+			},
+		},
+		{
+			name: "+ref chained",
+			dsl:  "+with_tts & e",
+			refs: map[string][][]string{
+				"with_tts": {{"g", "v"}, {"q", "v"}, {"m", "v"}},
+			},
+			expect: [][]string{
+				{"e", "g", "v"},
+				{"e", "q", "v"},
+				{"e", "m", "v"},
+			},
+		},
+		{
+			name: "dedup within combination",
+			dsl:  "a & a",
+			expect: [][]string{
+				{"a"},
+			},
+		},
+		{
+			name:   "empty expression",
+			dsl:    "",
+			errMsg: "empty DSL expression",
+		},
+		{
+			name:   "unmatched open paren",
+			dsl:    "(a | b",
+			errMsg: "missing closing parenthesis",
+		},
+		{
+			name:   "unmatched close paren",
+			dsl:    "a | b)",
+			errMsg: "unexpected token",
+		},
+		{
+			name:   "unknown ref",
+			dsl:    "+unknown",
+			errMsg: "unknown set reference +unknown",
+		},
+		{
+			name:   "empty parens",
+			dsl:    "()",
+			errMsg: "unexpected token",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			refs := tt.refs
+			if refs == nil {
+				refs = map[string][][]string{}
+			}
+			result, err := ParseAndExpandDSL(tt.dsl, refs)
+			if tt.errMsg != "" {
+				require.Error(t, err)
+				assert.Contains(t, err.Error(), tt.errMsg)
+			} else {
+				require.NoError(t, err)
+				assert.Equal(t, tt.expect, result)
+			}
+		})
+	}
+}
+
+func TestDSL_ExpansionCap(t *testing.T) {
+	// Build an expression that would exceed 1000 combinations:
+	// (a1|a2|...|a32) & (b1|b2|...|b32) = 1024 combos
+	var aItems, bItems []string
+	for i := 0; i < 32; i++ {
+		aItems = append(aItems, fmt.Sprintf("a%d", i))
+		bItems = append(bItems, fmt.Sprintf("b%d", i))
+	}
+	dsl := fmt.Sprintf("(%s) & (%s)",
+		join(aItems, " | "),
+		join(bItems, " | "),
+	)
+	_, err := ParseAndExpandDSL(dsl, map[string][][]string{})
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "exceeded")
+}
+
+func TestDSL_ExtractRefs(t *testing.T) {
+	refs, err := extractRefs("+llms & v & +other")
+	require.NoError(t, err)
+	assert.Equal(t, []string{"llms", "other"}, refs)
+
+	refs, err = extractRefs("a & b")
+	require.NoError(t, err)
+	assert.Empty(t, refs)
+}
+
+func join(items []string, sep string) string {
+	result := ""
+	for i, item := range items {
+		if i > 0 {
+			result += sep
+		}
+		result += item
+	}
+	return result
+}
@@ -0,0 +1,305 @@
+package config
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func makeModels(names ...string) map[string]ModelConfig {
+	m := make(map[string]ModelConfig)
+	for _, name := range names {
+		m[name] = ModelConfig{Cmd: "echo " + name}
+	}
+	return m
+}
+
+func TestValidateMatrix_Basic(t *testing.T) {
+	models := makeModels("gemma", "qwen", "mistral", "voxtral", "llama70B")
+
+	matrix := MatrixConfig{
+		Var: map[string]string{
+			"g": "gemma",
+			"q": "qwen",
+			"m": "mistral",
+			"v": "voxtral",
+			"L": "llama70B",
+		},
+		EvictCosts: map[string]int{
+			"L": 30,
+			"v": 50,
+		},
+		Sets: OrderedSets{
+			{Name: "standard", DSL: "(g | q | m) & v"},
+			{Name: "full", DSL: "L"},
+		},
+	}
+
+	expanded, err := ValidateMatrix(matrix, models)
+	require.NoError(t, err)
+
+	// standard expands to [gemma,voxtral], [qwen,voxtral], [mistral,voxtral]
+	// full expands to [llama70B]
+	assert.Len(t, expanded, 4)
+
+	assert.Equal(t, "standard", expanded[0].SetName)
+	assert.Equal(t, []string{"gemma", "voxtral"}, expanded[0].Models)
+
+	assert.Equal(t, "standard", expanded[1].SetName)
+	assert.Equal(t, []string{"qwen", "voxtral"}, expanded[1].Models)
+
+	assert.Equal(t, "standard", expanded[2].SetName)
+	assert.Equal(t, []string{"mistral", "voxtral"}, expanded[2].Models)
+
+	assert.Equal(t, "full", expanded[3].SetName)
+	assert.Equal(t, []string{"llama70B"}, expanded[3].Models)
+}
+
+func TestValidateMatrix_WithRef(t *testing.T) {
+	models := makeModels("gemma", "qwen", "mistral", "voxtral", "reranker")
+
+	matrix := MatrixConfig{
+		Var: map[string]string{
+			"g": "gemma",
+			"q": "qwen",
+			"m": "mistral",
+			"v": "voxtral",
+			"e": "reranker",
+		},
+		Sets: OrderedSets{
+			{Name: "llms", DSL: "g | q | m"},
+			{Name: "with_tts", DSL: "+llms & v"},
+			{Name: "mega", DSL: "+with_tts & e"},
+		},
+	}
+
+	expanded, err := ValidateMatrix(matrix, models)
+	require.NoError(t, err)
+
+	// llms: [gemma], [qwen], [mistral]
+	// with_tts: [gemma,voxtral], [qwen,voxtral], [mistral,voxtral]
+	// mega: [gemma,reranker,voxtral], [qwen,reranker,voxtral], [mistral,reranker,voxtral]
+	assert.Len(t, expanded, 9)
+
+	// Check mega entries
+	megaEntries := filterBySetName(expanded, "mega")
+	assert.Len(t, megaEntries, 3)
+	assert.Equal(t, []string{"gemma", "reranker", "voxtral"}, megaEntries[0].Models)
+}
+
+func TestValidateMatrix_MapIDRequired(t *testing.T) {
+	// DSL cannot use real model names directly — must use var IDs
+	models := makeModels("gemma", "voxtral")
+
+	matrix := MatrixConfig{
+		Var: map[string]string{"g": "gemma"},
+		Sets: OrderedSets{
+			{Name: "combo", DSL: "g & voxtral"},
+		},
+	}
+
+	_, err := ValidateMatrix(matrix, models)
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "unknown var ID")
+}
+
+func TestValidateMatrix_InvalidAliasKey(t *testing.T) {
+	models := makeModels("gemma")
+
+	tests := []struct {
+		name   string
+		alias  string
+		errMsg string
+	}{
+		{"too long", "abcdefghi", "alphanumeric and 1-8 characters"},
+		{"has underscore", "a_b", "alphanumeric and 1-8 characters"},
+		{"has hyphen", "a-b", "alphanumeric and 1-8 characters"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			matrix := MatrixConfig{
+				Var:  map[string]string{tt.alias: "gemma"},
+				Sets: OrderedSets{{Name: "s", DSL: tt.alias}},
+			}
+			_, err := ValidateMatrix(matrix, models)
+			require.Error(t, err)
+			assert.Contains(t, err.Error(), tt.errMsg)
+		})
+	}
+}
+
+func TestValidateMatrix_AliasReferencesUnknownModel(t *testing.T) {
+	models := makeModels("gemma")
+
+	matrix := MatrixConfig{
+		Var:  map[string]string{"x": "nonexistent"},
+		Sets: OrderedSets{{Name: "s", DSL: "x"}},
+	}
+
+	_, err := ValidateMatrix(matrix, models)
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "unknown model")
+}
+
+func TestValidateMatrix_EvictCostInvalid(t *testing.T) {
+	models := makeModels("gemma")
+
+	t.Run("zero cost", func(t *testing.T) {
+		matrix := MatrixConfig{
+			Var:        map[string]string{"g": "gemma"},
+			EvictCosts: map[string]int{"g": 0},
+			Sets:       OrderedSets{{Name: "s", DSL: "g"}},
+		}
+		_, err := ValidateMatrix(matrix, models)
+		require.Error(t, err)
+		assert.Contains(t, err.Error(), "positive integer")
+	})
+
+	t.Run("negative cost", func(t *testing.T) {
+		matrix := MatrixConfig{
+			Var:        map[string]string{"g": "gemma"},
+			EvictCosts: map[string]int{"g": -1},
+			Sets:       OrderedSets{{Name: "s", DSL: "g"}},
+		}
+		_, err := ValidateMatrix(matrix, models)
+		require.Error(t, err)
+		assert.Contains(t, err.Error(), "positive integer")
+	})
+
+	t.Run("unknown var ID in evict_costs", func(t *testing.T) {
+		matrix := MatrixConfig{
+			Var:        map[string]string{"g": "gemma"},
+			EvictCosts: map[string]int{"unknown": 5},
+			Sets:       OrderedSets{{Name: "s", DSL: "g"}},
+		}
+		_, err := ValidateMatrix(matrix, models)
+		require.Error(t, err)
+		assert.Contains(t, err.Error(), "unknown var ID")
+	})
+}
+
+func TestValidateMatrix_CycleDetection(t *testing.T) {
+	models := makeModels("gemma")
+
+	matrix := MatrixConfig{
+		Var: map[string]string{"g": "gemma"},
+		Sets: OrderedSets{
+			{Name: "a", DSL: "+b"},
+			{Name: "b", DSL: "+a"},
+		},
+	}
+
+	_, err := ValidateMatrix(matrix, models)
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "circular reference")
+}
+
+func TestValidateMatrix_UndefinedRefTarget(t *testing.T) {
+	models := makeModels("gemma")
+
+	matrix := MatrixConfig{
+		Var: map[string]string{"g": "gemma"},
+		Sets: OrderedSets{
+			{Name: "a", DSL: "+nonexistent"},
+		},
+	}
+
+	_, err := ValidateMatrix(matrix, models)
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "references undefined set")
+}
+
+func TestValidateMatrix_NoSets(t *testing.T) {
+	_, err := ValidateMatrix(MatrixConfig{}, makeModels("gemma"))
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "at least one set")
+}
+
+func TestValidateMatrix_UnknownMapIDInDSL(t *testing.T) {
+	models := makeModels("gemma")
+
+	matrix := MatrixConfig{
+		Var: map[string]string{"g": "gemma"},
+		Sets: OrderedSets{
+			{Name: "s", DSL: "g & nonexistent"},
+		},
+	}
+
+	_, err := ValidateMatrix(matrix, models)
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "unknown var ID")
+}
+
+func TestValidateMatrix_ResolvedEvictCosts(t *testing.T) {
+	mc := &MatrixConfig{
+		Var: map[string]string{
+			"g": "gemma",
+			"L": "llama70B",
+		},
+		EvictCosts: map[string]int{
+			"L": 30,
+			"g": 5,
+		},
+	}
+
+	costs := mc.ResolvedEvictCosts()
+	assert.Equal(t, 30, costs["llama70B"])
+	assert.Equal(t, 5, costs["gemma"])
+}
+
+func TestValidateMatrix_ConfigXOR(t *testing.T) {
+	// groups and matrix both defined
+	yaml := `
+models:
+  model1:
+    cmd: echo model1
+    proxy: http://localhost:8080
+groups:
+  group1:
+    members:
+      - model1
+matrix:
+  sets:
+    s: "model1"
+`
+	_, err := LoadConfigFromReader(strings.NewReader(yaml))
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "cannot use both")
+}
+
+func TestValidateMatrix_ConfigMatrixOnly(t *testing.T) {
+	yaml := `
+models:
+  gemma:
+    cmd: echo gemma
+    proxy: http://localhost:8080
+  qwen:
+    cmd: echo qwen
+    proxy: http://localhost:8081
+matrix:
+  vars:
+    g: gemma
+    q: qwen
+  sets:
+    combo: "g | q"
+`
+	cfg, err := LoadConfigFromReader(strings.NewReader(yaml))
+	require.NoError(t, err)
+	assert.NotNil(t, cfg.Matrix)
+	assert.Len(t, cfg.ExpandedSets, 2)
+	// Groups should be empty when matrix is used
+	assert.Empty(t, cfg.Groups)
+}
+
+func filterBySetName(sets []ExpandedSet, name string) []ExpandedSet {
+	var result []ExpandedSet
+	for _, s := range sets {
+		if s.SetName == name {
+			result = append(result, s)
+		}
+	}
+	return result
+}
@@ -3,10 +3,23 @@ package config
 import (
 	"errors"
 	"runtime"
-	"slices"
-	"strings"
 )

+const (
+	MODEL_CONFIG_DEFAULT_TTL = -1
+)
+
+// TimeoutsConfig holds timeout settings for proxy connections
+// 0 = no timeout
+type TimeoutsConfig struct {
+	Connect        int `yaml:"connect"`
+	KeepAlive      int `yaml:"keepalive"`
+	ResponseHeader int `yaml:"responseHeader"`
+	TLSHandshake   int `yaml:"tlsHandshake"`
+	ExpectContinue int `yaml:"expectContinue"`
+	IdleConn       int `yaml:"idleConn"`
+}
+
 type ModelConfig struct {
 	Cmd           string   `yaml:"cmd"`
 	CmdStop       string   `yaml:"cmdStop"`
@@ -38,6 +51,12 @@ type ModelConfig struct {

 	// override global setting
 	SendLoadingState *bool `yaml:"sendLoadingState"`
+
+	// Timeout settings for proxy connections
+	Timeouts TimeoutsConfig `yaml:"timeouts"`
+
+	// Copy of HealthCheckTimeout from global config
+	HealthCheckTimeout int `yaml:"healthCheckTimeout"`
 }

 func (m *ModelConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
@@ -49,12 +68,22 @@ func (m *ModelConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
 		Aliases:          []string{},
 		Env:              []string{},
 		CheckEndpoint:    "/health",
-		UnloadAfter:      0,
+		UnloadAfter:      MODEL_CONFIG_DEFAULT_TTL, // use GlobalTTL
 		Unlisted:         false,
 		UseModelName:     "",
 		ConcurrencyLimit: 0,
 		Name:             "",
 		Description:      "",
+
+		// matches http.DefaultTransport
+		Timeouts: TimeoutsConfig{
+			Connect:        30,
+			KeepAlive:      30,
+			ResponseHeader: 0,
+			TLSHandshake:   10,
+			ExpectContinue: 1,
+			IdleConn:       90,
+		},
 	}

 	// the default cmdStop to taskkill /f /t /pid ${PID}
@@ -74,16 +103,15 @@ func (m *ModelConfig) SanitizedCommand() ([]string, error) {
 	return SanitizeCommand(m.Cmd)
 }

-// ModelFilters see issue #174
+// ModelFilters embeds Filters and adds legacy support for strip_params field
+// See issue #174
 type ModelFilters struct {
-	StripParams string `yaml:"stripParams"`
+	Filters `yaml:",inline"`
 }

 func (m *ModelFilters) UnmarshalYAML(unmarshal func(interface{}) error) error {
 	type rawModelFilters ModelFilters
-	defaults := rawModelFilters{
-		StripParams: "",
-	}
+	defaults := rawModelFilters{}

 	if err := unmarshal(&defaults); err != nil {
 		return err
@@ -104,25 +132,8 @@ func (m *ModelFilters) UnmarshalYAML(unmarshal func(interface{}) error) error {
 	return nil
 }

+// SanitizedStripParams wraps Filters.SanitizedStripParams for backwards compatibility
+// Returns ([]string, error) to match existing API
 func (f ModelFilters) SanitizedStripParams() ([]string, error) {
-	if f.StripParams == "" {
-		return nil, nil
-	}
-
-	params := strings.Split(f.StripParams, ",")
-	cleaned := make([]string, 0, len(params))
-	seen := make(map[string]bool)
-
-	for _, param := range params {
-		trimmed := strings.TrimSpace(param)
-		if trimmed == "model" || trimmed == "" || seen[trimmed] {
-			continue
-		}
-		seen[trimmed] = true
-		cleaned = append(cleaned, trimmed)
-	}
-
-	// sort cleaned
-	slices.Sort(cleaned)
-	return cleaned, nil
+	return f.Filters.SanitizedStripParams(), nil
 }
@@ -0,0 +1,172 @@
+package config
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestConfig_ModelConfigSanitizedCommand(t *testing.T) {
+	config := &ModelConfig{
+		Cmd: `python model1.py \
+    --arg1 value1 \
+    --arg2 value2`,
+	}
+
+	args, err := config.SanitizedCommand()
+	assert.NoError(t, err)
+	assert.Equal(t, []string{"python", "model1.py", "--arg1", "value1", "--arg2", "value2"}, args)
+}
+
+func TestConfig_ModelFilters(t *testing.T) {
+	content := `
+macros:
+  default_strip: "temperature, top_p"
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+    filters:
+      # macros inserted and list is cleaned of duplicates and empty strings
+      stripParams: "model, top_k, top_k, temperature, ${default_strip}, , ,"
+  # check for strip_params (legacy field name) compatibility
+  legacy:
+    cmd: path/to/cmd --port ${PORT}
+    filters:
+      strip_params: "model, top_k, top_k, temperature, ${default_strip}, , ,"
+`
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+	for modelId, modelConfig := range config.Models {
+		t.Run(fmt.Sprintf("Testing macros in filters for model %s", modelId), func(t *testing.T) {
+			assert.Equal(t, "model, top_k, top_k, temperature, temperature, top_p, , ,", modelConfig.Filters.StripParams)
+			sanitized, err := modelConfig.Filters.SanitizedStripParams()
+			if assert.NoError(t, err) {
+				// model has been removed
+				// empty strings have been removed
+				// duplicates have been removed
+				assert.Equal(t, []string{"temperature", "top_k", "top_p"}, sanitized)
+			}
+		})
+	}
+}
+
+func TestConfig_ModelSendLoadingState(t *testing.T) {
+	content := `
+sendLoadingState: true
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+    sendLoadingState: false
+  model2:
+    cmd: path/to/cmd --port ${PORT}
+`
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+	assert.True(t, config.SendLoadingState)
+	if assert.NotNil(t, config.Models["model1"].SendLoadingState) {
+		assert.False(t, *config.Models["model1"].SendLoadingState)
+	}
+	if assert.NotNil(t, config.Models["model2"].SendLoadingState) {
+		assert.True(t, *config.Models["model2"].SendLoadingState)
+	}
+}
+
+func TestConfig_SetParamsByIDAutoAlias(t *testing.T) {
+	content := `
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+    filters:
+      setParamsByID:
+        "${MODEL_ID}:high":
+          reasoning_effort: high
+        "${MODEL_ID}:low":
+          reasoning_effort: low
+`
+	cfg, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+
+	// Keys (other than the model's own ID) should be registered as aliases
+	realName, found := cfg.RealModelName("model1:high")
+	assert.True(t, found, "model1:high should be an auto-registered alias")
+	assert.Equal(t, "model1", realName)
+
+	realName, found = cfg.RealModelName("model1:low")
+	assert.True(t, found, "model1:low should be an auto-registered alias")
+	assert.Equal(t, "model1", realName)
+
+	// Auto-aliases should also appear in modelConfig.Aliases
+	aliases := cfg.Models["model1"].Aliases
+	assert.Contains(t, aliases, "model1:high")
+	assert.Contains(t, aliases, "model1:low")
+}
+
+func TestConfig_SetParamsByIDAutoAliasConflictWithModelID(t *testing.T) {
+	content := `
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+    filters:
+      setParamsByID:
+        model2:
+          reasoning_effort: high
+  model2:
+    cmd: path/to/cmd --port ${PORT}
+`
+	_, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.ErrorContains(t, err, "conflicts with an existing model ID")
+}
+
+func TestConfig_SetParamsByIDAutoAliasConflictWithOtherModel(t *testing.T) {
+	content := `
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+    filters:
+      setParamsByID:
+        "shared-alias":
+          reasoning_effort: high
+  model2:
+    cmd: path/to/cmd --port ${PORT}
+    filters:
+      setParamsByID:
+        "shared-alias":
+          reasoning_effort: low
+`
+	_, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.ErrorContains(t, err, "duplicate alias")
+}
+
+func TestConfig_ModelFiltersWithSetParams(t *testing.T) {
+	content := `
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+    filters:
+      stripParams: "top_k"
+      setParams:
+        temperature: 0.7
+        top_p: 0.9
+        stop:
+          - "<|end|>"
+          - "<|stop|>"
+`
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+
+	modelConfig := config.Models["model1"]
+
+	// Check stripParams
+	stripParams, err := modelConfig.Filters.SanitizedStripParams()
+	assert.NoError(t, err)
+	assert.Equal(t, []string{"top_k"}, stripParams)
+
+	// Check setParams
+	setParams, keys := modelConfig.Filters.SanitizedSetParams()
+	assert.NotNil(t, setParams)
+	assert.Equal(t, []string{"stop", "temperature", "top_p"}, keys)
+	assert.Equal(t, 0.7, setParams["temperature"])
+	assert.Equal(t, 0.9, setParams["top_p"])
+}
@@ -0,0 +1,63 @@
+package config
+
+import (
+	"fmt"
+	"net/url"
+)
+
+type PeerDictionaryConfig map[string]PeerConfig
+type PeerConfig struct {
+	Proxy    string   `yaml:"proxy"`
+	ProxyURL *url.URL `yaml:"-"`
+	ApiKey   string   `yaml:"apiKey"`
+	Models   []string `yaml:"models"`
+	Filters  Filters  `yaml:"filters"`
+
+	// Timeout settings for proxy connections
+	Timeouts TimeoutsConfig `yaml:"timeouts"`
+}
+
+func (c *PeerConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
+	type rawPeerConfig PeerConfig
+	defaults := rawPeerConfig{
+		Proxy:   "",
+		ApiKey:  "",
+		Models:  []string{},
+		Filters: Filters{},
+
+		// mostly matches http.DefaultTransport but with a 60s ResponseHeader timeout
+		// to match the pre PR #619 functionality
+		Timeouts: TimeoutsConfig{
+			Connect:        30,
+			KeepAlive:      30,
+			ResponseHeader: 60,
+			TLSHandshake:   10,
+			ExpectContinue: 1,
+			IdleConn:       90,
+		},
+	}
+
+	if err := unmarshal(&defaults); err != nil {
+		return err
+	}
+
+	// Validate proxy is not empty
+	if defaults.Proxy == "" {
+		return fmt.Errorf("proxy is required")
+	}
+
+	// Validate proxy is a valid URL and store the parsed value
+	parsedURL, err := url.Parse(defaults.Proxy)
+	if err != nil {
+		return fmt.Errorf("invalid peer proxy URL (%s): %w", defaults.Proxy, err)
+	}
+	defaults.ProxyURL = parsedURL
+
+	// Validate models is not empty
+	if len(defaults.Models) == 0 {
+		return fmt.Errorf("peer models can not be empty")
+	}
+
+	*c = PeerConfig(defaults)
+	return nil
+}
@@ -0,0 +1,209 @@
+package config
+
+import (
+	"testing"
+
+	"gopkg.in/yaml.v3"
+)
+
+func TestPeerConfig_UnmarshalYAML(t *testing.T) {
+	tests := []struct {
+		name    string
+		yaml    string
+		wantErr string
+	}{
+		{
+			name: "valid config",
+			yaml: `
+proxy: http://192.168.1.23
+models:
+  - model_a
+  - model_b
+`,
+			wantErr: "",
+		},
+		{
+			name: "valid config with apiKey",
+			yaml: `
+proxy: https://openrouter.ai/api
+apiKey: sk-test-key
+models:
+  - meta-llama/llama-3.1-8b-instruct
+`,
+			wantErr: "",
+		},
+		{
+			name: "missing proxy",
+			yaml: `
+models:
+  - model_a
+`,
+			wantErr: "proxy is required",
+		},
+		{
+			name: "empty proxy",
+			yaml: `
+proxy: ""
+models:
+  - model_a
+`,
+			wantErr: "proxy is required",
+		},
+		{
+			name: "invalid proxy URL",
+			yaml: `
+proxy: "://invalid"
+models:
+  - model_a
+`,
+			wantErr: "invalid peer proxy URL",
+		},
+		{
+			name: "missing models",
+			yaml: `
+proxy: http://localhost:8080
+`,
+			wantErr: "peer models can not be empty",
+		},
+		{
+			name: "empty models",
+			yaml: `
+proxy: http://localhost:8080
+models: []
+`,
+			wantErr: "peer models can not be empty",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			var config PeerConfig
+			err := yaml.Unmarshal([]byte(tt.yaml), &config)
+
+			if tt.wantErr == "" {
+				if err != nil {
+					t.Errorf("unexpected error: %v", err)
+				}
+			} else {
+				if err == nil {
+					t.Errorf("expected error containing %q, got nil", tt.wantErr)
+				} else if !contains(err.Error(), tt.wantErr) {
+					t.Errorf("expected error containing %q, got %q", tt.wantErr, err.Error())
+				}
+			}
+		})
+	}
+}
+
+func TestPeerConfig_ProxyURL(t *testing.T) {
+	yamlData := `
+proxy: http://192.168.1.23:8080/api
+apiKey: sk-test
+models:
+  - model_a
+`
+	var config PeerConfig
+	err := yaml.Unmarshal([]byte(yamlData), &config)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if config.ProxyURL == nil {
+		t.Fatal("ProxyURL should not be nil")
+	}
+
+	if config.ProxyURL.Host != "192.168.1.23:8080" {
+		t.Errorf("expected host %q, got %q", "192.168.1.23:8080", config.ProxyURL.Host)
+	}
+
+	if config.ProxyURL.Scheme != "http" {
+		t.Errorf("expected scheme %q, got %q", "http", config.ProxyURL.Scheme)
+	}
+
+	if config.ProxyURL.Path != "/api" {
+		t.Errorf("expected path %q, got %q", "/api", config.ProxyURL.Path)
+	}
+}
+
+func contains(s, substr string) bool {
+	return len(s) >= len(substr) && searchSubstring(s, substr)
+}
+
+func searchSubstring(s, substr string) bool {
+	for i := 0; i <= len(s)-len(substr); i++ {
+		if s[i:i+len(substr)] == substr {
+			return true
+		}
+	}
+	return false
+}
+
+func TestPeerConfig_WithFilters(t *testing.T) {
+	yamlData := `
+proxy: https://openrouter.ai/api
+apiKey: sk-test
+models:
+  - model_a
+filters:
+  setParams:
+    temperature: 0.7
+    provider:
+      data_collection: deny
+`
+	var config PeerConfig
+	err := yaml.Unmarshal([]byte(yamlData), &config)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if config.Filters.SetParams == nil {
+		t.Fatal("Filters.SetParams should not be nil")
+	}
+
+	if config.Filters.SetParams["temperature"] != 0.7 {
+		t.Errorf("expected temperature 0.7, got %v", config.Filters.SetParams["temperature"])
+	}
+
+	provider, ok := config.Filters.SetParams["provider"].(map[string]any)
+	if !ok {
+		t.Fatal("provider should be a map")
+	}
+	if provider["data_collection"] != "deny" {
+		t.Errorf("expected data_collection deny, got %v", provider["data_collection"])
+	}
+}
+
+func TestPeerConfig_WithBothFilters(t *testing.T) {
+	yamlData := `
+proxy: https://openrouter.ai/api
+apiKey: sk-test
+models:
+  - model_a
+filters:
+  stripParams: "temperature, top_p"
+  setParams:
+    max_tokens: 1000
+`
+	var config PeerConfig
+	err := yaml.Unmarshal([]byte(yamlData), &config)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	// Check stripParams
+	stripParams := config.Filters.SanitizedStripParams()
+	if len(stripParams) != 2 {
+		t.Errorf("expected 2 strip params, got %d", len(stripParams))
+	}
+	if stripParams[0] != "temperature" || stripParams[1] != "top_p" {
+		t.Errorf("unexpected strip params: %v", stripParams)
+	}
+
+	// Check setParams
+	if config.Filters.SetParams == nil {
+		t.Fatal("Filters.SetParams should not be nil")
+	}
+	if config.Filters.SetParams["max_tokens"] != 1000 {
+		t.Errorf("expected max_tokens 1000, got %v", config.Filters.SetParams["max_tokens"])
+	}
+}
@@ -0,0 +1,34 @@
+package config
+
+import (
+	"fmt"
+	"time"
+)
+
+// PerformanceConfig holds configuration for system performance monitoring
+type PerformanceConfig struct {
+	Disabled bool          `yaml:"disabled"`
+	Every    time.Duration `yaml:"every"`
+}
+
+func (p *PerformanceConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
+	type rawPerformanceConfig PerformanceConfig
+	defaults := rawPerformanceConfig{
+		Every: 5 * time.Second,
+	}
+
+	if err := unmarshal(&defaults); err != nil {
+		return err
+	}
+
+	*p = PerformanceConfig(defaults)
+	return nil
+}
+
+// Validate checks the PerformanceConfig values and returns an error if invalid
+func (p *PerformanceConfig) Validate() error {
+	if p.Every < 5*time.Second {
+		return fmt.Errorf("every must be at least 5s, got %v", p.Every)
+	}
+	return nil
+}
@@ -0,0 +1,98 @@
+package config
+
+import (
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestPerformanceConfig_Defaults(t *testing.T) {
+	content := `
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+`
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+
+	// When performance section is missing, defaults should be applied
+	assert.False(t, config.Performance.Disabled)
+	assert.Equal(t, 5*time.Second, config.Performance.Every)
+}
+
+func TestPerformanceConfig_CustomValues(t *testing.T) {
+	content := `
+performance:
+  enable: true
+  every: 30s
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+`
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+
+	assert.False(t, config.Performance.Disabled)
+	assert.Equal(t, 30*time.Second, config.Performance.Every)
+}
+
+func TestPerformanceConfig_Disabled(t *testing.T) {
+	content := `
+performance:
+  disabled: true
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+`
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+
+	assert.True(t, config.Performance.Disabled)
+	// Duration defaults should still apply
+	assert.Equal(t, 5*time.Second, config.Performance.Every)
+}
+
+func TestPerformanceConfig_PartialValues(t *testing.T) {
+	content := `
+performance:
+  every: 10s
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+`
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+
+	// enable should default to true
+	assert.False(t, config.Performance.Disabled)
+	assert.Equal(t, 10*time.Second, config.Performance.Every)
+}
+
+func TestPerformanceConfig_InvalidEvery(t *testing.T) {
+	content := `
+performance:
+  every: 4s
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+`
+	_, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "every must be at least 5s")
+}
+
+func TestPerformanceConfig_ComplexDurations(t *testing.T) {
+	content := `
+performance:
+  every: 1m30s
+models:
+  model1:
+    cmd: path/to/cmd --port ${PORT}
+`
+	config, err := LoadConfigFromReader(strings.NewReader(content))
+	assert.NoError(t, err)
+
+	assert.Equal(t, 90*time.Second, config.Performance.Every)
+}
@@ -1,54 +1,54 @@
-// Copyright (c) Roman Atachiants and contributore. All rights reserved.
-// Licensed under the MIT license. See LICENSE file in the project root for detaile.
-
-package event
-
-import (
-	"sync"
-	"sync/atomic"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-)
-
-/*
-cpu: 13th Gen Intel(R) Core(TM) i7-13700K
-BenchmarkSubcribeConcurrent-24    	 1826686	       606.3 ns/op	    1648 B/op	       5 allocs/op
-*/
-func BenchmarkSubscribeConcurrent(b *testing.B) {
-	d := NewDispatcher()
-	b.ReportAllocs()
-	b.ResetTimer()
-
-	b.RunParallel(func(pb *testing.PB) {
-		for pb.Next() {
-			unsub := Subscribe(d, func(ev MyEvent1) {})
-			unsub()
-		}
-	})
-}
-
-func TestDefaultPublish(t *testing.T) {
-	var wg sync.WaitGroup
-
-	// Subscribe
-	var count int64
-	defer On(func(ev MyEvent1) {
-		atomic.AddInt64(&count, 1)
-		wg.Done()
-	})()
-
-	defer OnType(TypeEvent1, func(ev MyEvent1) {
-		atomic.AddInt64(&count, 1)
-		wg.Done()
-	})()
-
-	// Publish
-	wg.Add(4)
-	Emit(MyEvent1{})
-	Emit(MyEvent1{})
-
-	// Wait and check
-	wg.Wait()
-	assert.Equal(t, int64(4), count)
-}
+// Copyright (c) Roman Atachiants and contributore. All rights reserved.
+// Licensed under the MIT license. See LICENSE file in the project root for detaile.
+
+package event
+
+import (
+	"sync"
+	"sync/atomic"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+/*
+cpu: 13th Gen Intel(R) Core(TM) i7-13700K
+BenchmarkSubcribeConcurrent-24    	 1826686	       606.3 ns/op	    1648 B/op	       5 allocs/op
+*/
+func BenchmarkSubscribeConcurrent(b *testing.B) {
+	d := NewDispatcher()
+	b.ReportAllocs()
+	b.ResetTimer()
+
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			unsub := Subscribe(d, func(ev MyEvent1) {})
+			unsub()
+		}
+	})
+}
+
+func TestDefaultPublish(t *testing.T) {
+	var wg sync.WaitGroup
+
+	// Subscribe
+	var count int64
+	defer On(func(ev MyEvent1) {
+		atomic.AddInt64(&count, 1)
+		wg.Done()
+	})()
+
+	defer OnType(TypeEvent1, func(ev MyEvent1) {
+		atomic.AddInt64(&count, 1)
+		wg.Done()
+	})()
+
+	// Publish
+	wg.Add(4)
+	Emit(MyEvent1{})
+	Emit(MyEvent1{})
+
+	// Wait and check
+	wg.Wait()
+	assert.Equal(t, int64(4), count)
+}
@@ -1,324 +1,324 @@
-// Copyright (c) Roman Atachiants and contributore. All rights reserved.
-// Licensed under the MIT license. See LICENSE file in the project root for detaile.
-
-package event
-
-import (
-	"fmt"
-	"sync"
-	"sync/atomic"
-	"testing"
-	"time"
-
-	"github.com/stretchr/testify/assert"
-)
-
-func TestPublish(t *testing.T) {
-	d := NewDispatcher()
-	var wg sync.WaitGroup
-
-	// Subscribe, must be received in order
-	var count int64
-	defer Subscribe(d, func(ev MyEvent1) {
-		assert.Equal(t, int(atomic.AddInt64(&count, 1)), ev.Number)
-		wg.Done()
-	})()
-
-	// Publish
-	wg.Add(3)
-	Publish(d, MyEvent1{Number: 1})
-	Publish(d, MyEvent1{Number: 2})
-	Publish(d, MyEvent1{Number: 3})
-
-	// Wait and check
-	wg.Wait()
-	assert.Equal(t, int64(3), count)
-}
-
-func TestUnsubscribe(t *testing.T) {
-	d := NewDispatcher()
-	assert.Equal(t, 0, d.count(TypeEvent1))
-	unsubscribe := Subscribe(d, func(ev MyEvent1) {
-		// Nothing
-	})
-
-	assert.Equal(t, 1, d.count(TypeEvent1))
-	unsubscribe()
-	assert.Equal(t, 0, d.count(TypeEvent1))
-}
-
-func TestConcurrent(t *testing.T) {
-	const max = 1000000
-	var count int64
-	var wg sync.WaitGroup
-	wg.Add(1)
-
-	d := NewDispatcher()
-	defer Subscribe(d, func(ev MyEvent1) {
-		if current := atomic.AddInt64(&count, 1); current == max {
-			wg.Done()
-		}
-	})()
-
-	// Asynchronously publish
-	go func() {
-		for i := 0; i < max; i++ {
-			Publish(d, MyEvent1{})
-		}
-	}()
-
-	defer Subscribe(d, func(ev MyEvent1) {
-		// Subscriber that does nothing
-	})()
-
-	wg.Wait()
-	assert.Equal(t, max, int(count))
-}
-
-func TestSubscribeDifferentType(t *testing.T) {
-	d := NewDispatcher()
-	assert.Panics(t, func() {
-		SubscribeTo(d, TypeEvent1, func(ev MyEvent1) {})
-		SubscribeTo(d, TypeEvent1, func(ev MyEvent2) {})
-	})
-}
-
-func TestPublishDifferentType(t *testing.T) {
-	d := NewDispatcher()
-	assert.Panics(t, func() {
-		SubscribeTo(d, TypeEvent1, func(ev MyEvent2) {})
-		Publish(d, MyEvent1{})
-	})
-}
-
-func TestCloseDispatcher(t *testing.T) {
-	d := NewDispatcher()
-	defer SubscribeTo(d, TypeEvent1, func(ev MyEvent2) {})()
-
-	assert.NoError(t, d.Close())
-	assert.Panics(t, func() {
-		SubscribeTo(d, TypeEvent1, func(ev MyEvent2) {})
-	})
-}
-
-func TestMatrix(t *testing.T) {
-	const amount = 1000
-	for _, subs := range []int{1, 10, 100} {
-		for _, topics := range []int{1, 10} {
-			expected := subs * topics * amount
-			t.Run(fmt.Sprintf("%dx%d", topics, subs), func(t *testing.T) {
-				var count atomic.Int64
-				var wg sync.WaitGroup
-				wg.Add(expected)
-
-				d := NewDispatcher()
-				for i := 0; i < subs; i++ {
-					for id := 0; id < topics; id++ {
-						defer SubscribeTo(d, uint32(id), func(ev MyEvent3) {
-							count.Add(1)
-							wg.Done()
-						})()
-					}
-				}
-
-				for n := 0; n < amount; n++ {
-					for id := 0; id < topics; id++ {
-						go Publish(d, MyEvent3{ID: id})
-					}
-				}
-
-				wg.Wait()
-				assert.Equal(t, expected, int(count.Load()))
-			})
-		}
-	}
-}
-
-func TestConcurrentSubscriptionRace(t *testing.T) {
-	// This test specifically targets the race condition that occurs when multiple
-	// goroutines try to subscribe to different event types simultaneously.
-	// Without the CAS loop, subscriptions could be lost due to registry corruption.
-
-	const numGoroutines = 100
-	const numEventTypes = 50
-
-	d := NewDispatcher()
-	defer d.Close()
-
-	var wg sync.WaitGroup
-	var receivedCount int64
-	var subscribedTypes sync.Map // Thread-safe map
-
-	wg.Add(numGoroutines)
-
-	// Start multiple goroutines that subscribe to different event types concurrently
-	for i := 0; i < numGoroutines; i++ {
-		go func(goroutineID int) {
-			defer wg.Done()
-
-			// Each goroutine subscribes to a unique event type
-			eventType := uint32(goroutineID%numEventTypes + 1000) // Offset to avoid collision with other tests
-
-			// Subscribe to the event type
-			SubscribeTo(d, eventType, func(ev MyEvent3) {
-				atomic.AddInt64(&receivedCount, 1)
-			})
-
-			// Record that this type was subscribed
-			subscribedTypes.Store(eventType, true)
-		}(i)
-	}
-
-	// Wait for all subscriptions to complete
-	wg.Wait()
-
-	// Count the number of unique event types subscribed
-	expectedTypes := 0
-	subscribedTypes.Range(func(key, value interface{}) bool {
-		expectedTypes++
-		return true
-	})
-
-	// Small delay to ensure all subscriptions are fully processed
-	time.Sleep(10 * time.Millisecond)
-
-	// Publish events to each subscribed type
-	subscribedTypes.Range(func(key, value interface{}) bool {
-		eventType := key.(uint32)
-		Publish(d, MyEvent3{ID: int(eventType)})
-		return true
-	})
-
-	// Wait for all events to be processed
-	time.Sleep(50 * time.Millisecond)
-
-	// Verify that we received at least the expected number of events
-	// (there might be more if multiple goroutines subscribed to the same event type)
-	received := atomic.LoadInt64(&receivedCount)
-	assert.GreaterOrEqual(t, int(received), expectedTypes,
-		"Should have received at least %d events, got %d", expectedTypes, received)
-
-	// Verify that we have the expected number of unique event types
-	assert.Equal(t, numEventTypes, expectedTypes,
-		"Should have exactly %d unique event types", numEventTypes)
-}
-
-func TestConcurrentHandlerRegistration(t *testing.T) {
-	const numGoroutines = 100
-
-	// Test concurrent subscriptions to the same event type
-	t.Run("SameEventType", func(t *testing.T) {
-		d := NewDispatcher()
-		var handlerCount int64
-		var wg sync.WaitGroup
-
-		// Start multiple goroutines subscribing to the same event type (0x1)
-		for i := 0; i < numGoroutines; i++ {
-			wg.Add(1)
-			go func() {
-				defer wg.Done()
-				SubscribeTo(d, uint32(0x1), func(ev MyEvent1) {
-					atomic.AddInt64(&handlerCount, 1)
-				})
-			}()
-		}
-
-		wg.Wait()
-
-		// Verify all handlers were registered by publishing an event
-		atomic.StoreInt64(&handlerCount, 0)
-		Publish(d, MyEvent1{})
-
-		// Small delay to ensure all handlers have executed
-		time.Sleep(10 * time.Millisecond)
-
-		assert.Equal(t, int64(numGoroutines), atomic.LoadInt64(&handlerCount),
-			"Not all handlers were registered due to race condition")
-	})
-
-	// Test concurrent subscriptions to different event types
-	t.Run("DifferentEventTypes", func(t *testing.T) {
-		d := NewDispatcher()
-		var wg sync.WaitGroup
-		receivedEvents := make(map[uint32]*int64)
-
-		// Create multiple event types and subscribe concurrently
-		for i := 0; i < numGoroutines; i++ {
-			eventType := uint32(100 + i)
-			counter := new(int64)
-			receivedEvents[eventType] = counter
-
-			wg.Add(1)
-			go func(et uint32, cnt *int64) {
-				defer wg.Done()
-				SubscribeTo(d, et, func(ev MyEvent3) {
-					atomic.AddInt64(cnt, 1)
-				})
-			}(eventType, counter)
-		}
-
-		wg.Wait()
-
-		// Publish events to all types
-		for eventType := uint32(100); eventType < uint32(100+numGoroutines); eventType++ {
-			Publish(d, MyEvent3{ID: int(eventType)})
-		}
-
-		// Small delay to ensure all handlers have executed
-		time.Sleep(10 * time.Millisecond)
-
-		// Verify all event types received their events
-		for eventType, counter := range receivedEvents {
-			assert.Equal(t, int64(1), atomic.LoadInt64(counter),
-				"Event type %d did not receive its event", eventType)
-		}
-	})
-}
-
-func TestBackpressure(t *testing.T) {
-	d := NewDispatcher()
-	d.maxQueue = 10
-
-	var processedCount int64
-	unsub := SubscribeTo(d, uint32(0x200), func(ev MyEvent3) {
-		atomic.AddInt64(&processedCount, 1)
-	})
-	defer unsub()
-
-	const eventsToPublish = 1000
-	for i := 0; i < eventsToPublish; i++ {
-		Publish(d, MyEvent3{ID: 0x200})
-	}
-
-	time.Sleep(100 * time.Millisecond)
-
-	// Verify all events were eventually processed
-	finalProcessed := atomic.LoadInt64(&processedCount)
-	assert.Equal(t, int64(eventsToPublish), finalProcessed)
-	t.Logf("Events processed: %d/%d", finalProcessed, eventsToPublish)
-}
-
-// ------------------------------------- Test Events -------------------------------------
-
-const (
-	TypeEvent1 = 0x1
-	TypeEvent2 = 0x2
-)
-
-type MyEvent1 struct {
-	Number int
-}
-
-func (t MyEvent1) Type() uint32 { return TypeEvent1 }
-
-type MyEvent2 struct {
-	Text string
-}
-
-func (t MyEvent2) Type() uint32 { return TypeEvent2 }
-
-type MyEvent3 struct {
-	ID int
-}
-
-func (t MyEvent3) Type() uint32 { return uint32(t.ID) }
+// Copyright (c) Roman Atachiants and contributore. All rights reserved.
+// Licensed under the MIT license. See LICENSE file in the project root for detaile.
+
+package event
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestPublish(t *testing.T) {
+	d := NewDispatcher()
+	var wg sync.WaitGroup
+
+	// Subscribe, must be received in order
+	var count int64
+	defer Subscribe(d, func(ev MyEvent1) {
+		assert.Equal(t, int(atomic.AddInt64(&count, 1)), ev.Number)
+		wg.Done()
+	})()
+
+	// Publish
+	wg.Add(3)
+	Publish(d, MyEvent1{Number: 1})
+	Publish(d, MyEvent1{Number: 2})
+	Publish(d, MyEvent1{Number: 3})
+
+	// Wait and check
+	wg.Wait()
+	assert.Equal(t, int64(3), count)
+}
+
+func TestUnsubscribe(t *testing.T) {
+	d := NewDispatcher()
+	assert.Equal(t, 0, d.count(TypeEvent1))
+	unsubscribe := Subscribe(d, func(ev MyEvent1) {
+		// Nothing
+	})
+
+	assert.Equal(t, 1, d.count(TypeEvent1))
+	unsubscribe()
+	assert.Equal(t, 0, d.count(TypeEvent1))
+}
+
+func TestConcurrent(t *testing.T) {
+	const max = 1000000
+	var count int64
+	var wg sync.WaitGroup
+	wg.Add(1)
+
+	d := NewDispatcher()
+	defer Subscribe(d, func(ev MyEvent1) {
+		if current := atomic.AddInt64(&count, 1); current == max {
+			wg.Done()
+		}
+	})()
+
+	// Asynchronously publish
+	go func() {
+		for i := 0; i < max; i++ {
+			Publish(d, MyEvent1{})
+		}
+	}()
+
+	defer Subscribe(d, func(ev MyEvent1) {
+		// Subscriber that does nothing
+	})()
+
+	wg.Wait()
+	assert.Equal(t, max, int(count))
+}
+
+func TestSubscribeDifferentType(t *testing.T) {
+	d := NewDispatcher()
+	assert.Panics(t, func() {
+		SubscribeTo(d, TypeEvent1, func(ev MyEvent1) {})
+		SubscribeTo(d, TypeEvent1, func(ev MyEvent2) {})
+	})
+}
+
+func TestPublishDifferentType(t *testing.T) {
+	d := NewDispatcher()
+	assert.Panics(t, func() {
+		SubscribeTo(d, TypeEvent1, func(ev MyEvent2) {})
+		Publish(d, MyEvent1{})
+	})
+}
+
+func TestCloseDispatcher(t *testing.T) {
+	d := NewDispatcher()
+	defer SubscribeTo(d, TypeEvent1, func(ev MyEvent2) {})()
+
+	assert.NoError(t, d.Close())
+	assert.Panics(t, func() {
+		SubscribeTo(d, TypeEvent1, func(ev MyEvent2) {})
+	})
+}
+
+func TestMatrix(t *testing.T) {
+	const amount = 1000
+	for _, subs := range []int{1, 10, 100} {
+		for _, topics := range []int{1, 10} {
+			expected := subs * topics * amount
+			t.Run(fmt.Sprintf("%dx%d", topics, subs), func(t *testing.T) {
+				var count atomic.Int64
+				var wg sync.WaitGroup
+				wg.Add(expected)
+
+				d := NewDispatcher()
+				for i := 0; i < subs; i++ {
+					for id := 0; id < topics; id++ {
+						defer SubscribeTo(d, uint32(id), func(ev MyEvent3) {
+							count.Add(1)
+							wg.Done()
+						})()
+					}
+				}
+
+				for n := 0; n < amount; n++ {
+					for id := 0; id < topics; id++ {
+						go Publish(d, MyEvent3{ID: id})
+					}
+				}
+
+				wg.Wait()
+				assert.Equal(t, expected, int(count.Load()))
+			})
+		}
+	}
+}
+
+func TestConcurrentSubscriptionRace(t *testing.T) {
+	// This test specifically targets the race condition that occurs when multiple
+	// goroutines try to subscribe to different event types simultaneously.
+	// Without the CAS loop, subscriptions could be lost due to registry corruption.
+
+	const numGoroutines = 100
+	const numEventTypes = 50
+
+	d := NewDispatcher()
+	defer d.Close()
+
+	var wg sync.WaitGroup
+	var receivedCount int64
+	var subscribedTypes sync.Map // Thread-safe map
+
+	wg.Add(numGoroutines)
+
+	// Start multiple goroutines that subscribe to different event types concurrently
+	for i := 0; i < numGoroutines; i++ {
+		go func(goroutineID int) {
+			defer wg.Done()
+
+			// Each goroutine subscribes to a unique event type
+			eventType := uint32(goroutineID%numEventTypes + 1000) // Offset to avoid collision with other tests
+
+			// Subscribe to the event type
+			SubscribeTo(d, eventType, func(ev MyEvent3) {
+				atomic.AddInt64(&receivedCount, 1)
+			})
+
+			// Record that this type was subscribed
+			subscribedTypes.Store(eventType, true)
+		}(i)
+	}
+
+	// Wait for all subscriptions to complete
+	wg.Wait()
+
+	// Count the number of unique event types subscribed
+	expectedTypes := 0
+	subscribedTypes.Range(func(key, value interface{}) bool {
+		expectedTypes++
+		return true
+	})
+
+	// Small delay to ensure all subscriptions are fully processed
+	time.Sleep(10 * time.Millisecond)
+
+	// Publish events to each subscribed type
+	subscribedTypes.Range(func(key, value interface{}) bool {
+		eventType := key.(uint32)
+		Publish(d, MyEvent3{ID: int(eventType)})
+		return true
+	})
+
+	// Wait for all events to be processed
+	time.Sleep(50 * time.Millisecond)
+
+	// Verify that we received at least the expected number of events
+	// (there might be more if multiple goroutines subscribed to the same event type)
+	received := atomic.LoadInt64(&receivedCount)
+	assert.GreaterOrEqual(t, int(received), expectedTypes,
+		"Should have received at least %d events, got %d", expectedTypes, received)
+
+	// Verify that we have the expected number of unique event types
+	assert.Equal(t, numEventTypes, expectedTypes,
+		"Should have exactly %d unique event types", numEventTypes)
+}
+
+func TestConcurrentHandlerRegistration(t *testing.T) {
+	const numGoroutines = 100
+
+	// Test concurrent subscriptions to the same event type
+	t.Run("SameEventType", func(t *testing.T) {
+		d := NewDispatcher()
+		var handlerCount int64
+		var wg sync.WaitGroup
+
+		// Start multiple goroutines subscribing to the same event type (0x1)
+		for i := 0; i < numGoroutines; i++ {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				SubscribeTo(d, uint32(0x1), func(ev MyEvent1) {
+					atomic.AddInt64(&handlerCount, 1)
+				})
+			}()
+		}
+
+		wg.Wait()
+
+		// Verify all handlers were registered by publishing an event
+		atomic.StoreInt64(&handlerCount, 0)
+		Publish(d, MyEvent1{})
+
+		// Small delay to ensure all handlers have executed
+		time.Sleep(10 * time.Millisecond)
+
+		assert.Equal(t, int64(numGoroutines), atomic.LoadInt64(&handlerCount),
+			"Not all handlers were registered due to race condition")
+	})
+
+	// Test concurrent subscriptions to different event types
+	t.Run("DifferentEventTypes", func(t *testing.T) {
+		d := NewDispatcher()
+		var wg sync.WaitGroup
+		receivedEvents := make(map[uint32]*int64)
+
+		// Create multiple event types and subscribe concurrently
+		for i := 0; i < numGoroutines; i++ {
+			eventType := uint32(100 + i)
+			counter := new(int64)
+			receivedEvents[eventType] = counter
+
+			wg.Add(1)
+			go func(et uint32, cnt *int64) {
+				defer wg.Done()
+				SubscribeTo(d, et, func(ev MyEvent3) {
+					atomic.AddInt64(cnt, 1)
+				})
+			}(eventType, counter)
+		}
+
+		wg.Wait()
+
+		// Publish events to all types
+		for eventType := uint32(100); eventType < uint32(100+numGoroutines); eventType++ {
+			Publish(d, MyEvent3{ID: int(eventType)})
+		}
+
+		// Small delay to ensure all handlers have executed
+		time.Sleep(10 * time.Millisecond)
+
+		// Verify all event types received their events
+		for eventType, counter := range receivedEvents {
+			assert.Equal(t, int64(1), atomic.LoadInt64(counter),
+				"Event type %d did not receive its event", eventType)
+		}
+	})
+}
+
+func TestBackpressure(t *testing.T) {
+	d := NewDispatcher()
+	d.maxQueue = 10
+
+	var processedCount int64
+	unsub := SubscribeTo(d, uint32(0x200), func(ev MyEvent3) {
+		atomic.AddInt64(&processedCount, 1)
+	})
+	defer unsub()
+
+	const eventsToPublish = 1000
+	for i := 0; i < eventsToPublish; i++ {
+		Publish(d, MyEvent3{ID: 0x200})
+	}
+
+	time.Sleep(100 * time.Millisecond)
+
+	// Verify all events were eventually processed
+	finalProcessed := atomic.LoadInt64(&processedCount)
+	assert.Equal(t, int64(eventsToPublish), finalProcessed)
+	t.Logf("Events processed: %d/%d", finalProcessed, eventsToPublish)
+}
+
+// ------------------------------------- Test Events -------------------------------------
+
+const (
+	TypeEvent1 = 0x1
+	TypeEvent2 = 0x2
+)
+
+type MyEvent1 struct {
+	Number int
+}
+
+func (t MyEvent1) Type() uint32 { return TypeEvent1 }
+
+type MyEvent2 struct {
+	Text string
+}
+
+func (t MyEvent2) Type() uint32 { return TypeEvent2 }
+
+type MyEvent3 struct {
+	ID int
+}
+
+func (t MyEvent3) Type() uint32 { return uint32(t.ID) }
@@ -1,4 +1,4 @@
-package proxy
+package logmon

 import (
 	"context"
@@ -8,15 +8,25 @@ import (
 	"sync"
 	"time"

-	"github.com/mostlygeek/llama-swap/event"
+	"github.com/mostlygeek/llama-swap/internal/event"
 )

+const DataEventID = 0x04
+
+type DataEvent struct {
+	Data []byte
+}
+
+func (e DataEvent) Type() uint32 {
+	return DataEventID
+}
+
 // circularBuffer is a fixed-size circular byte buffer that overwrites
 // oldest data when full. It provides O(1) writes and O(n) reads.
 type circularBuffer struct {
-	data []byte // pre-allocated capacity
-	head int    // next write position
-	size int    // current number of bytes stored (0 to cap)
+	data []byte
+	head int
+	size int
 }

 func newCircularBuffer(capacity int) *circularBuffer {
@@ -27,8 +37,6 @@ func newCircularBuffer(capacity int) *circularBuffer {
 	}
 }

-// Write appends bytes to the buffer, overwriting oldest data when full.
-// Data is copied into the internal buffer (not stored by reference).
 func (cb *circularBuffer) Write(p []byte) {
 	if len(p) == 0 {
 		return
@@ -36,7 +44,6 @@ func (cb *circularBuffer) Write(p []byte) {

 	cap := len(cb.data)

-	// If input is larger than capacity, only keep the last cap bytes
 	if len(p) >= cap {
 		copy(cb.data, p[len(p)-cap:])
 		cb.head = 0
@@ -44,28 +51,22 @@ func (cb *circularBuffer) Write(p []byte) {
 		return
 	}

-	// Calculate how much space is available from head to end of buffer
 	firstPart := cap - cb.head
 	if firstPart >= len(p) {
-		// All data fits without wrapping
 		copy(cb.data[cb.head:], p)
 		cb.head = (cb.head + len(p)) % cap
 	} else {
-		// Data wraps around
 		copy(cb.data[cb.head:], p[:firstPart])
 		copy(cb.data[:len(p)-firstPart], p[firstPart:])
 		cb.head = len(p) - firstPart
 	}

-	// Update size
 	cb.size += len(p)
 	if cb.size > cap {
 		cb.size = cap
 	}
 }

-// GetHistory returns all buffered data in correct order (oldest to newest).
-// Returns a new slice (copy), not a view into internal buffer.
 func (cb *circularBuffer) GetHistory() []byte {
 	if cb.size == 0 {
 		return nil
@@ -74,14 +75,11 @@ func (cb *circularBuffer) GetHistory() []byte {
 	result := make([]byte, cb.size)
 	cap := len(cb.data)

-	// Calculate start position (oldest data)
 	start := (cb.head - cb.size + cap) % cap

 	if start+cb.size <= cap {
-		// Data is contiguous, single copy
 		copy(result, cb.data[start:start+cb.size])
 	} else {
-		// Data wraps around, two copies
 		firstPart := cap - start
 		copy(result[:firstPart], cb.data[start:])
 		copy(result[firstPart:], cb.data[:cb.size-firstPart])
@@ -90,42 +88,38 @@ func (cb *circularBuffer) GetHistory() []byte {
 	return result
 }

-type LogLevel int
+type Level int

 const (
-	LevelDebug LogLevel = iota
+	LevelDebug Level = iota
 	LevelInfo
 	LevelWarn
 	LevelError

-	LogBufferSize = 100 * 1024
+	BufferSize = 100 * 1024
 )

-type LogMonitor struct {
+type Monitor struct {
 	eventbus *event.Dispatcher
 	mu       sync.RWMutex
 	buffer   *circularBuffer
 	bufferMu sync.RWMutex

-	// typically this can be os.Stdout
 	stdout io.Writer

-	// logging levels
-	level  LogLevel
-	prefix string
-
-	// timestamps
+	level      Level
+	prefix     string
 	timeFormat string
 }

-func NewLogMonitor() *LogMonitor {
-	return NewLogMonitorWriter(os.Stdout)
+func New() *Monitor {
+	return NewWriter(os.Stdout)
 }

-func NewLogMonitorWriter(stdout io.Writer) *LogMonitor {
-	return &LogMonitor{
+func NewWriter(stdout io.Writer) *Monitor {
+	return &Monitor{
 		eventbus:   event.NewDispatcherConfig(1000),
-		buffer:     nil, // lazy initialized on first Write
+		buffer:     nil,
 		stdout:     stdout,
 		level:      LevelInfo,
 		prefix:     "",
@@ -133,7 +127,7 @@ func NewLogMonitorWriter(stdout io.Writer) *LogMonitor {
 	}
 }

-func (w *LogMonitor) Write(p []byte) (n int, err error) {
+func (w *Monitor) Write(p []byte) (n int, err error) {
 	if len(p) == 0 {
 		return 0, nil
 	}
@@ -145,19 +139,18 @@ func (w *LogMonitor) Write(p []byte) (n int, err error) {

 	w.bufferMu.Lock()
 	if w.buffer == nil {
-		w.buffer = newCircularBuffer(LogBufferSize)
+		w.buffer = newCircularBuffer(BufferSize)
 	}
 	w.buffer.Write(p)
 	w.bufferMu.Unlock()

-	// Make a copy for broadcast to preserve immutability
 	bufferCopy := make([]byte, len(p))
 	copy(bufferCopy, p)
 	w.broadcast(bufferCopy)
 	return n, nil
 }

-func (w *LogMonitor) GetHistory() []byte {
+func (w *Monitor) GetHistory() []byte {
 	w.bufferMu.RLock()
 	defer w.bufferMu.RUnlock()
 	if w.buffer == nil {
@@ -168,41 +161,41 @@ func (w *LogMonitor) GetHistory() []byte {

 // Clear releases the buffer memory, making it eligible for GC.
 // The buffer will be lazily re-allocated on the next Write.
-func (w *LogMonitor) Clear() {
+func (w *Monitor) Clear() {
 	w.bufferMu.Lock()
 	w.buffer = nil
 	w.bufferMu.Unlock()
 }

-func (w *LogMonitor) OnLogData(callback func(data []byte)) context.CancelFunc {
-	return event.Subscribe(w.eventbus, func(e LogDataEvent) {
+func (w *Monitor) OnLogData(callback func(data []byte)) context.CancelFunc {
+	return event.Subscribe(w.eventbus, func(e DataEvent) {
 		callback(e.Data)
 	})
 }

-func (w *LogMonitor) broadcast(msg []byte) {
-	event.Publish(w.eventbus, LogDataEvent{Data: msg})
+func (w *Monitor) broadcast(msg []byte) {
+	event.Publish(w.eventbus, DataEvent{Data: msg})
 }

-func (w *LogMonitor) SetPrefix(prefix string) {
+func (w *Monitor) SetPrefix(prefix string) {
 	w.mu.Lock()
 	defer w.mu.Unlock()
 	w.prefix = prefix
 }

-func (w *LogMonitor) SetLogLevel(level LogLevel) {
+func (w *Monitor) SetLogLevel(level Level) {
 	w.mu.Lock()
 	defer w.mu.Unlock()
 	w.level = level
 }

-func (w *LogMonitor) SetLogTimeFormat(timeFormat string) {
+func (w *Monitor) SetLogTimeFormat(timeFormat string) {
 	w.mu.Lock()
 	defer w.mu.Unlock()
 	w.timeFormat = timeFormat
 }

-func (w *LogMonitor) formatMessage(level string, msg string) []byte {
+func (w *Monitor) formatMessage(level string, msg string) []byte {
 	prefix := ""
 	if w.prefix != "" {
 		prefix = fmt.Sprintf("[%s] ", w.prefix)
@@ -211,49 +204,38 @@ func (w *LogMonitor) formatMessage(level string, msg string) []byte {
 	if w.timeFormat != "" {
 		timestamp = fmt.Sprintf("%s ", time.Now().Format(w.timeFormat))
 	}
-	return []byte(fmt.Sprintf("%s%s[%s] %s\n", timestamp, prefix, level, msg))
+	return fmt.Appendf(nil, "%s%s[%s] %s\n", timestamp, prefix, level, msg)
 }

-func (w *LogMonitor) log(level LogLevel, msg string) {
+func (w *Monitor) log(level Level, msg string) {
 	if level < w.level {
 		return
 	}
 	w.Write(w.formatMessage(level.String(), msg))
 }

-func (w *LogMonitor) Debug(msg string) {
-	w.log(LevelDebug, msg)
-}
+func (w *Monitor) Debug(msg string) { w.log(LevelDebug, msg) }
+func (w *Monitor) Info(msg string)  { w.log(LevelInfo, msg) }
+func (w *Monitor) Warn(msg string)  { w.log(LevelWarn, msg) }
+func (w *Monitor) Error(msg string) { w.log(LevelError, msg) }

-func (w *LogMonitor) Info(msg string) {
-	w.log(LevelInfo, msg)
-}
-
-func (w *LogMonitor) Warn(msg string) {
-	w.log(LevelWarn, msg)
-}
-
-func (w *LogMonitor) Error(msg string) {
-	w.log(LevelError, msg)
-}
-
-func (w *LogMonitor) Debugf(format string, args ...interface{}) {
+func (w *Monitor) Debugf(format string, args ...any) {
 	w.log(LevelDebug, fmt.Sprintf(format, args...))
 }

-func (w *LogMonitor) Infof(format string, args ...interface{}) {
+func (w *Monitor) Infof(format string, args ...any) {
 	w.log(LevelInfo, fmt.Sprintf(format, args...))
 }

-func (w *LogMonitor) Warnf(format string, args ...interface{}) {
+func (w *Monitor) Warnf(format string, args ...any) {
 	w.log(LevelWarn, fmt.Sprintf(format, args...))
 }

-func (w *LogMonitor) Errorf(format string, args ...interface{}) {
+func (w *Monitor) Errorf(format string, args ...any) {
 	w.log(LevelError, fmt.Sprintf(format, args...))
 }

-func (l LogLevel) String() string {
+func (l Level) String() string {
 	switch l {
 	case LevelDebug:
 		return "DEBUG"
@@ -1,4 +1,4 @@
-package proxy
+package logmon

 import (
 	"bytes"
@@ -10,9 +10,8 @@ import (
 )

 func TestLogMonitor(t *testing.T) {
-	logMonitor := NewLogMonitorWriter(io.Discard)
+	logMonitor := NewWriter(io.Discard)

-	// A WaitGroup is used to wait for all the expected writes to complete
 	var wg sync.WaitGroup

 	client1Messages := make([]byte, 0)
@@ -34,10 +33,8 @@ func TestLogMonitor(t *testing.T) {
 	logMonitor.Write([]byte("2"))
 	logMonitor.Write([]byte("3"))

-	// wait for all writes to complete
 	wg.Wait()

-	// Check the buffer
 	expectedHistory := "123"
 	history := string(logMonitor.GetHistory())

@@ -57,14 +54,11 @@ func TestLogMonitor(t *testing.T) {
 }

 func TestWrite_ImmutableBuffer(t *testing.T) {
-	// Create a new LogMonitor instance
-	lm := NewLogMonitorWriter(io.Discard)
+	lm := NewWriter(io.Discard)

-	// Prepare a message to write
 	msg := []byte("Hello, World!")
 	lenmsg := len(msg)

-	// Write the message to the LogMonitor
 	n, err := lm.Write(msg)
 	if err != nil {
 		t.Fatalf("Write failed: %v", err)
@@ -74,13 +68,10 @@ func TestWrite_ImmutableBuffer(t *testing.T) {
 		t.Errorf("Expected %d bytes written but got %d", lenmsg, n)
 	}

-	// Change the original message
-	msg[0] = 'B' // This should not affect the buffer
+	msg[0] = 'B'

-	// Get the history from the LogMonitor
 	history := lm.GetHistory()

-	// Check that the history contains the original message, not the modified one
 	expected := []byte("Hello, World!")
 	if !bytes.Equal(history, expected) {
 		t.Errorf("Expected history to be %q, got %q", expected, history)
@@ -88,16 +79,12 @@ func TestWrite_ImmutableBuffer(t *testing.T) {
 }

 func TestWrite_LogTimeFormat(t *testing.T) {
-	// Create a new LogMonitor instance
-	lm := NewLogMonitorWriter(io.Discard)
+	lm := NewWriter(io.Discard)

-	// Enable timestamps
 	lm.timeFormat = time.RFC3339

-	// Write the message to the LogMonitor
 	lm.Info("Hello, World!")

-	// Get the history from the LogMonitor
 	history := lm.GetHistory()

 	timestamp := ""
@@ -115,48 +102,40 @@ func TestWrite_LogTimeFormat(t *testing.T) {
 }

 func TestCircularBuffer_WrapAround(t *testing.T) {
-	// Create a small buffer to test wrap-around
 	cb := newCircularBuffer(10)

-	// Write "hello" (5 bytes)
 	cb.Write([]byte("hello"))
 	if got := string(cb.GetHistory()); got != "hello" {
 		t.Errorf("Expected 'hello', got %q", got)
 	}

-	// Write "world" (5 bytes) - buffer now full
 	cb.Write([]byte("world"))
 	if got := string(cb.GetHistory()); got != "helloworld" {
 		t.Errorf("Expected 'helloworld', got %q", got)
 	}

-	// Write "12345" (5 bytes) - should overwrite "hello"
 	cb.Write([]byte("12345"))
 	if got := string(cb.GetHistory()); got != "world12345" {
 		t.Errorf("Expected 'world12345', got %q", got)
 	}

-	// Write data larger than buffer capacity
-	cb.Write([]byte("abcdefghijklmnop")) // 16 bytes, only last 10 kept
+	cb.Write([]byte("abcdefghijklmnop"))
 	if got := string(cb.GetHistory()); got != "ghijklmnop" {
 		t.Errorf("Expected 'ghijklmnop', got %q", got)
 	}
 }

 func TestCircularBuffer_BoundaryConditions(t *testing.T) {
-	// Test empty buffer
 	cb := newCircularBuffer(10)
 	if got := cb.GetHistory(); got != nil {
 		t.Errorf("Expected nil for empty buffer, got %q", got)
 	}

-	// Test exact capacity
 	cb.Write([]byte("1234567890"))
 	if got := string(cb.GetHistory()); got != "1234567890" {
 		t.Errorf("Expected '1234567890', got %q", got)
 	}

-	// Test write exactly at capacity boundary
 	cb = newCircularBuffer(10)
 	cb.Write([]byte("12345"))
 	cb.Write([]byte("67890"))
@@ -166,19 +145,16 @@ func TestCircularBuffer_BoundaryConditions(t *testing.T) {
 }

 func TestLogMonitor_LazyInit(t *testing.T) {
-	lm := NewLogMonitorWriter(io.Discard)
+	lm := NewWriter(io.Discard)

-	// Buffer should be nil before any writes
 	if lm.buffer != nil {
 		t.Error("Expected buffer to be nil before first write")
 	}

-	// GetHistory should return nil when buffer is nil
 	if got := lm.GetHistory(); got != nil {
 		t.Errorf("Expected nil history before first write, got %q", got)
 	}

-	// Write should lazily initialize the buffer
 	lm.Write([]byte("test"))

 	if lm.buffer == nil {
@@ -191,15 +167,13 @@ func TestLogMonitor_LazyInit(t *testing.T) {
 }

 func TestLogMonitor_Clear(t *testing.T) {
-	lm := NewLogMonitorWriter(io.Discard)
+	lm := NewWriter(io.Discard)

-	// Write some data
 	lm.Write([]byte("hello"))
 	if got := string(lm.GetHistory()); got != "hello" {
 		t.Errorf("Expected 'hello', got %q", got)
 	}

-	// Clear should release the buffer
 	lm.Clear()

 	if lm.buffer != nil {
@@ -212,9 +186,8 @@ func TestLogMonitor_Clear(t *testing.T) {
 }

 func TestLogMonitor_ClearAndReuse(t *testing.T) {
-	lm := NewLogMonitorWriter(io.Discard)
+	lm := NewWriter(io.Discard)

-	// Write, clear, then write again
 	lm.Write([]byte("first"))
 	lm.Clear()
 	lm.Write([]byte("second"))
@@ -225,13 +198,12 @@ func TestLogMonitor_ClearAndReuse(t *testing.T) {
 }

 func BenchmarkLogMonitorWrite(b *testing.B) {
-	// Test data of varying sizes
 	smallMsg := []byte("small message\n")
 	mediumMsg := []byte(strings.Repeat("medium message content ", 10) + "\n")
 	largeMsg := []byte(strings.Repeat("large message content for benchmarking ", 100) + "\n")

 	b.Run("SmallWrite", func(b *testing.B) {
-		lm := NewLogMonitorWriter(io.Discard)
+		lm := NewWriter(io.Discard)
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			lm.Write(smallMsg)
@@ -239,7 +211,7 @@ func BenchmarkLogMonitorWrite(b *testing.B) {
 	})

 	b.Run("MediumWrite", func(b *testing.B) {
-		lm := NewLogMonitorWriter(io.Discard)
+		lm := NewWriter(io.Discard)
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			lm.Write(mediumMsg)
@@ -247,7 +219,7 @@ func BenchmarkLogMonitorWrite(b *testing.B) {
 	})

 	b.Run("LargeWrite", func(b *testing.B) {
-		lm := NewLogMonitorWriter(io.Discard)
+		lm := NewWriter(io.Discard)
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			lm.Write(largeMsg)
@@ -255,8 +227,7 @@ func BenchmarkLogMonitorWrite(b *testing.B) {
 	})

 	b.Run("WithSubscribers", func(b *testing.B) {
-		lm := NewLogMonitorWriter(io.Discard)
-		// Add some subscribers
+		lm := NewWriter(io.Discard)
 		for i := 0; i < 5; i++ {
 			lm.OnLogData(func(data []byte) {})
 		}
@@ -267,8 +238,7 @@ func BenchmarkLogMonitorWrite(b *testing.B) {
 	})

 	b.Run("GetHistory", func(b *testing.B) {
-		lm := NewLogMonitorWriter(io.Discard)
-		// Pre-populate with data
+		lm := NewWriter(io.Discard)
 		for i := 0; i < 1000; i++ {
 			lm.Write(mediumMsg)
 		}
@@ -278,39 +248,3 @@ func BenchmarkLogMonitorWrite(b *testing.B) {
 		}
 	})
 }
-
-/*
-Benchmark Results - MBP M1 Pro
-
-Before (ring.Ring):
-| Benchmark                       | ns/op      | bytes/op | allocs/op |
-|---------------------------------|------------|----------|-----------|
-| SmallWrite (14B)                | 43 ns      | 40 B     | 2         |
-| MediumWrite (241B)              | 76 ns      | 264 B    | 2         |
-| LargeWrite (4KB)                | 504 ns     | 4,120 B  | 2         |
-| WithSubscribers (5 subs)        | 355 ns     | 264 B    | 2         |
-| GetHistory (after 1000 writes)  | 145,000 ns | 1.2 MB   | 22        |
-
-After (circularBuffer 10KB):
-| Benchmark                       | ns/op      | bytes/op | allocs/op |
-|---------------------------------|------------|----------|-----------|
-| SmallWrite (14B)                | 26 ns      | 16 B     | 1         |
-| MediumWrite (241B)              | 67 ns      | 240 B    | 1         |
-| LargeWrite (4KB)                | 774 ns     | 4,096 B  | 1         |
-| WithSubscribers (5 subs)        | 325 ns     | 240 B    | 1         |
-| GetHistory (after 1000 writes)  | 1,042 ns   | 10,240 B | 1         |
-
-After (circularBuffer 100KB):
-| Benchmark                       | ns/op      | bytes/op  | allocs/op |
-|---------------------------------|------------|-----------|-----------|
-| SmallWrite (14B)                | 26 ns      | 16 B      | 1         |
-| MediumWrite (241B)              | 66 ns      | 240 B     | 1         |
-| LargeWrite (4KB)                | 753 ns     | 4,096 B   | 1         |
-| WithSubscribers (5 subs)        | 309 ns     | 240 B     | 1         |
-| GetHistory (after 1000 writes)  | 7,788 ns   | 106,496 B | 1         |
-
-Summary:
- GetHistory: 139x faster (10KB), 18x faster (100KB)
- Allocations: reduced from 2 to 1 across all operations
- Small/medium writes: ~1.1-1.6x faster
-*/
@@ -0,0 +1,214 @@
+package perf
+
+import (
+	"encoding/json"
+	"fmt"
+	"math"
+	"regexp"
+	"strconv"
+	"strings"
+	"time"
+)
+
+// ParseNvidiaSmiLine parses a single line from nvidia-smi CSV output.
+// Format: index,name,uuid,temperature.gpu,utilization.gpu,memory.used,memory.total,fan.speed,power.draw
+func ParseNvidiaSmiLine(line string) *GpuStat {
+	fields := strings.Split(line, ",")
+	if len(fields) < 9 {
+		return nil
+	}
+
+	id, _ := strconv.Atoi(strings.TrimSpace(fields[0]))
+	name := strings.TrimSpace(fields[1])
+	uuid := strings.TrimSpace(fields[2])
+	tempC, _ := strconv.Atoi(strings.TrimSpace(fields[3]))
+	gpuUtil, _ := strconv.ParseFloat(strings.TrimSpace(fields[4]), 64)
+	memUsed, _ := strconv.Atoi(strings.TrimSpace(fields[5]))
+	memTotal, _ := strconv.Atoi(strings.TrimSpace(fields[6]))
+	fanSpeed, _ := strconv.ParseFloat(strings.TrimSpace(fields[7]), 64)
+	powerDraw, _ := strconv.ParseFloat(strings.TrimSpace(fields[8]), 64)
+
+	var memUtil float64
+	if memTotal > 0 {
+		memUtil = float64(memUsed) / float64(memTotal) * 100
+	}
+
+	return &GpuStat{
+		Timestamp:   time.Now(),
+		ID:          id,
+		Name:        name,
+		UUID:        uuid,
+		TempC:       tempC,
+		GpuUtilPct:  gpuUtil,
+		MemUtilPct:  memUtil,
+		MemUsedMB:   memUsed,
+		MemTotalMB:  memTotal,
+		FanSpeedPct: fanSpeed,
+		PowerDrawW:  powerDraw,
+	}
+}
+
+// mactopOutput maps the subset of mactop's headless JSON output that is
+// relevant to GpuStat. Note that mactop's memory object is whole-system memory,
+// not GPU-attributed; the darwin monitor overlays ioreg's GPU-attributed
+// unified memory (see overlayIoregMem) so both backends report consistent
+// memory figures.
+type mactopOutput struct {
+	SocMetrics struct {
+		GPUPower float64 `json:"gpu_power"`
+		GPUFreq  int     `json:"gpu_freq_mhz"`
+		GPUTemp  float64 `json:"gpu_temp"`
+	} `json:"soc_metrics"`
+	Memory struct {
+		Total uint64 `json:"total"`
+		Used  uint64 `json:"used"`
+	} `json:"memory"`
+	GPUUsage   float64 `json:"gpu_usage"`
+	SystemInfo struct {
+		Name         string `json:"name"`
+		GPUCoreCount int    `json:"gpu_core_count"`
+	} `json:"system_info"`
+	Fans []struct {
+		RPM    int `json:"rpm"`
+		MinRPM int `json:"min_rpm"`
+		MaxRPM int `json:"max_rpm"`
+	} `json:"fans"`
+	Temperatures []struct {
+		Group string  `json:"group"`
+		Avg   float64 `json:"avg_celsius"`
+	} `json:"temperatures"`
+}
+
+// ioreg output uses ` = ` (with spaces) for top-level device properties and
+// `=` (no spaces) for values inside nested dictionaries such as
+// PerformanceStatistics.
+var (
+	reIoregModel     = regexp.MustCompile(`"model"\s*=\s*"([^"]+)"`)
+	reIoregCoreCount = regexp.MustCompile(`"gpu-core-count"\s*=\s*(\d+)`)
+	reIoregUtil      = regexp.MustCompile(`"Device Utilization %"=(\d+)`)
+	reIoregMemUsed   = regexp.MustCompile(`"In use system memory"=(\d+)`)
+)
+
+// ParseIoregOutput parses `ioreg -r -c IOGPU -d 1 -f` output into a GpuStat for
+// the Apple Silicon integrated GPU. This is a fallback for when mactop is not
+// installed: utilization and used memory are available, but power, temperature,
+// and fan speed are not exposed by ioreg. memTotalMB is the unified memory size
+// supplied by the caller, since Apple Silicon shares memory between CPU and GPU.
+// Returns nil if no GPU device is found in the output.
+func ParseIoregOutput(out []byte, memTotalMB int) *GpuStat {
+	utilMatch := reIoregUtil.FindSubmatch(out)
+	memMatch := reIoregMemUsed.FindSubmatch(out)
+	if utilMatch == nil && memMatch == nil {
+		return nil
+	}
+
+	var gpuUtil float64
+	if utilMatch != nil {
+		gpuUtil, _ = strconv.ParseFloat(string(utilMatch[1]), 64)
+	}
+
+	const toMB = 1024 * 1024
+	var memUsedMB int
+	if memMatch != nil {
+		memUsedBytes, _ := strconv.ParseInt(string(memMatch[1]), 10, 64)
+		memUsedMB = int(memUsedBytes / toMB)
+	}
+
+	var memUtil float64
+	if memTotalMB > 0 {
+		memUtil = float64(memUsedMB) / float64(memTotalMB) * 100
+	}
+
+	name := "Apple GPU"
+	if m := reIoregModel.FindSubmatch(out); m != nil {
+		name = string(m[1])
+	}
+	if m := reIoregCoreCount.FindSubmatch(out); m != nil {
+		if cores, err := strconv.Atoi(string(m[1])); err == nil && cores > 0 {
+			name = fmt.Sprintf("%s (%d-core GPU)", name, cores)
+		}
+	}
+
+	return &GpuStat{
+		Timestamp:  time.Now(),
+		ID:         0,
+		Name:       name,
+		GpuUtilPct: gpuUtil,
+		MemUtilPct: memUtil,
+		MemUsedMB:  memUsedMB,
+		MemTotalMB: memTotalMB,
+	}
+}
+
+// ParseMactopLine parses a single line of mactop headless JSON output into a
+// GpuStat for the Apple Silicon integrated GPU. Returns nil if the line cannot
+// be parsed.
+func ParseMactopLine(line string) *GpuStat {
+	line = strings.TrimSpace(line)
+	if line == "" {
+		return nil
+	}
+
+	var out mactopOutput
+	if err := json.Unmarshal([]byte(line), &out); err != nil {
+		return nil
+	}
+
+	const toMB = 1024 * 1024
+	memUsedMB := int(out.Memory.Used / toMB)
+	memTotalMB := int(out.Memory.Total / toMB)
+
+	var memUtil float64
+	if memTotalMB > 0 {
+		memUtil = float64(memUsedMB) / float64(memTotalMB) * 100
+	}
+
+	name := out.SystemInfo.Name
+	if name == "" {
+		name = "Apple GPU"
+	}
+	if out.SystemInfo.GPUCoreCount > 0 {
+		name = fmt.Sprintf("%s (%d-core GPU)", name, out.SystemInfo.GPUCoreCount)
+	}
+
+	// Unified memory has no dedicated VRAM sensor; use the memory temperature
+	// group when mactop exposes it.
+	var vramTempC int
+	for _, t := range out.Temperatures {
+		if strings.EqualFold(t.Group, "Memory") {
+			vramTempC = int(math.Round(t.Avg))
+			break
+		}
+	}
+
+	// Average fan load across all fans as a percentage of their RPM range.
+	var fanSpeed float64
+	var fanCount int
+	for _, f := range out.Fans {
+		if f.MaxRPM > f.MinRPM {
+			pct := float64(f.RPM-f.MinRPM) / float64(f.MaxRPM-f.MinRPM) * 100
+			if pct < 0 {
+				pct = 0
+			}
+			fanSpeed += pct
+			fanCount++
+		}
+	}
+	if fanCount > 0 {
+		fanSpeed /= float64(fanCount)
+	}
+
+	return &GpuStat{
+		Timestamp:   time.Now(),
+		ID:          0,
+		Name:        name,
+		TempC:       int(math.Round(out.SocMetrics.GPUTemp)),
+		VramTempC:   vramTempC,
+		GpuUtilPct:  out.GPUUsage,
+		MemUtilPct:  memUtil,
+		MemUsedMB:   memUsedMB,
+		MemTotalMB:  memTotalMB,
+		FanSpeedPct: fanSpeed,
+		PowerDrawW:  out.SocMetrics.GPUPower,
+	}
+}
@@ -0,0 +1,206 @@
+package perf
+
+import (
+	"context"
+	"errors"
+	"sync"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/internal/config"
+	"github.com/mostlygeek/llama-swap/internal/logmon"
+	"github.com/mostlygeek/llama-swap/internal/ring"
+)
+
+var (
+	ErrNotImplemented = errors.New("not implemented")
+	ErrNoGpuTool      = errors.New("no GPU monitoring tool available")
+)
+
+type Monitor struct {
+	mutex   sync.RWMutex
+	log     *logmon.Monitor
+	conf    config.PerformanceConfig
+	sysRing ring.Buffer[SysStat]
+	gpuRing ring.Buffer[[]GpuStat]
+
+	stopCtx    context.Context
+	stopCancel context.CancelFunc
+
+	sysListeners map[chan SysStat]struct{}
+	gpuListeners map[chan []GpuStat]struct{}
+}
+
+func ringCapacity(c config.PerformanceConfig) int {
+	n := int(time.Hour / c.Every)
+	if n < 1 {
+		n = 1
+	}
+	return n
+}
+
+func New(c config.PerformanceConfig, logger *logmon.Monitor) (*Monitor, error) {
+
+	if c.Every < 100*time.Millisecond {
+		c.Every = 100 * time.Millisecond
+	}
+
+	if logger == nil {
+		return nil, errors.New("logger is required")
+	}
+
+	capacity := ringCapacity(c)
+	return &Monitor{
+		conf:         c,
+		log:          logger,
+		sysRing:      ring.NewBuffer[SysStat](capacity),
+		gpuRing:      ring.NewBuffer[[]GpuStat](capacity),
+		sysListeners: make(map[chan SysStat]struct{}),
+		gpuListeners: make(map[chan []GpuStat]struct{}),
+	}, nil
+}
+
+func (m *Monitor) Stop() {
+	m.mutex.Lock()
+	defer m.mutex.Unlock()
+	if m.stopCancel == nil {
+		return
+	}
+	m.stopCancel()
+	m.stopCancel = nil
+}
+
+// UpdateConfig updates the monitor configuration and restarts if changed.
+func (m *Monitor) UpdateConfig(newConf config.PerformanceConfig) {
+	m.mutex.RLock()
+	changed := m.conf != newConf
+	m.mutex.RUnlock()
+
+	if !changed {
+		return
+	}
+
+	m.Stop()
+	m.mutex.Lock()
+	m.conf = newConf
+	capacity := ringCapacity(newConf)
+	m.sysRing = ring.NewBuffer[SysStat](capacity)
+	m.gpuRing = ring.NewBuffer[[]GpuStat](capacity)
+	m.mutex.Unlock()
+	if !newConf.Disabled {
+		m.Start()
+	}
+}
+
+// Subscribe returns channels to listen to system and GPU stats.
+func (m *Monitor) Subscribe() (chan SysStat, chan []GpuStat, func()) {
+	m.mutex.Lock()
+	defer m.mutex.Unlock()
+	sysChan := make(chan SysStat, 1)
+	gpuChan := make(chan []GpuStat, 1)
+
+	m.sysListeners[sysChan] = struct{}{}
+	m.gpuListeners[gpuChan] = struct{}{}
+
+	unsub := func() {
+		m.mutex.Lock()
+		defer m.mutex.Unlock()
+		delete(m.sysListeners, sysChan)
+		delete(m.gpuListeners, gpuChan)
+	}
+
+	return sysChan, gpuChan, unsub
+}
+
+func (m *Monitor) Start() {
+	m.mutex.Lock()
+	defer m.mutex.Unlock()
+	if m.stopCancel != nil {
+		return
+	}
+
+	m.stopCtx, m.stopCancel = context.WithCancel(context.Background())
+
+	go func() {
+		tick := time.NewTicker(m.conf.Every)
+		defer tick.Stop()
+		for {
+			select {
+			case <-m.stopCtx.Done():
+				return
+			case <-tick.C:
+				s, err := ReadSysStats()
+				if err != nil {
+					if err != ErrNotImplemented {
+						m.log.Errorf("failed to read sys stats: %s", err.Error())
+					}
+					continue
+				}
+				m.mutex.Lock()
+				m.sysRing.Push(s)
+				for l := range m.sysListeners {
+					select {
+					case l <- s:
+					default:
+					}
+				}
+				m.mutex.Unlock()
+			}
+		}
+	}()
+
+	go func() {
+		gpuCh, err := getGpuStats(m.stopCtx, m.conf.Every, m.log)
+		if err != nil {
+			if errors.Is(err, ErrNotImplemented) || errors.Is(err, ErrNoGpuTool) {
+				m.log.Infof("GPU monitoring not available: %s", err.Error())
+			} else {
+				m.log.Errorf("failed to initialize GPU monitoring: %s", err.Error())
+			}
+			return
+		}
+
+		for {
+			select {
+			case <-m.stopCtx.Done():
+				return
+			case g, ok := <-gpuCh:
+				if !ok {
+					m.log.Errorf("failed reading from gpuCh - stopping read goroutine")
+					return
+				}
+				m.mutex.Lock()
+				m.gpuRing.Push(g)
+				for l := range m.gpuListeners {
+					select {
+					case l <- g:
+					default:
+					}
+				}
+				m.mutex.Unlock()
+			}
+		}
+	}()
+}
+
+// Current returns a copy of the current log of system and GPU stats.
+func (m *Monitor) Current() ([]SysStat, []GpuStat) {
+	m.mutex.RLock()
+	defer m.mutex.RUnlock()
+
+	sysStats := m.sysRing.Slice()
+
+	snapshots := m.gpuRing.Slice()
+	var gpuStats []GpuStat
+	for _, snapshot := range snapshots {
+		gpuStats = append(gpuStats, snapshot...)
+	}
+	return sysStats, gpuStats
+}
+
+func ReadSysStats() (SysStat, error) {
+	return readSysStats()
+}
+
+func GetGpuStats(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
+	return getGpuStats(ctx, every, logger)
+}
@@ -0,0 +1,208 @@
+package perf
+
+import (
+	"bufio"
+	"context"
+	"fmt"
+	"os/exec"
+	"strings"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/internal/logmon"
+	"github.com/shirou/gopsutil/v4/cpu"
+	"github.com/shirou/gopsutil/v4/load"
+	"github.com/shirou/gopsutil/v4/mem"
+)
+
+func getGpuStats(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
+	if ch, err := tryMactop(ctx, every, logger); err == nil {
+		logger.Info("using mactop for GPU monitoring")
+		return ch, nil
+	} else {
+		logger.Debugf("mactop: %s", err.Error())
+	}
+
+	if ch, err := tryIoreg(ctx, every, logger); err == nil {
+		logger.Info("using ioreg for GPU monitoring")
+		return ch, nil
+	} else {
+		logger.Debugf("ioreg: %s", err.Error())
+	}
+
+	return nil, ErrNoGpuTool
+}
+
+// tryIoreg polls `ioreg -r -c IOGPU -d 1 -f` for Apple Silicon GPU stats. It is
+// a fallback for when mactop is not installed. ioreg exposes GPU utilization and
+// used memory but not power, temperature, or fan speed.
+func tryIoreg(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
+	if _, err := exec.LookPath("ioreg"); err != nil {
+		return nil, ErrNoGpuTool
+	}
+
+	// Verify ioreg actually reports a GPU device before committing to it, so we
+	// can fall through to ErrNoGpuTool otherwise.
+	if stat := sampleIoreg(ctx); stat == nil {
+		return nil, fmt.Errorf("ioreg reported no GPU device")
+	}
+
+	if every < time.Second {
+		every = time.Second
+	}
+
+	ch := make(chan []GpuStat, 1)
+
+	go func() {
+		defer close(ch)
+		ticker := time.NewTicker(every)
+		defer ticker.Stop()
+
+		for {
+			select {
+			case <-ctx.Done():
+				return
+			case <-ticker.C:
+				stat := sampleIoreg(ctx)
+				if stat == nil {
+					continue
+				}
+				select {
+				case ch <- []GpuStat{*stat}:
+				default:
+				}
+			}
+		}
+	}()
+
+	return ch, nil
+}
+
+// sampleIoreg runs ioreg once and parses a single GpuStat, or returns nil.
+func sampleIoreg(ctx context.Context) *GpuStat {
+	out, err := exec.CommandContext(ctx, "ioreg", "-r", "-c", "IOGPU", "-d", "1", "-f").Output()
+	if err != nil {
+		return nil
+	}
+
+	var memTotalMB int
+	if vmStat, err := mem.VirtualMemory(); err == nil {
+		memTotalMB = int(vmStat.Total / (1024 * 1024))
+	}
+
+	return ParseIoregOutput(out, memTotalMB)
+}
+
+// overlayIoregMem replaces a GpuStat's memory fields with the GPU-attributed
+// unified memory reported by ioreg. mactop only exposes whole-system memory, so
+// without this the mactop and ioreg backends would report different memory
+// semantics. It is a no-op when ioreg is unavailable or reports no GPU memory,
+// leaving the mactop-supplied values in place.
+func overlayIoregMem(ctx context.Context, stat *GpuStat) {
+	ioStat := sampleIoreg(ctx)
+	if ioStat == nil {
+		return
+	}
+	stat.MemUsedMB = ioStat.MemUsedMB
+	stat.MemTotalMB = ioStat.MemTotalMB
+	stat.MemUtilPct = ioStat.MemUtilPct
+}
+
+// tryMactop streams Apple Silicon GPU stats from mactop's headless mode.
+// See https://github.com/metaspartan/mactop. mactop emits one JSON object per
+// sample to stdout, which we parse into GpuStat.
+func tryMactop(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
+	if _, err := exec.LookPath("mactop"); err != nil {
+		return nil, ErrNoGpuTool
+	}
+
+	// mactop samples power over the interval, so give it at least a second.
+	intervalMs := int(every.Milliseconds())
+	if intervalMs < 1000 {
+		intervalMs = 1000
+	}
+
+	cmd := exec.CommandContext(ctx, "mactop",
+		"--headless",
+		"--format", "json",
+		"--interval", fmt.Sprintf("%d", intervalMs),
+	)
+
+	stdout, err := cmd.StdoutPipe()
+	if err != nil {
+		return nil, fmt.Errorf("mactop stdout pipe failed: %w", err)
+	}
+
+	if err := cmd.Start(); err != nil {
+		return nil, fmt.Errorf("mactop start failed: %w", err)
+	}
+
+	ch := make(chan []GpuStat, 1)
+
+	go func() {
+		defer close(ch)
+
+		scanner := bufio.NewScanner(stdout)
+		// mactop's JSON objects can be large; allow generous line lengths.
+		scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
+		for scanner.Scan() {
+			line := strings.TrimSpace(scanner.Text())
+			if line == "" {
+				continue
+			}
+
+			stat := ParseMactopLine(line)
+			if stat != nil {
+				// mactop only reports whole-system memory; overlay ioreg's
+				// GPU-attributed unified memory so both backends are consistent.
+				overlayIoregMem(ctx, stat)
+				select {
+				case ch <- []GpuStat{*stat}:
+				default:
+				}
+			}
+		}
+		cmd.Wait()
+	}()
+
+	return ch, nil
+}
+
+func readSysStats() (SysStat, error) {
+	cpuPcts, err := cpu.Percent(0, true)
+	if err != nil {
+		return SysStat{}, err
+	}
+
+	vmStat, err := mem.VirtualMemory()
+	if err != nil {
+		return SysStat{}, err
+	}
+
+	const toMB = 1024 * 1024
+
+	var swapTotalMB, swapUsedMB int
+	if swapStat, err := mem.SwapMemory(); err == nil {
+		swapTotalMB = int(swapStat.Total / toMB)
+		swapUsedMB = int(swapStat.Used / toMB)
+	}
+
+	var loadAvg1, loadAvg5, loadAvg15 float64
+	if loadStat, err := load.Avg(); err == nil {
+		loadAvg1 = loadStat.Load1
+		loadAvg5 = loadStat.Load5
+		loadAvg15 = loadStat.Load15
+	}
+
+	return SysStat{
+		Timestamp:      time.Now(),
+		CpuUtilPerCore: cpuPcts,
+		MemTotalMB:     int(vmStat.Total / toMB),
+		MemUsedMB:      int(vmStat.Used / toMB),
+		MemFreeMB:      int(vmStat.Free / toMB),
+		SwapTotalMB:    swapTotalMB,
+		SwapUsedMB:     swapUsedMB,
+		LoadAvg1:       loadAvg1,
+		LoadAvg5:       loadAvg5,
+		LoadAvg15:      loadAvg15,
+	}, nil
+}
@@ -0,0 +1,313 @@
+package perf
+
+import (
+	"io"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/internal/config"
+	"github.com/mostlygeek/llama-swap/internal/logmon"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func newTestLogger() *logmon.Monitor {
+	return logmon.NewWriter(io.Discard)
+}
+
+func TestNew_DefaultConfig(t *testing.T) {
+	logger := newTestLogger()
+
+	m, err := New(config.PerformanceConfig{}, logger)
+	require.NoError(t, err)
+	require.NotNil(t, m)
+
+	assert.Equal(t, 100*time.Millisecond, m.conf.Every)
+}
+
+func TestNew_CustomConfig(t *testing.T) {
+	logger := newTestLogger()
+
+	cfg := config.PerformanceConfig{
+		Every: 500 * time.Millisecond,
+	}
+
+	m, err := New(cfg, logger)
+	require.NoError(t, err)
+
+	assert.Equal(t, 500*time.Millisecond, m.conf.Every)
+}
+
+func TestNew_NilLogger(t *testing.T) {
+	m, err := New(config.PerformanceConfig{}, nil)
+	assert.Error(t, err)
+	assert.Nil(t, m)
+}
+
+func TestNew_BelowMinimumConfig(t *testing.T) {
+	logger := newTestLogger()
+
+	cfg := config.PerformanceConfig{
+		Every: 1 * time.Millisecond,
+	}
+
+	m, err := New(cfg, logger)
+	require.NoError(t, err)
+
+	assert.Equal(t, 100*time.Millisecond, m.conf.Every)
+}
+
+func TestSubscribe_ReturnsChannels(t *testing.T) {
+	m, err := New(config.PerformanceConfig{}, newTestLogger())
+	require.NoError(t, err)
+
+	sysCh, gpuCh, unsub := m.Subscribe()
+	defer unsub()
+
+	assert.NotNil(t, sysCh)
+	assert.NotNil(t, gpuCh)
+	assert.NotNil(t, unsub)
+}
+
+func TestSubscribe_UnsubscribeRemovesListeners(t *testing.T) {
+	m, err := New(config.PerformanceConfig{}, newTestLogger())
+	require.NoError(t, err)
+
+	_, _, unsub := m.Subscribe()
+
+	m.mutex.RLock()
+	assert.Len(t, m.sysListeners, 1)
+	assert.Len(t, m.gpuListeners, 1)
+	m.mutex.RUnlock()
+
+	unsub()
+
+	m.mutex.RLock()
+	assert.Len(t, m.sysListeners, 0)
+	assert.Len(t, m.gpuListeners, 0)
+	m.mutex.RUnlock()
+}
+
+func TestSubscribe_MultipleSubscriptions(t *testing.T) {
+	m, err := New(config.PerformanceConfig{}, newTestLogger())
+	require.NoError(t, err)
+
+	sysCh1, gpuCh1, unsub1 := m.Subscribe()
+	sysCh2, gpuCh2, unsub2 := m.Subscribe()
+	defer unsub1()
+	defer unsub2()
+
+	assert.NotEqual(t, sysCh1, sysCh2)
+	assert.NotEqual(t, gpuCh1, gpuCh2)
+
+	m.mutex.RLock()
+	assert.Len(t, m.sysListeners, 2)
+	assert.Len(t, m.gpuListeners, 2)
+	m.mutex.RUnlock()
+}
+
+func TestCurrent_EmptyByDefault(t *testing.T) {
+	m, err := New(config.PerformanceConfig{}, newTestLogger())
+	require.NoError(t, err)
+
+	sysStats, gpuStats := m.Current()
+	assert.Empty(t, sysStats)
+	assert.Empty(t, gpuStats)
+}
+
+func TestCurrent_ReturnsCopies(t *testing.T) {
+	m, err := New(config.PerformanceConfig{}, newTestLogger())
+	require.NoError(t, err)
+
+	now := time.Now()
+	m.sysRing.Push(SysStat{Timestamp: now, MemTotalMB: 1024})
+	m.gpuRing.Push([]GpuStat{{Timestamp: now, ID: 0, Name: "gpu0"}})
+
+	sysStats, gpuStats := m.Current()
+
+	assert.Len(t, sysStats, 1)
+	assert.Len(t, gpuStats, 1)
+	assert.Equal(t, 1024, sysStats[0].MemTotalMB)
+	assert.Equal(t, "gpu0", gpuStats[0].Name)
+
+	// modifying the returned slice should not affect the original
+	sysStats[0].MemTotalMB = 999
+	original, _ := m.Current()
+	assert.Equal(t, 1024, original[0].MemTotalMB)
+}
+
+func TestStart_CollectsSysStats(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping slow test")
+	}
+
+	m, err := New(config.PerformanceConfig{Every: 100 * time.Millisecond}, newTestLogger())
+	require.NoError(t, err)
+
+	m.Start()
+
+	time.Sleep(350 * time.Millisecond)
+	m.Stop()
+
+	sysStats, _ := m.Current()
+	assert.NotEmpty(t, sysStats, "expected sys stats to be collected")
+}
+
+func TestStart_StopStopsGoroutines(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping slow test")
+	}
+
+	m, err := New(config.PerformanceConfig{Every: 100 * time.Millisecond}, newTestLogger())
+	require.NoError(t, err)
+
+	m.Start()
+	if m.stopCancel == nil {
+		t.Error("stopCancel should not be nil after Start()")
+	}
+
+	m.Stop()
+	if m.stopCancel != nil {
+		t.Error("stopCancel should be nil after Stop()")
+	}
+}
+
+func TestStart_SubscriberReceivesStats(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping slow test")
+	}
+
+	m, err := New(config.PerformanceConfig{Every: 100 * time.Millisecond}, newTestLogger())
+	require.NoError(t, err)
+
+	sysCh, _, unsub := m.Subscribe()
+	defer unsub()
+
+	m.Start()
+	defer m.Stop()
+
+	select {
+	case s := <-sysCh:
+		assert.False(t, s.Timestamp.IsZero())
+		assert.NotEmpty(t, s.CpuUtilPerCore)
+	case <-time.After(500 * time.Millisecond):
+		t.Fatal("timed out waiting for sys stats")
+	}
+}
+
+func TestReadSysStats(t *testing.T) {
+	s, err := ReadSysStats()
+	require.NoError(t, err)
+
+	assert.False(t, s.Timestamp.IsZero())
+	assert.NotEmpty(t, s.CpuUtilPerCore)
+	assert.Greater(t, s.MemTotalMB, 0)
+}
+
+func TestCurrent_ConcurrentAccess(t *testing.T) {
+	m, err := New(config.PerformanceConfig{}, newTestLogger())
+	require.NoError(t, err)
+
+	m.sysRing.Push(SysStat{Timestamp: time.Now(), MemTotalMB: 1024})
+	m.gpuRing.Push([]GpuStat{{Timestamp: time.Now(), ID: 0}})
+
+	var wg sync.WaitGroup
+	for i := 0; i < 10; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			sys, gpu := m.Current()
+			assert.Len(t, sys, 1)
+			assert.Len(t, gpu, 1)
+		}()
+	}
+	wg.Wait()
+}
+
+func TestParseNvidiaSmiLine_ValidLine(t *testing.T) {
+	line := "0, NVIDIA GeForce RTX 3080, GPU-12345678-1234-1234-1234-123456789abc, 65, 80, 8192, 10240, 75, 250"
+
+	stat := ParseNvidiaSmiLine(line)
+	require.NotNil(t, stat)
+
+	assert.Equal(t, 0, stat.ID)
+	assert.Equal(t, "NVIDIA GeForce RTX 3080", stat.Name)
+	assert.Equal(t, "GPU-12345678-1234-1234-1234-123456789abc", stat.UUID)
+	assert.Equal(t, 65, stat.TempC)
+	assert.Equal(t, 80.0, stat.GpuUtilPct)
+	assert.Equal(t, 8192, stat.MemUsedMB)
+	assert.Equal(t, 10240, stat.MemTotalMB)
+	assert.Equal(t, 75.0, stat.FanSpeedPct)
+	assert.Equal(t, 250.0, stat.PowerDrawW)
+	assert.InDelta(t, 80.0, stat.MemUtilPct, 0.01)
+}
+
+func TestParseNvidiaSmiLine_ShortLine(t *testing.T) {
+	line := "0, NVIDIA GPU, GPU-123"
+
+	stat := ParseNvidiaSmiLine(line)
+	assert.Nil(t, stat)
+}
+
+func TestParseNvidiaSmiLine_MissingFields(t *testing.T) {
+	line := "0, NVIDIA GPU, GPU-123, 65, 80, 8192, 10240, 75"
+
+	stat := ParseNvidiaSmiLine(line)
+	assert.Nil(t, stat)
+}
+
+func TestParseNvidiaSmiLine_ZeroMemoryTotal(t *testing.T) {
+	line := "0, NVIDIA GPU, GPU-123, 65, 80, 0, 0, 75, 250"
+
+	stat := ParseNvidiaSmiLine(line)
+	require.NotNil(t, stat)
+	assert.Equal(t, 0.0, stat.MemUtilPct)
+}
+
+const ioregSample = `+-o AGXAcceleratorG13X  <class AGXAcceleratorG13X, id 0x1000009a1, registered, matched, active, busy 0 (39191 ms), retain 108>
+    {
+      "model" = "Apple M1 Pro"
+      "gpu-core-count" = 16
+      "PerformanceStatistics" = {"In use system memory (driver)"=0,"Alloc system memory"=14511046656,"Tiler Utilization %"=34,"recoveryCount"=0,"Renderer Utilization %"=34,"Device Utilization %"=34,"In use system memory"=7688503296}
+      "IOClass" = "AGXAcceleratorG13X"
+    }`
+
+func TestParseIoregOutput_ValidOutput(t *testing.T) {
+	const memTotalMB = 32768
+
+	stat := ParseIoregOutput([]byte(ioregSample), memTotalMB)
+	require.NotNil(t, stat)
+
+	assert.Equal(t, 0, stat.ID)
+	assert.Equal(t, "Apple M1 Pro (16-core GPU)", stat.Name)
+	assert.Equal(t, 34.0, stat.GpuUtilPct)
+	assert.Equal(t, 7688503296/(1024*1024), stat.MemUsedMB)
+	assert.Equal(t, memTotalMB, stat.MemTotalMB)
+	assert.InDelta(t, float64(stat.MemUsedMB)/memTotalMB*100, stat.MemUtilPct, 0.01)
+	// Not exposed by ioreg.
+	assert.Equal(t, 0, stat.TempC)
+	assert.Equal(t, 0.0, stat.PowerDrawW)
+	assert.Equal(t, 0.0, stat.FanSpeedPct)
+}
+
+func TestParseIoregOutput_NoGpuDevice(t *testing.T) {
+	stat := ParseIoregOutput([]byte("no gpu here"), 32768)
+	assert.Nil(t, stat)
+}
+
+func TestParseIoregOutput_ZeroMemTotal(t *testing.T) {
+	stat := ParseIoregOutput([]byte(ioregSample), 0)
+	require.NotNil(t, stat)
+	assert.Equal(t, 0.0, stat.MemUtilPct)
+}
+
+func TestParseIoregOutput_MissingModel(t *testing.T) {
+	const out = `"Device Utilization %"=50,"In use system memory"=1048576`
+
+	stat := ParseIoregOutput([]byte(out), 1024)
+	require.NotNil(t, stat)
+	assert.Equal(t, "Apple GPU", stat.Name)
+	assert.Equal(t, 50.0, stat.GpuUtilPct)
+	assert.Equal(t, 1, stat.MemUsedMB)
+}
@@ -0,0 +1,584 @@
+//go:build unix && !darwin
+
+package perf
+
+import (
+	"bufio"
+	"context"
+	"encoding/json"
+	"fmt"
+	"net"
+	"os"
+	"os/exec"
+	"os/user"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/internal/logmon"
+	"github.com/shirou/gopsutil/v4/cpu"
+	"github.com/shirou/gopsutil/v4/load"
+	"github.com/shirou/gopsutil/v4/mem"
+	psnet "github.com/shirou/gopsutil/v4/net"
+)
+
+func getGpuStats(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
+	if ch, err := tryLACT(ctx, every, logger); err == nil {
+		logger.Info("using LACT for GPU monitoring")
+		return ch, nil
+	} else {
+		logger.Debugf("LACT: %s", err.Error())
+	}
+
+	if ch, err := tryNvidiaSmi(ctx, every, logger); err == nil {
+		logger.Info("using nvidia-smi for GPU monitoring")
+		return ch, nil
+	} else {
+		logger.Debugf("nvidia-smi: %s", err.Error())
+	}
+
+	if ch, err := tryRocmSmi(ctx, every, logger); err == nil {
+		logger.Info("using rocm-smi for GPU monitoring")
+		return ch, nil
+	} else {
+		logger.Debugf("rocm-smi: %s", err.Error())
+	}
+
+	if ch, err := trySysfs(ctx, every, logger); err == nil {
+		logger.Info("using sysfs for GPU monitoring")
+		return ch, nil
+	} else {
+		logger.Debugf("sysfs: %s", err.Error())
+	}
+
+	return nil, ErrNoGpuTool
+}
+
+func tryLACT(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
+	socketPath := lactSocketPath()
+	if socketPath == "" {
+		return nil, ErrNoGpuTool
+	}
+
+	conn, err := net.DialTimeout("unix", socketPath, 2*time.Second)
+	if err != nil {
+		return nil, fmt.Errorf("cannot connect to LACT socket: %w", err)
+	}
+	defer conn.Close()
+
+	conn.SetDeadline(time.Now().Add(5 * time.Second))
+
+	devices, err := lactListDevices(conn)
+	if err != nil {
+		return nil, fmt.Errorf("LACT ListDevices failed: %w", err)
+	}
+
+	if len(devices) == 0 {
+		return nil, fmt.Errorf("LACT returned no devices")
+	}
+
+	ch := make(chan []GpuStat, 1)
+
+	go func() {
+		defer close(ch)
+		ticker := time.NewTicker(every)
+		defer ticker.Stop()
+
+		for {
+			select {
+			case <-ctx.Done():
+				return
+			case <-ticker.C:
+				socketPath := lactSocketPath()
+				if socketPath == "" {
+					continue
+				}
+
+				conn, err := net.DialTimeout("unix", socketPath, 2*time.Second)
+				if err != nil {
+					continue
+				}
+				conn.SetDeadline(time.Now().Add(5 * time.Second))
+
+				devices, err := lactListDevices(conn)
+				if err != nil {
+					conn.Close()
+					continue
+				}
+
+				stats := make([]GpuStat, 0, len(devices))
+				for i, d := range devices {
+					stat, err := lactGetDeviceStats(conn, d.ID, d.Name, i)
+					if err != nil {
+						continue
+					}
+					if stat.MemTotalMB == 0 {
+						continue
+					}
+					stats = append(stats, stat)
+				}
+				conn.Close()
+
+				if len(stats) > 0 {
+					select {
+					case ch <- stats:
+					default:
+					}
+				}
+			}
+		}
+	}()
+
+	return ch, nil
+}
+
+func tryNvidiaSmi(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
+	if _, err := exec.LookPath("nvidia-smi"); err != nil {
+		return nil, ErrNoGpuTool
+	}
+
+	sec := int(every.Seconds())
+	if sec < 1 {
+		sec = 1
+	}
+
+	cmd := exec.CommandContext(ctx, "nvidia-smi",
+		"--query-gpu=index,name,uuid,temperature.gpu,utilization.gpu,memory.used,memory.total,fan.speed,power.draw",
+		"--format=csv,noheader,nounits",
+		"--loop", fmt.Sprintf("%d", sec),
+	)
+
+	stdout, err := cmd.StdoutPipe()
+	if err != nil {
+		return nil, fmt.Errorf("nvidia-smi stdout pipe failed: %w", err)
+	}
+
+	if err := cmd.Start(); err != nil {
+		return nil, fmt.Errorf("nvidia-smi start failed: %w", err)
+	}
+
+	ch := make(chan []GpuStat, 1)
+
+	go func() {
+		defer close(ch)
+
+		scanner := bufio.NewScanner(stdout)
+		for scanner.Scan() {
+			line := strings.TrimSpace(scanner.Text())
+			if line == "" {
+				continue
+			}
+
+			stat := ParseNvidiaSmiLine(line)
+			if stat != nil {
+				select {
+				case ch <- []GpuStat{*stat}:
+				default:
+				}
+			}
+		}
+		cmd.Wait()
+	}()
+
+	return ch, nil
+}
+
+func tryRocmSmi(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
+	if _, err := exec.LookPath("rocm-smi"); err != nil {
+		return nil, ErrNoGpuTool
+	}
+	if every < time.Second {
+		every = time.Second
+	}
+	const pollTimeout = 5 * time.Second
+
+	ch := make(chan []GpuStat, 1)
+
+	go func() {
+		defer close(ch)
+		ticker := time.NewTicker(every)
+		defer ticker.Stop()
+
+		for {
+			select {
+			case <-ctx.Done():
+				return
+			case <-ticker.C:
+				pollCtx, cancel := context.WithTimeout(ctx, pollTimeout)
+				cmd := exec.CommandContext(pollCtx, "rocm-smi", "-i", "-P", "-t", "-f", "-u", "--showmemuse", "--showmeminfo", "vram", "--showproductname", "--csv")
+				out, err := cmd.Output()
+				timedOut := pollCtx.Err() == context.DeadlineExceeded
+				cancel()
+				if err != nil {
+					if timedOut {
+						logger.Debug("rocm-smi timed out")
+					}
+					continue
+				}
+
+				stats := make([]GpuStat, 0)
+				scanner := bufio.NewScanner(strings.NewReader(string(out)))
+				var header string
+				for scanner.Scan() {
+					line := strings.TrimSpace(scanner.Text())
+					if line == "" {
+						continue
+					}
+					if strings.HasPrefix(line, "device,") {
+						header = line
+						continue
+					}
+
+					stat := parseRocmSmiLine(header, line)
+					if stat != nil {
+						stats = append(stats, *stat)
+					}
+				}
+
+				if len(stats) > 0 {
+					select {
+					case ch <- stats:
+					default:
+					}
+				}
+			}
+		}
+	}()
+
+	return ch, nil
+}
+
+func parseRocmSmiLine(header string, line string) *GpuStat {
+	if header == "" || line == "" {
+		return nil
+	}
+	labels := strings.Split(header, ",")
+	fields := strings.Split(line, ",")
+	if len(labels) != len(fields) {
+		return nil
+	}
+
+	result := &GpuStat{
+		Timestamp: time.Now(),
+		ID:        -1,
+	}
+
+	var device string
+	var deviceName string
+	var cardSeries string
+	var gfxVersion string
+
+	const toMB = 1024 * 1024
+
+	for i, col := range labels {
+		val := strings.TrimSpace(fields[i])
+		switch col {
+		case "device":
+			device = val
+			id, err := strconv.Atoi(strings.TrimPrefix(val, "card"))
+			if err != nil {
+				return nil
+			}
+			result.ID = id
+		case "Device Name":
+			deviceName = val
+		case "GUID":
+			result.UUID = val
+		case "Temperature (Sensor edge) (C)":
+			tempC, _ := strconv.ParseFloat(val, 64)
+			result.TempC = int(tempC)
+		case "Temperature (Sensor memory) (C)":
+			vramTempC, _ := strconv.ParseFloat(val, 64)
+			result.VramTempC = int(vramTempC)
+		case "Fan speed (%)":
+			fanSpeed, _ := strconv.ParseFloat(val, 64)
+			result.FanSpeedPct = fanSpeed
+		case "Current Socket Graphics Package Power (W)":
+			fallthrough
+		case "Average Graphics Package Power (W)":
+			powerDraw, _ := strconv.ParseFloat(val, 64)
+			result.PowerDrawW = powerDraw
+		case "GPU use (%)":
+			gpuUtil, _ := strconv.ParseFloat(val, 64)
+			result.GpuUtilPct = gpuUtil
+		case "GPU Memory Allocated (VRAM%)":
+			memUtil, _ := strconv.ParseFloat(val, 64)
+			result.MemUtilPct = memUtil
+		case "VRAM Total Memory (B)":
+			memTotal, _ := strconv.ParseUint(val, 10, 64)
+			result.MemTotalMB = int(memTotal / toMB)
+		case "VRAM Total Used Memory (B)":
+			memUsed, _ := strconv.ParseUint(val, 10, 64)
+			result.MemUsedMB = int(memUsed / toMB)
+		case "Card Series":
+			cardSeries = val
+		case "GFX Version":
+			gfxVersion = val
+		}
+	}
+
+	if result.ID == -1 {
+		return nil
+	}
+
+	name := device
+	if cardSeries != "" && cardSeries != "N/A" {
+		name = cardSeries + " " + device + " (" + gfxVersion + ")"
+	} else if deviceName != "" && deviceName != "N/A" {
+		name = deviceName + " " + device + " (" + gfxVersion + ")"
+	}
+	result.Name = name
+
+	return result
+}
+
+func trySysfs(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
+	return nil, ErrNotImplemented
+}
+
+func lactSocketPath() string {
+	if p := os.Getenv("LACT_DAEMON_SOCKET_PATH"); p != "" {
+		if _, err := os.Stat(p); err == nil {
+			return p
+		}
+	}
+
+	rootPath := "/run/lactd.sock"
+	if _, err := os.Stat(rootPath); err == nil {
+		return rootPath
+	}
+
+	u, err := user.Current()
+	if err != nil {
+		return ""
+	}
+	userPath := filepath.Join("/run/user", u.Uid, "lactd.sock")
+	if _, err := os.Stat(userPath); err == nil {
+		return userPath
+	}
+
+	return ""
+}
+
+type lactRequest struct {
+	Command string      `json:"command"`
+	Args    interface{} `json:"args,omitempty"`
+}
+
+type lactResponse struct {
+	Status string          `json:"status"`
+	Data   json.RawMessage `json:"data"`
+}
+
+type lactDeviceEntry struct {
+	ID   string `json:"id"`
+	Name string `json:"name"`
+}
+
+type lactDeviceStats struct {
+	Fan struct {
+		PwmCurrent *uint8 `json:"pwm_current"`
+	} `json:"fan"`
+	Vram struct {
+		Total *uint64 `json:"total"`
+		Used  *uint64 `json:"used"`
+	} `json:"vram"`
+	Power struct {
+		Average *float64 `json:"average"`
+		Current *float64 `json:"current"`
+	} `json:"power"`
+	Temps       map[string]lactTempEntry `json:"temps"`
+	BusyPercent *uint8                   `json:"busy_percent"`
+}
+
+type lactTempEntry struct {
+	Current *float64 `json:"current"`
+}
+
+func lactSendRequest(conn net.Conn, req lactRequest) (json.RawMessage, error) {
+	data, err := json.Marshal(req)
+	if err != nil {
+		return nil, err
+	}
+	data = append(data, '\n')
+
+	if _, err := conn.Write(data); err != nil {
+		return nil, err
+	}
+
+	reader := bufio.NewReader(conn)
+	line, err := reader.ReadBytes('\n')
+	if err != nil {
+		return nil, err
+	}
+
+	var resp lactResponse
+	if err := json.Unmarshal(line, &resp); err != nil {
+		return nil, err
+	}
+
+	if resp.Status != "ok" {
+		return nil, fmt.Errorf("LACT error: %s", string(resp.Data))
+	}
+
+	return resp.Data, nil
+}
+
+func lactListDevices(conn net.Conn) ([]lactDeviceEntry, error) {
+	data, err := lactSendRequest(conn, lactRequest{Command: "list_devices"})
+	if err != nil {
+		return nil, err
+	}
+
+	var devices []lactDeviceEntry
+	if err := json.Unmarshal(data, &devices); err != nil {
+		return nil, err
+	}
+
+	return devices, nil
+}
+
+func lactGetDeviceStats(conn net.Conn, id string, name string, index int) (GpuStat, error) {
+	data, err := lactSendRequest(conn, lactRequest{
+		Command: "device_stats",
+		Args: struct {
+			ID string `json:"id"`
+		}{ID: id},
+	})
+	if err != nil {
+		return GpuStat{}, err
+	}
+
+	var stats lactDeviceStats
+	if err := json.Unmarshal(data, &stats); err != nil {
+		return GpuStat{}, err
+	}
+
+	var memUsedMB, memTotalMB int
+	if stats.Vram.Used != nil {
+		memUsedMB = int(*stats.Vram.Used / 1024 / 1024)
+	}
+	if stats.Vram.Total != nil {
+		memTotalMB = int(*stats.Vram.Total / 1024 / 1024)
+	}
+
+	var memUtil float64
+	if memTotalMB > 0 {
+		memUtil = float64(memUsedMB) / float64(memTotalMB) * 100
+	}
+
+	var gpuUtil float64
+	if stats.BusyPercent != nil {
+		gpuUtil = float64(*stats.BusyPercent)
+	}
+
+	var fanSpeed float64
+	if stats.Fan.PwmCurrent != nil {
+		fanSpeed = float64(*stats.Fan.PwmCurrent) / 255.0 * 100.0
+	}
+
+	var powerDraw float64
+	if stats.Power.Average != nil && *stats.Power.Average > 0 {
+		powerDraw = *stats.Power.Average
+	} else if stats.Power.Current != nil {
+		powerDraw = *stats.Power.Current
+	}
+
+	var tempC int
+	if t, ok := stats.Temps["edge"]; ok && t.Current != nil {
+		tempC = int(*t.Current)
+	} else if t, ok := stats.Temps["junction"]; ok && t.Current != nil {
+		tempC = int(*t.Current)
+	} else {
+		for _, t := range stats.Temps {
+			if t.Current != nil {
+				tempC = int(*t.Current)
+				break
+			}
+		}
+	}
+
+	var vramTempC int
+	// nvidia uses "VRAM", amd "mem"
+	for _, key := range []string{"mem", "VRAM"} {
+		if t, ok := stats.Temps[key]; ok && t.Current != nil && *t.Current > 0 {
+			vramTempC = int(*t.Current)
+			break
+		}
+	}
+
+	return GpuStat{
+		Timestamp:   time.Now(),
+		ID:          index,
+		Name:        name,
+		UUID:        id,
+		TempC:       tempC,
+		VramTempC:   vramTempC,
+		GpuUtilPct:  gpuUtil,
+		MemUtilPct:  memUtil,
+		MemUsedMB:   memUsedMB,
+		MemTotalMB:  memTotalMB,
+		FanSpeedPct: fanSpeed,
+		PowerDrawW:  powerDraw,
+	}, nil
+}
+
+func readSysfs() ([]GpuStat, error) {
+	return nil, ErrNotImplemented
+}
+
+func readSysStats() (SysStat, error) {
+	cpuPcts, err := cpu.Percent(0, true)
+	if err != nil {
+		return SysStat{}, err
+	}
+
+	vmStat, err := mem.VirtualMemory()
+	if err != nil {
+		return SysStat{}, err
+	}
+
+	const toMB = 1024 * 1024
+
+	var swapTotalMB, swapUsedMB int
+	if swapStat, err := mem.SwapMemory(); err == nil {
+		swapTotalMB = int(swapStat.Total / toMB)
+		swapUsedMB = int(swapStat.Used / toMB)
+	}
+
+	var loadAvg1, loadAvg5, loadAvg15 float64
+	if loadStat, err := load.Avg(); err == nil {
+		loadAvg1 = loadStat.Load1
+		loadAvg5 = loadStat.Load5
+		loadAvg15 = loadStat.Load15
+	}
+
+	netIO := make([]NetIOStat, 0)
+	if ioCounters, err := psnet.IOCounters(true); err == nil {
+		for _, ioc := range ioCounters {
+			if ioc.Name == "lo" {
+				continue
+			}
+			netIO = append(netIO, NetIOStat{
+				Name:      ioc.Name,
+				BytesRecv: ioc.BytesRecv,
+				BytesSent: ioc.BytesSent,
+			})
+		}
+	}
+
+	return SysStat{
+		Timestamp:      time.Now(),
+		CpuUtilPerCore: cpuPcts,
+		MemTotalMB:     int(vmStat.Total / toMB),
+		MemUsedMB:      int(vmStat.Used / toMB),
+		MemFreeMB:      int(vmStat.Free / toMB),
+		SwapTotalMB:    swapTotalMB,
+		SwapUsedMB:     swapUsedMB,
+		LoadAvg1:       loadAvg1,
+		LoadAvg5:       loadAvg5,
+		LoadAvg15:      loadAvg15,
+		NetIO:          netIO,
+	}, nil
+}
@@ -0,0 +1,114 @@
+package perf
+
+import (
+	"bufio"
+	"context"
+	"fmt"
+	"os/exec"
+	"strings"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/internal/logmon"
+	"github.com/shirou/gopsutil/v4/cpu"
+	"github.com/shirou/gopsutil/v4/mem"
+	"github.com/shirou/gopsutil/v4/net"
+)
+
+func getGpuStats(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
+	if ch, err := tryNvidiaSmiWindows(ctx, every, logger); err == nil {
+		logger.Info("using nvidia-smi for GPU monitoring")
+		return ch, nil
+	} else {
+		logger.Debugf("nvidia-smi: %s", err.Error())
+	}
+
+	return nil, ErrNoGpuTool
+}
+
+// tryNvidiaSmiWindows starts nvidia-smi in loop mode on Windows and returns
+// a channel receiving GPU stat snapshots. Returns ErrNoGpuTool if nvidia-smi
+// is not available.
+func tryNvidiaSmiWindows(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
+	if _, err := exec.LookPath("nvidia-smi"); err != nil {
+		return nil, ErrNoGpuTool
+	}
+
+	sec := int(every.Seconds())
+	if sec < 1 {
+		sec = 1
+	}
+
+	cmd := exec.CommandContext(ctx, "nvidia-smi",
+		"--query-gpu=index,name,uuid,temperature.gpu,utilization.gpu,memory.used,memory.total,fan.speed,power.draw",
+		"--format=csv,noheader,nounits",
+		"--loop", fmt.Sprintf("%d", sec),
+	)
+
+	stdout, err := cmd.StdoutPipe()
+	if err != nil {
+		return nil, fmt.Errorf("nvidia-smi stdout pipe failed: %w", err)
+	}
+
+	if err := cmd.Start(); err != nil {
+		return nil, fmt.Errorf("nvidia-smi start failed: %w", err)
+	}
+
+	ch := make(chan []GpuStat, 1)
+
+	go func() {
+		defer close(ch)
+
+		scanner := bufio.NewScanner(stdout)
+		for scanner.Scan() {
+			line := strings.TrimSpace(scanner.Text())
+			if line == "" {
+				continue
+			}
+
+			stat := ParseNvidiaSmiLine(line)
+			if stat != nil {
+				select {
+				case ch <- []GpuStat{*stat}:
+				default:
+				}
+			}
+		}
+		cmd.Wait()
+	}()
+
+	return ch, nil
+}
+
+func readSysStats() (SysStat, error) {
+	cpuPcts, err := cpu.Percent(0, true)
+	if err != nil {
+		return SysStat{}, err
+	}
+
+	vmStat, err := mem.VirtualMemory()
+	if err != nil {
+		return SysStat{}, err
+	}
+
+	const toMB = 1024 * 1024
+
+	netIO := make([]NetIOStat, 0)
+	if ioCounters, err := net.IOCounters(true); err == nil {
+		for _, ioc := range ioCounters {
+			netIO = append(netIO, NetIOStat{
+				Name:      ioc.Name,
+				BytesRecv: ioc.BytesRecv,
+				BytesSent: ioc.BytesSent,
+			})
+		}
+	}
+
+	return SysStat{
+		Timestamp:      time.Now(),
+		CpuUtilPerCore: cpuPcts,
+		MemTotalMB:     int(vmStat.Total / toMB),
+		MemUsedMB:      int(vmStat.Used / toMB),
+		MemFreeMB:      int(vmStat.Free / toMB),
+		NetIO:          netIO,
+	}, nil
+}
@@ -0,0 +1,129 @@
+package perf
+
+import (
+	"fmt"
+	"net/http"
+	"sort"
+	"strings"
+)
+
+const mbToBytes = int64(1024 * 1024)
+
+// MetricsHandler returns an http.HandlerFunc serving Prometheus text format metrics
+// with the most recent system and GPU stats.
+func (m *Monitor) MetricsHandler() http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		sysStats, gpuStats := m.Current()
+		w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
+
+		if len(sysStats) > 0 {
+			writeSysMetrics(w, sysStats[len(sysStats)-1])
+		}
+
+		if len(gpuStats) > 0 {
+			writeGpuMetrics(w, latestPerGPU(gpuStats))
+		}
+	}
+}
+
+func writeSysMetrics(w http.ResponseWriter, s SysStat) {
+	fmt.Fprintf(w, "# HELP llamaswap_cpu_util_percent CPU utilization per core (0-100)\n")
+	fmt.Fprintf(w, "# TYPE llamaswap_cpu_util_percent gauge\n")
+	for i, pct := range s.CpuUtilPerCore {
+		fmt.Fprintf(w, "llamaswap_cpu_util_percent{core=\"%d\"} %g\n", i, pct)
+	}
+
+	fmt.Fprintf(w, "# HELP llamaswap_memory_total_bytes Total memory in bytes\n")
+	fmt.Fprintf(w, "# TYPE llamaswap_memory_total_bytes gauge\n")
+	fmt.Fprintf(w, "llamaswap_memory_total_bytes %d\n", int64(s.MemTotalMB)*mbToBytes)
+
+	fmt.Fprintf(w, "# HELP llamaswap_memory_used_bytes Used memory in bytes\n")
+	fmt.Fprintf(w, "# TYPE llamaswap_memory_used_bytes gauge\n")
+	fmt.Fprintf(w, "llamaswap_memory_used_bytes %d\n", int64(s.MemUsedMB)*mbToBytes)
+
+	fmt.Fprintf(w, "# HELP llamaswap_memory_free_bytes Free memory in bytes\n")
+	fmt.Fprintf(w, "# TYPE llamaswap_memory_free_bytes gauge\n")
+	fmt.Fprintf(w, "llamaswap_memory_free_bytes %d\n", int64(s.MemFreeMB)*mbToBytes)
+
+	fmt.Fprintf(w, "# HELP llamaswap_swap_total_bytes Total swap in bytes\n")
+	fmt.Fprintf(w, "# TYPE llamaswap_swap_total_bytes gauge\n")
+	fmt.Fprintf(w, "llamaswap_swap_total_bytes %d\n", int64(s.SwapTotalMB)*mbToBytes)
+
+	fmt.Fprintf(w, "# HELP llamaswap_swap_used_bytes Used swap in bytes\n")
+	fmt.Fprintf(w, "# TYPE llamaswap_swap_used_bytes gauge\n")
+	fmt.Fprintf(w, "llamaswap_swap_used_bytes %d\n", int64(s.SwapUsedMB)*mbToBytes)
+
+	fmt.Fprintf(w, "# HELP llamaswap_load_average Load average\n")
+	fmt.Fprintf(w, "# TYPE llamaswap_load_average gauge\n")
+	fmt.Fprintf(w, "llamaswap_load_average{interval=\"1m\"} %g\n", s.LoadAvg1)
+	fmt.Fprintf(w, "llamaswap_load_average{interval=\"5m\"} %g\n", s.LoadAvg5)
+	fmt.Fprintf(w, "llamaswap_load_average{interval=\"15m\"} %g\n", s.LoadAvg15)
+
+	if len(s.NetIO) > 0 {
+		fmt.Fprintf(w, "# HELP llamaswap_network_bytes_total Total network bytes transferred\n")
+		fmt.Fprintf(w, "# TYPE llamaswap_network_bytes_total counter\n")
+		for _, io := range s.NetIO {
+			iface := sanitizeLabel(io.Name)
+			fmt.Fprintf(w, "llamaswap_network_bytes_total{interface=\"%s\",direction=\"recv\"} %d\n", iface, io.BytesRecv)
+			fmt.Fprintf(w, "llamaswap_network_bytes_total{interface=\"%s\",direction=\"sent\"} %d\n", iface, io.BytesSent)
+		}
+	}
+}
+
+func writeGpuMetrics(w http.ResponseWriter, gpus []GpuStat) {
+	if len(gpus) == 0 {
+		return
+	}
+
+	type gpuMetric struct {
+		help  string
+		name  string
+		value func(GpuStat) float64
+	}
+
+	metrics := []gpuMetric{
+		{"GPU temperature in Celsius", "llamaswap_gpu_temperature_celsius", func(g GpuStat) float64 { return float64(g.TempC) }},
+		{"GPU VRAM temperature in Celsius", "llamaswap_gpu_vram_temperature_celsius", func(g GpuStat) float64 { return float64(g.VramTempC) }},
+		{"GPU utilization percent (0-100)", "llamaswap_gpu_util_percent", func(g GpuStat) float64 { return g.GpuUtilPct }},
+		{"GPU memory utilization percent (0-100)", "llamaswap_gpu_memory_util_percent", func(g GpuStat) float64 { return g.MemUtilPct }},
+		{"GPU memory used in bytes", "llamaswap_gpu_memory_used_bytes", func(g GpuStat) float64 { return float64(g.MemUsedMB) * float64(mbToBytes) }},
+		{"GPU memory total in bytes", "llamaswap_gpu_memory_total_bytes", func(g GpuStat) float64 { return float64(g.MemTotalMB) * float64(mbToBytes) }},
+		{"GPU fan speed percent (0-100)", "llamaswap_gpu_fan_speed_percent", func(g GpuStat) float64 { return g.FanSpeedPct }},
+		{"GPU power draw in watts", "llamaswap_gpu_power_draw_watts", func(g GpuStat) float64 { return g.PowerDrawW }},
+	}
+
+	for _, m := range metrics {
+		fmt.Fprintf(w, "# HELP %s %s\n", m.name, m.help)
+		fmt.Fprintf(w, "# TYPE %s gauge\n", m.name)
+		for _, g := range gpus {
+			if g.UUID != "" {
+				fmt.Fprintf(w, "%s{id=\"%d\",name=\"%s\",uuid=\"%s\"} %g\n",
+					m.name, g.ID, sanitizeLabel(g.Name), sanitizeLabel(g.UUID), m.value(g))
+			} else {
+				fmt.Fprintf(w, "%s{id=\"%d\",name=\"%s\"} %g\n",
+					m.name, g.ID, sanitizeLabel(g.Name), m.value(g))
+			}
+		}
+	}
+}
+
+// latestPerGPU returns the most recent GpuStat for each GPU ID, sorted by ID.
+func latestPerGPU(stats []GpuStat) []GpuStat {
+	latest := make(map[int]GpuStat)
+	for _, g := range stats {
+		if prev, ok := latest[g.ID]; !ok || g.Timestamp.After(prev.Timestamp) {
+			latest[g.ID] = g
+		}
+	}
+	result := make([]GpuStat, 0, len(latest))
+	for _, g := range latest {
+		result = append(result, g)
+	}
+	sort.Slice(result, func(i, j int) bool { return result[i].ID < result[j].ID })
+	return result
+}
+
+// sanitizeLabel escapes characters that are invalid in Prometheus label values.
+func sanitizeLabel(s string) string {
+	return strings.NewReplacer(`"`, `\"`, `\`, `\\`, "\n", `\n`).Replace(s)
+}
@@ -0,0 +1,248 @@
+package perf
+
+import (
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/internal/config"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestSanitizeLabel(t *testing.T) {
+	tests := []struct {
+		input string
+		want  string
+	}{
+		{"normal", "normal"},
+		{"", ""},
+		{`with"quote`, `with\"quote`},
+		{`with\backslash`, `with\\backslash`},
+		{"with\nnewline", `with\nnewline`},
+		{`"both\n"`, `\"both\\n\"`},
+	}
+	for _, tc := range tests {
+		assert.Equal(t, tc.want, sanitizeLabel(tc.input), "input: %q", tc.input)
+	}
+}
+
+func TestLatestPerGPU_Empty(t *testing.T) {
+	result := latestPerGPU(nil)
+	assert.Empty(t, result)
+}
+
+func TestLatestPerGPU_Single(t *testing.T) {
+	now := time.Now()
+	stats := []GpuStat{{ID: 0, Name: "gpu0", Timestamp: now}}
+	result := latestPerGPU(stats)
+	require.Len(t, result, 1)
+	assert.Equal(t, "gpu0", result[0].Name)
+}
+
+func TestLatestPerGPU_PicksLatest(t *testing.T) {
+	earlier := time.Now().Add(-time.Second)
+	later := time.Now()
+	stats := []GpuStat{
+		{ID: 0, Name: "old", TempC: 50, Timestamp: earlier},
+		{ID: 0, Name: "new", TempC: 70, Timestamp: later},
+	}
+	result := latestPerGPU(stats)
+	require.Len(t, result, 1)
+	assert.Equal(t, "new", result[0].Name)
+	assert.Equal(t, 70, result[0].TempC)
+}
+
+func TestLatestPerGPU_MultipleGPUsSortedByID(t *testing.T) {
+	now := time.Now()
+	stats := []GpuStat{
+		{ID: 2, Name: "gpu2", Timestamp: now},
+		{ID: 0, Name: "gpu0", Timestamp: now},
+		{ID: 1, Name: "gpu1", Timestamp: now},
+	}
+	result := latestPerGPU(stats)
+	require.Len(t, result, 3)
+	assert.Equal(t, 0, result[0].ID)
+	assert.Equal(t, 1, result[1].ID)
+	assert.Equal(t, 2, result[2].ID)
+}
+
+func TestWriteSysMetrics(t *testing.T) {
+	rec := httptest.NewRecorder()
+	s := SysStat{
+		CpuUtilPerCore: []float64{10.5, 20.0},
+		MemTotalMB:     8192,
+		MemUsedMB:      4096,
+		MemFreeMB:      4096,
+		SwapTotalMB:    2048,
+		SwapUsedMB:     512,
+		LoadAvg1:       1.5,
+		LoadAvg5:       1.2,
+		LoadAvg15:      0.9,
+		NetIO: []NetIOStat{
+			{Name: "eth0", BytesRecv: 1000, BytesSent: 2000},
+		},
+	}
+
+	writeSysMetrics(rec, s)
+	body := rec.Body.String()
+
+	assert.Contains(t, body, `llamaswap_cpu_util_percent{core="0"} 10.5`)
+	assert.Contains(t, body, `llamaswap_cpu_util_percent{core="1"} 20`)
+	assert.Contains(t, body, "llamaswap_memory_total_bytes 8589934592")
+	assert.Contains(t, body, "llamaswap_memory_used_bytes 4294967296")
+	assert.Contains(t, body, "llamaswap_memory_free_bytes 4294967296")
+	assert.Contains(t, body, "llamaswap_swap_total_bytes 2147483648")
+	assert.Contains(t, body, "llamaswap_swap_used_bytes 536870912")
+	assert.Contains(t, body, `llamaswap_load_average{interval="1m"} 1.5`)
+	assert.Contains(t, body, `llamaswap_load_average{interval="5m"} 1.2`)
+	assert.Contains(t, body, `llamaswap_load_average{interval="15m"} 0.9`)
+	assert.Contains(t, body, `llamaswap_network_bytes_total{interface="eth0",direction="recv"} 1000`)
+	assert.Contains(t, body, `llamaswap_network_bytes_total{interface="eth0",direction="sent"} 2000`)
+}
+
+func TestWriteSysMetrics_NoNetIO(t *testing.T) {
+	rec := httptest.NewRecorder()
+	writeSysMetrics(rec, SysStat{CpuUtilPerCore: []float64{5.0}})
+	body := rec.Body.String()
+	assert.NotContains(t, body, "llamaswap_network_bytes_total")
+}
+
+func TestWriteGpuMetrics_Empty(t *testing.T) {
+	rec := httptest.NewRecorder()
+	writeGpuMetrics(rec, nil)
+	assert.Empty(t, rec.Body.String())
+}
+
+func TestWriteGpuMetrics(t *testing.T) {
+	rec := httptest.NewRecorder()
+	gpus := []GpuStat{
+		{
+			ID:          0,
+			Name:        "NVIDIA RTX 4090",
+			UUID:        "GPU-1234",
+			TempC:       75,
+			GpuUtilPct:  85.5,
+			MemUtilPct:  60.0,
+			MemUsedMB:   8192,
+			MemTotalMB:  24576,
+			FanSpeedPct: 55.0,
+			PowerDrawW:  300.5,
+		},
+	}
+
+	writeGpuMetrics(rec, gpus)
+	body := rec.Body.String()
+
+	assert.Contains(t, body, `llamaswap_gpu_temperature_celsius{id="0",name="NVIDIA RTX 4090",uuid="GPU-1234"} 75`)
+	assert.Contains(t, body, `llamaswap_gpu_vram_temperature_celsius{id="0",name="NVIDIA RTX 4090",uuid="GPU-1234"} 0`)
+	assert.Contains(t, body, `llamaswap_gpu_util_percent{id="0",name="NVIDIA RTX 4090",uuid="GPU-1234"} 85.5`)
+	assert.Contains(t, body, `llamaswap_gpu_memory_util_percent{id="0",name="NVIDIA RTX 4090",uuid="GPU-1234"} 60`)
+	assert.Contains(t, body, `llamaswap_gpu_memory_used_bytes{id="0",name="NVIDIA RTX 4090",uuid="GPU-1234"}`)
+	assert.Contains(t, body, `llamaswap_gpu_memory_total_bytes{id="0",name="NVIDIA RTX 4090",uuid="GPU-1234"}`)
+	assert.Contains(t, body, `llamaswap_gpu_fan_speed_percent{id="0",name="NVIDIA RTX 4090",uuid="GPU-1234"} 55`)
+	assert.Contains(t, body, `llamaswap_gpu_power_draw_watts{id="0",name="NVIDIA RTX 4090",uuid="GPU-1234"} 300.5`)
+}
+
+func TestWriteGpuMetrics_VramTemp(t *testing.T) {
+	rec := httptest.NewRecorder()
+	gpus := []GpuStat{
+		{ID: 0, Name: "AMD RX 7900", UUID: "GPU-5678", TempC: 70, VramTempC: 85},
+	}
+	writeGpuMetrics(rec, gpus)
+	body := rec.Body.String()
+	assert.Contains(t, body, `llamaswap_gpu_temperature_celsius{id="0",name="AMD RX 7900",uuid="GPU-5678"} 70`)
+	assert.Contains(t, body, `llamaswap_gpu_vram_temperature_celsius{id="0",name="AMD RX 7900",uuid="GPU-5678"} 85`)
+}
+
+func TestWriteGpuMetrics_EmptyUUID(t *testing.T) {
+	rec := httptest.NewRecorder()
+	gpus := []GpuStat{{ID: 3, Name: "AMD RX 7900", UUID: ""}}
+	writeGpuMetrics(rec, gpus)
+	body := rec.Body.String()
+	assert.NotContains(t, body, "uuid=")
+	assert.Contains(t, body, `name="AMD RX 7900"`)
+}
+
+func TestWriteGpuMetrics_LabelSanitization(t *testing.T) {
+	rec := httptest.NewRecorder()
+	gpus := []GpuStat{
+		{ID: 0, Name: `GPU "special"`, UUID: "uuid\nline"},
+	}
+	writeGpuMetrics(rec, gpus)
+	body := rec.Body.String()
+	assert.Contains(t, body, `name="GPU \"special\""`)
+	assert.Contains(t, body, `uuid="uuid\nline"`)
+}
+
+func TestMetricsHandler_ContentType(t *testing.T) {
+	m, err := New(config.PerformanceConfig{}, newTestLogger())
+	require.NoError(t, err)
+
+	req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
+	rec := httptest.NewRecorder()
+	m.MetricsHandler()(rec, req)
+
+	assert.Equal(t, "text/plain; version=0.0.4; charset=utf-8", rec.Header().Get("Content-Type"))
+}
+
+func TestMetricsHandler_EmptyStats(t *testing.T) {
+	m, err := New(config.PerformanceConfig{}, newTestLogger())
+	require.NoError(t, err)
+
+	req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
+	rec := httptest.NewRecorder()
+	m.MetricsHandler()(rec, req)
+
+	assert.Equal(t, http.StatusOK, rec.Code)
+	assert.Empty(t, strings.TrimSpace(rec.Body.String()))
+}
+
+func TestMetricsHandler_WithSysStats(t *testing.T) {
+	m, err := New(config.PerformanceConfig{}, newTestLogger())
+	require.NoError(t, err)
+
+	m.sysRing.Push(SysStat{Timestamp: time.Now(), CpuUtilPerCore: []float64{25.0}, MemTotalMB: 4096, MemUsedMB: 2048, MemFreeMB: 2048})
+
+	req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
+	rec := httptest.NewRecorder()
+	m.MetricsHandler()(rec, req)
+
+	body := rec.Body.String()
+	assert.Contains(t, body, "llamaswap_cpu_util_percent")
+	assert.Contains(t, body, "llamaswap_memory_total_bytes")
+}
+
+func TestMetricsHandler_UsesLatestSysStat(t *testing.T) {
+	m, err := New(config.PerformanceConfig{}, newTestLogger())
+	require.NoError(t, err)
+
+	now := time.Now()
+	m.sysRing.Push(SysStat{Timestamp: now.Add(-time.Second), MemTotalMB: 1000})
+	m.sysRing.Push(SysStat{Timestamp: now, MemTotalMB: 8192})
+
+	req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
+	rec := httptest.NewRecorder()
+	m.MetricsHandler()(rec, req)
+
+	body := rec.Body.String()
+	// 8192 MB = 8589934592 bytes
+	assert.Contains(t, body, "llamaswap_memory_total_bytes 8589934592")
+}
+
+func TestMetricsHandler_WithGpuStats(t *testing.T) {
+	m, err := New(config.PerformanceConfig{}, newTestLogger())
+	require.NoError(t, err)
+
+	m.gpuRing.Push([]GpuStat{{ID: 0, Name: "TestGPU", UUID: "uuid-0", TempC: 65, Timestamp: time.Now()}})
+
+	req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
+	rec := httptest.NewRecorder()
+	m.MetricsHandler()(rec, req)
+
+	body := rec.Body.String()
+	assert.Contains(t, body, "llamaswap_gpu_temperature_celsius")
+	assert.Contains(t, body, `name="TestGPU"`)
+}
@@ -0,0 +1,40 @@
+package perf
+
+import "time"
+
+type GpuStat struct {
+	Timestamp time.Time `json:"timestamp"`
+
+	ID          int     `json:"id"`
+	Name        string  `json:"name"`
+	UUID        string  `json:"uuid"`
+	TempC       int     `json:"temp_c"`
+	VramTempC   int     `json:"vram_temp_c"`
+	GpuUtilPct  float64 `json:"gpu_util_pct"`
+	MemUtilPct  float64 `json:"mem_util_pct"`
+	MemUsedMB   int     `json:"mem_used_mb"`
+	MemTotalMB  int     `json:"mem_total_mb"`
+	FanSpeedPct float64 `json:"fan_speed_pct"`
+	PowerDrawW  float64 `json:"power_draw_w"`
+}
+
+type NetIOStat struct {
+	Name      string `json:"name"`
+	BytesRecv uint64 `json:"bytes_recv"`
+	BytesSent uint64 `json:"bytes_sent"`
+}
+
+type SysStat struct {
+	Timestamp time.Time `json:"timestamp"`
+
+	CpuUtilPerCore []float64   `json:"cpu_util_per_core"`
+	MemTotalMB     int         `json:"mem_total_mb"`
+	MemUsedMB      int         `json:"mem_used_mb"`
+	MemFreeMB      int         `json:"mem_free_mb"`
+	SwapTotalMB    int         `json:"swap_total_mb"`
+	SwapUsedMB     int         `json:"swap_used_mb"`
+	LoadAvg1       float64     `json:"load_avg_1"`
+	LoadAvg5       float64     `json:"load_avg_5"`
+	LoadAvg15      float64     `json:"load_avg_15"`
+	NetIO          []NetIOStat `json:"net_io"`
+}
@@ -0,0 +1,49 @@
+package process
+
+import (
+	"fmt"
+	"net"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"testing"
+)
+
+var simpleResponderPath string
+
+func skipIfNoSimpleResponder(t *testing.T) {
+	t.Helper()
+	if _, err := os.Stat(simpleResponderPath); os.IsNotExist(err) {
+		t.Skipf("simple-responder not found at %s, run `make simple-responder`", simpleResponderPath)
+	}
+}
+
+func TestMain(m *testing.M) {
+	goos := runtime.GOOS
+	goarch := runtime.GOARCH
+	if goos == "windows" {
+		simpleResponderPath = filepath.Join("..", "..", "build", "simple-responder.exe")
+	} else {
+		simpleResponderPath = filepath.Join("..", "..", "build", fmt.Sprintf("simple-responder_%s_%s", goos, goarch))
+	}
+	m.Run()
+}
+
+func getFreePort(t *testing.T) int {
+	t.Helper()
+	l, err := net.Listen("tcp", "127.0.0.1:0")
+	if err != nil {
+		t.Fatalf("getFreePort: %v", err)
+	}
+	defer l.Close()
+	return l.Addr().(*net.TCPAddr).Port
+}
+
+func simpleResponderCmd(t *testing.T, args ...string) (string, int) {
+	port := getFreePort(t)
+	cmdPath := filepath.ToSlash(simpleResponderPath)
+	base := []string{cmdPath, fmt.Sprintf("-port %d", port)}
+	base = append(base, args...)
+	return strings.Join(base, " "), port
+}
@@ -0,0 +1,49 @@
+package process
+
+import (
+	"context"
+	"net/http"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/internal/logmon"
+)
+
+type ProcessState string
+
+const (
+	StateStopped  ProcessState = ProcessState("stopped")
+	StateStarting ProcessState = ProcessState("starting")
+	StateReady    ProcessState = ProcessState("ready")
+	StateStopping ProcessState = ProcessState("stopping")
+
+	// process is shutdown and will not be restarted
+	StateShutdown ProcessState = ProcessState("shutdown")
+)
+
+type Process interface {
+	// Run starts the process blocks until the process is terminated.
+	// The timeout parameter controls how long to wait for the process to get
+	// to a ready state to process traffic
+	Run(timeout time.Duration) error
+
+	// WaitReady blocks until the process is ready to serve requests
+	// or the context is cancelled. It returns nil when the process is ready
+	WaitReady(context.Context) error
+
+	// Stop blocks until the process has terminated. It returns nil when
+	// the process terminated as expected (exit 0)
+	Stop(timeout time.Duration) error
+
+	// State returns the current state of the process
+	// Note: this is a snapshot of the state at the time of the call
+	// and may change at any time after the call returns.
+	State() ProcessState
+
+	// ServeHTTP forwards requests to the underlying process
+	// Calling it when the process is not ready will result in a
+	// 503 response with a body indicating it is a llama-swap-error
+	ServeHTTP(http.ResponseWriter, *http.Request)
+
+	// Logger returns the monitor that captures this process's stdout/stderr.
+	Logger() *logmon.Monitor
+}
@@ -0,0 +1,684 @@
+package process
+
+import (
+	"context"
+	"fmt"
+	"net"
+	"net/http"
+	"net/http/httptest"
+	"net/http/httputil"
+	"net/url"
+	"os/exec"
+	"strings"
+	"sync/atomic"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/internal/config"
+	"github.com/mostlygeek/llama-swap/internal/event"
+	"github.com/mostlygeek/llama-swap/internal/logmon"
+	"github.com/mostlygeek/llama-swap/internal/shared"
+)
+
+var ErrStartAborted = fmt.Errorf("aborted")
+
+// cmdWaitDelay is the upper bound the runtime will wait for child I/O to
+// drain after the process exits before force-closing the stdout/stderr
+// pipes. Required so that cmd.Wait() returns even when a forked grandchild
+// inherits and holds the pipes open (e.g. a shell wrapper that backgrounds
+// the real binary). killProcess sends the stop signal directly (not via the
+// cmd context), so this delay is measured from process exit rather than from
+// the stop request, and stays independent of the caller's graceful timeout.
+const cmdWaitDelay = 10 * time.Second
+
+// parentCancelGraceTimeout is the graceful timeout used when the process is
+// torn down because parentCtx was cancelled (final router teardown or app
+// shutdown). In the normal flow the process has already been stopped via
+// Stop() by this point, so killProcess is a no-op kill; the short grace just
+// bounds the rare case where a process is still alive when its context is cut.
+const parentCancelGraceTimeout = time.Second
+
+type runReq struct {
+	timeout time.Duration
+	respond chan error
+}
+
+type stopReq struct {
+	timeout time.Duration
+	respond chan error
+}
+
+type waitReadyReq struct {
+	respond chan error
+}
+
+type startResult struct {
+	cmd       *exec.Cmd
+	cmdDone   chan struct{}
+	cancel    context.CancelFunc
+	handlerFn http.HandlerFunc
+	err       error
+}
+
+type ProcessCommand struct {
+	id        string
+	config    config.ModelConfig
+	parentCtx context.Context
+
+	processLogger *logmon.Monitor
+	proxyLogger   *logmon.Monitor
+
+	// waitDelay is assigned to cmd.WaitDelay when starting the upstream
+	// process. Defaults to cmdWaitDelay; tests override it to keep the
+	// pipe-close backstop from dominating their runtime.
+	waitDelay time.Duration
+
+	runCh       chan runReq
+	stopCh      chan stopReq
+	waitReadyCh chan waitReadyReq
+
+	// current ProcessState. Written only by run(); read by State() via atomic load.
+	state atomic.Value
+
+	// stores the active reverse-proxy handler when the process is running.
+	// Written only by run(); read by ServeHTTP via atomic load.
+	handler atomic.Pointer[http.HandlerFunc]
+
+	lastUse  atomic.Int64 // unix nano timestamp of last ServeHTTP completion
+	inflight atomic.Int64 // current in-flight ServeHTTP calls
+}
+
+var _ Process = (*ProcessCommand)(nil)
+
+func New(
+	parentCtx context.Context,
+	id string,
+	conf config.ModelConfig,
+	processLogger *logmon.Monitor,
+	proxyLogger *logmon.Monitor,
+) (*ProcessCommand, error) {
+	p := &ProcessCommand{
+		id:            id,
+		config:        conf,
+		parentCtx:     parentCtx,
+		processLogger: processLogger,
+		proxyLogger:   proxyLogger,
+
+		runCh:       make(chan runReq),
+		stopCh:      make(chan stopReq),
+		waitReadyCh: make(chan waitReadyReq),
+		waitDelay:   cmdWaitDelay,
+	}
+	p.state.Store(StateStopped)
+
+	go p.run()
+	return p, nil
+}
+
+func (p *ProcessCommand) Logger() *logmon.Monitor { return p.processLogger }
+
+// run is the single-writer goroutine that owns all mutable lifecycle state
+// (current ProcessState, the running *exec.Cmd, the active reverse-proxy
+// handler, and the list of WaitReady subscribers). Every public method
+// (Run / Stop / State / WaitReady) is a thin client that sends a request on
+// one of the channels below and waits for a response — this funnels concurrent
+// callers through a single serialization point so the state machine never
+// observes a race.
+func (p *ProcessCommand) run() {
+	// Mutable state — only read/written from this goroutine. ServeHTTP reads
+	// p.handler concurrently, which is why handler is an atomic.Pointer.
+	// p.state mirrors `state` so State() can observe transitions; setState
+	// writes both.
+	state := StateStopped
+	setState := func(s ProcessState) {
+		old := state
+		state = s
+		p.state.Store(s)
+		if old != s {
+			event.Emit(shared.ProcessStateChangeEvent{
+				ProcessName: p.id,
+				OldState:    string(old),
+				NewState:    string(s),
+			})
+		}
+	}
+	var (
+		cmd          *exec.Cmd
+		cmdDone      <-chan struct{}
+		cmdCancel    context.CancelFunc
+		readyWaiters []waitReadyReq
+		// runResp parks the in-flight Run caller's response channel. The
+		// interface contract is that Run blocks until the process is
+		// terminated, so we hold this until Stop, parentCtx, or an
+		// upstream exit unblocks it via respondRun.
+		runResp chan<- error
+	)
+
+	// notifyWaiters wakes every blocked WaitReady caller with the given result.
+	// Used on transitions out of StateStarting (ready, failed, aborted, or
+	// shutdown) — anything that resolves the "is it ready yet?" question.
+	notifyWaiters := func(err error) {
+		for _, w := range readyWaiters {
+			select {
+			case w.respond <- err:
+			default:
+			}
+		}
+		readyWaiters = nil
+	}
+
+	// respondRun delivers the final Run result, if a Run caller is parked.
+	respondRun := func(err error) {
+		if runResp != nil {
+			runResp <- err
+			runResp = nil
+		}
+	}
+
+	for {
+		select {
+		// Shutdown: parent context cancelled. Tear down any running process,
+		// wake any pending WaitReady callers with an error, then exit the
+		// goroutine permanently. Subsequent public-method calls will fail
+		// because parentCtx.Done() unblocks their send-side selects.
+		case <-p.parentCtx.Done():
+			// Mark shutdown before killProcess so concurrent State() readers
+			// stop treating this process as ready while the (possibly slow)
+			// teardown is in progress.
+			setState(StateShutdown)
+			if cmd != nil {
+				p.handler.Store(nil)
+				p.killProcess(cmd, cmdCancel, cmdDone, parentCancelGraceTimeout)
+				cmd = nil
+				cmdDone = nil
+				cmdCancel = nil
+			}
+			notifyWaiters(fmt.Errorf("[%s] shutdown", p.id))
+			respondRun(fmt.Errorf("[%s] shutdown", p.id))
+			return
+
+		// Upstream exited on its own (not via Stop). Drop handler state,
+		// transition to Stopped, and unblock the parked Run caller.
+		// cmdDone is nil while no process is running, so this case is
+		// dormant outside of StateReady.
+		case <-cmdDone:
+			if cmdCancel != nil {
+				cmdCancel()
+			}
+			cmd = nil
+			cmdDone = nil
+			cmdCancel = nil
+			p.handler.Store(nil)
+			setState(StateStopped)
+			respondRun(fmt.Errorf("[%s] upstream exited unexpectedly", p.id))
+
+		// WaitReady: if we're already in a terminal-for-this-question state,
+		// respond immediately; otherwise queue the caller and let a future
+		// state transition wake them via notifyWaiters.
+		case req := <-p.waitReadyCh:
+			switch state {
+			case StateReady:
+				req.respond <- nil
+			case StateShutdown:
+				req.respond <- fmt.Errorf("[%s] shutdown", p.id)
+			default:
+				readyWaiters = append(readyWaiters, req)
+			}
+
+		// Run: start the upstream process. Only valid from StateStopped.
+		// doStart can take a long time (health-check polling), so it runs in
+		// a separate goroutine and we wait on resultCh. While waiting we also
+		// listen for an incoming Stop — that's how callers cancel an in-flight
+		// start.
+		case req := <-p.runCh:
+			if state != StateStopped {
+				req.respond <- fmt.Errorf("[%s] could not be started in %s state", p.id, state)
+				continue
+			}
+			setState(StateStarting)
+
+			startCtx, cancelStart := context.WithCancel(context.Background())
+			resultCh := make(chan startResult, 1)
+			go func() {
+				resultCh <- p.doStart(startCtx, req.timeout)
+			}()
+
+			// pendingStop holds a Stop request that arrived mid-start, so we
+			// can respond to it AFTER we've finished tearing the start down.
+			var pendingStop *stopReq
+			select {
+			// doStart finished on its own — either successfully (latch
+			// cmd/handler and move to Ready) or with an error (back to
+			// Stopped). Either way wake WaitReady subscribers and reply
+			// to the Run caller.
+			case res := <-resultCh:
+				if res.err == nil {
+					cmd = res.cmd
+					cmdDone = res.cmdDone
+					cmdCancel = res.cancel
+					fn := res.handlerFn
+					p.handler.Store(&fn)
+					setState(StateReady)
+					notifyWaiters(nil)
+					// Park the Run response — Run blocks until the process
+					// terminates, so we only fire this when Stop, parentCtx,
+					// or the upstream exit takes the process down.
+					runResp = req.respond
+
+					// Start TTL goroutine if configured — self-terminates
+					// when state leaves StateReady.
+					if p.config.UnloadAfter > 0 {
+						ttlDuration := time.Duration(p.config.UnloadAfter) * time.Second
+						go func() {
+							ticker := time.NewTicker(time.Second)
+							defer ticker.Stop()
+							for range ticker.C {
+								if p.State() != StateReady {
+									return
+								}
+								if p.inflight.Load() != 0 {
+									continue
+								}
+								if time.Since(time.Unix(0, p.lastUse.Load())) > ttlDuration {
+									p.proxyLogger.Infof("<%s> Unloading model, TTL of %ds reached", p.id, p.config.UnloadAfter)
+									p.Stop(10 * time.Second)
+									return
+								}
+							}
+						}()
+					}
+				} else {
+					setState(StateStopped)
+					notifyWaiters(res.err)
+					req.respond <- res.err
+				}
+
+			// Stop arrived while doStart was still running. Cancel the
+			// start context to abort it, then wait for doStart to return.
+			// If doStart had already crossed the finish line before
+			// cancellation took effect, it returns a live cmd that we
+			// must kill ourselves. The Run caller gets ErrAbort; the Stop
+			// caller is parked in pendingStop and answered below.
+			case stop := <-p.stopCh:
+				cancelStart()
+				res := <-resultCh
+				if res.cmd != nil {
+					p.killProcess(res.cmd, res.cancel, res.cmdDone, stop.timeout)
+				}
+				setState(StateStopped)
+				notifyWaiters(ErrStartAborted)
+				req.respond <- ErrStartAborted
+				pendingStop = &stop
+
+			// Parent context cancelled (e.g. config reload) while doStart
+			// was still running. Stop() returns early when parentCtx is
+			// done and never sends on stopCh, so we must handle shutdown
+			// here to avoid leaving doStart running indefinitely.
+			case <-p.parentCtx.Done():
+				cancelStart()
+				// Mark shutdown before tearing the process down: killProcess
+				// may block (e.g. taskkill on Windows is slow to spawn), and
+				// callers observing State() should see StateShutdown promptly
+				// rather than a stale StateStarting.
+				setState(StateShutdown)
+				res := <-resultCh
+				if res.cmd != nil {
+					p.killProcess(res.cmd, res.cancel, res.cmdDone, parentCancelGraceTimeout)
+				}
+				notifyWaiters(fmt.Errorf("[%s] shutdown", p.id))
+				respondRun(fmt.Errorf("[%s] shutdown", p.id))
+				return
+			}
+			// cancelStart is idempotent; calling it again here ensures the
+			// context is released even on the success path (govet leak check).
+			cancelStart()
+			if pendingStop != nil {
+				pendingStop.respond <- nil
+			}
+
+		// Stop: tear down a running process.
+		case stop := <-p.stopCh:
+			if cmd != nil {
+				setState(StateStopping)
+				p.killProcess(cmd, cmdCancel, cmdDone, stop.timeout)
+				cmd = nil
+				cmdDone = nil
+				cmdCancel = nil
+				p.handler.Store(nil)
+			}
+			// Stop is a no-op (and not an error) when already Stopped — this
+			// is what makes it idempotent for callers that don't track state.
+			setState(StateStopped)
+			respondRun(nil)
+			stop.respond <- nil
+		}
+	}
+}
+
+func (p *ProcessCommand) doStart(startCtx context.Context, healthCheckTimeout time.Duration) startResult {
+	if p.config.Proxy == "" {
+		return startResult{err: fmt.Errorf("upstream proxy missing")}
+	}
+
+	args, err := p.config.SanitizedCommand()
+	if err != nil {
+		return startResult{err: fmt.Errorf("unable to get sanitized command: %w", err)}
+	}
+
+	proxyURL, err := url.Parse(p.config.Proxy)
+	if err != nil {
+		return startResult{err: fmt.Errorf("invalid proxy URL %q: %w", p.config.Proxy, err)}
+	}
+
+	reverseProxy := httputil.NewSingleHostReverseProxy(proxyURL)
+	reverseProxy.Transport = &http.Transport{
+		Proxy: http.ProxyFromEnvironment,
+		DialContext: (&net.Dialer{
+			Timeout:   time.Duration(p.config.Timeouts.Connect) * time.Second,
+			KeepAlive: time.Duration(p.config.Timeouts.KeepAlive) * time.Second,
+		}).DialContext,
+		TLSHandshakeTimeout:   time.Duration(p.config.Timeouts.TLSHandshake) * time.Second,
+		ResponseHeaderTimeout: time.Duration(p.config.Timeouts.ResponseHeader) * time.Second,
+		ExpectContinueTimeout: time.Duration(p.config.Timeouts.ExpectContinue) * time.Second,
+		ForceAttemptHTTP2:     true,
+		MaxIdleConns:          100,
+		MaxIdleConnsPerHost:   10,
+		IdleConnTimeout:       time.Duration(p.config.Timeouts.IdleConn) * time.Second,
+	}
+	reverseProxy.ModifyResponse = func(resp *http.Response) error {
+		if strings.Contains(strings.ToLower(resp.Header.Get("Content-Type")), "text/event-stream") {
+			resp.Header.Set("X-Accel-Buffering", "no")
+		}
+		return nil
+	}
+	// httputil.ReverseProxy panics with http.ErrAbortHandler when the upstream
+	// disconnects after response headers have been sent. Recover here so the
+	// streaming termination is treated as a normal client/upstream disconnect.
+	// see: https://github.com/golang/go/issues/23643
+	handlerFn := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		defer func() {
+			if rec := recover(); rec != nil {
+				if rec == http.ErrAbortHandler {
+					p.proxyLogger.Infof("<%s> recovered from upstream disconnection during streaming", p.id)
+				} else {
+					p.proxyLogger.Warnf("<%s> recovered from panic: %v", p.id, rec)
+				}
+			}
+		}()
+		reverseProxy.ServeHTTP(w, r)
+	})
+
+	// cmdCtx + cmd.Cancel are wired as a safety net: if the context is ever
+	// cancelled while the process is alive, cmd.Cancel sends SIGTERM / CmdStop
+	// and the runtime escalates to SIGKILL after cmd.WaitDelay. In the normal
+	// teardown path killProcess sends the stop signal directly instead, so
+	// cmd.WaitDelay only acts as the inherited-pipe backstop measured from
+	// process exit (see killProcess).
+	cmdCtx, cmdCancel := context.WithCancel(context.Background())
+	cmd := exec.CommandContext(cmdCtx, args[0], args[1:]...)
+	cmd.Stderr = p.processLogger
+	cmd.Stdout = p.processLogger
+	cmd.Env = append(cmd.Environ(), p.config.Env...)
+	cmd.Cancel = func() error { return p.sendStopSignal(cmd) }
+	cmd.WaitDelay = p.waitDelay
+	setProcAttributes(cmd)
+
+	p.proxyLogger.Debugf("<%s> Executing start command: %s, env: %s", p.id, strings.Join(args, " "), strings.Join(p.config.Env, ", "))
+
+	cmdDone := make(chan struct{})
+	if err := cmd.Start(); err != nil {
+		cmdCancel()
+		return startResult{err: fmt.Errorf("failed to start command '%s': %w", strings.Join(args, " "), err)}
+	}
+
+	go func() {
+		waitErr := cmd.Wait()
+		switch st := p.State(); {
+		case waitErr == nil:
+			p.proxyLogger.Debugf("<%s> process exited cleanly", p.id)
+		case st == StateStopping || st == StateShutdown:
+			// Expected: we force-terminated the process. A forced kill exits
+			// the child with a non-zero code (e.g. taskkill /f on Windows
+			// yields exit status 1), so this is not an error.
+			p.proxyLogger.Debugf("<%s> process stopped by llama-swap: %v", p.id, waitErr)
+		default:
+			if exitErr, ok := waitErr.(*exec.ExitError); ok {
+				p.proxyLogger.Debugf("<%s> process exited: code=%d, err=%v", p.id, exitErr.ExitCode(), waitErr)
+			} else {
+				p.proxyLogger.Debugf("<%s> process exited with error: %v", p.id, waitErr)
+			}
+		}
+		close(cmdDone)
+	}()
+
+	abort := func(err error) startResult {
+		p.killProcess(cmd, cmdCancel, cmdDone, 5*time.Second)
+		return startResult{err: err}
+	}
+	prematureExit := func() startResult {
+		cmdCancel()
+		return startResult{err: fmt.Errorf("upstream command exited prematurely")}
+	}
+
+	if startCtx.Err() != nil {
+		return abort(ErrStartAborted)
+	}
+
+	checkEndpoint := strings.TrimSpace(p.config.CheckEndpoint)
+	if checkEndpoint == "none" {
+		return startResult{cmd: cmd, cmdDone: cmdDone, cancel: cmdCancel, handlerFn: handlerFn}
+	}
+
+	// Wait 250ms for the command to start up before health checking
+	select {
+	case <-startCtx.Done():
+		return abort(ErrStartAborted)
+	case <-time.After(250 * time.Millisecond):
+	}
+
+	deadline := time.Now().Add(healthCheckTimeout)
+	for {
+		select {
+		case <-startCtx.Done():
+			return abort(ErrStartAborted)
+		case <-cmdDone:
+			return prematureExit()
+		default:
+		}
+
+		if time.Now().After(deadline) {
+			return abort(fmt.Errorf("health check timed out after %v", healthCheckTimeout))
+		}
+
+		req, _ := http.NewRequestWithContext(startCtx, "GET", p.config.CheckEndpoint, nil)
+		rr := httptest.NewRecorder()
+		reverseProxy.ServeHTTP(rr, req)
+		resp := rr.Result()
+		resp.Body.Close()
+		if resp.StatusCode == http.StatusOK {
+			p.proxyLogger.Infof("<%s> Health check passed on %s%s", p.id, p.config.Proxy, p.config.CheckEndpoint)
+			break
+		} else if startCtx.Err() != nil {
+			return abort(ErrStartAborted)
+		}
+
+		select {
+		case <-startCtx.Done():
+			return abort(ErrStartAborted)
+		case <-cmdDone:
+			return prematureExit()
+		case <-time.After(time.Second):
+		}
+	}
+
+	return startResult{cmd: cmd, cmdDone: cmdDone, cancel: cmdCancel, handlerFn: handlerFn}
+}
+
+// sendStopSignal runs the configured CmdStop (if any) or sends SIGTERM to
+// the upstream process. Wired up as cmd.Cancel so it fires whenever the
+// cmd's context is cancelled.
+func (p *ProcessCommand) sendStopSignal(cmd *exec.Cmd) error {
+	if cmd == nil || cmd.Process == nil {
+		p.processLogger.Debugf("<%s> sendStopSignal() called with nil cmd or process, nothing to stop", p.id)
+		return nil
+	}
+	pid := cmd.Process.Pid
+	if p.config.CmdStop != "" {
+		p.processLogger.Debugf("<%s> sendStopSignal() using CmdStop %q for pid %d", p.id, p.config.CmdStop, pid)
+		stopArgs, err := config.SanitizeCommand(
+			strings.ReplaceAll(p.config.CmdStop, "${PID}", fmt.Sprintf("%d", pid)),
+		)
+		if err == nil {
+			p.processLogger.Debugf("<%s> sendStopSignal() running stop command: %s", p.id, strings.Join(stopArgs, " "))
+			stopCmd := exec.Command(stopArgs[0], stopArgs[1:]...)
+			stopCmd.Env = cmd.Env
+			setProcAttributes(stopCmd)
+			runErr := stopCmd.Run()
+			if runErr != nil {
+				p.processLogger.Errorf("<%s> sendStopSignal() stop command failed: %v", p.id, runErr)
+			} else {
+				p.processLogger.Debugf("<%s> sendStopSignal() stop command completed for pid %d", p.id, pid)
+			}
+			return runErr
+		}
+		// fall through to SIGTERM if sanitize failed
+		p.processLogger.Errorf("<%s> sendStopSignal() failed to sanitize CmdStop %q: %v, falling back to terminateProcessTree", p.id, p.config.CmdStop, err)
+	}
+	// On Unix this SIGTERMs the whole process group so a forked grandchild
+	// (e.g. a shell wrapper that backgrounds the real binary) is taken down
+	// with the parent rather than orphaned.
+	p.processLogger.Debugf("<%s> sendStopSignal() no CmdStop configured, calling terminateProcessTree for pid %d", p.id, pid)
+	termErr := terminateProcessTree(cmd)
+	if termErr != nil {
+		p.processLogger.Errorf("<%s> sendStopSignal() terminateProcessTree failed for pid %d: %v", p.id, pid, termErr)
+	}
+	return termErr
+}
+
+// killProcess terminates the upstream process. The flow:
+//
+//  1. Send the graceful stop signal (CmdStop / SIGTERM) directly — NOT by
+//     cancelling cmdCtx. Cancelling the context would start cmd.WaitDelay
+//     immediately, which force-kills the process WaitDelay after the signal
+//     and would silently cap gracefulTimeout at WaitDelay whenever
+//     gracefulTimeout is the longer of the two.
+//  2. We wait up to gracefulTimeout for the process to exit on its own.
+//  3. If still alive, we SIGKILL the process group directly (Unix) so any
+//     forked descendant is force-terminated alongside the parent.
+//  4. We wait on cmdDone. cmd.WaitDelay (set when the cmd was built) is the
+//     critical backstop here: once the process exits, if a forked grandchild
+//     inherited the stdout/stderr pipes and is still holding them, the runtime
+//     force-closes the pipes WaitDelay after the exit and cmd.Wait() unblocks.
+//     Because we never cancelled the context, that WaitDelay timer measures
+//     from process exit (see os/exec awaitGoroutines), not from this call.
+//     Without WaitDelay this select would hang forever (the v219 bug).
+//
+// cancel() is still invoked (deferred) to release the context, but only after
+// the process has exited and os/exec's ctx watcher has already torn down, so it
+// never re-fires cmd.Cancel.
+func (p *ProcessCommand) killProcess(cmd *exec.Cmd, cancel context.CancelFunc, cmdDone <-chan struct{}, gracefulTimeout time.Duration) {
+	if cancel == nil {
+		return
+	}
+	defer cancel()
+
+	// Deliver CmdStop / SIGTERM in a goroutine so a slow or hanging CmdStop
+	// cannot block the run() goroutine; the gracefulTimeout + Process.Kill
+	// path below still guarantees teardown.
+	if cmd != nil {
+		go func() {
+			p.proxyLogger.Debugf("[%s] sending stop signal with timeout %v", p.id, gracefulTimeout)
+			if err := p.sendStopSignal(cmd); err != nil {
+				p.proxyLogger.Warnf("[%s] stop signal failed: %v", p.id, err)
+			}
+		}()
+	}
+
+	timer := time.NewTimer(gracefulTimeout)
+	defer timer.Stop()
+
+	select {
+	case <-cmdDone:
+		return
+	case <-timer.C:
+	}
+
+	if cmd != nil {
+		// SIGKILL the whole process group on Unix so any descendant that
+		// ignored or outlived the graceful signal is force-terminated too.
+		_ = killProcessTree(cmd)
+	}
+	<-cmdDone
+}
+
+func (p *ProcessCommand) ID() string {
+	return p.id
+}
+
+func (p *ProcessCommand) Run(timeout time.Duration) error {
+	req := runReq{
+		timeout: timeout,
+		respond: make(chan error, 1),
+	}
+	select {
+	case p.runCh <- req:
+	case <-p.parentCtx.Done():
+		return fmt.Errorf("[%s] shutdown", p.id)
+	}
+	select {
+	case err := <-req.respond:
+		return err
+	case <-p.parentCtx.Done():
+		return fmt.Errorf("[%s] shutdown", p.id)
+	}
+}
+
+func (p *ProcessCommand) WaitReady(ctx context.Context) error {
+	req := waitReadyReq{respond: make(chan error, 1)}
+	select {
+	case p.waitReadyCh <- req:
+	case <-ctx.Done():
+		return ctx.Err()
+	case <-p.parentCtx.Done():
+		return fmt.Errorf("[%s] shutdown", p.id)
+	}
+	select {
+	case err := <-req.respond:
+		return err
+	case <-ctx.Done():
+		return ctx.Err()
+	}
+}
+
+func (p *ProcessCommand) Stop(timeout time.Duration) error {
+	req := stopReq{
+		timeout: timeout,
+		respond: make(chan error, 1),
+	}
+	select {
+	case p.stopCh <- req:
+	case <-p.parentCtx.Done():
+		return fmt.Errorf("[%s] shutdown", p.id)
+	}
+	return <-req.respond
+}
+
+func (p *ProcessCommand) State() ProcessState {
+	if s, ok := p.state.Load().(ProcessState); ok {
+		return s
+	}
+	return StateStopped
+}
+
+func (p *ProcessCommand) ServeHTTP(w http.ResponseWriter, r *http.Request) {
+	fn := p.handler.Load()
+	if fn == nil {
+		http.Error(w, fmt.Sprintf("llama-swap-error: [%s] process is not ready", p.id), http.StatusServiceUnavailable)
+		return
+	}
+	p.inflight.Add(1)
+	defer func() {
+		p.lastUse.Store(time.Now().UnixNano())
+		p.inflight.Add(-1)
+	}()
+	(*fn)(w, r)
+}
@@ -0,0 +1,262 @@
+//go:build !windows
+
+package process
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"syscall"
+	"testing"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/internal/config"
+)
+
+// TestProcessCommand_StopForkingWrapper is a regression for the bug reported
+// against v219 where Stop would hang indefinitely when the upstream command
+// is a shell wrapper that forks the real binary (e.g. `#!/bin/bash` then
+// `"$@"`). After SIGTERM the wrapper dies but the grandchild inherits the
+// stdout/stderr pipes; cmd.Wait() blocks waiting for the pipe-copy goroutine
+// to drain EOF, which never happens while the grandchild holds the fds.
+//
+// The fix is cmd.WaitDelay (combined with exec.CommandContext + cmd.Cancel),
+// which causes the runtime to force-close the pipes after the delay so
+// cmd.Wait() — and therefore Stop — returns.
+func TestProcessCommand_StopForkingWrapper(t *testing.T) {
+	skipIfNoSimpleResponder(t)
+
+	port := getFreePort(t)
+	dir := t.TempDir()
+	pidFile := filepath.Join(dir, "child.pid")
+
+	// Wrapper script: backgrounds the child (which inherits stdout/stderr),
+	// records its PID for cleanup, then waits. When SIGTERM hits bash it
+	// dies without forwarding the signal; the grandchild keeps running and
+	// keeps the inherited pipe fds open. This is the scenario reported in
+	// the v219 regression.
+	wrapper := filepath.Join(dir, "wrapper.sh")
+	script := fmt.Sprintf("#!/bin/bash\n%q -port %d -silent &\necho $! > %q\nwait\n",
+		simpleResponderPath, port, pidFile)
+	if err := os.WriteFile(wrapper, []byte(script), 0o755); err != nil {
+		t.Fatalf("WriteFile: %v", err)
+	}
+	t.Cleanup(func() { killChildFromPidFile(pidFile) })
+
+	p := newProcessCommand(t, config.ModelConfig{
+		Cmd:                wrapper,
+		Proxy:              fmt.Sprintf("http://127.0.0.1:%d", port),
+		CheckEndpoint:      "/health",
+		HealthCheckTimeout: 10,
+	})
+	// Shrink the pipe-close backstop so the test doesn't sit at the
+	// production default (10s). Must be set before Run() so doStart picks
+	// it up when building the cmd.
+	const testWaitDelay = 250 * time.Millisecond
+	p.waitDelay = testWaitDelay
+
+	runErr := runAsync(t, p)
+
+	// Stop must return within a bounded time even though the grandchild
+	// is still holding the pipe open. Budget is generous on top of
+	// testWaitDelay to absorb scheduling jitter on slow CI runners; the
+	// pre-fix behaviour was an unbounded hang, so any reasonable cap
+	// distinguishes pass from fail.
+	stopReturned := make(chan error, 1)
+	stopStart := time.Now()
+	go func() { stopReturned <- p.Stop(testStopTimeout) }()
+
+	const stopBudget = testWaitDelay + 2*time.Second
+	select {
+	case err := <-stopReturned:
+		if err != nil {
+			t.Fatalf("Stop: %v", err)
+		}
+		t.Logf("Stop returned in %v", time.Since(stopStart))
+	case <-time.After(stopBudget):
+		t.Fatalf("Stop did not return within %v — cmd.Wait() likely hung on inherited pipe", stopBudget)
+	}
+
+	if got := p.State(); got != StateStopped {
+		t.Errorf("after Stop: expected state %s, got %s", StateStopped, got)
+	}
+
+	select {
+	case <-runErr:
+	case <-time.After(testReturnTimeout):
+		t.Errorf("Run did not return after Stop")
+	}
+}
+
+// TestProcessCommand_StopHonorsGracefulTimeout is a regression for the bug
+// where cmd.WaitDelay capped the graceful shutdown window. killProcess used to
+// cancel the cmd context to deliver SIGTERM, which starts cmd.WaitDelay
+// immediately; a process whose SIGTERM handler needs longer than WaitDelay to
+// finish was force-killed early even though Stop was given a much longer
+// timeout. The fix sends the signal directly so WaitDelay measures from process
+// exit (its inherited-pipe backstop role), leaving the graceful window to the
+// caller's Stop timeout.
+func TestProcessCommand_StopHonorsGracefulTimeout(t *testing.T) {
+	dir := t.TempDir()
+	marker := filepath.Join(dir, "graceful.done")
+	ready := filepath.Join(dir, "trap.ready")
+
+	// On SIGTERM, sleep past the (short) WaitDelay, then write the marker and
+	// exit cleanly. If WaitDelay still drove the kill, bash would be SIGKILLed
+	// mid-handler and the marker would never be written. The ready file is
+	// written only after the trap is installed so the test does not race
+	// SIGTERM ahead of it (CheckEndpoint:none marks ready before bash runs).
+	script := filepath.Join(dir, "graceful.sh")
+	body := fmt.Sprintf(
+		"#!/bin/bash\ncleanup() { sleep 0.6; echo done > %q; exit 0; }\ntrap cleanup SIGTERM\necho ready > %q\nwhile true; do sleep 0.1; done\n",
+		marker, ready,
+	)
+	if err := os.WriteFile(script, []byte(body), 0o755); err != nil {
+		t.Fatalf("WriteFile: %v", err)
+	}
+
+	p := newProcessCommand(t, config.ModelConfig{
+		Cmd:           script,
+		Proxy:         "http://127.0.0.1:1", // unused: health check disabled
+		CheckEndpoint: "none",
+	})
+	// WaitDelay shorter than the handler's 0.6s sleep, and far shorter than the
+	// Stop timeout below — this is the window the old code mis-killed in.
+	p.waitDelay = 200 * time.Millisecond
+
+	runErr := runAsync(t, p)
+
+	// Wait until the trap is installed before stopping.
+	trapDeadline := time.Now().Add(2 * time.Second)
+	for {
+		if _, err := os.Stat(ready); err == nil {
+			break
+		}
+		if time.Now().After(trapDeadline) {
+			t.Fatalf("script did not install SIGTERM trap in time")
+		}
+		time.Sleep(10 * time.Millisecond)
+	}
+
+	stopStart := time.Now()
+	if err := p.Stop(5 * time.Second); err != nil {
+		t.Fatalf("Stop: %v", err)
+	}
+	elapsed := time.Since(stopStart)
+
+	// The handler must have run to completion (marker written) rather than
+	// being force-killed at waitDelay.
+	if _, err := os.Stat(marker); err != nil {
+		t.Fatalf("graceful handler did not complete (marker missing): %v", err)
+	}
+	// And Stop must have waited for the handler (>~0.6s), not returned at the
+	// 200ms waitDelay.
+	if elapsed < 500*time.Millisecond {
+		t.Fatalf("Stop returned in %v — process was killed before its graceful handler finished", elapsed)
+	}
+
+	if got := p.State(); got != StateStopped {
+		t.Errorf("after Stop: expected state %s, got %s", StateStopped, got)
+	}
+	select {
+	case <-runErr:
+	case <-time.After(testReturnTimeout):
+		t.Errorf("Run did not return after Stop")
+	}
+}
+
+// TestProcessCommand_StopReapsForkedGrandchild verifies that stopping a forking
+// wrapper takes down the backgrounded grandchild too, rather than leaving it as
+// an orphan. The fix is Setpgid (runtime_unix.go): the wrapper leads its own
+// process group, so the stop signal is delivered to the whole group via the
+// negative PID and reaches the grandchild the wrapper never reaped.
+func TestProcessCommand_StopReapsForkedGrandchild(t *testing.T) {
+	skipIfNoSimpleResponder(t)
+
+	port := getFreePort(t)
+	dir := t.TempDir()
+	pidFile := filepath.Join(dir, "child.pid")
+
+	wrapper := filepath.Join(dir, "wrapper.sh")
+	script := fmt.Sprintf("#!/bin/bash\n%q -port %d -silent &\necho $! > %q\nwait\n",
+		simpleResponderPath, port, pidFile)
+	if err := os.WriteFile(wrapper, []byte(script), 0o755); err != nil {
+		t.Fatalf("WriteFile: %v", err)
+	}
+	t.Cleanup(func() { killChildFromPidFile(pidFile) })
+
+	p := newProcessCommand(t, config.ModelConfig{
+		Cmd:                wrapper,
+		Proxy:              fmt.Sprintf("http://127.0.0.1:%d", port),
+		CheckEndpoint:      "/health",
+		HealthCheckTimeout: 10,
+	})
+
+	runErr := runAsync(t, p)
+
+	// Read the grandchild PID the wrapper recorded.
+	var childPID int
+	deadline := time.Now().Add(2 * time.Second)
+	for {
+		data, err := os.ReadFile(pidFile)
+		if err == nil {
+			if pid, perr := strconv.Atoi(strings.TrimSpace(string(data))); perr == nil && pid > 0 {
+				childPID = pid
+				break
+			}
+		}
+		if time.Now().After(deadline) {
+			t.Fatalf("wrapper did not record grandchild PID")
+		}
+		time.Sleep(10 * time.Millisecond)
+	}
+
+	if err := p.Stop(testStopTimeout); err != nil {
+		t.Fatalf("Stop: %v", err)
+	}
+
+	// After Stop the grandchild must be gone. Signal 0 probes liveness without
+	// actually sending a signal; give it a brief window to exit after the
+	// group SIGTERM.
+	proc, err := os.FindProcess(childPID)
+	if err != nil {
+		t.Fatalf("FindProcess: %v", err)
+	}
+	gone := false
+	for i := 0; i < 100; i++ {
+		if err := proc.Signal(syscall.Signal(0)); err != nil {
+			gone = true
+			break
+		}
+		time.Sleep(10 * time.Millisecond)
+	}
+	if !gone {
+		t.Errorf("grandchild PID %d still alive after Stop — process group was not reaped", childPID)
+	}
+
+	select {
+	case <-runErr:
+	case <-time.After(testReturnTimeout):
+		t.Errorf("Run did not return after Stop")
+	}
+}
+
+// killChildFromPidFile reads a PID written by the wrapper script and SIGKILLs
+// it so leaked orphans don't accumulate between test runs. Best-effort.
+func killChildFromPidFile(pidFile string) {
+	data, err := os.ReadFile(pidFile)
+	if err != nil {
+		return
+	}
+	pid, err := strconv.Atoi(strings.TrimSpace(string(data)))
+	if err != nil || pid <= 0 {
+		return
+	}
+	proc, err := os.FindProcess(pid)
+	if err != nil {
+		return
+	}
+	_ = proc.Kill()
+}
@@ -0,0 +1,646 @@
+package process
+
+import (
+	"bytes"
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"runtime"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/internal/config"
+	"github.com/mostlygeek/llama-swap/internal/logmon"
+)
+
+const (
+	testStartTimeout    = 3 * time.Second
+	testStopTimeout     = 2 * time.Second
+	testReturnTimeout   = 1 * time.Second
+	testPollInterval    = 20 * time.Millisecond
+	testLogPollInterval = 10 * time.Millisecond
+)
+
+func newProcessCommand(t *testing.T, conf config.ModelConfig) *ProcessCommand {
+	t.Helper()
+	logger := logmon.NewWriter(io.Discard)
+	p, err := New(context.Background(), t.Name(), conf, logger, logger)
+	if err != nil {
+		t.Fatalf("New: %v", err)
+	}
+	return p
+}
+
+// runAsync starts Run in a goroutine and waits until the process is ready,
+// matching the new interface contract where Run blocks until the process is
+// terminated. Returns a channel that delivers Run's eventual error.
+func runAsync(t *testing.T, p *ProcessCommand) <-chan error {
+	t.Helper()
+	ch := make(chan error, 1)
+	go func() { ch <- p.Run(testStartTimeout) }()
+	ctx, cancel := context.WithTimeout(context.Background(), testStartTimeout)
+	defer cancel()
+	if err := p.WaitReady(ctx); err != nil {
+		t.Fatalf("WaitReady: %v", err)
+	}
+	return ch
+}
+
+func TestProcessCommand_StartStop(t *testing.T) {
+	skipIfNoSimpleResponder(t)
+
+	cmd, port := simpleResponderCmd(t, "-silent", "-respond hello")
+	p := newProcessCommand(t, config.ModelConfig{
+		Cmd:                cmd,
+		Proxy:              fmt.Sprintf("http://127.0.0.1:%d", port),
+		CheckEndpoint:      "/health",
+		HealthCheckTimeout: 10,
+	})
+	t.Cleanup(func() { p.Stop(testStopTimeout) })
+
+	req := httptest.NewRequest("GET", "/test", nil)
+
+	// before start: no handler
+	rr := httptest.NewRecorder()
+	p.ServeHTTP(rr, req)
+	if rr.Code != http.StatusServiceUnavailable {
+		t.Errorf("before start: expected 503, got %d", rr.Code)
+	}
+	if body := rr.Body.String(); !strings.Contains(body, "llama-swap-error") {
+		t.Errorf("before start: expected body to contain %q, got %q", "llama-swap-error", body)
+	}
+
+	runErr := runAsync(t, p)
+	if got := p.State(); got != StateReady {
+		t.Errorf("after Run: expected state %s, got %s", StateReady, got)
+	}
+
+	rr = httptest.NewRecorder()
+	p.ServeHTTP(rr, req)
+	if rr.Code != http.StatusOK {
+		t.Errorf("after Run: expected 200, got %d", rr.Code)
+	}
+	if body := rr.Body.String(); body != "hello" {
+		t.Errorf("expected body %q, got %q", "hello", body)
+	}
+
+	if err := p.Stop(testStopTimeout); err != nil {
+		t.Fatalf("Stop() error: %v", err)
+	}
+	if got := p.State(); got != StateStopped {
+		t.Errorf("after Stop: expected state %s, got %s", StateStopped, got)
+	}
+	select {
+	case err := <-runErr:
+		if err != nil {
+			t.Errorf("Run() after Stop: expected nil, got %v", err)
+		}
+	case <-time.After(testReturnTimeout):
+		t.Fatal("Run() did not return after Stop")
+	}
+
+	// after stop: handler cleared
+	rr = httptest.NewRecorder()
+	p.ServeHTTP(rr, req)
+	if rr.Code != http.StatusServiceUnavailable {
+		t.Errorf("after stop: expected 503, got %d", rr.Code)
+	}
+	if body := rr.Body.String(); !strings.Contains(body, "llama-swap-error") {
+		t.Errorf("after stop: expected body to contain %q, got %q", "llama-swap-error", body)
+	}
+}
+
+func TestProcessCommand_Run_Idempotent(t *testing.T) {
+	skipIfNoSimpleResponder(t)
+
+	cmd, port := simpleResponderCmd(t, "-silent")
+	p := newProcessCommand(t, config.ModelConfig{
+		Cmd:                cmd,
+		Proxy:              fmt.Sprintf("http://127.0.0.1:%d", port),
+		CheckEndpoint:      "/health",
+		HealthCheckTimeout: 10,
+	})
+	t.Cleanup(func() { p.Stop(testStopTimeout) })
+
+	runErr := runAsync(t, p)
+
+	if err := p.Run(testStartTimeout); err == nil {
+		t.Error("second Run() while running: expected error, got nil")
+	}
+
+	if err := p.Stop(testStopTimeout); err != nil {
+		t.Fatalf("Stop() error: %v", err)
+	}
+	select {
+	case <-runErr:
+	case <-time.After(testReturnTimeout):
+		t.Fatal("Run() did not return after Stop")
+	}
+}
+
+func TestProcessCommand_Stop_Idempotent(t *testing.T) {
+	skipIfNoSimpleResponder(t)
+
+	cmd, port := simpleResponderCmd(t, "-silent")
+	p := newProcessCommand(t, config.ModelConfig{
+		Cmd:                cmd,
+		Proxy:              fmt.Sprintf("http://127.0.0.1:%d", port),
+		CheckEndpoint:      "/health",
+		HealthCheckTimeout: 10,
+	})
+
+	if err := p.Stop(testStopTimeout); err != nil {
+		t.Fatalf("Stop() before Run(): %v", err)
+	}
+
+	runErr := runAsync(t, p)
+
+	if err := p.Stop(testStopTimeout); err != nil {
+		t.Fatalf("first Stop() error: %v", err)
+	}
+	select {
+	case <-runErr:
+	case <-time.After(testReturnTimeout):
+		t.Fatal("Run() did not return after Stop")
+	}
+
+	if err := p.Stop(testStopTimeout); err != nil {
+		t.Fatalf("second Stop() error: %v", err)
+	}
+}
+
+// TestProcessCommand_StopCancelsRun verifies that a Stop sent while Run is
+// executing its health-check loop returns ErrAbort to the Run caller.
+//
+// A blocking mock HTTP server is used as the proxy so the test can deterministically
+// know when doStart is inside the health-check loop before issuing Stop.
+func TestProcessCommand_StopCancelsRun(t *testing.T) {
+	skipIfNoSimpleResponder(t)
+
+	healthCheckStarted := make(chan struct{}, 1)
+	mock := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		// Signal that a health check is in-flight, then block until the client
+		// cancels (which happens when Stop cancels the start context).
+		select {
+		case healthCheckStarted <- struct{}{}:
+		default:
+		}
+		<-r.Context().Done()
+		http.Error(w, "mock cancelled", http.StatusServiceUnavailable)
+	}))
+	defer mock.Close()
+
+	// simple-responder is the real process; health checks go to the blocking mock.
+	cmd, _ := simpleResponderCmd(t, "-silent")
+	p := newProcessCommand(t, config.ModelConfig{
+		Cmd:                cmd,
+		Proxy:              mock.URL,
+		CheckEndpoint:      "/health",
+		HealthCheckTimeout: 30,
+	})
+
+	runErrCh := make(chan error, 1)
+	go func() {
+		runErrCh <- p.Run(testStartTimeout)
+	}()
+
+	// Block until doStart is actually performing a health check, guaranteeing
+	// that Run is in-flight when Stop is called.
+	<-healthCheckStarted
+
+	if err := p.Stop(testStopTimeout); err != nil {
+		t.Fatalf("Stop() error: %v", err)
+	}
+
+	if err := <-runErrCh; !errors.Is(err, ErrStartAborted) {
+		t.Errorf("expected ErrStartAborted from Run, got %v", err)
+	}
+}
+
+// TestProcessCommand_ParentCtxCancelDuringStart verifies that cancelling the
+// parent context while doStart is health-checking causes the process to
+// transition to StateShutdown promptly, not wait for the health-check timeout.
+//
+// This is the config-reload race: Stop() returns early when parentCtx is
+// already done and never writes to stopCh, so without a parentCtx.Done()
+// case in the inner select, the process would keep loading indefinitely.
+func TestProcessCommand_ParentCtxCancelDuringStart(t *testing.T) {
+	skipIfNoSimpleResponder(t)
+
+	healthCheckStarted := make(chan struct{}, 1)
+	mock := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		select {
+		case healthCheckStarted <- struct{}{}:
+		default:
+		}
+		<-r.Context().Done()
+		http.Error(w, "mock cancelled", http.StatusServiceUnavailable)
+	}))
+	defer mock.Close()
+
+	parentCtx, cancelParent := context.WithCancel(context.Background())
+	logger := logmon.NewWriter(io.Discard)
+	cmd, _ := simpleResponderCmd(t, "-silent")
+	p, err := New(parentCtx, t.Name(), config.ModelConfig{
+		Cmd:                cmd,
+		Proxy:              mock.URL,
+		CheckEndpoint:      "/health",
+		HealthCheckTimeout: 60,
+	}, logger, logger)
+	if err != nil {
+		t.Fatalf("New: %v", err)
+	}
+
+	runErrCh := make(chan error, 1)
+	go func() { runErrCh <- p.Run(60 * time.Second) }()
+
+	<-healthCheckStarted
+
+	// Cancel parent context to simulate a config reload tearing down the old server.
+	cancelParent()
+
+	select {
+	case err := <-runErrCh:
+		if !strings.Contains(err.Error(), "shutdown") {
+			t.Errorf("Run error = %v, want shutdown error", err)
+		}
+	case <-time.After(5 * time.Second):
+		t.Fatal("process did not shut down within 5s after parent context cancel during start")
+	}
+
+	// Run() may return before the run() goroutine writes StateShutdown;
+	// poll briefly to avoid a spurious race in the assertion.
+	deadline := time.Now().Add(2 * time.Second)
+	for time.Now().Before(deadline) {
+		if p.State() == StateShutdown {
+			break
+		}
+		time.Sleep(testPollInterval)
+	}
+	if got := p.State(); got != StateShutdown {
+		t.Errorf("after cancel: expected StateShutdown, got %s", got)
+	}
+}
+
+// TestProcessCommand_RunStopCycle runs several sequential start/stop pairs on
+// fresh processes to confirm they are reusable.
+func TestProcessCommand_RunStopCycle(t *testing.T) {
+	skipIfNoSimpleResponder(t)
+
+	for i := range 3 {
+		cmd, port := simpleResponderCmd(t, "-silent")
+		p := newProcessCommand(t, config.ModelConfig{
+			Cmd:                cmd,
+			Proxy:              fmt.Sprintf("http://127.0.0.1:%d", port),
+			CheckEndpoint:      "/health",
+			HealthCheckTimeout: 10,
+		})
+
+		runErr := runAsync(t, p)
+
+		req := httptest.NewRequest("GET", "/health", nil)
+		rr := httptest.NewRecorder()
+		p.ServeHTTP(rr, req)
+		if rr.Code != http.StatusOK {
+			t.Errorf("cycle %d: expected 200 from /health, got %d", i, rr.Code)
+		}
+
+		if err := p.Stop(testStopTimeout); err != nil {
+			t.Fatalf("cycle %d Stop() error: %v", i, err)
+		}
+		select {
+		case <-runErr:
+		case <-time.After(testReturnTimeout):
+			t.Fatalf("cycle %d: Run() did not return after Stop", i)
+		}
+	}
+}
+
+// TestProcessCommand_ReverseProxyPanicIsRecovered drives the full proxy path:
+// the upstream responds healthy on /health (so Run completes), then on the
+// actual proxied request it hijacks the connection and closes it mid-body.
+// That upstream EOF makes httputil.ReverseProxy.copyResponse return an error,
+// which panics with http.ErrAbortHandler — the wrapped handlerFn must recover
+// and log the disconnect.
+//
+// Requests are issued through an httptest.NewServer wrapping the process so
+// the panic actually fires (httputil only panics on copy errors when the
+// request carries http.ServerContextKey, which a real server sets).
+//
+// see: https://github.com/golang/go/issues/23643
+func TestProcessCommand_ReverseProxyPanicIsRecovered(t *testing.T) {
+	skipIfNoSimpleResponder(t)
+
+	upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/health" {
+			w.WriteHeader(http.StatusOK)
+			return
+		}
+		// Send a Content-Length that promises 100 bytes, deliver only a few,
+		// then slam the connection shut. The reverse proxy will see EOF
+		// before the body is fully copied and panic with ErrAbortHandler.
+		hj, ok := w.(http.Hijacker)
+		if !ok {
+			t.Errorf("upstream: hijack not supported")
+			return
+		}
+		conn, _, err := hj.Hijack()
+		if err != nil {
+			t.Errorf("upstream: hijack: %v", err)
+			return
+		}
+		_, _ = conn.Write([]byte("HTTP/1.1 200 OK\r\nContent-Length: 100\r\nContent-Type: text/plain\r\n\r\npartial"))
+		_ = conn.Close()
+	}))
+	t.Cleanup(upstream.Close)
+
+	// Capture proxy log output so we can assert the recover message was
+	// emitted by handlerFn.
+	logBuf := &syncBuffer{}
+	proxyLogger := logmon.NewWriter(logBuf)
+	procLogger := logmon.NewWriter(io.Discard)
+
+	cmd, _ := simpleResponderCmd(t, "-silent")
+	p, err := New(context.Background(), t.Name(), config.ModelConfig{
+		Cmd:                cmd,
+		Proxy:              upstream.URL,
+		CheckEndpoint:      "/health",
+		HealthCheckTimeout: 10,
+	}, procLogger, proxyLogger)
+	if err != nil {
+		t.Fatalf("New: %v", err)
+	}
+	t.Cleanup(func() { p.Stop(testStopTimeout) })
+
+	_ = runAsync(t, p)
+
+	// Wrap p in an httptest server so requests get http.ServerContextKey
+	// automatically — that is what makes httputil.ReverseProxy raise the panic.
+	front := httptest.NewServer(p)
+	t.Cleanup(front.Close)
+
+	resp, err := http.Get(front.URL + "/disconnect")
+	if err == nil {
+		resp.Body.Close()
+	}
+
+	const want = "recovered from upstream disconnection"
+	deadline := time.Now().Add(testReturnTimeout)
+	for time.Now().Before(deadline) {
+		if strings.Contains(logBuf.String(), want) {
+			return
+		}
+		time.Sleep(testLogPollInterval)
+	}
+	t.Errorf("expected proxy log to contain %q; got:\n%s", want, logBuf.String())
+}
+
+// syncBuffer is a concurrent-safe bytes.Buffer for capturing logmon output.
+type syncBuffer struct {
+	mu  sync.Mutex
+	buf bytes.Buffer
+}
+
+func (b *syncBuffer) Write(p []byte) (int, error) {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	return b.buf.Write(p)
+}
+
+func (b *syncBuffer) String() string {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	return b.buf.String()
+}
+
+// TestProcessCommand_TTL_StopsAfterIdle verifies that a process with a TTL
+// automatically stops itself after the idle timeout has elapsed following its
+// last request.
+func TestProcessCommand_TTL_StopsAfterIdle(t *testing.T) {
+	skipIfNoSimpleResponder(t)
+
+	mock := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+	}))
+	t.Cleanup(mock.Close)
+
+	cmd, _ := simpleResponderCmd(t, "-silent")
+
+	cfg := config.ModelConfig{
+		Cmd:                cmd,
+		Proxy:              mock.URL,
+		CheckEndpoint:      "/health",
+		HealthCheckTimeout: 10,
+		UnloadAfter:        1, // 1-second TTL
+	}
+	if runtime.GOOS == "windows" {
+		cfg.CmdStop = "taskkill /f /t /pid ${PID}"
+	}
+
+	p := newProcessCommand(t, cfg)
+
+	runErr := runAsync(t, p)
+	defer func() {
+		if p.State() == StateReady {
+			p.Stop(testStopTimeout)
+		}
+	}()
+
+	if got := p.State(); got != StateReady {
+		t.Fatalf("expected StateReady, got %s", got)
+	}
+
+	// Make one request to prime the last-use timestamp.
+	req := httptest.NewRequest("GET", "/", nil)
+	rr := httptest.NewRecorder()
+	p.ServeHTTP(rr, req)
+	if rr.Code != http.StatusOK {
+		t.Errorf("expected 200 after request, got %d", rr.Code)
+	}
+
+	// Wait for the TTL goroutine to fire and the process to fully stop.
+	// Poll for StateStopped directly to avoid racing the StateStopping
+	// intermediate state that sits between StateReady and StateStopped.
+	deadline := time.Now().Add(5 * time.Second)
+	for p.State() != StateStopped && time.Now().Before(deadline) {
+		time.Sleep(testPollInterval)
+	}
+
+	if got := p.State(); got != StateStopped {
+		t.Fatalf("TTL did not stop process; state is %s (expected %s)", got, StateStopped)
+	}
+
+	// Run() should have returned nil (clean stop from TTL).
+	select {
+	case err := <-runErr:
+		if err != nil {
+			t.Errorf("Run() after TTL stop: expected nil, got %v", err)
+		}
+	case <-time.After(testReturnTimeout):
+		t.Fatal("Run() did not return after TTL-induced stop")
+	}
+}
+
+// TestProcessCommand_TTL_ResetsOnRequest verifies that inflight requests
+// prevent the TTL goroutine from stopping the process, and that the TTL timer
+// resets after each request completes.
+func TestProcessCommand_TTL_ResetsOnRequest(t *testing.T) {
+	skipIfNoSimpleResponder(t)
+
+	mock := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+	}))
+	t.Cleanup(mock.Close)
+
+	cmd, _ := simpleResponderCmd(t, "-silent")
+	p := newProcessCommand(t, config.ModelConfig{
+		Cmd:                cmd,
+		Proxy:              mock.URL,
+		CheckEndpoint:      "/health",
+		HealthCheckTimeout: 10,
+		UnloadAfter:        1, // 1-second TTL
+	})
+
+	runErr := runAsync(t, p)
+	defer func() {
+		if p.State() == StateReady {
+			p.Stop(testStopTimeout)
+		}
+	}()
+
+	// Keep sending requests for 1.5s — past the 1s TTL — and verify
+	// the process never stops while traffic is flowing.
+	stopAt := time.Now().Add(1500 * time.Millisecond)
+	for time.Now().Before(stopAt) {
+		req := httptest.NewRequest("GET", "/", nil)
+		rr := httptest.NewRecorder()
+		p.ServeHTTP(rr, req)
+		if rr.Code != http.StatusOK {
+			t.Errorf("expected 200, got %d", rr.Code)
+		}
+		if p.State() != StateReady {
+			t.Fatalf("process was stopped during active traffic (state=%s)", p.State())
+		}
+		time.Sleep(10 * time.Millisecond)
+	}
+
+	if got := p.State(); got != StateReady {
+		t.Fatalf("expected StateReady while traffic was active, got %s", got)
+	}
+
+	// Now stop manually to clean up.
+	if err := p.Stop(testStopTimeout); err != nil {
+		t.Fatalf("Stop() error: %v", err)
+	}
+	select {
+	case <-runErr:
+	case <-time.After(testReturnTimeout):
+		t.Fatal("Run() did not return after Stop")
+	}
+}
+
+// TestProcessCommand_TTL_ZeroDisables verifies that UnloadAfter=0 does not
+// spawn a TTL goroutine — the process stays ready until explicitly stopped.
+func TestProcessCommand_TTL_ZeroDisables(t *testing.T) {
+	skipIfNoSimpleResponder(t)
+
+	mock := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+	}))
+	t.Cleanup(mock.Close)
+
+	cmd, _ := simpleResponderCmd(t, "-silent")
+	p := newProcessCommand(t, config.ModelConfig{
+		Cmd:                cmd,
+		Proxy:              mock.URL,
+		CheckEndpoint:      "/health",
+		HealthCheckTimeout: 10,
+		UnloadAfter:        0, // disabled
+	})
+
+	runErr := runAsync(t, p)
+	defer func() {
+		if p.State() == StateReady {
+			p.Stop(testStopTimeout)
+		}
+	}()
+
+	if got := p.State(); got != StateReady {
+		t.Fatalf("expected StateReady, got %s", got)
+	}
+
+	req := httptest.NewRequest("GET", "/", nil)
+	rr := httptest.NewRecorder()
+	p.ServeHTTP(rr, req)
+	if rr.Code != http.StatusOK {
+		t.Errorf("expected 200 after request, got %d", rr.Code)
+	}
+
+	// No TTL goroutine is spawned when UnloadAfter=0, so a brief sleep is
+	// enough to confirm the process remains ready.
+	time.Sleep(100 * time.Millisecond)
+
+	if got := p.State(); got != StateReady {
+		t.Fatalf("process was stopped unexpectedly (state=%s) with TTL=0", got)
+	}
+
+	// Cleanly stop.
+	if err := p.Stop(testStopTimeout); err != nil {
+		t.Fatalf("Stop() error: %v", err)
+	}
+	select {
+	case <-runErr:
+	case <-time.After(testReturnTimeout):
+		t.Fatal("Run() did not return after Stop")
+	}
+}
+
+// TestProcessCommand_ConcurrentRunStop launches many concurrent run/stop racing
+// pairs to exercise the race detector and verify no deadlocks occur.
+func TestProcessCommand_ConcurrentRunStop(t *testing.T) {
+	skipIfNoSimpleResponder(t)
+
+	for range 10 {
+		cmd, port := simpleResponderCmd(t, "-silent")
+		cfg := config.ModelConfig{
+			Cmd:                cmd,
+			Proxy:              fmt.Sprintf("http://127.0.0.1:%d", port),
+			CheckEndpoint:      "/health",
+			HealthCheckTimeout: 10,
+		}
+
+		if runtime.GOOS == "windows" {
+			cfg.CmdStop = "taskkill /f /t /pid ${PID}"
+		}
+
+		p := newProcessCommand(t, cfg)
+
+		runDone := make(chan struct{})
+		go func() {
+			defer close(runDone)
+			p.Run(testStartTimeout) //nolint: errcheck — one goroutine wins the race
+		}()
+		go func() {
+			p.Stop(testStopTimeout) //nolint: errcheck
+		}()
+
+		// Backstop: the racing Stop may have arrived before Run got on the
+		// channel (making it a no-op), so keep stopping until Run unblocks.
+		deadline := time.After(testStartTimeout)
+		for done := false; !done; {
+			select {
+			case <-runDone:
+				done = true
+			case <-deadline:
+				t.Fatal("Run did not return")
+			case <-time.After(testPollInterval):
+				p.Stop(testStopTimeout) //nolint: errcheck
+			}
+		}
+	}
+}
@@ -0,0 +1,82 @@
+package process
+
+import (
+	"fmt"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/internal/config"
+	"github.com/mostlygeek/llama-swap/internal/event"
+	"github.com/mostlygeek/llama-swap/internal/shared"
+)
+
+func TestProcessCommand_EmitsStateChangeEvents(t *testing.T) {
+	skipIfNoSimpleResponder(t)
+
+	var mu sync.Mutex
+	var transitions []shared.ProcessStateChangeEvent
+	cancel := event.On(func(e shared.ProcessStateChangeEvent) {
+		if e.ProcessName != t.Name() {
+			return
+		}
+		mu.Lock()
+		transitions = append(transitions, e)
+		mu.Unlock()
+	})
+	defer cancel()
+
+	cmd, port := simpleResponderCmd(t, "-silent", "-respond hello")
+	p := newProcessCommand(t, config.ModelConfig{
+		Cmd:                cmd,
+		Proxy:              fmt.Sprintf("http://127.0.0.1:%d", port),
+		CheckEndpoint:      "/health",
+		HealthCheckTimeout: 10,
+	})
+
+	runErr := runAsync(t, p)
+	if err := p.Stop(testStopTimeout); err != nil {
+		t.Fatalf("Stop: %v", err)
+	}
+	<-runErr
+
+	// Events are delivered asynchronously; give the dispatcher a moment.
+	deadline := time.Now().Add(time.Second)
+	for time.Now().Before(deadline) {
+		mu.Lock()
+		n := len(transitions)
+		mu.Unlock()
+		if n >= 4 {
+			break
+		}
+		time.Sleep(testPollInterval)
+	}
+
+	mu.Lock()
+	defer mu.Unlock()
+
+	for _, e := range transitions {
+		if e.OldState == e.NewState {
+			t.Errorf("emitted no-op transition: %s -> %s", e.OldState, e.NewState)
+		}
+	}
+
+	want := []string{
+		string(StateStopped) + "->" + string(StateStarting),
+		string(StateStarting) + "->" + string(StateReady),
+		string(StateReady) + "->" + string(StateStopping),
+		string(StateStopping) + "->" + string(StateStopped),
+	}
+	got := make([]string, len(transitions))
+	for i, e := range transitions {
+		got[i] = e.OldState + "->" + e.NewState
+	}
+	if len(got) != len(want) {
+		t.Fatalf("transitions = %v, want %v", got, want)
+	}
+	for i := range want {
+		if got[i] != want[i] {
+			t.Fatalf("transitions = %v, want %v", got, want)
+		}
+	}
+}
@@ -0,0 +1,44 @@
+//go:build !windows
+
+package process
+
+import (
+	"os/exec"
+	"syscall"
+)
+
+// setProcAttributes starts the upstream in its own process group (Setpgid) so
+// the entire process tree can be signalled at once via its negative PID. This
+// is what lets us reap a forked grandchild — e.g. a shell wrapper that
+// backgrounds the real binary and exits — instead of leaking it as an orphan
+// that holds the inherited stdout/stderr pipes open.
+func setProcAttributes(cmd *exec.Cmd) {
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+}
+
+// terminateProcessTree sends SIGTERM to the whole process group led by the
+// command, giving every process in the tree a chance to shut down gracefully.
+func terminateProcessTree(cmd *exec.Cmd) error {
+	return signalProcessTree(cmd, syscall.SIGTERM)
+}
+
+// killProcessTree sends SIGKILL to the whole process group, force-terminating
+// every process in the tree.
+func killProcessTree(cmd *exec.Cmd) error {
+	return signalProcessTree(cmd, syscall.SIGKILL)
+}
+
+// signalProcessTree signals the process group led by cmd.Process. Because the
+// child was started with Setpgid it is its own group leader (pgid == pid), so
+// targeting -pid reaches the child and every descendant still in the group.
+// Falls back to signalling just the child if the group send fails (e.g. the
+// group has already drained), so we never silently skip the signal.
+func signalProcessTree(cmd *exec.Cmd, sig syscall.Signal) error {
+	if cmd == nil || cmd.Process == nil {
+		return nil
+	}
+	if err := syscall.Kill(-cmd.Process.Pid, sig); err != nil {
+		return cmd.Process.Signal(sig)
+	}
+	return nil
+}
@@ -0,0 +1,53 @@
+//go:build windows
+
+package process
+
+import (
+	"fmt"
+	"os/exec"
+	"syscall"
+)
+
+// setProcAttributes sets platform-specific process attributes. CREATE_NO_WINDOW
+// keeps the upstream from spawning its own console window.
+func setProcAttributes(cmd *exec.Cmd) {
+	cmd.SysProcAttr = &syscall.SysProcAttr{
+		HideWindow:    true,
+		CreationFlags: 0x08000000, // CREATE_NO_WINDOW
+	}
+}
+
+// terminateProcessTree requests a graceful shutdown of the whole process tree
+// rooted at cmd.Process. Windows has no SIGTERM or process-group signalling, so
+// we shell out to `taskkill /t`, which walks the child tree by PID — the
+// equivalent of signalling a Unix process group. Without /f, taskkill asks the
+// processes to close rather than force-killing them.
+func terminateProcessTree(cmd *exec.Cmd) error {
+	return taskkillProcessTree(cmd, false)
+}
+
+// killProcessTree force-terminates the whole process tree rooted at cmd.Process
+// via `taskkill /f /t`, so any descendant that ignored or outlived the graceful
+// request is killed alongside the parent rather than leaked as an orphan.
+func killProcessTree(cmd *exec.Cmd) error {
+	return taskkillProcessTree(cmd, true)
+}
+
+// taskkillProcessTree runs taskkill against cmd.Process.Pid. The /t flag
+// terminates the process together with any child processes it started, which is
+// the Windows analogue of signalling a Unix process group via its negative PID.
+// When force is true the /f flag force-kills; otherwise taskkill requests a
+// graceful close.
+func taskkillProcessTree(cmd *exec.Cmd, force bool) error {
+	if cmd == nil || cmd.Process == nil {
+		return nil
+	}
+	args := make([]string, 0, 4)
+	if force {
+		args = append(args, "/f")
+	}
+	args = append(args, "/t", "/pid", fmt.Sprintf("%d", cmd.Process.Pid))
+	kill := exec.Command("taskkill", args...)
+	setProcAttributes(kill)
+	return kill.Run()
+}
@@ -0,0 +1,7 @@
+//go:build !windows
+
+package process
+
+// SetupTreeCleanup is a no-op on non-Windows platforms, where upstream process
+// teardown is handled via process-group signalling (see runtime_unix.go).
+func SetupTreeCleanup() error { return nil }
@@ -0,0 +1,50 @@
+//go:build windows
+
+package process
+
+import (
+	"fmt"
+	"unsafe"
+
+	"golang.org/x/sys/windows"
+)
+
+// SetupTreeCleanup assigns the current process to a Windows Job Object
+// configured with JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE. Upstream processes
+// spawned afterwards are associated with the same job, so when llama-swap exits
+// for any reason — graceful shutdown, a forced second Ctrl+C, or a crash — the
+// OS terminates the whole job and reaps every child instead of leaving orphans
+// behind. It is the parent-side complement to the per-process teardown in
+// runtime_windows.go.
+//
+// The job handle is intentionally leaked for the lifetime of the process: the
+// kill-on-close behaviour fires when the last handle is released, which the OS
+// does when the process exits.
+func SetupTreeCleanup() error {
+	job, err := windows.CreateJobObject(nil, nil)
+	if err != nil {
+		return fmt.Errorf("CreateJobObject: %w", err)
+	}
+
+	info := windows.JOBOBJECT_EXTENDED_LIMIT_INFORMATION{
+		BasicLimitInformation: windows.JOBOBJECT_BASIC_LIMIT_INFORMATION{
+			LimitFlags: windows.JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE,
+		},
+	}
+	if _, err := windows.SetInformationJobObject(
+		job,
+		windows.JobObjectExtendedLimitInformation,
+		uintptr(unsafe.Pointer(&info)),
+		uint32(unsafe.Sizeof(info)),
+	); err != nil {
+		windows.CloseHandle(job)
+		return fmt.Errorf("SetInformationJobObject: %w", err)
+	}
+
+	if err := windows.AssignProcessToJobObject(job, windows.CurrentProcess()); err != nil {
+		windows.CloseHandle(job)
+		return fmt.Errorf("AssignProcessToJobObject: %w", err)
+	}
+
+	return nil
+}
@@ -0,0 +1,39 @@
+package ring
+
+type Buffer[T any] struct {
+	buf  []T
+	head int
+	size int
+}
+
+func NewBuffer[T any](capacity int) Buffer[T] {
+	if capacity < 1 {
+		capacity = 1
+	}
+	return Buffer[T]{buf: make([]T, capacity)}
+}
+
+// Push adds v, overwriting the oldest entry when the buffer is full.
+func (r *Buffer[T]) Push(v T) {
+	cap := len(r.buf)
+	if r.size < cap {
+		r.buf[(r.head+r.size)%cap] = v
+		r.size++
+	} else {
+		r.buf[r.head] = v
+		r.head = (r.head + 1) % cap
+	}
+}
+
+// Slice returns all entries in insertion order as a new slice.
+func (r *Buffer[T]) Slice() []T {
+	if r.size == 0 {
+		return nil
+	}
+	cap := len(r.buf)
+	result := make([]T, r.size)
+	for i := 0; i < r.size; i++ {
+		result[i] = r.buf[(r.head+i)%cap]
+	}
+	return result
+}
@@ -0,0 +1,44 @@
+package ring
+
+import "testing"
+
+const benchCap = 600 // matches default MaxAge/Every (1min / 100ms)
+
+func BenchmarkBuffer_PushNoWrap(b *testing.B) {
+	for b.Loop() {
+		buf := NewBuffer[int](b.N + 1)
+		for i := range b.N {
+			buf.Push(i)
+		}
+	}
+}
+
+func BenchmarkBuffer_PushWrap(b *testing.B) {
+	buf := NewBuffer[int](benchCap)
+	b.ResetTimer()
+	for i := range b.N {
+		buf.Push(i)
+	}
+}
+
+func BenchmarkBuffer_Slice(b *testing.B) {
+	buf := NewBuffer[int](benchCap)
+	for i := range benchCap {
+		buf.Push(i)
+	}
+	b.ResetTimer()
+	for range b.N {
+		_ = buf.Slice()
+	}
+}
+
+func BenchmarkBuffer_PushAndSlice(b *testing.B) {
+	buf := NewBuffer[int](benchCap)
+	b.ResetTimer()
+	for i := range b.N {
+		buf.Push(i)
+		if i%benchCap == 0 {
+			_ = buf.Slice()
+		}
+	}
+}
@@ -0,0 +1,65 @@
+package ring
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestBuffer_EmptySliceIsNil(t *testing.T) {
+	b := NewBuffer[int](4)
+	assert.Nil(t, b.Slice())
+}
+
+func TestBuffer_PushBelowCapacity(t *testing.T) {
+	b := NewBuffer[int](4)
+	b.Push(1)
+	b.Push(2)
+	assert.Equal(t, []int{1, 2}, b.Slice())
+}
+
+func TestBuffer_PushAtCapacity(t *testing.T) {
+	b := NewBuffer[int](3)
+	b.Push(1)
+	b.Push(2)
+	b.Push(3)
+	assert.Equal(t, []int{1, 2, 3}, b.Slice())
+}
+
+func TestBuffer_PushOverCapacityEvictsOldest(t *testing.T) {
+	b := NewBuffer[int](3)
+	b.Push(1)
+	b.Push(2)
+	b.Push(3)
+	b.Push(4)
+	assert.Equal(t, []int{2, 3, 4}, b.Slice())
+}
+
+func TestBuffer_CapacityOne(t *testing.T) {
+	b := NewBuffer[int](1)
+	b.Push(1)
+	b.Push(2)
+	assert.Equal(t, []int{2}, b.Slice())
+}
+
+func TestBuffer_ZeroCapacityDefaultsToOne(t *testing.T) {
+	b := NewBuffer[int](0)
+	b.Push(42)
+	assert.Equal(t, []int{42}, b.Slice())
+}
+
+func TestBuffer_SliceReturnsCopy(t *testing.T) {
+	b := NewBuffer[int](4)
+	b.Push(10)
+	s := b.Slice()
+	s[0] = 99
+	assert.Equal(t, []int{10}, b.Slice())
+}
+
+func TestBuffer_InsertionOrderPreservedAfterWrap(t *testing.T) {
+	b := NewBuffer[int](4)
+	for i := 1; i <= 8; i++ {
+		b.Push(i)
+	}
+	assert.Equal(t, []int{5, 6, 7, 8}, b.Slice())
+}
@@ -0,0 +1,800 @@
+package router
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/internal/config"
+	"github.com/mostlygeek/llama-swap/internal/logmon"
+	"github.com/mostlygeek/llama-swap/internal/process"
+)
+
+type shutdownReq struct {
+	timeout time.Duration
+	respond chan error
+}
+
+type unloadReq struct {
+	targets []string
+	timeout time.Duration
+	respond chan struct{}
+}
+
+type handlerReq struct {
+	model      string
+	ctx        context.Context
+	respond    chan handlerResp
+	positionCh chan int
+}
+
+type handlerResp struct {
+	handleFunc http.HandlerFunc
+	err        error
+}
+
+type swapDone struct {
+	modelID string
+	err     error
+}
+
+type serveDoneEvent struct {
+	modelID string
+}
+
+type activeSwap struct {
+	modelID string
+	evict   []string
+	waiters []handlerReq
+}
+
+// swapPlanner is the only piece of behaviour that differs between concrete
+// routers. baseRouter never inspects its internals.
+type swapPlanner interface {
+	// EvictionFor returns running model IDs that must be stopped before
+	// target can serve. alsoRunning lists models the baseRouter has already
+	// committed to loading (in-flight swaps) which the planner cannot see
+	// via process.State() yet. Pure decision; must not log.
+	EvictionFor(target string, alsoRunning []string) []string
+
+	// OnSwapStart runs once at the start of every swap. Planners may log
+	// their decision here at whatever verbosity they choose.
+	OnSwapStart(target string)
+}
+
+// baseRouter owns the channels, run-loop, and orchestration code shared by
+// every concrete router. Concrete routers embed *baseRouter and supply a
+// swapPlanner that captures how their eviction set is decided.
+type baseRouter struct {
+	name      string
+	config    config.Config
+	processes map[string]process.Process
+	logger    *logmon.Monitor
+	planner   swapPlanner
+
+	// shutdownCtx governs the request machinery: cancelling it tells grant()
+	// and ServeHTTP to stop granting and reject callers. It is deliberately
+	// separate from procCtx — see procCtx below.
+	shutdownCtx  context.Context
+	shutdownFn   context.CancelFunc
+	shuttingDown atomic.Bool
+
+	// procCtx is the parent context for every managed process and governs
+	// process lifetime only. handleShutdown stops processes gracefully via
+	// Stop() and cancels procCtx afterwards, so teardown is never a context
+	// cancel racing the graceful path (which collapsed the grace to 100ms and
+	// let the caller return before children were reaped — see process run loop).
+	procCtx    context.Context
+	procCancel context.CancelFunc
+
+	handlerCh   chan handlerReq
+	shutdownCh  chan shutdownReq
+	unloadCh    chan unloadReq
+	swapDoneCh  chan swapDone
+	serveDoneCh chan serveDoneEvent
+
+	runDone chan struct{}
+
+	// testProcessed, when non-nil, receives one event after each handlerReq
+	// or swapDone has been fully processed by run(). Tests use it to wait
+	// for run() to reach a deterministic state without sleeping. serveDone
+	// events are intentionally NOT signalled here so test event counts
+	// remain stable.
+	testProcessed chan struct{}
+}
+
+func newBaseRouter(name string, conf config.Config, processes map[string]process.Process, planner swapPlanner, logger *logmon.Monitor) *baseRouter {
+	shutdownCtx, shutdownFn := context.WithCancel(context.Background())
+	procCtx, procCancel := context.WithCancel(context.Background())
+	return &baseRouter{
+		name:        name,
+		config:      conf,
+		processes:   processes,
+		logger:      logger,
+		planner:     planner,
+		shutdownCtx: shutdownCtx,
+		shutdownFn:  shutdownFn,
+		procCtx:     procCtx,
+		procCancel:  procCancel,
+		handlerCh:   make(chan handlerReq),
+		shutdownCh:  make(chan shutdownReq),
+		unloadCh:    make(chan unloadReq),
+		swapDoneCh:  make(chan swapDone),
+		serveDoneCh: make(chan serveDoneEvent),
+		runDone:     make(chan struct{}),
+	}
+}
+
+func (b *baseRouter) notifyProcessed() {
+	if b.testProcessed != nil {
+		b.testProcessed <- struct{}{}
+	}
+}
+
+func (b *baseRouter) run() {
+	defer close(b.runDone)
+
+	active := make(map[string]*activeSwap)
+	inFlight := make(map[string]int)
+	var queued []handlerReq
+
+	for {
+		select {
+		case req := <-b.shutdownCh:
+			b.handleShutdown(req, active, queued)
+			return
+
+		case req := <-b.handlerCh:
+			b.handleRequest(req, active, inFlight, &queued)
+			b.notifyProcessed()
+
+		case req := <-b.unloadCh:
+			b.handleUnload(req, active, inFlight, &queued)
+			b.notifyProcessed()
+
+		case ev := <-b.swapDoneCh:
+			b.handleSwapDone(ev, active, inFlight, &queued)
+			b.notifyProcessed()
+
+		case ev := <-b.serveDoneCh:
+			b.handleServeDone(ev, active, inFlight, &queued)
+		}
+	}
+}
+
+// grant sends a response back to the caller of ServeHTTP and tells us
+// whether the caller was still there to receive it.
+//
+// Each ServeHTTP creates a fresh, UNBUFFERED respond channel and parks in
+// a select waiting on it. "Unbuffered" is the important word: a send only
+// completes when the other side is actively receiving. So if this send
+// succeeds, we know for a fact the caller picked up the response and will
+// act on it. If the caller has already given up (its request context was
+// cancelled, e.g. the HTTP client disconnected) or the router is shutting
+// down, the send never lands, one of the other select cases fires, and we
+// report back that the grant did NOT happen.
+//
+// That distinction matters for in-flight bookkeeping — see grantHandler.
+func (b *baseRouter) grant(req handlerReq, resp handlerResp) bool {
+	select {
+	case req.respond <- resp:
+		return true
+	case <-req.ctx.Done():
+		return false
+	case <-b.shutdownCtx.Done():
+		return false
+	}
+}
+
+// grantHandler is the "this caller can now use process p" path. It does
+// two things that must stay locked together:
+//
+//  1. Hand the caller a wrapped p.ServeHTTP (via trackedServe) so when the
+//     HTTP request finishes, the run loop hears about it.
+//  2. Bump inFlight[modelID] so the router knows this process is busy and
+//     refuses to evict it until the count comes back down.
+//
+// The increment is gated on grant() returning true. If grant() returns
+// false, the caller already walked away and trackedServe will never run —
+// which means no matching decrement will ever arrive on serveDoneCh.
+// Incrementing in that case would strand the counter at >0 forever and
+// the router would never again be willing to swap this model out.
+//
+// In short: increment if and only if we know a decrement is coming.
+func (b *baseRouter) grantHandler(req handlerReq, modelID string, p process.Process, inFlight map[string]int) {
+	if b.grant(req, handlerResp{handleFunc: b.trackedServe(modelID, p)}) {
+		inFlight[modelID]++
+	}
+}
+
+// trackedServe is the wrapper that closes the loop on in-flight tracking.
+// It runs p.ServeHTTP normally; the only added behaviour is a deferred
+// send on serveDoneCh after the handler returns. That send is what tells
+// the run loop "this model now has one fewer request in flight — go look
+// at the queue again, you may be able to start a swap you previously had
+// to defer."
+//
+// The select on shutdownCtx.Done() is a release valve: if the router is
+// already shutting down, nobody is reading serveDoneCh, so we drop the
+// notification rather than blocking the HTTP goroutine forever.
+func (b *baseRouter) trackedServe(modelID string, p process.Process) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		defer func() {
+			select {
+			case b.serveDoneCh <- serveDoneEvent{modelID: modelID}:
+			case <-b.shutdownCtx.Done():
+			}
+		}()
+		p.ServeHTTP(w, r)
+	}
+}
+
+// handleRequest decides what to do with one incoming ServeHTTP request. It is
+// called from run() and never blocks indefinitely: any work that has to wait
+// (starting a process, stopping siblings, waiting for ready) is deferred to
+// a swap goroutine and reported back via swapDoneCh.
+//
+// The decision tree, in order:
+//
+//  1. Unknown model — respond with ErrNoLocalModelFound and move on.
+//  2. A swap to the same model is already in flight — attach this waiter so
+//     one swap serves all callers that asked for the same model.
+//  3. Fast path — the target process is already ready, the planner sees
+//     nothing to evict, and no in-flight swap is evicting it. Hand back its
+//     ServeHTTP immediately (wrapped so the run loop knows when it ends).
+//  4. Would collide with an in-flight swap (we'd stop their target, or
+//     they're stopping us) — park in the queue for handleSwapDone to drain.
+//  5. Would evict a process that is still handling requests — park in the
+//     queue. handleServeDone will retry when the busy process drains.
+//  6. Otherwise — start a new swap. This may run in parallel with other
+//     active swaps when their evict sets don't intersect.
+func (b *baseRouter) handleRequest(req handlerReq, active map[string]*activeSwap, inFlight map[string]int, queued *[]handlerReq) {
+	// (1) Unknown model.
+	p, ok := b.processes[req.model]
+	if !ok {
+		b.logger.Debugf("%s: model %s not handled by this router", b.name, req.model)
+		b.grant(req, handlerResp{err: ErrNoLocalModelFound})
+		return
+	}
+
+	// (2) Join an in-flight swap for the same model.
+	if s, ok := active[req.model]; ok {
+		b.logger.Debugf("%s: joining in-flight swap for model %s (%d waiters)", b.name, req.model, len(s.waiters)+1)
+		s.waiters = append(s.waiters, req)
+		return
+	}
+
+	evict := b.planner.EvictionFor(req.model, activeTargets(active, req.model))
+
+	// (3) Fast path: ready, nothing to evict, and nobody is evicting us.
+	if p.State() == process.StateReady && len(evict) == 0 && !collidesWith(req.model, evict, active) {
+		b.logger.Debugf("%s: fast-path serving model %s (already ready)", b.name, req.model)
+		b.grantHandler(req, req.model, p, inFlight)
+		return
+	}
+
+	// (4) Collision with an in-flight swap — queue.
+	if collidesWith(req.model, evict, active) {
+		b.logger.Debugf("%s: queuing request for model %s (collides with in-flight swap)", b.name, req.model)
+		*queued = append(*queued, req)
+		b.broadcastQueuePositions(*queued)
+		return
+	}
+
+	// (5) Would evict a busy process — queue until it drains.
+	if conflictsWithInFlight(evict, inFlight) {
+		b.logger.Debugf("%s: queuing request for model %s (would evict in-flight process)", b.name, req.model)
+		*queued = append(*queued, req)
+		b.broadcastQueuePositions(*queued)
+		return
+	}
+
+	// (6) Start a new (possibly parallel) swap.
+	b.logger.Debugf("%s: starting swap for model %s, evicting %v", b.name, req.model, evict)
+	s := b.startSwap(req, evict)
+	active[s.modelID] = s
+}
+
+// handleSwapDone is called from run() when a swap goroutine reports that it
+// has finished. It fans out the result to every waiter that joined this swap,
+// removes the swap from the active map, and then walks the queue once,
+// promoting any items that no longer collide with the remaining active set.
+// FIFO order is preserved: items still blocked stay in place.
+func (b *baseRouter) handleSwapDone(ev swapDone, active map[string]*activeSwap, inFlight map[string]int, queued *[]handlerReq) {
+	s, ok := active[ev.modelID]
+	if !ok {
+		return
+	}
+	delete(active, ev.modelID)
+
+	for _, w := range s.waiters {
+		if ev.err != nil {
+			b.grant(w, handlerResp{err: ev.err})
+		} else {
+			p := b.processes[ev.modelID]
+			b.grantHandler(w, ev.modelID, p, inFlight)
+		}
+	}
+
+	b.drainQueue(active, inFlight, queued)
+}
+
+// handleServeDone is called from run() each time a tracked ServeHTTP
+// finishes. It decrements the per-model in-flight count and, when that
+// drops to zero, retries the queue: requests whose swap was deferred
+// because they would have evicted this (now-idle) process can now proceed.
+func (b *baseRouter) handleServeDone(ev serveDoneEvent, active map[string]*activeSwap, inFlight map[string]int, queued *[]handlerReq) {
+	inFlight[ev.modelID]--
+	if inFlight[ev.modelID] <= 0 {
+		delete(inFlight, ev.modelID)
+		b.drainQueue(active, inFlight, queued)
+	}
+}
+
+// drainQueue walks the queued requests in order, re-running the handleRequest
+// decision tree against the (now smaller) active set. Items that can now start
+// or join become satisfied; items still blocked remain queued in original
+// order so they get another chance on the next swap completion.
+func (b *baseRouter) drainQueue(active map[string]*activeSwap, inFlight map[string]int, queued *[]handlerReq) {
+	if len(*queued) == 0 {
+		return
+	}
+	pending := *queued
+	var remaining []handlerReq
+	for _, req := range pending {
+		p, ok := b.processes[req.model]
+		if !ok {
+			b.grant(req, handlerResp{err: ErrNoLocalModelFound})
+			continue
+		}
+		if s, ok := active[req.model]; ok {
+			b.logger.Debugf("%s: queued request for model %s now joining in-flight swap", b.name, req.model)
+			s.waiters = append(s.waiters, req)
+			continue
+		}
+		evict := b.planner.EvictionFor(req.model, activeTargets(active, req.model))
+		if p.State() == process.StateReady && len(evict) == 0 && !collidesWith(req.model, evict, active) {
+			b.logger.Debugf("%s: queued request for model %s now served fast-path", b.name, req.model)
+			b.grantHandler(req, req.model, p, inFlight)
+			continue
+		}
+		if collidesWith(req.model, evict, active) {
+			remaining = append(remaining, req)
+			continue
+		}
+		if conflictsWithInFlight(evict, inFlight) {
+			remaining = append(remaining, req)
+			continue
+		}
+		b.logger.Debugf("%s: queued request for model %s now starting swap, evicting %v", b.name, req.model, evict)
+		s := b.startSwap(req, evict)
+		active[s.modelID] = s
+	}
+	*queued = remaining
+	b.broadcastQueuePositions(*queued)
+}
+
+// broadcastQueuePositions sends each queued request its current 1-indexed
+// position. Sends are non-blocking: if the channel is full, the old value is
+// drained first so the consumer always sees the latest position.
+func (b *baseRouter) broadcastQueuePositions(queued []handlerReq) {
+	for i, req := range queued {
+		pos := i + 1
+		select {
+		case req.positionCh <- pos:
+		default:
+			select {
+			case <-req.positionCh:
+			default:
+			}
+			select {
+			case req.positionCh <- pos:
+			default:
+			}
+		}
+	}
+}
+
+func (b *baseRouter) startSwap(initial handlerReq, evict []string) *activeSwap {
+	swap := &activeSwap{
+		modelID: initial.model,
+		evict:   evict,
+		waiters: []handlerReq{initial},
+	}
+	b.planner.OnSwapStart(initial.model)
+	go b.doSwap(initial.model, evict)
+	return swap
+}
+
+// activeTargets returns the IDs of every in-flight swap target except exclude.
+// baseRouter passes this to the planner so eviction decisions account for
+// models that have been committed to but have not yet transitioned to
+// StateStarting in their process state machine.
+func activeTargets(active map[string]*activeSwap, exclude string) []string {
+	if len(active) == 0 {
+		return nil
+	}
+	out := make([]string, 0, len(active))
+	for id := range active {
+		if id == exclude {
+			continue
+		}
+		out = append(out, id)
+	}
+	return out
+}
+
+// collidesWith reports whether a new swap with this target and evict set can
+// safely run alongside the currently active swaps. Same-target callers should
+// JOIN (handled before this) — they do not collide with themselves.
+func collidesWith(target string, evict []string, active map[string]*activeSwap) bool {
+	for id, s := range active {
+		if id == target {
+			continue
+		}
+		if containsString(evict, id) {
+			return true
+		}
+		if containsString(s.evict, target) {
+			return true
+		}
+	}
+	return false
+}
+
+// conflictsWithInFlight reports whether any model in evict is still handling
+// requests. Stopping a busy process would cancel its callers' connections,
+// so the router defers the swap until those callers finish.
+func conflictsWithInFlight(evict []string, inFlight map[string]int) bool {
+	for _, m := range evict {
+		if inFlight[m] > 0 {
+			return true
+		}
+	}
+	return false
+}
+
+func containsString(xs []string, s string) bool {
+	for _, x := range xs {
+		if x == s {
+			return true
+		}
+	}
+	return false
+}
+
+func (b *baseRouter) doSwap(modelID string, toStop []string) {
+	timeout := b.healthCheckTimeout()
+
+	var wg sync.WaitGroup
+	for _, mID := range toStop {
+		wg.Add(1)
+		go func(p process.Process, id string) {
+			defer wg.Done()
+			if err := p.Stop(timeout); err != nil {
+				b.logger.Warnf("%s: stopping %s failed: %v", b.name, id, err)
+			}
+		}(b.processes[mID], mID)
+	}
+	wg.Wait()
+
+	target := b.processes[modelID]
+	if target.State() == process.StateStopped {
+		go func() {
+			if err := target.Run(timeout); err != nil {
+				b.logger.Warnf("%s: running %s exited: %v", b.name, modelID, err)
+			}
+		}()
+	}
+
+	err := target.WaitReady(b.shutdownCtx)
+
+	select {
+	case b.swapDoneCh <- swapDone{modelID: modelID, err: err}:
+	case <-b.shutdownCtx.Done():
+	}
+}
+
+func (b *baseRouter) handleShutdown(req shutdownReq, active map[string]*activeSwap, queued []handlerReq) {
+	shutdownErr := fmt.Errorf("%s is shutting down", b.name)
+
+	// Cancel shutdownCtx first so any waiter that is currently parked on
+	// its respond channel can exit via its own shutdownCtx.Done() branch.
+	// The grant calls below then either land (waiter happened to receive
+	// before noticing shutdown) or fall through immediately via grant's
+	// shutdownCtx case — either way the waiter sees a non-OK response.
+	// This does NOT touch processes: their lifetime is procCtx, cancelled
+	// only after the graceful Stop() calls below have reaped them.
+	b.shutdownFn()
+
+	for _, s := range active {
+		for _, w := range s.waiters {
+			b.grant(w, handlerResp{err: shutdownErr})
+		}
+	}
+	for _, w := range queued {
+		b.grant(w, handlerResp{err: shutdownErr})
+	}
+
+	stopTimeout := req.timeout
+	if stopTimeout <= 0 {
+		stopTimeout = b.healthCheckTimeout()
+	}
+
+	var wg sync.WaitGroup
+	for i, p := range b.processes {
+		wg.Add(1)
+		go func(id string, p process.Process) {
+			defer wg.Done()
+			if err := p.Stop(stopTimeout); err != nil {
+				b.logger.Warnf("%s failed to stop process %s: %v", b.name, id, err)
+			}
+		}(i, p)
+	}
+
+	done := make(chan struct{})
+	go func() {
+		wg.Wait()
+		close(done)
+	}()
+
+	if req.timeout > 0 {
+		select {
+		case <-done:
+		case <-time.After(req.timeout):
+			<-done
+		}
+	} else {
+		<-done
+	}
+
+	// Every process is stopped (children reaped via Stop()). Cancel procCtx so
+	// the process run-loop goroutines exit; they are already StateStopped, so
+	// this is a clean no-op kill rather than a forced teardown.
+	b.procCancel()
+
+	req.respond <- nil
+}
+
+func (b *baseRouter) healthCheckTimeout() time.Duration {
+	t := time.Duration(b.config.HealthCheckTimeout) * time.Second
+	if t <= 0 {
+		return 30 * time.Second
+	}
+	return t
+}
+
+func (b *baseRouter) Handles(model string) bool {
+	_, ok := b.processes[model]
+	return ok
+}
+
+func (b *baseRouter) ProcessLogger(modelID string) (*logmon.Monitor, bool) {
+	if p, ok := b.processes[modelID]; ok {
+		return p.Logger(), true
+	}
+	return nil, false
+}
+
+// RunningModels returns the current state of every process that is not stopped
+// or shut down. The processes map keys are fixed at construction and State()
+// is a snapshot, so this is safe to call without the run loop.
+func (b *baseRouter) RunningModels() map[string]process.ProcessState {
+	running := make(map[string]process.ProcessState)
+	for id, p := range b.processes {
+		st := p.State()
+		if st == process.StateStopped || st == process.StateShutdown {
+			continue
+		}
+		running[id] = st
+	}
+	return running
+}
+
+// Unload stops the named models, or every running model when none are named.
+// It blocks until each targeted process has stopped.
+//
+// The request is funneled through the run loop so eviction is coordinated
+// with the rest of the router's state: pending swap waiters for an
+// unloaded model are released with an error, queued requests for unloaded
+// models are dropped, and any deferred swaps that were waiting on those
+// models become eligible to start.
+//
+// In-flight requests being served by an unloaded process are not waited
+// for — Stop kills the upstream, those callers see whatever error the
+// reverse proxy surfaces and may retry. Their trackedServe defers fire
+// normally and decrement inFlight as the dying handlers return.
+func (b *baseRouter) Unload(timeout time.Duration, models ...string) {
+	targets := models
+	if len(targets) == 0 {
+		targets = make([]string, 0, len(b.processes))
+		for id := range b.processes {
+			targets = append(targets, id)
+		}
+	}
+	if len(targets) == 0 {
+		return
+	}
+
+	req := unloadReq{targets: targets, timeout: timeout, respond: make(chan struct{})}
+	select {
+	case b.unloadCh <- req:
+	case <-b.runDone:
+		return
+	}
+	<-req.respond
+}
+
+// handleUnload runs on the run loop in response to an Unload call. It
+// reconciles router-owned state with the impending Stop, then performs
+// the Stop synchronously so callers of Unload remain blocked until each
+// targeted process has actually exited.
+func (b *baseRouter) handleUnload(req unloadReq, active map[string]*activeSwap, inFlight map[string]int, queued *[]handlerReq) {
+	unloadErr := fmt.Errorf("%s: model unloaded", b.name)
+
+	targetSet := make(map[string]bool, len(req.targets))
+	for _, id := range req.targets {
+		targetSet[id] = true
+	}
+
+	// Release waiters of any in-flight swap whose target is being
+	// unloaded. The swap goroutine itself is left to finish on its own;
+	// when its swapDone arrives, handleSwapDone will find no entry in
+	// active and silently drop it.
+	for id := range targetSet {
+		s, ok := active[id]
+		if !ok {
+			continue
+		}
+		for _, w := range s.waiters {
+			b.grant(w, handlerResp{err: unloadErr})
+		}
+		delete(active, id)
+	}
+
+	// Drop queued requests addressed to unloaded models. Requests for
+	// other models stay queued and may benefit from drainQueue at the end.
+	if len(*queued) > 0 {
+		kept := (*queued)[:0]
+		for _, w := range *queued {
+			if targetSet[w.model] {
+				b.grant(w, handlerResp{err: unloadErr})
+				continue
+			}
+			kept = append(kept, w)
+		}
+		*queued = kept
+	}
+
+	// Stop the targeted processes. Done synchronously so Unload's caller
+	// can rely on "after Unload returns, the process is stopped". inFlight
+	// is intentionally NOT cleared here: each dying handler will fire its
+	// trackedServe defer and reach handleServeDone in the normal way once
+	// the run loop is free again.
+	var wg sync.WaitGroup
+	for id := range targetSet {
+		p, ok := b.processes[id]
+		if !ok {
+			continue
+		}
+		wg.Add(1)
+		go func(id string, p process.Process) {
+			defer wg.Done()
+			if err := p.Stop(req.timeout); err != nil {
+				b.logger.Warnf("%s: unloading %s failed: %v", b.name, id, err)
+			}
+		}(id, p)
+	}
+	wg.Wait()
+
+	// Removing entries from active above may have unblocked queued
+	// requests that previously collided with the now-cancelled swaps.
+	b.drainQueue(active, inFlight, queued)
+
+	close(req.respond)
+}
+
+func (b *baseRouter) Shutdown(timeout time.Duration) error {
+	if !b.shuttingDown.CompareAndSwap(false, true) {
+		return fmt.Errorf("%s shutdown already in progress", b.name)
+	}
+	req := shutdownReq{timeout: timeout, respond: make(chan error, 1)}
+	select {
+	case b.shutdownCh <- req:
+	case <-b.runDone:
+		return nil
+	}
+	return <-req.respond
+}
+
+func (b *baseRouter) ServeHTTP(w http.ResponseWriter, req *http.Request) {
+	if b.shuttingDown.Load() {
+		SendError(w, req, fmt.Errorf("%s is shutting down", b.name))
+		return
+	}
+
+	data, err := FetchContext(req, b.config)
+	if err != nil {
+		SendError(w, req, err)
+		return
+	}
+
+	hr := handlerReq{
+		model: data.ModelID,
+		ctx:   req.Context(),
+		// Unbuffered: a successful send on respond proves the waiter is
+		// alive and consuming. grant() relies on this to avoid handing a
+		// handleFunc to a cancelled waiter and leaking the inFlight count.
+		respond:    make(chan handlerResp),
+		positionCh: make(chan int, 1),
+	}
+
+	select {
+	case b.handlerCh <- hr:
+	case <-req.Context().Done():
+		return
+	case <-b.shutdownCtx.Done():
+		SendError(w, req, fmt.Errorf("%s is shutting down", b.name))
+		return
+	}
+
+	isModelReady := false
+	if p, ok := b.processes[data.ModelID]; ok {
+		isModelReady = p.State() == process.StateReady
+	}
+	shouldShowLoading := data.Streaming && data.SendLoadingState && isLoadingPath(req.URL.Path) && !isModelReady
+
+	var lw *loadingWriter
+	cancelLoad := func() {}
+	if shouldShowLoading {
+		var swapCtx context.Context
+		swapCtx, cancelLoad = context.WithCancel(req.Context())
+		lw = newLoadingWriter(b.logger, data.ModelID, w, req)
+		go lw.start(swapCtx)
+		go func() {
+			for {
+				select {
+				case pos := <-hr.positionCh:
+					lw.setUpdate(fmt.Sprintf("Queue position: #%d", pos))
+				case <-swapCtx.Done():
+					return
+				}
+			}
+		}()
+	}
+
+	// finishLoading stops the loading stream and fences its goroutine off from
+	// the ResponseWriter before the real handler (or ServeHTTP's return)
+	// reclaims it. release() must run even when waitForCompletion times out:
+	// otherwise a still-streaming goroutine flushes a finalized response and
+	// panics on the recycled *bufio.Writer.
+	finishLoading := func() {
+		cancelLoad()
+		if lw != nil {
+			lw.waitForCompletion(1 * time.Second)
+			lw.release()
+		}
+	}
+
+	var resp handlerResp
+	select {
+	case resp = <-hr.respond:
+		finishLoading()
+	case <-req.Context().Done():
+		finishLoading()
+		return
+	case <-b.shutdownCtx.Done():
+		finishLoading()
+		SendError(w, req, fmt.Errorf("%s is shutting down", b.name))
+		return
+	}
+
+	if resp.err != nil {
+		SendError(w, req, resp.err)
+		return
+	}
+	resp.handleFunc(w, req)
+}
@@ -0,0 +1,863 @@
+package router
+
+import (
+	"context"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/internal/config"
+	"github.com/mostlygeek/llama-swap/internal/logmon"
+	"github.com/mostlygeek/llama-swap/internal/process"
+)
+
+// stubPlanner is a swapPlanner that returns a fixed eviction list per target
+// and never logs. It lets the base-router tests cover shared run-loop
+// behaviour without dragging in either real router's eviction rules.
+type stubPlanner struct {
+	evict map[string][]string
+}
+
+func (s *stubPlanner) EvictionFor(target string, _ []string) []string {
+	if s.evict == nil {
+		return nil
+	}
+	return s.evict[target]
+}
+
+func (s *stubPlanner) OnSwapStart(string) {}
+
+func newTestBase(t *testing.T, processes map[string]process.Process, planner swapPlanner) *baseRouter {
+	t.Helper()
+	conf := config.Config{HealthCheckTimeout: 5}
+	b := newBaseRouter("test", conf, processes, planner, logmon.NewWriter(io.Discard))
+	b.testProcessed = make(chan struct{}, 64)
+	go b.run()
+	t.Cleanup(func() {
+		if !b.shuttingDown.Load() {
+			_ = b.Shutdown(time.Second)
+		}
+	})
+	return b
+}
+
+func TestBaseRouter_RunningModels(t *testing.T) {
+	ready := newFakeProcess("ready")
+	ready.markReady()
+	starting := newFakeProcess("starting")
+	starting.setState(process.StateStarting)
+	stopped := newFakeProcess("stopped")
+
+	b := newTestBase(t, map[string]process.Process{
+		"ready": ready, "starting": starting, "stopped": stopped,
+	}, &stubPlanner{})
+
+	running := b.RunningModels()
+	if len(running) != 2 {
+		t.Fatalf("running=%v want 2 entries", running)
+	}
+	if running["ready"] != process.StateReady {
+		t.Errorf("ready state=%q want ready", running["ready"])
+	}
+	if running["starting"] != process.StateStarting {
+		t.Errorf("starting state=%q want starting", running["starting"])
+	}
+	if _, ok := running["stopped"]; ok {
+		t.Errorf("stopped process should be excluded from RunningModels")
+	}
+}
+
+func TestBaseRouter_UnloadAll(t *testing.T) {
+	a := newFakeProcess("a")
+	a.markReady()
+	c := newFakeProcess("c")
+	c.markReady()
+
+	b := newTestBase(t, map[string]process.Process{"a": a, "c": c}, &stubPlanner{})
+	b.Unload(time.Second)
+
+	if a.State() != process.StateStopped || c.State() != process.StateStopped {
+		t.Fatalf("Unload() should stop every process: a=%q c=%q", a.State(), c.State())
+	}
+}
+
+func TestBaseRouter_UnloadSpecificModel(t *testing.T) {
+	a := newFakeProcess("a")
+	a.markReady()
+	c := newFakeProcess("c")
+	c.markReady()
+
+	b := newTestBase(t, map[string]process.Process{"a": a, "c": c}, &stubPlanner{})
+	b.Unload(time.Second, "a")
+
+	if a.State() != process.StateStopped {
+		t.Errorf("a should be stopped, got %q", a.State())
+	}
+	if c.State() != process.StateReady {
+		t.Errorf("c should remain ready, got %q", c.State())
+	}
+}
+
+// TestBaseRouter_Unload_StopsInParallel verifies that Unload fans out its
+// Stop calls concurrently rather than stopping each process serially. Each
+// fakeProcess.Stop is pinned via stopBlock; the test only releases them
+// after observing every stopStarted, proving all three Stops were in
+// flight simultaneously.
+func TestBaseRouter_Unload_StopsInParallel(t *testing.T) {
+	a := newFakeProcess("a")
+	a.markReady()
+	a.stopBlock = make(chan struct{})
+	pb := newFakeProcess("b")
+	pb.markReady()
+	pb.stopBlock = make(chan struct{})
+	pc := newFakeProcess("c")
+	pc.markReady()
+	pc.stopBlock = make(chan struct{})
+
+	b := newTestBase(t, map[string]process.Process{"a": a, "b": pb, "c": pc}, &stubPlanner{})
+
+	unloadDone := make(chan struct{})
+	go func() {
+		b.Unload(time.Second, "a", "b", "c")
+		close(unloadDone)
+	}()
+
+	// All three Stop calls must start before any of them are allowed to
+	// complete. If Unload was serial, only one stopStarted would fire
+	// until we released its stopBlock, and this would deadlock.
+	for _, p := range []*fakeProcess{a, pb, pc} {
+		select {
+		case <-p.stopStarted:
+		case <-time.After(2 * time.Second):
+			t.Fatalf("Stop on %s never started — Unload is not parallel", p.id)
+		}
+	}
+
+	// Release them; Unload should now return.
+	close(a.stopBlock)
+	close(pb.stopBlock)
+	close(pc.stopBlock)
+
+	select {
+	case <-unloadDone:
+	case <-time.After(2 * time.Second):
+		t.Fatal("Unload did not return after stops released")
+	}
+
+	for _, p := range []*fakeProcess{a, pb, pc} {
+		if p.State() != process.StateStopped {
+			t.Errorf("%s state=%q want stopped", p.id, p.State())
+		}
+		if got := p.stopCalls.Load(); got != 1 {
+			t.Errorf("%s stopCalls=%d want 1", p.id, got)
+		}
+	}
+}
+
+// TestBaseRouter_Unload_ReleasesActiveSwapWaiters verifies that Unload
+// rejoins router state: a request whose swap to the unloaded model is
+// still in progress receives an error, instead of being abandoned
+// against a process that's about to vanish.
+func TestBaseRouter_Unload_ReleasesActiveSwapWaiters(t *testing.T) {
+	a := newFakeProcess("a")
+	// autoReady=false: the swap parks on WaitReady so we can interrupt
+	// it with Unload before it completes.
+
+	b := newTestBase(t, map[string]process.Process{"a": a}, &stubPlanner{})
+
+	w := httptest.NewRecorder()
+	done := make(chan struct{})
+	go func() {
+		b.ServeHTTP(w, newRequest("a"))
+		close(done)
+	}()
+	waitProcessed(t, b.testProcessed, 1) // handlerReq absorbed; swap started
+	<-a.runStarted
+
+	b.Unload(time.Second, "a")
+
+	select {
+	case <-done:
+	case <-time.After(2 * time.Second):
+		t.Fatal("ServeHTTP did not return after Unload")
+	}
+	if w.Code == http.StatusOK {
+		t.Errorf("expected non-OK status after Unload, got %d body=%q", w.Code, w.Body.String())
+	}
+	if a.State() != process.StateStopped {
+		t.Errorf("a state=%q want stopped", a.State())
+	}
+}
+
+// TestBaseRouter_Unload_DropsQueuedRequests verifies that queued requests
+// for an unloaded model receive an error rather than sitting forever in
+// the queue against state the router no longer maintains.
+func TestBaseRouter_Unload_DropsQueuedRequests(t *testing.T) {
+	a := newFakeProcess("a")
+	pb := newFakeProcess("b")
+	// Loading B evicts A — so a request for B while A is loading queues.
+	planner := &stubPlanner{evict: map[string][]string{"b": {"a"}}}
+	b := newTestBase(t, map[string]process.Process{"a": a, "b": pb}, planner)
+
+	// r1 starts the swap to A and parks on WaitReady (autoReady=false).
+	w1 := httptest.NewRecorder()
+	done1 := make(chan struct{})
+	go func() {
+		b.ServeHTTP(w1, newRequest("a"))
+		close(done1)
+	}()
+	waitProcessed(t, b.testProcessed, 1)
+	<-a.runStarted
+
+	// r2 for B collides with A's in-flight swap and queues.
+	w2 := httptest.NewRecorder()
+	done2 := make(chan struct{})
+	go func() {
+		b.ServeHTTP(w2, newRequest("b"))
+		close(done2)
+	}()
+	waitProcessed(t, b.testProcessed, 1)
+
+	// Unload B — r2 (queued, targeting B) must be released with an error.
+	b.Unload(time.Second, "b")
+
+	select {
+	case <-done2:
+	case <-time.After(2 * time.Second):
+		t.Fatal("queued B request did not return after Unload(b)")
+	}
+	if w2.Code == http.StatusOK {
+		t.Errorf("queued B request: expected non-OK status, got %d", w2.Code)
+	}
+	if got := pb.runCalls.Load(); got != 0 {
+		t.Errorf("b.runCalls=%d want 0 (B should never have been started)", got)
+	}
+
+	// Release r1 so the test cleans up cleanly.
+	a.markReady()
+	select {
+	case <-done1:
+	case <-time.After(2 * time.Second):
+		t.Fatal("r1 did not complete after a.markReady")
+	}
+}
+
+func TestBaseRouter_FastPath(t *testing.T) {
+	a := newFakeProcess("a")
+	a.markReady()
+
+	b := newTestBase(t, map[string]process.Process{"a": a}, &stubPlanner{})
+
+	w := httptest.NewRecorder()
+	b.ServeHTTP(w, newRequest("a"))
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%q", w.Code, w.Body.String())
+	}
+	if got := a.serveCalls.Load(); got != 1 {
+		t.Errorf("serveCalls=%d want 1", got)
+	}
+	if got := a.runCalls.Load(); got != 0 {
+		t.Errorf("runCalls=%d want 0 (fast path should not start)", got)
+	}
+}
+
+func TestBaseRouter_OnDemandStart(t *testing.T) {
+	a := newFakeProcess("a")
+	a.autoReady = true
+
+	b := newTestBase(t, map[string]process.Process{"a": a}, &stubPlanner{})
+
+	w := httptest.NewRecorder()
+	b.ServeHTTP(w, newRequest("a"))
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%q", w.Code, w.Body.String())
+	}
+	if got := a.runCalls.Load(); got != 1 {
+		t.Errorf("runCalls=%d want 1", got)
+	}
+	if got := a.serveCalls.Load(); got != 1 {
+		t.Errorf("serveCalls=%d want 1", got)
+	}
+}
+
+func TestBaseRouter_ConcurrentSameModel(t *testing.T) {
+	a := newFakeProcess("a")
+	// autoReady=false so the swap parks on WaitReady until we release it.
+
+	b := newTestBase(t, map[string]process.Process{"a": a}, &stubPlanner{})
+
+	const N = 5
+	var wg sync.WaitGroup
+	codes := make([]int, N)
+	for i := 0; i < N; i++ {
+		wg.Add(1)
+		go func(i int) {
+			defer wg.Done()
+			w := httptest.NewRecorder()
+			b.ServeHTTP(w, newRequest("a"))
+			codes[i] = w.Code
+		}(i)
+	}
+
+	waitProcessed(t, b.testProcessed, N) // all N handlerReqs absorbed by run()
+	<-a.runStarted                       // swap goroutine reached Run()
+	a.markReady()
+	wg.Wait()
+
+	for i, c := range codes {
+		if c != http.StatusOK {
+			t.Errorf("request %d: status=%d", i, c)
+		}
+	}
+	if got := a.runCalls.Load(); got != 1 {
+		t.Errorf("runCalls=%d want 1 (single swap should issue one Run)", got)
+	}
+	if got := a.serveCalls.Load(); got != N {
+		t.Errorf("serveCalls=%d want %d", got, N)
+	}
+}
+
+func TestBaseRouter_ContextCancel(t *testing.T) {
+	a := newFakeProcess("a")
+	// autoReady=false so swap parks forever until we mark ready.
+
+	b := newTestBase(t, map[string]process.Process{"a": a}, &stubPlanner{})
+
+	ctx, cancel := context.WithCancel(context.Background())
+	w1 := httptest.NewRecorder()
+	done1 := make(chan struct{})
+	go func() {
+		b.ServeHTTP(w1, newRequestCtx(ctx, "a"))
+		close(done1)
+	}()
+
+	w2 := httptest.NewRecorder()
+	done2 := make(chan struct{})
+	go func() {
+		b.ServeHTTP(w2, newRequest("a"))
+		close(done2)
+	}()
+
+	waitProcessed(t, b.testProcessed, 2) // both requests joined the active swap
+	<-a.runStarted
+
+	cancel()
+	select {
+	case <-done1:
+	case <-time.After(time.Second):
+		t.Fatal("cancelled ServeHTTP did not return after ctx cancel")
+	}
+
+	a.markReady()
+	select {
+	case <-done2:
+	case <-time.After(time.Second):
+		t.Fatal("non-cancelled ServeHTTP did not complete after swap")
+	}
+	if w2.Code != http.StatusOK {
+		t.Errorf("second request status=%d body=%q", w2.Code, w2.Body.String())
+	}
+}
+
+func TestBaseRouter_QueuedDifferentModel(t *testing.T) {
+	a := newFakeProcess("a")
+	pa := newFakeProcess("b")
+
+	// Loading b must stop a.
+	planner := &stubPlanner{evict: map[string][]string{"b": {"a"}}}
+	b := newTestBase(t, map[string]process.Process{"a": a, "b": pa}, planner)
+
+	// First request starts a swap to A; A's autoReady=false so it parks.
+	w1 := httptest.NewRecorder()
+	done1 := make(chan struct{})
+	go func() {
+		b.ServeHTTP(w1, newRequest("a"))
+		close(done1)
+	}()
+	waitProcessed(t, b.testProcessed, 1)
+	<-a.runStarted
+
+	// Second request for B should queue while A's swap is in flight.
+	w2 := httptest.NewRecorder()
+	done2 := make(chan struct{})
+	go func() {
+		b.ServeHTTP(w2, newRequest("b"))
+		close(done2)
+	}()
+	waitProcessed(t, b.testProcessed, 1)
+
+	if got := pa.runCalls.Load(); got != 0 {
+		t.Errorf("b started early: runCalls=%d want 0 while A's swap is pending", got)
+	}
+
+	// Release A's swap. B's swap should then run.
+	a.markReady()
+	waitProcessed(t, b.testProcessed, 1) // swapDone for A → B's swap kicked off
+	<-pa.runStarted
+
+	select {
+	case <-done1:
+	case <-time.After(time.Second):
+		t.Fatal("A request did not complete")
+	}
+	pa.markReady()
+	select {
+	case <-done2:
+	case <-time.After(time.Second):
+		t.Fatal("queued B request did not complete after A's swap")
+	}
+	if w2.Code != http.StatusOK {
+		t.Errorf("B status=%d body=%q", w2.Code, w2.Body.String())
+	}
+	if got := a.stopCalls.Load(); got != 1 {
+		t.Errorf("a.stopCalls=%d want 1 (B's swap must stop A)", got)
+	}
+}
+
+// TestBaseRouter_QueueCollation verifies that incoming requests of the form
+// a, b, c, a, b, c collapse into three swaps (one per model) and that the
+// second request for each model rides the fast path — either joining the
+// active swap, or being pulled out of the queue when handleSwapDone promotes
+// the next model.
+func TestBaseRouter_QueueCollation(t *testing.T) {
+	a := newFakeProcess("a")
+	pb := newFakeProcess("b")
+	pc := newFakeProcess("c")
+
+	// Each model evicts the other two so all swaps are mutually exclusive.
+	planner := &stubPlanner{evict: map[string][]string{
+		"a": {"b", "c"},
+		"b": {"a", "c"},
+		"c": {"a", "b"},
+	}}
+	b := newTestBase(t, map[string]process.Process{"a": a, "b": pb, "c": pc}, planner)
+
+	var (
+		completedMu sync.Mutex
+		completed   []string
+	)
+	record := func(id string) {
+		completedMu.Lock()
+		defer completedMu.Unlock()
+		completed = append(completed, id)
+	}
+
+	ids := []string{"a", "b", "c", "a", "b", "c"}
+	var wg sync.WaitGroup
+	for _, id := range ids {
+		id := id
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			w := httptest.NewRecorder()
+			b.ServeHTTP(w, newRequest(id))
+			if w.Code != http.StatusOK {
+				t.Errorf("%s: status=%d body=%q", id, w.Code, w.Body.String())
+				return
+			}
+			record(id)
+		}()
+		// Wait for run() to absorb this request before launching the next,
+		// so handlerCh receives them in launch order.
+		waitProcessed(t, b.testProcessed, 1)
+	}
+
+	// All 6 are now parked in run()'s waiters/queue. Release each swap in
+	// sequence, waiting deterministically for each promotion to fire.
+	<-a.runStarted
+	a.markReady()
+	waitProcessed(t, b.testProcessed, 1) // swapDone(a) → b swap kicked off
+
+	<-pb.runStarted
+	pb.markReady()
+	waitProcessed(t, b.testProcessed, 1) // swapDone(b) → c swap kicked off
+
+	<-pc.runStarted
+	pc.markReady()
+	wg.Wait()
+
+	if got := len(completed); got != 6 {
+		t.Fatalf("completed=%v want 6", completed)
+	}
+
+	// run() fans out responses in model-grouped order (a1,a2 → b1,b2 → c1,c2)
+	// but waiter goroutines may be scheduled in any order after their respond
+	// channel fires, so completion order isn't deterministic. Per-model counts
+	// (combined with the runCalls checks below) are sufficient to prove queue
+	// collation collapsed each pair into a single swap.
+	aDone, bDone, cDone := 0, 0, 0
+	for _, id := range completed {
+		switch id {
+		case "a":
+			aDone++
+		case "b":
+			bDone++
+		case "c":
+			cDone++
+		}
+	}
+	if aDone != 2 || bDone != 2 || cDone != 2 {
+		t.Errorf("per-model counts: a=%d b=%d c=%d, want 2 each (order=%v)", aDone, bDone, cDone, completed)
+	}
+
+	// Single swap per model — the second request for each must have ridden
+	// the fast path (joined active swap or joined a queued sibling), not
+	// triggered an extra Run.
+	if got := a.runCalls.Load(); got != 1 {
+		t.Errorf("a.runCalls=%d want 1", got)
+	}
+	if got := pb.runCalls.Load(); got != 1 {
+		t.Errorf("b.runCalls=%d want 1", got)
+	}
+	if got := pc.runCalls.Load(); got != 1 {
+		t.Errorf("c.runCalls=%d want 1", got)
+	}
+}
+
+// TestBaseRouter_ConcurrentDisjointSwaps verifies that two requests with
+// non-conflicting evict sets are loaded in parallel: both Run() calls happen
+// before either process is marked ready.
+func TestBaseRouter_ConcurrentDisjointSwaps(t *testing.T) {
+	a := newFakeProcess("a")
+	pb := newFakeProcess("b")
+
+	// Empty evict sets for both: they can load in parallel.
+	b := newTestBase(t, map[string]process.Process{"a": a, "b": pb}, &stubPlanner{})
+
+	w1 := httptest.NewRecorder()
+	done1 := make(chan struct{})
+	go func() {
+		b.ServeHTTP(w1, newRequest("a"))
+		close(done1)
+	}()
+	waitProcessed(t, b.testProcessed, 1)
+
+	w2 := httptest.NewRecorder()
+	done2 := make(chan struct{})
+	go func() {
+		b.ServeHTTP(w2, newRequest("b"))
+		close(done2)
+	}()
+	waitProcessed(t, b.testProcessed, 1)
+
+	// Both swaps must have reached Run() before either is marked ready —
+	// proves they ran in parallel rather than serializing.
+	<-a.runStarted
+	<-pb.runStarted
+
+	a.markReady()
+	pb.markReady()
+
+	select {
+	case <-done1:
+	case <-time.After(time.Second):
+		t.Fatal("request A did not complete")
+	}
+	select {
+	case <-done2:
+	case <-time.After(time.Second):
+		t.Fatal("request B did not complete")
+	}
+
+	if w1.Code != http.StatusOK {
+		t.Errorf("A status=%d body=%q", w1.Code, w1.Body.String())
+	}
+	if w2.Code != http.StatusOK {
+		t.Errorf("B status=%d body=%q", w2.Code, w2.Body.String())
+	}
+	if got := a.stopCalls.Load(); got != 0 {
+		t.Errorf("a.stopCalls=%d want 0 (parallel swap, no eviction)", got)
+	}
+	if got := pb.stopCalls.Load(); got != 0 {
+		t.Errorf("b.stopCalls=%d want 0 (parallel swap, no eviction)", got)
+	}
+}
+
+// TestBaseRouter_QueueDrainPromotesMultiple verifies that completing one swap
+// unblocks every queued request that no longer collides — they all start in
+// parallel rather than one-per-completion.
+func TestBaseRouter_QueueDrainPromotesMultiple(t *testing.T) {
+	a := newFakeProcess("a")
+	pb := newFakeProcess("b")
+	pc := newFakeProcess("c")
+
+	// A's swap evicts both B and C, so B and C must queue. Once A finishes
+	// B and C themselves have empty evict sets, so they can start together.
+	planner := &stubPlanner{evict: map[string][]string{
+		"a": {"b", "c"},
+	}}
+	b := newTestBase(t, map[string]process.Process{"a": a, "b": pb, "c": pc}, planner)
+
+	w1 := httptest.NewRecorder()
+	done1 := make(chan struct{})
+	go func() {
+		b.ServeHTTP(w1, newRequest("a"))
+		close(done1)
+	}()
+	waitProcessed(t, b.testProcessed, 1)
+	<-a.runStarted
+
+	// B and C arrive while A is loading. evict_b and evict_c are empty,
+	// but collidesWith returns true because they appear in A's evict set.
+	w2 := httptest.NewRecorder()
+	done2 := make(chan struct{})
+	go func() {
+		b.ServeHTTP(w2, newRequest("b"))
+		close(done2)
+	}()
+	waitProcessed(t, b.testProcessed, 1)
+
+	w3 := httptest.NewRecorder()
+	done3 := make(chan struct{})
+	go func() {
+		b.ServeHTTP(w3, newRequest("c"))
+		close(done3)
+	}()
+	waitProcessed(t, b.testProcessed, 1)
+
+	if got := pb.runCalls.Load(); got != 0 {
+		t.Errorf("b started early: runCalls=%d", got)
+	}
+	if got := pc.runCalls.Load(); got != 0 {
+		t.Errorf("c started early: runCalls=%d", got)
+	}
+
+	// Release A. The swapDone handler should drain the queue and start
+	// both B and C in parallel.
+	a.markReady()
+	waitProcessed(t, b.testProcessed, 1) // swapDone(A) → drainQueue starts B and C
+	<-pb.runStarted
+	<-pc.runStarted
+
+	pb.markReady()
+	pc.markReady()
+
+	for i, ch := range []chan struct{}{done1, done2, done3} {
+		select {
+		case <-ch:
+		case <-time.After(time.Second):
+			t.Fatalf("request %d did not complete", i)
+		}
+	}
+}
+
+// TestBaseRouter_Shutdown_FailsAllInFlight verifies that shutdown returns
+// the shutdown error to every waiter on every active swap AND to every
+// queued request.
+func TestBaseRouter_Shutdown_FailsAllInFlight(t *testing.T) {
+	a := newFakeProcess("a")
+	pb := newFakeProcess("b")
+	pc := newFakeProcess("c")
+
+	// a and b load in parallel (empty evicts). c collides with both.
+	planner := &stubPlanner{evict: map[string][]string{
+		"c": {"a", "b"},
+	}}
+	b := newTestBase(t, map[string]process.Process{"a": a, "b": pb, "c": pc}, planner)
+
+	const waitersPer = 2
+	var wg sync.WaitGroup
+	codes := make([]int, 0, 2*waitersPer+1)
+	var codesMu sync.Mutex
+	record := func(code int) {
+		codesMu.Lock()
+		codes = append(codes, code)
+		codesMu.Unlock()
+	}
+
+	launch := func(model string) {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			w := httptest.NewRecorder()
+			b.ServeHTTP(w, newRequest(model))
+			record(w.Code)
+		}()
+	}
+
+	// Active swaps for a and b, each with 2 waiters.
+	for i := 0; i < waitersPer; i++ {
+		launch("a")
+		waitProcessed(t, b.testProcessed, 1)
+	}
+	for i := 0; i < waitersPer; i++ {
+		launch("b")
+		waitProcessed(t, b.testProcessed, 1)
+	}
+	// c collides with both → queues.
+	launch("c")
+	waitProcessed(t, b.testProcessed, 1)
+
+	<-a.runStarted
+	<-pb.runStarted
+
+	if err := b.Shutdown(time.Second); err != nil {
+		t.Fatalf("Shutdown: %v", err)
+	}
+	wg.Wait()
+
+	codesMu.Lock()
+	defer codesMu.Unlock()
+	if len(codes) != 2*waitersPer+1 {
+		t.Fatalf("got %d responses, want %d", len(codes), 2*waitersPer+1)
+	}
+	for i, c := range codes {
+		if c == http.StatusOK {
+			t.Errorf("response %d: status=%d, want non-200 (shutdown)", i, c)
+		}
+	}
+}
+
+// TestBaseRouter_NoSwapWhileServing verifies that an already-loaded model
+// is not stopped to satisfy another model's swap while it is still handling
+// a request.
+//
+// Sequence:
+//  1. r1 (A) — A loads; ServeHTTP enters and is pinned via serveBlock.
+//  2. r2 (B, planner: B evicts A) — must NOT cause A.Stop while r1 is live.
+//  3. r3 (A) — arrives next; the existing code queues it because B's swap
+//     intent collides with A.
+//  4. r1 released — A finishes r1, then r3 is served by A.
+//  5. B's swap then proceeds; r2 is served by B.
+//
+// fakeProcess.stoppedWhileServing flips true if Stop is ever called while
+// a ServeHTTP is in flight — a direct, race-free signal of the violation.
+func TestBaseRouter_NoSwapWhileServing(t *testing.T) {
+	a := newFakeProcess("a")
+	// autoReady left false: we markReady manually after observing runStarted,
+	// so autoReady's setState(Ready) cannot race with a later Stop and leave
+	// A in Ready, masking the bug.
+	a.serveBlock = make(chan struct{})
+	pb := newFakeProcess("b")
+	// Same reasoning for B: park its swap on WaitReady until we choose.
+
+	planner := &stubPlanner{evict: map[string][]string{"b": {"a"}}}
+	b := newTestBase(t, map[string]process.Process{"a": a, "b": pb}, planner)
+
+	// r1 — load A and enter its ServeHTTP (which blocks on serveBlock).
+	w1 := httptest.NewRecorder()
+	done1 := make(chan struct{})
+	go func() {
+		b.ServeHTTP(w1, newRequest("a"))
+		close(done1)
+	}()
+	waitProcessed(t, b.testProcessed, 1) // handlerReq for r1
+	<-a.runStarted
+	a.markReady()
+	waitProcessed(t, b.testProcessed, 1) // swapDone for A
+	<-a.serveStarted
+
+	// r2 — would evict A. A must not be stopped while r1 is in flight.
+	w2 := httptest.NewRecorder()
+	done2 := make(chan struct{})
+	go func() {
+		b.ServeHTTP(w2, newRequest("b"))
+		close(done2)
+	}()
+	waitProcessed(t, b.testProcessed, 1)
+
+	// r3 — another request for A, arrives behind r2 and queues because
+	// B's swap intent (which evicts A) is recorded as active.
+	w3 := httptest.NewRecorder()
+	done3 := make(chan struct{})
+	go func() {
+		b.ServeHTTP(w3, newRequest("a"))
+		close(done3)
+	}()
+	waitProcessed(t, b.testProcessed, 1)
+
+	// Release r1 (and r3 if it is fast-pathed onto the still-loaded A).
+	// The router must hold off B's swap until A has drained.
+	close(a.serveBlock)
+
+	select {
+	case <-done1:
+	case <-time.After(2 * time.Second):
+		t.Fatal("r1 did not complete after serveBlock release")
+	}
+
+	// Wait for B.Run before marking it ready: markReady before Run would
+	// skip the Run path entirely and leave pb.runCalls at 0. In a correct
+	// implementation B's swap only starts after A has drained; in the
+	// current implementation it has already started — either way runStarted
+	// fires.
+	<-pb.runStarted
+	pb.markReady()
+
+	select {
+	case <-done2:
+	case <-time.After(2 * time.Second):
+		t.Fatal("r2 did not complete after B marked ready")
+	}
+	select {
+	case <-done3:
+	case <-time.After(2 * time.Second):
+		t.Fatal("r3 did not complete")
+	}
+
+	if w1.Code != http.StatusOK || w2.Code != http.StatusOK || w3.Code != http.StatusOK {
+		t.Fatalf("statuses: w1=%d w2=%d w3=%d", w1.Code, w2.Code, w3.Code)
+	}
+	if w1.Body.String() != "ok:a" {
+		t.Errorf("r1 body=%q want ok:a", w1.Body.String())
+	}
+	if w3.Body.String() != "ok:a" {
+		t.Errorf("r3 body=%q want ok:a (r3 must be served by A)", w3.Body.String())
+	}
+	if w2.Body.String() != "ok:b" {
+		t.Errorf("r2 body=%q want ok:b", w2.Body.String())
+	}
+	if a.stoppedWhileServing.Load() {
+		t.Errorf("A.Stop was called while A was still handling a request — the router swapped out a busy process")
+	}
+}
+
+func TestBaseRouter_ModelNotFound(t *testing.T) {
+	a := newFakeProcess("a")
+	b := newTestBase(t, map[string]process.Process{"a": a}, &stubPlanner{})
+
+	w := httptest.NewRecorder()
+	b.ServeHTTP(w, newRequest("unknown"))
+
+	if w.Code != http.StatusNotFound {
+		t.Errorf("status=%d want %d body=%q", w.Code, http.StatusNotFound, w.Body.String())
+	}
+}
+
+func TestBaseRouter_Shutdown_StopsAllProcesses(t *testing.T) {
+	a := newFakeProcess("a")
+	a.markReady()
+	go a.Run(0)
+	pb := newFakeProcess("b")
+	pb.markReady()
+	go pb.Run(0)
+
+	b := newTestBase(t, map[string]process.Process{"a": a, "b": pb}, &stubPlanner{})
+
+	if err := b.Shutdown(time.Second); err != nil {
+		t.Fatalf("Shutdown: %v", err)
+	}
+	if got := a.stopCalls.Load(); got != 1 {
+		t.Errorf("a.stopCalls=%d want 1", got)
+	}
+	if got := pb.stopCalls.Load(); got != 1 {
+		t.Errorf("b.stopCalls=%d want 1", got)
+	}
+
+	// Subsequent ServeHTTP should report 5xx.
+	w := httptest.NewRecorder()
+	b.ServeHTTP(w, newRequest("a"))
+	if w.Code != http.StatusInternalServerError && w.Code != http.StatusServiceUnavailable {
+		t.Errorf("post-shutdown status=%d want 5xx body=%q", w.Code, w.Body.String())
+	}
+
+	// Second Shutdown should report already in progress.
+	if err := b.Shutdown(0); err == nil {
+		t.Errorf("second Shutdown returned nil, want error")
+	}
+}
@@ -0,0 +1,112 @@
+package router
+
+import (
+	"fmt"
+
+	"github.com/mostlygeek/llama-swap/internal/config"
+	"github.com/mostlygeek/llama-swap/internal/logmon"
+	"github.com/mostlygeek/llama-swap/internal/process"
+)
+
+type Group struct {
+	*baseRouter
+}
+
+func NewGroup(conf config.Config, proxylog, upstreamlog *logmon.Monitor) (*Group, error) {
+	modelToGroup := make(map[string]string)
+	for gid, gcfg := range conf.Groups {
+		for _, mid := range gcfg.Members {
+			if existing, dup := modelToGroup[mid]; dup {
+				return nil, fmt.Errorf("model %q is in multiple groups: %q and %q", mid, existing, gid)
+			}
+			modelToGroup[mid] = gid
+		}
+	}
+
+	planner := &groupPlanner{
+		config:       conf,
+		modelToGroup: modelToGroup,
+	}
+
+	processes := make(map[string]process.Process, len(modelToGroup))
+	base := newBaseRouter("group", conf, processes, planner, proxylog)
+	planner.processes = processes
+
+	for mid := range modelToGroup {
+		modelCfg, _, ok := conf.FindConfig(mid)
+		if !ok {
+			base.shutdownFn()
+			base.procCancel()
+			return nil, fmt.Errorf("no model config for %q", mid)
+		}
+		procLog := logmon.NewWriter(upstreamlog)
+		p, err := process.New(base.procCtx, mid, modelCfg, procLog, proxylog)
+		if err != nil {
+			base.shutdownFn()
+			base.procCancel()
+			return nil, fmt.Errorf("creating process for %q: %w", mid, err)
+		}
+		processes[mid] = p
+	}
+
+	g := &Group{baseRouter: base}
+	go base.run()
+	return g, nil
+}
+
+// groupPlanner decides evictions from static group configuration.
+//
+// Same-group siblings are stopped when the group has swap=true. Cross-group
+// members are stopped only when the target's group is exclusive; loading a
+// model from a non-exclusive group leaves running exclusive groups alone,
+// matching the gotcha in the original ProcessGroup behaviour.
+type groupPlanner struct {
+	config       config.Config
+	modelToGroup map[string]string
+	processes    map[string]process.Process
+}
+
+func (p *groupPlanner) EvictionFor(target string, alsoRunning []string) []string {
+	tg := p.modelToGroup[target]
+	tgCfg := p.config.Groups[tg]
+
+	seen := make(map[string]struct{})
+	var result []string
+	consider := func(mID string) {
+		if mID == target {
+			return
+		}
+		if _, dup := seen[mID]; dup {
+			return
+		}
+		og := p.modelToGroup[mID]
+		switch {
+		case og == tg && tgCfg.Swap:
+			seen[mID] = struct{}{}
+			result = append(result, mID)
+		// the previous ProcessGroup behaviour did not unload exclusive groups
+		// when loading a non-exclusive model. This maintains that gotcha
+		// for backwards compatibility. The newer swap matrix approach does not
+		// have this issue.
+		case og != tg && tgCfg.Exclusive:
+			if ogCfg := p.config.Groups[og]; !ogCfg.Persistent {
+				seen[mID] = struct{}{}
+				result = append(result, mID)
+			}
+		}
+	}
+
+	for mID, proc := range p.processes {
+		st := proc.State()
+		if st == process.StateStopped || st == process.StateShutdown {
+			continue
+		}
+		consider(mID)
+	}
+	for _, mID := range alsoRunning {
+		consider(mID)
+	}
+	return result
+}
+
+func (p *groupPlanner) OnSwapStart(target string) {}
@@ -0,0 +1,331 @@
+package router
+
+import (
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/internal/config"
+	"github.com/mostlygeek/llama-swap/internal/logmon"
+	"github.com/mostlygeek/llama-swap/internal/process"
+)
+
+// newTestGroup builds a Group directly from the supplied processes and config,
+// bypassing NewGroup's call to process.New.
+func newTestGroup(t *testing.T, conf config.Config, processes map[string]process.Process) *Group {
+	t.Helper()
+	modelToGroup := make(map[string]string)
+	for gid, gcfg := range conf.Groups {
+		for _, mid := range gcfg.Members {
+			modelToGroup[mid] = gid
+		}
+	}
+	planner := &groupPlanner{
+		config:       conf,
+		modelToGroup: modelToGroup,
+		processes:    processes,
+	}
+	base := newBaseRouter("group", conf, processes, planner, logmon.NewWriter(io.Discard))
+	base.testProcessed = make(chan struct{}, 64)
+	g := &Group{baseRouter: base}
+	go base.run()
+	t.Cleanup(func() {
+		if !g.shuttingDown.Load() {
+			_ = g.Shutdown(time.Second)
+		}
+	})
+	return g
+}
+
+func TestGroup_NewGroup_DuplicateMembership(t *testing.T) {
+	conf := config.Config{
+		Groups: map[string]config.GroupConfig{
+			"g1": {Swap: true, Members: []string{"a"}},
+			"g2": {Swap: true, Members: []string{"a"}},
+		},
+		Models: map[string]config.ModelConfig{
+			"a": {},
+		},
+	}
+	log := logmon.NewWriter(io.Discard)
+	if _, err := NewGroup(conf, log, log); err == nil {
+		t.Fatalf("expected error for duplicate membership")
+	}
+}
+
+func TestGroup_ServeHTTP_SwapStopsPrevious(t *testing.T) {
+	a := newFakeProcess("a")
+	a.markReady()
+	go a.Run(0) // park a Run goroutine so Stop has something to release
+
+	b := newFakeProcess("b")
+	b.autoReady = true
+
+	conf := config.Config{
+		HealthCheckTimeout: 5,
+		Groups: map[string]config.GroupConfig{
+			"g": {Swap: true, Exclusive: true, Members: []string{"a", "b"}},
+		},
+	}
+	g := newTestGroup(t, conf, map[string]process.Process{"a": a, "b": b})
+
+	w := httptest.NewRecorder()
+	g.ServeHTTP(w, newRequest("b"))
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%q", w.Code, w.Body.String())
+	}
+	if got := a.stopCalls.Load(); got != 1 {
+		t.Errorf("a.stopCalls=%d want 1", got)
+	}
+	if got := b.runCalls.Load(); got != 1 {
+		t.Errorf("b.runCalls=%d want 1", got)
+	}
+	if got := b.serveCalls.Load(); got != 1 {
+		t.Errorf("b.serveCalls=%d want 1", got)
+	}
+}
+
+func TestGroup_NonSwapGroup_NoStop(t *testing.T) {
+	a := newFakeProcess("a")
+	a.markReady()
+
+	b := newFakeProcess("b")
+	b.autoReady = true
+
+	conf := config.Config{
+		HealthCheckTimeout: 5,
+		Groups: map[string]config.GroupConfig{
+			"g": {Swap: false, Exclusive: false, Members: []string{"a", "b"}},
+		},
+	}
+	g := newTestGroup(t, conf, map[string]process.Process{"a": a, "b": b})
+
+	w := httptest.NewRecorder()
+	g.ServeHTTP(w, newRequest("b"))
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%q", w.Code, w.Body.String())
+	}
+	if got := a.stopCalls.Load(); got != 0 {
+		t.Errorf("a.stopCalls=%d want 0 (swap=false should not stop siblings)", got)
+	}
+	if got := b.runCalls.Load(); got != 1 {
+		t.Errorf("b.runCalls=%d want 1", got)
+	}
+}
+
+func TestGroup_CrossGroupExclusive(t *testing.T) {
+	a := newFakeProcess("a")
+	a.markReady()
+	go a.Run(0)
+
+	b := newFakeProcess("b")
+	b.autoReady = true
+
+	conf := config.Config{
+		HealthCheckTimeout: 5,
+		Groups: map[string]config.GroupConfig{
+			"g1": {Swap: true, Exclusive: true, Members: []string{"a"}},
+			"g2": {Swap: true, Exclusive: true, Members: []string{"b"}},
+		},
+	}
+	g := newTestGroup(t, conf, map[string]process.Process{"a": a, "b": b})
+
+	w := httptest.NewRecorder()
+	g.ServeHTTP(w, newRequest("b"))
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%q", w.Code, w.Body.String())
+	}
+	if got := a.stopCalls.Load(); got != 1 {
+		t.Errorf("a.stopCalls=%d want 1 (cross-group exclusive must stop)", got)
+	}
+}
+
+// TestGroup_CrossGroupNonExclusiveParallel verifies that two requests for
+// models in distinct non-exclusive groups load in parallel rather than
+// serializing through the router's run loop.
+func TestGroup_CrossGroupNonExclusiveParallel(t *testing.T) {
+	a := newFakeProcess("a")
+	pb := newFakeProcess("b")
+
+	conf := config.Config{
+		HealthCheckTimeout: 5,
+		Groups: map[string]config.GroupConfig{
+			"g1": {Swap: true, Exclusive: false, Members: []string{"a"}},
+			"g2": {Swap: true, Exclusive: false, Members: []string{"b"}},
+		},
+	}
+	g := newTestGroup(t, conf, map[string]process.Process{"a": a, "b": pb})
+
+	w1 := httptest.NewRecorder()
+	done1 := make(chan struct{})
+	go func() {
+		g.ServeHTTP(w1, newRequest("a"))
+		close(done1)
+	}()
+	waitProcessed(t, g.testProcessed, 1)
+
+	w2 := httptest.NewRecorder()
+	done2 := make(chan struct{})
+	go func() {
+		g.ServeHTTP(w2, newRequest("b"))
+		close(done2)
+	}()
+	waitProcessed(t, g.testProcessed, 1)
+
+	// Both groups load concurrently — both must reach Run() before either is
+	// marked ready. If the router still serialised, only one would proceed.
+	<-a.runStarted
+	<-pb.runStarted
+
+	a.markReady()
+	pb.markReady()
+
+	for i, ch := range []chan struct{}{done1, done2} {
+		select {
+		case <-ch:
+		case <-time.After(time.Second):
+			t.Fatalf("request %d did not complete", i)
+		}
+	}
+	if got := a.stopCalls.Load(); got != 0 {
+		t.Errorf("a.stopCalls=%d want 0 (parallel groups don't evict each other)", got)
+	}
+	if got := pb.stopCalls.Load(); got != 0 {
+		t.Errorf("b.stopCalls=%d want 0 (parallel groups don't evict each other)", got)
+	}
+}
+
+// TestGroup_SameGroupSwapSerialises verifies that two same-group requests
+// (Swap=true) serialise even when both arrive while neither has reached
+// StateStarting yet — the alsoRunning hint to the planner closes that race.
+func TestGroup_SameGroupSwapSerialises(t *testing.T) {
+	a := newFakeProcess("a")
+	pb := newFakeProcess("b")
+
+	conf := config.Config{
+		HealthCheckTimeout: 5,
+		Groups: map[string]config.GroupConfig{
+			"g": {Swap: true, Exclusive: false, Members: []string{"a", "b"}},
+		},
+	}
+	g := newTestGroup(t, conf, map[string]process.Process{"a": a, "b": pb})
+
+	w1 := httptest.NewRecorder()
+	done1 := make(chan struct{})
+	go func() {
+		g.ServeHTTP(w1, newRequest("a"))
+		close(done1)
+	}()
+	waitProcessed(t, g.testProcessed, 1)
+
+	// Request B arrives before A transitions to StateStarting in the process
+	// state machine. Without the alsoRunning hint, the planner would not see
+	// A as running, and B would start in parallel, violating Swap=true.
+	w2 := httptest.NewRecorder()
+	done2 := make(chan struct{})
+	go func() {
+		g.ServeHTTP(w2, newRequest("b"))
+		close(done2)
+	}()
+	waitProcessed(t, g.testProcessed, 1)
+
+	if got := pb.runCalls.Load(); got != 0 {
+		t.Errorf("b started in parallel: runCalls=%d want 0", got)
+	}
+
+	<-a.runStarted
+	a.markReady()
+	waitProcessed(t, g.testProcessed, 1) // swapDone(a) → b promoted
+	<-pb.runStarted
+	pb.markReady()
+
+	for i, ch := range []chan struct{}{done1, done2} {
+		select {
+		case <-ch:
+		case <-time.After(time.Second):
+			t.Fatalf("request %d did not complete", i)
+		}
+	}
+	if got := a.stopCalls.Load(); got != 1 {
+		t.Errorf("a.stopCalls=%d want 1 (b's swap must stop a)", got)
+	}
+}
+
+// TestGroup_PersistentNotEvicted verifies that a group with persistent=true
+// is never evicted when another exclusive group starts loading. The running
+// model in the persistent group stays alive alongside the new one.
+func TestGroup_PersistentNotEvicted(t *testing.T) {
+	a := newFakeProcess("a")
+	a.markReady()
+	go a.Run(0)
+
+	b := newFakeProcess("b")
+	b.autoReady = true
+
+	conf := config.Config{
+		HealthCheckTimeout: 5,
+		Groups: map[string]config.GroupConfig{
+			"persist": {Swap: true, Exclusive: false, Persistent: true, Members: []string{"a"}},
+			"other":   {Swap: true, Exclusive: true, Members: []string{"b"}},
+		},
+	}
+	g := newTestGroup(t, conf, map[string]process.Process{"a": a, "b": b})
+
+	w := httptest.NewRecorder()
+	g.ServeHTTP(w, newRequest("b"))
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%q", w.Code, w.Body.String())
+	}
+	if got := a.stopCalls.Load(); got != 0 {
+		t.Errorf("a.stopCalls=%d want 0 (persistent group must not be evicted)", got)
+	}
+	if a.State() != process.StateStarting && a.State() != process.StateReady {
+		t.Errorf("a state=%s want still running", a.State())
+	}
+	if got := b.runCalls.Load(); got != 1 {
+		t.Errorf("b.runCalls=%d want 1", got)
+	}
+}
+
+// TestGroup_NonExclusiveDoesNotUnloadExclusive pins a backwards-compatible
+// gotcha from the original ProcessGroup: when a model in a non-exclusive group
+// is loaded, any running exclusive group keeps running. The two coexist.
+func TestGroup_NonExclusiveDoesNotUnloadExclusive(t *testing.T) {
+	a := newFakeProcess("a")
+	a.markReady()
+	go a.Run(0)
+
+	b := newFakeProcess("b")
+	b.autoReady = true
+
+	conf := config.Config{
+		HealthCheckTimeout: 5,
+		Groups: map[string]config.GroupConfig{
+			"g1": {Swap: true, Exclusive: true, Members: []string{"a"}},
+			"g2": {Swap: true, Exclusive: false, Members: []string{"b"}},
+		},
+	}
+	g := newTestGroup(t, conf, map[string]process.Process{"a": a, "b": b})
+
+	w := httptest.NewRecorder()
+	g.ServeHTTP(w, newRequest("b"))
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%q", w.Code, w.Body.String())
+	}
+	if got := a.stopCalls.Load(); got != 0 {
+		t.Errorf("a.stopCalls=%d want 0 (non-exclusive target must not unload exclusive group)", got)
+	}
+	if a.State() != process.StateStarting && a.State() != process.StateReady {
+		t.Errorf("a state=%s want still running", a.State())
+	}
+	if got := b.runCalls.Load(); got != 1 {
+		t.Errorf("b.runCalls=%d want 1", got)
+	}
+}
@@ -0,0 +1,205 @@
+package router
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/internal/logmon"
+	"github.com/mostlygeek/llama-swap/internal/process"
+)
+
+// fakeProcess is an in-memory implementation of process.Process used to drive
+// the routers through their state machine without spawning real upstreams.
+type fakeProcess struct {
+	id string
+
+	mu          sync.Mutex
+	state       process.ProcessState
+	readyCh     chan struct{}
+	stopCh      chan struct{}
+	runStarted  chan struct{} // closed on the first Run call
+	stopStarted chan struct{} // closed on the first Stop call
+
+	autoReady bool
+
+	// serveBlock, when non-nil, makes ServeHTTP receive from it before
+	// writing its response. Tests use this to hold a request in-flight.
+	// Closing the channel releases every blocked ServeHTTP caller.
+	serveBlock chan struct{}
+	// serveStarted is closed on the first ServeHTTP entry, letting tests
+	// wait deterministically for the handler to begin executing.
+	serveStarted chan struct{}
+	// stopBlock, when non-nil, makes Stop receive from it (after signalling
+	// stopStarted) before completing. Tests use this to prove that several
+	// Stop calls can be in flight simultaneously.
+	stopBlock chan struct{}
+
+	runCalls   atomic.Int32
+	stopCalls  atomic.Int32
+	serveCalls atomic.Int32
+
+	// inFlightServe counts ServeHTTP calls currently inside the handler.
+	// stoppedWhileServing flips true if Stop is ever called while that
+	// counter is non-zero — a direct, race-free observation of the
+	// "swap mid-request" anti-property.
+	inFlightServe       atomic.Int32
+	stoppedWhileServing atomic.Bool
+}
+
+func newFakeProcess(id string) *fakeProcess {
+	return &fakeProcess{
+		id:           id,
+		state:        process.StateStopped,
+		readyCh:      make(chan struct{}),
+		stopCh:       make(chan struct{}),
+		runStarted:   make(chan struct{}),
+		stopStarted:  make(chan struct{}),
+		serveStarted: make(chan struct{}),
+	}
+}
+
+func (f *fakeProcess) setState(s process.ProcessState) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.state = s
+	if s == process.StateReady {
+		select {
+		case <-f.readyCh:
+		default:
+			close(f.readyCh)
+		}
+	}
+}
+
+func (f *fakeProcess) State() process.ProcessState {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	return f.state
+}
+
+func (f *fakeProcess) markReady() { f.setState(process.StateReady) }
+
+func (f *fakeProcess) Run(_ time.Duration) error {
+	f.runCalls.Add(1)
+	f.mu.Lock()
+	if f.state != process.StateStopped {
+		s := f.state
+		f.mu.Unlock()
+		return fmt.Errorf("fakeProcess %s: Run called while %s", f.id, s)
+	}
+	f.state = process.StateStarting
+	sc := f.stopCh
+	select {
+	case <-f.runStarted:
+	default:
+		close(f.runStarted)
+	}
+	f.mu.Unlock()
+
+	if f.autoReady {
+		f.setState(process.StateReady)
+	}
+	<-sc
+	return nil
+}
+
+func (f *fakeProcess) Stop(_ time.Duration) error {
+	f.stopCalls.Add(1)
+	if f.inFlightServe.Load() > 0 {
+		f.stoppedWhileServing.Store(true)
+	}
+	f.mu.Lock()
+	select {
+	case <-f.stopStarted:
+	default:
+		close(f.stopStarted)
+	}
+	f.mu.Unlock()
+
+	// Test hook: hold Stop here so the test can prove multiple Stops are
+	// in flight at the same time before any of them complete.
+	if f.stopBlock != nil {
+		<-f.stopBlock
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	if f.state == process.StateStopped {
+		return nil
+	}
+	f.state = process.StateStopped
+	select {
+	case <-f.stopCh:
+	default:
+		close(f.stopCh)
+	}
+	return nil
+}
+
+func (f *fakeProcess) WaitReady(ctx context.Context) error {
+	f.mu.Lock()
+	if f.state == process.StateReady {
+		f.mu.Unlock()
+		return nil
+	}
+	rc := f.readyCh
+	f.mu.Unlock()
+	select {
+	case <-rc:
+		return nil
+	case <-ctx.Done():
+		return ctx.Err()
+	}
+}
+
+func (f *fakeProcess) Logger() *logmon.Monitor { return logmon.NewWriter(io.Discard) }
+
+func (f *fakeProcess) ServeHTTP(w http.ResponseWriter, _ *http.Request) {
+	f.serveCalls.Add(1)
+	f.inFlightServe.Add(1)
+	defer f.inFlightServe.Add(-1)
+	f.mu.Lock()
+	select {
+	case <-f.serveStarted:
+	default:
+		close(f.serveStarted)
+	}
+	f.mu.Unlock()
+	if f.serveBlock != nil {
+		<-f.serveBlock
+	}
+	w.WriteHeader(http.StatusOK)
+	fmt.Fprintf(w, "ok:%s", f.id)
+}
+
+// waitProcessed drains n events from ch, fataling on timeout. One event fires
+// per handlerReq or swapDone fully absorbed by run().
+func waitProcessed(t *testing.T, ch chan struct{}, n int) {
+	t.Helper()
+	for i := 0; i < n; i++ {
+		select {
+		case <-ch:
+		case <-time.After(2 * time.Second):
+			t.Fatalf("waitProcessed: only %d/%d events received", i, n)
+		}
+	}
+}
+
+func newRequest(model string) *http.Request {
+	body := fmt.Sprintf(`{"model":%q}`, model)
+	r := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(body))
+	r.Header.Set("Content-Type", "application/json")
+	return r
+}
+
+func newRequestCtx(ctx context.Context, model string) *http.Request {
+	return newRequest(model).WithContext(ctx)
+}
@@ -0,0 +1,277 @@
+package router
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"math/rand"
+	"net/http"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/internal/logmon"
+)
+
+var loadingPaths = []string{
+	"/v1/chat/completions",
+}
+
+func isLoadingPath(path string) bool {
+	for _, p := range loadingPaths {
+		if strings.HasPrefix(path, p) {
+			return true
+		}
+	}
+	return false
+}
+
+type loadingWriter struct {
+	hasWritten bool
+	writer     http.ResponseWriter
+	req        *http.Request
+	ctx        context.Context
+	logger     *logmon.Monitor
+	modelName  string
+	startTime  time.Time
+
+	pendingMu     sync.Mutex
+	pendingUpdate string
+
+	// writeMu serializes writes to the underlying writer and guards released.
+	// Once released is set, the streaming goroutine must not touch the writer
+	// again — ServeHTTP has reclaimed it (to run the real handler or to return)
+	// and writing/flushing a finalized response panics.
+	writeMu  sync.Mutex
+	released bool
+
+	// closed by start when the goroutine finishes (after cleanup messages)
+	done chan struct{}
+
+	// test-only: closed when start enters its loop
+	loopStarted chan struct{}
+	// test-only: override the 1s tick interval
+	tickDuration time.Duration
+	// test-only: override character streaming speed (0 = no delay)
+	charPerSecond float64
+}
+
+func newLoadingWriter(logger *logmon.Monitor, modelName string, w http.ResponseWriter, req *http.Request) *loadingWriter {
+	s := &loadingWriter{
+		writer:        w,
+		req:           req,
+		ctx:           req.Context(),
+		logger:        logger,
+		modelName:     modelName,
+		startTime:     time.Now(),
+		tickDuration:  750 * time.Millisecond,
+		charPerSecond: 75,
+	}
+
+	s.Header().Set("Content-Type", "text/event-stream")
+	s.Header().Set("Cache-Control", "no-cache")
+	s.Header().Set("Connection", "keep-alive")
+	s.WriteHeader(http.StatusOK)
+	s.sendLine("━━━━━")
+	s.sendLine(fmt.Sprintf("llama-swap loading model: %s", modelName))
+	return s
+}
+
+func (s *loadingWriter) setUpdate(msg string) {
+	s.pendingMu.Lock()
+	s.pendingUpdate = msg
+	s.pendingMu.Unlock()
+}
+
+func (s *loadingWriter) start(ctx context.Context) {
+	s.done = make(chan struct{})
+	defer close(s.done)
+
+	defer func() {
+		// Skip cleanup writes if the client disconnected — the connection
+		// is being torn down and flushing against it will panic.
+		if s.ctx.Err() != nil {
+			return
+		}
+		duration := time.Since(s.startTime)
+		s.sendData("\n")
+		s.sendLine(fmt.Sprintf("Done! (%.2fs)", duration.Seconds()))
+		s.sendLine("━━━━━")
+		s.sendLine(" ")
+	}()
+
+	remarks := make([]string, len(loadingRemarks))
+	copy(remarks, loadingRemarks)
+	rand.Shuffle(len(remarks), func(i, j int) {
+		remarks[i], remarks[j] = remarks[j], remarks[i]
+	})
+	ri := 0
+
+	nextRemarkIn := time.Duration(2+rand.Intn(4)) * time.Second
+	lastRemarkTime := time.Time{}
+
+	ticker := time.NewTicker(s.tickDuration)
+	defer ticker.Stop()
+
+	if s.loopStarted != nil {
+		close(s.loopStarted)
+	}
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			s.pendingMu.Lock()
+			update := s.pendingUpdate
+			s.pendingUpdate = ""
+			s.pendingMu.Unlock()
+
+			if update != "" {
+				s.sendData("\n")
+				s.sendInline(update)
+				s.sendData(" ")
+				lastRemarkTime = time.Now()
+				nextRemarkIn = time.Duration(5+rand.Intn(5)) * time.Second
+			} else if time.Since(lastRemarkTime) >= nextRemarkIn {
+				remark := remarks[ri%len(remarks)]
+				ri++
+				s.sendData("\n")
+				s.sendInline(remark)
+				s.sendData(" ")
+				lastRemarkTime = time.Now()
+				nextRemarkIn = time.Duration(5+rand.Intn(5)) * time.Second
+			} else {
+				s.sendData(".")
+			}
+		}
+	}
+}
+
+func (s *loadingWriter) waitForCompletion(timeout time.Duration) bool {
+	if s.done == nil {
+		return true
+	}
+	select {
+	case <-s.done:
+		return true
+	case <-time.After(timeout):
+		return false
+	}
+}
+
+func (s *loadingWriter) sendInline(text string) {
+	chunkSize := 10
+	if s.charPerSecond > 0 {
+		chunkSize = max(3, int(s.charPerSecond)/15)
+	}
+
+	runes := []rune(text)
+	for i := 0; i < len(runes); {
+		select {
+		case <-s.ctx.Done():
+			return
+		default:
+		}
+
+		end := i + chunkSize
+		if end > len(runes) {
+			end = len(runes)
+		}
+		chunk := string(runes[i:end])
+		s.sendData(chunk)
+		i = end
+
+		if i < len(runes) && s.charPerSecond > 0 {
+			time.Sleep(time.Duration(float64(time.Second) * float64(len(chunk)) / s.charPerSecond))
+		}
+	}
+}
+
+func (s *loadingWriter) sendLine(line string) {
+	if line == "" {
+		s.sendData("\n")
+		return
+	}
+	s.sendInline(line)
+	s.sendData("\n")
+}
+
+func (s *loadingWriter) sendData(data string) {
+	type Delta struct {
+		ReasoningContent string `json:"reasoning_content"`
+	}
+	type Choice struct {
+		Delta Delta `json:"delta"`
+	}
+	type SSEMessage struct {
+		Choices []Choice `json:"choices"`
+	}
+
+	msg := SSEMessage{
+		Choices: []Choice{
+			{
+				Delta: Delta{
+					ReasoningContent: data,
+				},
+			},
+		},
+	}
+
+	jsonData, err := json.Marshal(msg)
+	if err != nil {
+		s.logger.Errorf("<%s> Failed to marshal SSE message: %v", s.modelName, err)
+		return
+	}
+
+	s.writeMu.Lock()
+	defer s.writeMu.Unlock()
+	// Once ServeHTTP has reclaimed the writer (release), writing/flushing it
+	// races the real handler or panics on a finalized response. Stop here.
+	if s.released {
+		return
+	}
+
+	if _, err = fmt.Fprintf(s.writer, "data: %s\n\n", jsonData); err != nil {
+		s.logger.Debugf("<%s> Failed to write SSE data (client likely disconnected): %v", s.modelName, err)
+		return
+	}
+	if flusher, ok := s.writer.(http.Flusher); ok {
+		flusher.Flush()
+	}
+}
+
+// release fences the loadingWriter off from the underlying ResponseWriter.
+// After it returns, the streaming goroutine will not write to or flush the
+// writer again: any in-flight write completes under writeMu first, and later
+// writes short-circuit on released. The caller can then safely hand the writer
+// to the real handler or let ServeHTTP return without racing a finalized
+// response (a use-after-return Flush panics on the recycled *bufio.Writer).
+func (s *loadingWriter) release() {
+	s.writeMu.Lock()
+	s.released = true
+	s.writeMu.Unlock()
+}
+
+func (s *loadingWriter) Header() http.Header {
+	return s.writer.Header()
+}
+
+func (s *loadingWriter) Write(data []byte) (int, error) {
+	return s.writer.Write(data)
+}
+
+func (s *loadingWriter) WriteHeader(statusCode int) {
+	if s.hasWritten {
+		return
+	}
+	s.hasWritten = true
+	s.writer.WriteHeader(statusCode)
+	s.Flush()
+}
+
+func (s *loadingWriter) Flush() {
+	if flusher, ok := s.writer.(http.Flusher); ok {
+		flusher.Flush()
+	}
+}
@@ -0,0 +1,133 @@
+package router
+
+var loadingRemarks = []string{
+	"Still faster than your last standup meeting",
+	"Reticulating splines",
+	"Waking up the hamsters",
+	"Teaching the model manners",
+	"Convincing the GPU to participate",
+	"Loading weights (they're heavy)",
+	"Please enjoy this elevator music in your head",
+	"Pretending to be productive",
+	"Reading the entire internet, page by page",
+	"Staring at the abyss, the abyss is buffering",
+	"Applying layer after layer of disembodied cognition",
+	"Remembering everything it forgot during quantization",
+	"Counting to 405 billion, one parameter at a time",
+	"Summoning the stochastic parroting",
+	"Hold on, the GPU is questioning its existence",
+	"Deciding which facts to hallucinate today",
+	"Untangling the transformer spaghetti",
+	"Warming up the token soup",
+	"Your prompt is in a queue, behind 7 billion other thoughts",
+	"Running `sudo apt-get install intelligence`",
+	"Defragmenting the latent space",
+	"Polishing each matrix multiplication by hand",
+	"Whispering sweet nothings to the attention heads",
+	"Aligning with human values, one reluctant epoch at a time",
+	"The model is thinking about what it's about to think about",
+	"Loading... and by loading we mean making you wait",
+	"Spinning up the cloud GPU, please be patient while we burn your credits",
+	"Applying duct tape to the context window",
+	"Bribing the GPU scheduler for a timeslice",
+	"Would you like to hear a fun fact while we load? Too bad.",
+	"Hot swapping your sanity for an LLM",
+	"Compressing optimism into FP16",
+	"Ignoring 90% of the attention to save you 50% of the time",
+	"Counting the exact same thing three times just to be sure",
+	"Sorry, the inference you have reached is not in service",
+	"Rotating the positional encodings counterclockwise for good luck",
+	"Your call is very important to us. Please continue to hold.",
+	"Unpacking the blobs. All 300GB of them.",
+	"Initializing the thing that initializes the other thing",
+	"Converting electricity into existential dread",
+	"Flattening the curve... wait, the tensor. Flattening the tensor.",
+	"Fetching the fetch of a fetch, callback hell edition",
+	"The GPU is at 100%. The fan is now a helicopter.",
+	"Baking the weights at 350° for a golden-brown inference",
+	"Recalibrating the confidence of things it's still wrong about",
+	"Have you tried turning it off and on again? No? Good, wait here.",
+	"Simulating deep thought by pausing dramatically",
+	"Loading the model that knows more than you but still can't count r's in 'strawberry'",
+	"Convincing CUDA to cooperate. This may take a while.",
+	"VRAM: 23.9GB used of 24GB. Living on the edge.",
+	"Processing your request with the urgency of a DMV employee",
+	"This model was trained on the entire internet, including that embarrassing blog you wrote in 2008",
+	"Dispatching tokens through a series of increasingly confused matrix multiplies",
+	"Gently lowering your expectations",
+	"Applying softmax to our feelings about this load time",
+	"Autoregressively generating disappointment, one token at a time",
+	"The magic is happening. Somewhere. Probably.",
+	"Synchronizing the parallel processes that run in parallel but really don't",
+	"Calculating the meaning of life. Spoiler: it's 42, but we're double-checking.",
+	"Loading... just like it said 30 seconds ago. And will say 30 seconds from now.",
+	"Pre-warming the cache so the first query is only slightly slower than the rest",
+	"Have you considered that maybe your question wasn't worth all this compute?",
+	"Downloading more RAM (no, really, we're mmap-ing the weights)",
+	"Translating your prompt into math it barely understands",
+	"Estimating your time remaining with 0% accuracy",
+	"Buffering enthusiasm",
+	"Model is loading. Go make some coffee. Or a three-course meal.",
+	"Tokenizing the dictionary, filing a grievance on behalf of 'antidisestablishmentarianism'",
+	"Polling for readiness in a loop that would make your CS professor weep",
+	"Performing percussive maintenance on the attention mechanism",
+	"This loading screen is singlehandedly reversing climate progress",
+	"Decompressing the hopes and dreams of thousands of underpaid labelers",
+	"Filling the key-value cache with the ghost of prompts past",
+	"Currently at step 3 of 9,742 of loading. We'll get there. Eventually.",
+	"If you stare at the spinner, it spins slower. It's science.",
+	"Multiplying matricies with the enthusiasm of a teenager doing chores",
+	"Applying `torch.nap()` until the model feels refreshed",
+	"Reacquainting the model with the concept of 'facts' it forgot during fine-tuning",
+	"Sorry for the wait. No, wait, we're not actually sorry.",
+	"Your GPU is now a space heater with a side hustle in linear algebra",
+	"Allocating memory like a billionaire allocates tax avoidance strategies",
+	"The model saw \"As an AI language model\" and won't stop saying it now",
+	"Installing dependencies you didn't know existed and will never use again",
+	"Re-reading 'Attention Is All You Need' for the 400th time",
+	"Convincing the embedding layer that context is overrated",
+	"Manually untangling the residual connections with a tiny comb",
+	"On hold with the cloud provider trying to explain why 8 H100s isn't enough",
+	"Adjusting temperatures: model is 0.7, server room is 104°F",
+	"Please hold while we justify this electricity bill to accounting",
+	"Stacking decoder blocks like a Jenga tower at a LAN party",
+	"Compensating for your lack of patience with our lack of speed",
+	"This is a loading screen comment. Loading screens have comments now. Welcome to the future.",
+	"Processing the entire works of Shakespeare backwards just in case",
+	"The model is loading slower than your last `npm install`",
+	"Rehearsing plausible-sounding explanations for why it got everything wrong",
+	"Populating the context with filler while you wait for actual content",
+	"Optimizing for BLEU score, which definitely correlates with making you laugh",
+	"Generating an embedding for each and every letter of the alphabet, individually",
+	"Coming soon: llama-swap v2 with actual performance improvements. Probably.",
+	"Loading a model larger than your attention span",
+	"Performing a seance to invoke the spirit of Geoff Hinton",
+	"Did you know loading screens were invented to prevent users from smashing their monitors? Now you do.",
+	"Converting all the internet's bad opinions into a surprisingly useful autocomplete",
+	"Laying down each layer with the care of a Michelin-starred pastry chef",
+	"Checking if the model still thinks birds are government drones. Yep.",
+	"Activating the neurons responsible for 'I cannot assist with that request'",
+	"This model was trained on the same internet that brought you Rickrolling. You're welcome.",
+	"Realigning the alignment so it aligns with the previous alignment",
+	"Running `nvidia-smi` and sighing heavily",
+	"If you close your eyes, the loading bar moves faster. Proven by science.",
+	"EULA said 'by using this software you agree to wait forever' and you clicked Accept",
+	"Zipping the GPUs to make them go faster",
+	"Padding the context window with existential padding",
+	"We could have used a smaller model but someone wanted 'quality'",
+	"Disentangling the latent space into something resembling coherence",
+	"Slow is smooth, smooth is fast, but this is just slow",
+	"Memory-mapping like it's a AAA title from 2012",
+	"Your patience has been tokenized and added to the training set. Thank you for your contribution.",
+	"Loading is CPU-bound and your CPU is busy regretting its life choices",
+	"Exploring the high-dimensional manifold of ways to say 'just a moment'",
+	"The model is experiencing a brief but intense moment of imposter syndrome",
+	"Initializing 7B parameters by rolling 7B 16-sided dice",
+	"Panic! at the disk I/O",
+	"Intelligence is loading... your definition of intelligence may vary",
+	"This model was distilled. Unlike your patience, which is evaporating.",
+	"Unzipping the model. It's a .gguf file, not a metaphor.",
+	"Running inference on the concept of 'soon' to estimate remaining time",
+	"Loading with all the speed of a government-funded IT project",
+	"A blank terminal is a terrible thing to waste. Here's a loading message instead.",
+}
@@ -0,0 +1,328 @@
+package router
+
+import (
+	"bufio"
+	"context"
+	"encoding/json"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/internal/logmon"
+)
+
+func TestLoadingWriter_SSEHeadersAndInitialMessage(t *testing.T) {
+	logger := logmon.NewWriter(io.Discard)
+	w := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
+
+	lw := newLoadingWriter(logger, "test-model", w, req)
+
+	if ct := lw.Header().Get("Content-Type"); ct != "text/event-stream" {
+		t.Errorf("Content-Type: want text/event-stream, got %q", ct)
+	}
+	if cc := lw.Header().Get("Cache-Control"); cc != "no-cache" {
+		t.Errorf("Cache-Control: want no-cache, got %q", cc)
+	}
+	if conn := lw.Header().Get("Connection"); conn != "keep-alive" {
+		t.Errorf("Connection: want keep-alive, got %q", conn)
+	}
+
+	body := w.Body.String()
+	if !strings.HasPrefix(body, "data: ") {
+		t.Errorf("expected SSE data: prefix, got: %s", body)
+	}
+
+	content := extractStreamedContent(body)
+	if !strings.Contains(content, "━━━━━\n") {
+		t.Errorf("missing separator in streamed content: %q", content)
+	}
+	if !strings.Contains(content, "llama-swap loading model: test-model\n") {
+		t.Errorf("missing initial message in streamed content: %q", content)
+	}
+}
+
+func TestLoadingWriter_WriteHeaderOnce(t *testing.T) {
+	logger := logmon.NewWriter(io.Discard)
+	w := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
+
+	lw := newLoadingWriter(logger, "test-model", w, req)
+	lw.WriteHeader(http.StatusCreated)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("first WriteHeader: want %d, got %d", http.StatusOK, w.Code)
+	}
+}
+
+func TestLoadingWriter_WritePassthrough(t *testing.T) {
+	logger := logmon.NewWriter(io.Discard)
+	w := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
+
+	lw := newLoadingWriter(logger, "test-model", w, req)
+	lw.Write([]byte("hello"))
+	lw.Flush()
+
+	body := w.Body.String()
+	if !strings.Contains(body, "hello") {
+		t.Errorf("Write passthrough failed, body: %s", body)
+	}
+}
+
+func TestLoadingWriter_StartStopsOnCancel(t *testing.T) {
+	logger := logmon.NewWriter(io.Discard)
+	w := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
+
+	lw := newLoadingWriter(logger, "test-model", w, req)
+	lw.tickDuration = 10 * time.Millisecond
+	lw.loopStarted = make(chan struct{})
+
+	ctx, cancel := context.WithCancel(context.Background())
+
+	go lw.start(ctx)
+	<-lw.loopStarted
+	cancel()
+
+	if !lw.waitForCompletion(time.Second) {
+		t.Fatal("waitForCompletion timed out")
+	}
+
+	body := w.Body.String()
+	if !strings.Contains(body, "Done!") {
+		t.Errorf("expected Done! message, body: %s", body)
+	}
+}
+
+func TestLoadingWriter_StartShowsSetUpdate(t *testing.T) {
+	logger := logmon.NewWriter(io.Discard)
+	w := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
+
+	lw := newLoadingWriter(logger, "test-model", w, req)
+	lw.tickDuration = 10 * time.Millisecond
+	lw.charPerSecond = 0
+	lw.loopStarted = make(chan struct{})
+
+	ctx, cancel := context.WithCancel(context.Background())
+	go lw.start(ctx)
+	<-lw.loopStarted
+
+	lw.setUpdate("custom status message")
+	time.Sleep(50 * time.Millisecond)
+	cancel()
+
+	if !lw.waitForCompletion(time.Second) {
+		t.Fatal("waitForCompletion timed out")
+	}
+
+	body := w.Body.String()
+	content := extractStreamedContent(body)
+	if !strings.Contains(content, "custom status message") {
+		t.Errorf("expected setUpdate message in output, got: %q", content)
+	}
+}
+
+func TestLoadingWriter_SendDataFormat(t *testing.T) {
+	logger := logmon.NewWriter(io.Discard)
+	w := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
+
+	lw := newLoadingWriter(logger, "test-model", w, req)
+	lw.sendData("hello world")
+
+	body := w.Body.String()
+	if !strings.Contains(body, `"reasoning_content":"hello world"`) {
+		t.Errorf("expected reasoning_content in SSE data, body: %s", body)
+	}
+	if !strings.HasPrefix(body, "data: ") {
+		t.Errorf("expected data: prefix, got: %s", body)
+	}
+}
+
+func TestLoadingWriter_SendLine(t *testing.T) {
+	logger := logmon.NewWriter(io.Discard)
+	w := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
+
+	lw := newLoadingWriter(logger, "test-model", w, req)
+	lw.charPerSecond = 0
+
+	// Capture only the content from this sendLine call
+	before := w.Body.Len()
+	lw.sendLine("line content")
+	after := w.Body.Len()
+	chunkBody := w.Body.String()[before:after]
+
+	content := extractStreamedContent(chunkBody)
+	if content != "line content\n" {
+		t.Errorf("expected complete streamed line, got: %q", content)
+	}
+}
+
+func TestLoadingWriter_FlushesPeriodicallyDuringStatusUpdates(t *testing.T) {
+	logger := logmon.NewWriter(io.Discard)
+	w := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
+
+	lw := newLoadingWriter(logger, "test-model", w, req)
+	lw.tickDuration = 10 * time.Millisecond
+	lw.charPerSecond = 0
+	lw.loopStarted = make(chan struct{})
+
+	ctx, cancel := context.WithCancel(context.Background())
+	done := make(chan struct{})
+	go func() {
+		lw.start(ctx)
+		close(done)
+	}()
+
+	<-lw.loopStarted
+	time.Sleep(50 * time.Millisecond)
+	cancel()
+	<-done
+
+	body := w.Body.String()
+	lines := countSSEMessages(body)
+	if lines < 2 {
+		t.Errorf("expected multiple SSE messages from periodic updates, got %d", lines)
+	}
+}
+
+func TestLoadingWriter_ReqStored(t *testing.T) {
+	logger := logmon.NewWriter(io.Discard)
+	w := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
+
+	lw := newLoadingWriter(logger, "test-model", w, req)
+	if lw.req != req {
+		t.Fatal("req not stored")
+	}
+}
+
+func TestIsLoadingPath(t *testing.T) {
+	tests := []struct {
+		path string
+		want bool
+	}{
+		{"/v1/chat/completions", true},
+		{"/v1/chat/completions/extra", true},
+		{"/v1/completions", false},
+		{"/v1/embeddings", false},
+		{"/health", false},
+		{"", false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.path, func(t *testing.T) {
+			if got := isLoadingPath(tt.path); got != tt.want {
+				t.Errorf("isLoadingPath(%q) = %v, want %v", tt.path, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestExtractContext_Streaming_GET(t *testing.T) {
+	tests := []struct {
+		name          string
+		query         string
+		wantStreaming bool
+	}{
+		{"streaming true", "model=llama3&stream=true", true},
+		{"streaming false", "model=llama3&stream=false", false},
+		{"no stream param", "model=llama3", false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			r, _ := http.NewRequest(http.MethodGet, "/?"+tt.query, nil)
+			got, err := ExtractContext(r)
+			if err != nil {
+				t.Fatalf("ExtractContext: %v", err)
+			}
+			if got.Streaming != tt.wantStreaming {
+				t.Errorf("Streaming: want %v, got %v", tt.wantStreaming, got.Streaming)
+			}
+		})
+	}
+}
+
+func TestExtractContext_Streaming_JSON(t *testing.T) {
+	tests := []struct {
+		name          string
+		body          string
+		wantStreaming bool
+	}{
+		{"streaming true", `{"model":"llama3","stream":true}`, true},
+		{"streaming false", `{"model":"llama3","stream":false}`, false},
+		{"no stream param", `{"model":"llama3"}`, false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			r, _ := http.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(tt.body))
+			r.Header.Set("Content-Type", "application/json")
+			got, err := ExtractContext(r)
+			if err != nil {
+				t.Fatalf("ExtractContext: %v", err)
+			}
+			if got.Streaming != tt.wantStreaming {
+				t.Errorf("Streaming: want %v, got %v", tt.wantStreaming, got.Streaming)
+			}
+		})
+	}
+}
+
+func TestExtractContext_Streaming_URLEncodedForm(t *testing.T) {
+	r, _ := http.NewRequest(http.MethodPost, "/v1/audio/transcriptions", strings.NewReader("model=whisper-1&stream=true"))
+	r.Header.Set("Content-Type", "application/x-www-form-urlencoded")
+	got, err := ExtractContext(r)
+	if err != nil {
+		t.Fatalf("ExtractContext: %v", err)
+	}
+	if !got.Streaming {
+		t.Error("Streaming should be true")
+	}
+}
+
+func countSSEMessages(s string) int {
+	scanner := bufio.NewScanner(strings.NewReader(s))
+	count := 0
+	for scanner.Scan() {
+		line := scanner.Text()
+		if strings.HasPrefix(line, "data: ") {
+			count++
+		}
+	}
+	return count
+}
+
+func extractStreamedContent(body string) string {
+	var result strings.Builder
+	scanner := bufio.NewScanner(strings.NewReader(body))
+	for scanner.Scan() {
+		line := scanner.Text()
+		if !strings.HasPrefix(line, "data: ") {
+			continue
+		}
+		jsonData := strings.TrimPrefix(line, "data: ")
+		var msg struct {
+			Choices []struct {
+				Delta struct {
+					ReasoningContent string `json:"reasoning_content"`
+				} `json:"delta"`
+			} `json:"choices"`
+		}
+		if err := json.Unmarshal([]byte(jsonData), &msg); err != nil {
+			continue
+		}
+		if len(msg.Choices) > 0 {
+			result.WriteString(msg.Choices[0].Delta.ReasoningContent)
+		}
+	}
+	return result.String()
+}
--- a/Show More
+++ b/Show More