proxy: support timings for /infill from llama-server

fixes: #463
2026-02-07 16:54:02 -08:00
71 changed files with 1283 additions and 5054 deletions
@@ -4,7 +4,7 @@ early_access: false
 reviews:
  profile: "chill"
  request_changes_workflow: false
-  high_level_summary: false
+  high_level_summary: true
  poem: false
  review_status: true
  collapse_walkthrough: false
@@ -4,15 +4,11 @@ on:
  pull_request:
    paths:
      - "config-schema.json"
-      - "config.example.yaml"
-      - ".github/workflows/config-schema.yml"
  push:
    branches:
      - main
    paths:
      - "config-schema.json"
-      - "config.example.yaml"
-      - ".github/workflows/config-schema.yml"

  workflow_dispatch:

@@ -43,14 +39,3 @@ jobs:
          fi

          echo "✓ config-schema.json is valid"
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.x"
-
-      - name: Install check-jsonschema
-        run: pip install check-jsonschema
-
-      - name: Validate config.example.yaml against schema
-        run: check-jsonschema --schemafile config-schema.json config.example.yaml
@@ -17,19 +17,12 @@ on:
      - 'docker/build-container.sh'
      - 'docker/*.Containerfile'

-# grant permissions on GITHUB_TOKEN to publish packages
-# ref: https://docs.github.com/en/packages/managing-github-packages-using-github-actions-workflows/publishing-and-installing-a-package-with-github-actions#publishing-a-package-using-an-action
-permissions:
-  contents: read
-  packages: write
-  id-token: write
-
 jobs:
  build-and-push:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        platform: [intel, cuda, cuda13, vulkan, cpu, musa, rocm]
+        platform: [intel, cuda, vulkan, cpu, musa, rocm]
      fail-fast: false
    steps:
      - name: Checkout code
@@ -36,7 +36,7 @@ jobs:
    - name: Set up Go
      uses: actions/setup-go@v4
      with:
-        go-version-file: go.mod
+        go-version: '1.23'

    # Only run in this linux based runner
    - name: Check Formatting
@@ -51,7 +51,7 @@ jobs:
      uses: actions/cache/restore@v4
      with:
        path: ./build
-        key: ${{ runner.os }}-simple-responder-${{ hashFiles('cmd/simple-responder/simple-responder.go') }}
+        key: ${{ runner.os }}-simple-responder-${{ hashFiles('misc/simple-responder/simple-responder.go') }}

    # necessary for testing proxy/Process swapping
    - name: Create simple-responder
@@ -67,4 +67,4 @@ jobs:
        key: ${{ runner.os }}-simple-responder-${{ hashFiles('misc/simple-responder/simple-responder.go') }}

    - name: Test all
-      run: make test-all
+      run: make test-all
@@ -1,133 +0,0 @@
-name: Build Unified Docker Image
-
-on:
-  schedule:
-    - cron: "37 5 * * *"
-
-  workflow_dispatch:
-    inputs:
-      llama_cpp_ref:
-        description: "llama.cpp commit hash, tag, or branch"
-        required: false
-        default: "master"
-      whisper_ref:
-        description: "whisper.cpp commit hash, tag, or branch"
-        required: false
-        default: "master"
-      sd_ref:
-        description: "stable-diffusion.cpp commit hash, tag, or branch"
-        required: false
-        default: "master"
-      ik_llama_ref:
-        description: "ik_llama.cpp commit hash, tag, or branch (CUDA only)"
-        required: false
-        default: "main"
-      llama_swap_version:
-        description: "llama-swap version (e.g. v198, latest, main)"
-        required: false
-        default: "main"
-      build_cuda:
-        description: "Build CUDA image"
-        type: boolean
-        required: false
-        default: true
-      build_vulkan:
-        description: "Build Vulkan image"
-        type: boolean
-        required: false
-        default: true
-
-permissions:
-  contents: read
-  packages: write
-
-jobs:
-  setup:
-    runs-on: ubuntu-latest
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-    steps:
-      - id: set-matrix
-        run: |
-          backends=()
-          # schedule uses defaults (build both); workflow_dispatch respects inputs
-          if [[ "${{ github.event_name }}" == "schedule" ]] || [[ "${{ inputs.build_cuda }}" == "true" ]]; then
-            backends+=("cuda")
-          fi
-          if [[ "${{ github.event_name }}" == "schedule" ]] || [[ "${{ inputs.build_vulkan }}" == "true" ]]; then
-            backends+=("vulkan")
-          fi
-          matrix=$(printf '%s\n' "${backends[@]}" | jq -R . | jq -sc .)
-          echo "matrix=$matrix" >> $GITHUB_OUTPUT
-
-  build:
-    needs: setup
-    if: ${{ needs.setup.outputs.matrix != '[]' }}
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        backend: ${{ fromJSON(needs.setup.outputs.matrix) }}
-        variant:
-          - name: root
-            uid: "0"
-            suffix: ""
-          - name: rootless
-            uid: "10001"
-            suffix: "-rootless"
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Free up disk space
-        run: |
-          echo "Before cleanup:"
-          df -h
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /opt/ghc
-          sudo rm -rf /opt/hostedtoolcache/CodeQL
-          sudo docker system prune -af
-          echo "After cleanup:"
-          df -h
-
-      # On GitHub Actions runners, create a fresh builder.
-      # When running locally under act, skip this and reuse the existing
-      # llama-swap-builder (which has ccache warm) to avoid exhausting disk.
-      - name: Set up Docker Buildx
-        if: ${{ !env.ACT }}
-        uses: docker/setup-buildx-action@v3
-
-      - name: Log in to GitHub Container Registry
-        if: ${{ !env.ACT }}
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Build unified Docker image (${{ matrix.backend }}, ${{ matrix.variant.name }})
-        env:
-          LLAMA_REF: ${{ inputs.llama_cpp_ref || 'master' }}
-          WHISPER_REF: ${{ inputs.whisper_ref || 'master' }}
-          SD_REF: ${{ inputs.sd_ref || 'master' }}
-          IK_LLAMA_REF: ${{ inputs.ik_llama_ref || 'main' }}
-          LS_VERSION: ${{ inputs.llama_swap_version || 'main' }}
-          RUN_UID: ${{ matrix.variant.uid }}
-          DOCKER_IMAGE_TAG: ghcr.io/mostlygeek/llama-swap:unified-${{ matrix.backend }}${{ matrix.variant.suffix }}
-          # When running under act, use the local builder that has warm ccache.
-          # On GitHub Actions, BUILDX_BUILDER is unset so docker uses the builder
-          # created by setup-buildx-action above.
-          BUILDX_BUILDER: ${{ env.ACT == 'true' && 'llama-swap-builder' || '' }}
-        run: |
-          chmod +x docker/unified/build-image.sh
-          docker/unified/build-image.sh --${{ matrix.backend }}
-
-      - name: Push to GitHub Container Registry
-        if: ${{ !env.ACT }}
-        run: |
-          TAG="ghcr.io/mostlygeek/llama-swap:unified-${{ matrix.backend }}${{ matrix.variant.suffix }}"
-          docker push "${TAG}"
-          DATE_TAG=$(date -u +%Y-%m-%d)
-          docker tag "${TAG}" "${TAG}-${DATE_TAG}"
-          docker push "${TAG}-${DATE_TAG}"
@@ -1,51 +0,0 @@
-## Project Description:
-
-llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
-
-## Tech stack
-
- golang
- typescript, vite and svelt5 for UI (located in ui/)
-
-## Workflow Tasks
-
- when summarizing changes only include details that require further action
- just say "Done." when there is no further action
- use the github CLI `gh` to create pull requests and work with github
- Rules for creating pull requests:
-  - keep them short and focused on changes.
-  - never include a test plan
-  - write the summary using the same style rules as commit message
-
-## Testing
-
- Follow test naming conventions like `TestProxyManager_<test name>`, `TestProcessGroup_<test name>`, etc.
- Use `go test -v -run <name pattern for new tests>` to run any new tests you've written.
- Run `gofmt -l .` before committing to verify formatting. Fix any reported files with `gofmt -w <file>`.
- Use `make test-dev` after running new tests for a quick over all test run. This runs `go test` and `staticcheck`. Fix any static checking errors. Use this only when changes are made to any code under the `proxy/` directory
- Use `make test-all` before completing work. This includes long running concurrency tests.
-
-### Commit message example format:
-
-```
-proxy: add new feature
-
-Add new feature that implements functionality X and Y.
-
- key change 1
- key change 2
- key change 3
-
-fixes #123
-```
-
-## Code Reviews
-
- use three levels High, Medium, Low severity
- label each discovered issue with a label like H1, M2, L3 respectively
- High severity are must fix issues (security, race conditions, critical bugs)
- Medium severity are recommended improvements (coding style, missing functionality, inconsistencies)
- Low severity are nice to have changes and nits
- Include a suggestion with each discovered item
- Limit your code review to three items with the highest priority first
- Double check your discovered items and recommended remediations
@@ -1 +1,49 @@
-@AGENTS.md
+## Project Description:
+
+llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
+
+## Tech stack
+
+- golang
+- typescript, vite and react for UI (located in ui/)
+
+## Workflow Tasks
+
+- when summarizing changes only include details that require further action
+- just say "Done." when there is no further action
+- use `gh` to create PRs and load issues
+- do include Co-Authored-By or created by when committing changes or creating PRs
+- keep PR descriptions short and focused on changes.
+  - never include a test plan
+
+## Testing
+
+- Follow test naming conventions like `TestProxyManager_<test name>`, `TestProcessGroup_<test name>`, etc.
+- Use `go test -v -run <name pattern for new tests>` to run any new tests you've written.
+- Use `make test-dev` after running new tests for a quick over all test run. This runs `go test` and `staticcheck`. Fix any static checking errors. Use this only when changes are made to any code under the `proxy/` directory
+- Use `make test-all` before completing work. This includes long running concurrency tests.
+
+### Commit message example format:
+
+```
+proxy: add new feature
+
+Add new feature that implements functionality X and Y.
+
+- key change 1
+- key change 2
+- key change 3
+
+fixes #123
+```
+
+## Code Reviews
+
+- use three levels High, Medium, Low severity
+- label each discovered issue with a label like H1, M2, L3 respectively
+- High severity are must fix issues (security, race conditions, critical bugs)
+- Medium severity are recommended improvements (coding style, missing functionality, inconsistencies)
+- Low severity are nice to have changes and nits
+- Include a suggestion with each discovered item
+- Limit your code review to three items with the highest priority first
+- Double check your discovered items and recommended remediations
@@ -51,7 +51,7 @@ mac: ui
 linux: ui
 	@echo "Building Linux binary..."
 	GOOS=linux GOARCH=amd64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-linux-amd64
-#GOOS=linux GOARCH=arm64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-linux-arm64
+	GOOS=linux GOARCH=arm64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-linux-arm64

 # Build Windows binary
 windows: ui
@@ -5,7 +5,7 @@

 # llama-swap

-Run multiple generative AI models on your machine and hot-swap between them on demand. llama-swap works with any OpenAI and Anthropic API compatible server and is used by thousands of people to power their local AI workflows. 
+Run multiple LLM models on your machine and hot-swap between them as needed. llama-swap works with any OpenAI API-compatible server, giving you the flexibility to switch models without restarting your applications.

 Built in Go for performance and simplicity, llama-swap has zero dependencies and is incredibly easy to set up. Get started in minutes - just one binary and one configuration file.

@@ -32,10 +32,6 @@ Built in Go for performance and simplicity, llama-swap has zero dependencies and
  - `v1/rerank`, `v1/reranking`, `/rerank`
  - `/infill` - for code infilling
  - `/completion` - for completion endpoint
- ✅ SDAPI via [stable-diffusion.cpp's server](https://github.com/leejet/stable-diffusion.cpp/tree/master/examples/server)
-  - `/sdapi/v1/txt2img`
-  - `/sdapi/v1/img2img`
-  - `/sdapi/v1/loras` - requires `model` in request body to fetch the correct loras
 - ✅ llama-swap API
  - `/ui` - web UI
  - `/upstream/:model_id` - direct access to upstream server ([demo](https://github.com/mostlygeek/llama-swap/pull/31))
@@ -52,27 +48,13 @@ Built in Go for performance and simplicity, llama-swap has zero dependencies and

 ### Web UI

-llama-swap includes a real time web interface with a playground for testing out all sorts of local models: 
+llama-swap includes a real time web interface for monitoring logs and controlling models:

-<img width="1125" height="876" alt="image" src="https://github.com/user-attachments/assets/8ee41947-97af-463d-b0f0-8e9c478fac07" />
+<img width="1164" height="745" alt="image" src="https://github.com/user-attachments/assets/bacf3f9d-819f-430b-9ed2-1bfaa8d54579" />

-View detailed token metrics:
-
-<img width="1111" height="515" alt="image" src="https://github.com/user-attachments/assets/64bfb280-d7a3-4126-971a-a128fd40410c" />
-
-Inspect request and responses:
-
-<img width="1111" height="720" alt="image" src="https://github.com/user-attachments/assets/24fe4aca-1448-4d7c-b9e8-a967589bda6c" />
-
-Manually load and unload models: 
-
-<img width="1109" height="719" alt="image" src="https://github.com/user-attachments/assets/02b1e1f2-abd0-4050-84ae-facd66ff01c4" />
-
-
-Real time log streaming: 
-
-<img width="1107" height="559" alt="image" src="https://github.com/user-attachments/assets/39669a10-cff2-409e-836a-5bad8bd0140c" />
+The Activity Page shows recent requests:

+<img width="1360" height="963" alt="image" src="https://github.com/user-attachments/assets/5f3edee6-d03a-4ae5-ae06-b20ac1f135bd" />

 ## Installation

@@ -274,43 +274,6 @@ func main() {
 		c.String(200, fmt.Sprintf("%s %s", c.Request.Method, c.Request.URL.Path))
 	})

-	// SD API endpoints
-	r.POST("/sdapi/v1/txt2img", func(c *gin.Context) {
-		body, err := io.ReadAll(c.Request.Body)
-		if err != nil {
-			c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to read request body"})
-			return
-		}
-		defer c.Request.Body.Close()
-
-		modelName := gjson.GetBytes(body, "model").String()
-		c.JSON(http.StatusOK, gin.H{
-			"model":  modelName,
-			"images": []string{},
-		})
-	})
-
-	r.POST("/sdapi/v1/img2img", func(c *gin.Context) {
-		body, err := io.ReadAll(c.Request.Body)
-		if err != nil {
-			c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to read request body"})
-			return
-		}
-		defer c.Request.Body.Close()
-
-		modelName := gjson.GetBytes(body, "model").String()
-		c.JSON(http.StatusOK, gin.H{
-			"model":  modelName,
-			"images": []string{},
-		})
-	})
-
-	r.GET("/sdapi/v1/loras", func(c *gin.Context) {
-		c.JSON(http.StatusOK, gin.H{
-			"loras": []string{},
-		})
-	})
-
 	address := "127.0.0.1:" + *port // Address with the specified port

 	srv := &http.Server{
@@ -39,43 +39,6 @@
            },
            "default": {},
            "description": "A dictionary of string substitutions. Macros are reusable snippets used in model cmd, cmdStop, proxy, checkEndpoint, filters.stripParams. Macro names must be <64 chars, match ^[a-zA-Z0-9_-]+$, and not be PORT or MODEL_ID. Values can be string, number, or boolean. Macros can reference other macros defined before them."
-        },
-        "timeouts": {
-            "type": "object",
-            "properties": {
-                "connect": {
-                    "type": "integer",
-                    "minimum": 0,
-                    "default": 30,
-                    "description": "TCP connection timeout in seconds. Set to 0 to disable (not recommended)."
-                },
-                "responseHeader": {
-                    "type": "integer",
-                    "minimum": 0,
-                    "default": 60,
-                    "description": "Time to wait for response headers in seconds. Set to 0 to disable (not recommended)."
-                },
-                "tlsHandshake": {
-                    "type": "integer",
-                    "minimum": 0,
-                    "default": 10,
-                    "description": "TLS handshake timeout in seconds. Set to 0 to disable (not recommended)."
-                },
-                "expectContinue": {
-                    "type": "integer",
-                    "minimum": 0,
-                    "default": 1,
-                    "description": "Expect-Continue timeout in seconds. Set to 0 to disable (not recommended)."
-                },
-                "idleConn": {
-                    "type": "integer",
-                    "minimum": 0,
-                    "default": 90,
-                    "description": "Idle connection timeout in seconds. Set to 0 to disable (not recommended)."
-                }
-            },
-            "additionalProperties": false,
-            "description": "Timeout settings for proxy connections."
        }
    },
    "properties": {
@@ -85,12 +48,6 @@
            "default": 120,
            "description": "Number of seconds to wait for a model to be ready to serve requests."
        },
-        "globalTTL": {
-            "type": "integer",
-            "minimum": 0,
-            "default": 0,
-            "description": "Default TTL for all models in seconds, 0 means no TTL and models will never be automatically unloaded"
-        },
        "logLevel": {
            "type": "string",
            "enum": [
@@ -220,9 +177,9 @@
                    },
                    "ttl": {
                        "type": "integer",
-                        "minimum": -1,
-                        "default": -1,
-                        "description": "Automatically unload the model after ttl seconds. -1 uses the global TTL value, 0 disables unloading. Must be >0 to enable."
+                        "minimum": 0,
+                        "default": 0,
+                        "description": "Automatically unload the model after ttl seconds. 0 disables unloading. Must be >0 to enable."
                    },
                    "useModelName": {
                        "type": "string",
@@ -243,20 +200,11 @@
                                "additionalProperties": true,
                                "default": {},
                                "description": "Dictionary of parameters to set/override in requests. Useful for enforcing specific parameter values. Protected params like 'model' cannot be overridden. Values can be strings, numbers, booleans, arrays, or objects."
-                            },
-                            "setParamsByID": {
-                                "type": "object",
-                                "additionalProperties": {
-                                    "type": "object",
-                                    "additionalProperties": true
-                                },
-                                "default": {},
-                                "description": "Dictionary mapping requested model IDs (or aliases) to parameters to set/override in requests. Applied after setParams and can override those values. Useful with aliases to vary behaviour depending on which alias the client used (e.g. different reasoning_effort per alias). Keys support ${MODEL_ID} macro substitution. Protected params like 'model' cannot be overridden."
                            }
                        },
                        "additionalProperties": false,
                        "default": {},
-                        "description": "Dictionary of filter settings. Supports stripParams, setParams, and setParamsByID."
+                        "description": "Dictionary of filter settings. Supports stripParams and setParams."
                    },
                    "metadata": {
                        "type": "object",
@@ -278,9 +226,6 @@
                        "type": "boolean",
                        "default": false,
                        "description": "If true the model will not show up in /v1/models responses. It can still be used as normal in API requests."
-                    },
-                    "timeouts": {
-                        "$ref": "#/definitions/timeouts"
                    }
                }
            }
@@ -407,37 +352,6 @@
                        "additionalProperties": false,
                        "default": {},
                        "description": "Dictionary of filter settings for peer requests. Supports stripParams and setParams."
-                    },
-                    "timeouts": {
-                        "type": "object",
-                        "properties": {
-                            "connect": {
-                                "type": "integer",
-                                "minimum": 1,
-                                "default": 30,
-                                "description": "TCP connection timeout in seconds."
-                            },
-                            "responseHeader": {
-                                "type": "integer",
-                                "minimum": 1,
-                                "default": 60,
-                                "description": "Time to wait for response headers in seconds."
-                            },
-                            "tlsHandshake": {
-                                "type": "integer",
-                                "minimum": 1,
-                                "default": 10,
-                                "description": "TLS handshake timeout in seconds."
-                            },
-                            "idleConn": {
-                                "type": "integer",
-                                "minimum": 1,
-                                "default": 90,
-                                "description": "Idle connection timeout in seconds."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "description": "Timeout settings for proxy connections to this peer."
                    }
                }
            },
@@ -445,4 +359,4 @@
            "description": "A dictionary of remote peers and models they provide. Peers can be another llama-swap or any server that provides the /v1/ generative API endpoints supported by llama-swap."
        }
    }
-}
+}
@@ -75,11 +75,6 @@ sendLoadingState: true
 #   all fields except for Id so chat UIs can use the alias equivalent to the original.
 includeAliasesInList: false

-# globalTTL: the default TTL in seconds before unloading a model
-# - optional, default: 0 (never automatically unload)
-# - must be >= 0
-globalTTL: 0
-
 # macros: a dictionary of string substitutions
 # - optional, default: empty dictionary
 # - macros are reusable snippets
@@ -131,7 +126,7 @@ apiKeys:
 # - below are examples of the all the settings a model can have
 models:
  # keys are the model names used in API requests
-  "gpt-oss-120b":
+  "llama":
    # macros: a dictionary of string substitutions specific to this model
    # - optional, default: empty dictionary
    # - macros defined here override macros defined in the global macros section
@@ -148,7 +143,7 @@ models:
    cmd: |
      # ${latest-llama} is a macro that is defined above
      ${latest-llama}
-      --model path/to/gpt-oss-120B.gguf
+      --model path/to/llama-8B-Q4_K_M.gguf
      --ctx-size ${default_ctx}
      --temperature ${temp}

@@ -156,13 +151,13 @@ models:
    # - optional, default: empty string
    # - if set, it will be used in the v1/models API response
    # - if not set, it will be omitted in the JSON model record
-    name: "gpt-oss 120B"
+    name: "llama 3.1 8B"

    # description: a description for the model
    # - optional, default: empty string
    # - if set, it will be used in the v1/models API response
    # - if not set, it will be omitted in the JSON model record
-    description: "A thinking model from OpenAI"
+    description: "A small but capable model used for quick testing"

    # env: define an array of environment variables to inject into cmd's environment
    # - optional, default: empty array
@@ -177,6 +172,14 @@ models:
    # - if you use a custom port in cmd this *must* be set
    proxy: http://127.0.0.1:8999

+    # aliases: alternative model names that this model configuration is used for
+    # - optional, default: empty array
+    # - aliases must be unique globally
+    # - useful for impersonating a specific model
+    aliases:
+      - "gpt-4o-mini"
+      - "gpt-3.5-turbo"
+
    # checkEndpoint: URL path to check if the server is ready
    # - optional, default: /health
    # - endpoint is expected to return an HTTP 200 response
@@ -185,10 +188,8 @@ models:
    checkEndpoint: /custom-endpoint

    # ttl: automatically unload the model after ttl seconds
-    # - optional, default: -1 (use global default)
-    # - ttl values must be a value greater than or equal to 0
-    # - a ttl of -1 will use the global TTL value as the default
-    # - a ttl of 0 will mean never unload
+    # - optional, default: 0
+    # - ttl values must be a value greater than 0
    # - a value of 0 disables automatic unloading of the model
    ttl: 60

@@ -196,7 +197,7 @@ models:
    # - optional, default: ""
    # - useful for when the upstream server expects a specific model name that
    #   is different from the model's ID
-    useModelName: "openai/gpt-oss-120B"
+    useModelName: "qwen:qwq"

    # filters: a dictionary of filter settings
    # - optional, default: empty dictionary
@@ -215,38 +216,11 @@ models:
      # - useful for enforcing specific parameter values
      # - protected params like "model" cannot be overridden
      # - values can be strings, numbers, booleans, arrays, or objects
-      # - always runs for the model
      setParams:
        # Example: enforce specific sampling parameters
        temperature: 0.7
        top_p: 0.9

-      # setParamsByID: a dictionary of parameters to set based the model ID
-      # - optional, default: empty dictionary
-      # - combine with aliases to create variant behaviour without reloading the model
-      # - parameters are set in the request body JSON
-      # - run after setParams so it will override any settings
-      # - protected params like "model" cannot be overridden
-      # - values can be strings, numbers, booleans, arrays, or objects
-      # - model aliases will be automatically created for each key
-      setParamsByID:
-        "${MODEL_ID}":
-          chat_template_kwargs:
-            reasoning_effort: medium
-        "${MODEL_ID}:high":
-          chat_template_kwargs:
-            reasoning_effort: high
-        "${MODEL_ID}:low":
-          chat_template_kwargs:
-            reasoning_effort: low
-
-    # aliases: alternative model names that this model configuration is used for
-    # - optional, default: empty array
-    # - aliases must be unique globally
-    # - useful for impersonating a specific model
-    aliases:
-      - "gpt-4o-mini"
-
    # metadata: a dictionary of arbitrary values that are included in /v1/models
    # - optional, default: empty dictionary
    # - while metadata can contains complex types it is recommended to keep it simple
@@ -284,21 +258,6 @@ models:
    # - optional, default: undefined (use global setting)
    sendLoadingState: false

-    # timeouts: configure proxy connection timeouts for this model
-    # - optional, defaults shown below
-    # - useful for models running on slower hardware that need longer timeouts
-    # - connect: TCP connection timeout in seconds
-    # - responseHeader: time to wait for response headers in seconds
-    #   (increasing this helps avoid 502 errors on slow hardware)
-    # - tlsHandshake: TLS handshake timeout in seconds
-    # - idleConn: idle connection timeout in seconds
-    # - set any value to 0 to disable that timeout (not recommended)
-    timeouts:
-      connect: 30
-      responseHeader: 60
-      tlsHandshake: 10
-      idleConn: 90
-
  # Unlisted model example:
  "qwen-unlisted":
    # unlisted: boolean, true or false
@@ -441,16 +400,6 @@ peers:
      - z-ai/glm-4.7
      - moonshotai/kimi-k2-0905
      - minimax/minimax-m2.1
-    # timeouts: configure proxy connection timeouts for this peer
-    # - optional, defaults shown below
-    # - useful when the peer runs on slower hardware
-    # - set any value to 0 to disable that timeout (not recommended)
-    timeouts:
-      connect: 30
-      responseHeader: 60
-      tlsHandshake: 10
-      idleConn: 90
-
    # filters: a dictionary of filter settings for peer requests
    # - optional, default: empty dictionary
    # - same capabilities as model filters (stripParams, setParams)
@@ -27,7 +27,7 @@ ARCH=$1
 PUSH_IMAGES=${2:-false}

 # List of allowed architectures
-ALLOWED_ARCHS=("intel" "vulkan" "musa" "cuda" "cuda13" "cpu" "rocm")
+ALLOWED_ARCHS=("intel" "vulkan" "musa" "cuda" "cpu" "rocm")

 # Check if ARCH is in the allowed list
 if [[ ! " ${ALLOWED_ARCHS[@]} " =~ " ${ARCH} " ]]; then
@@ -142,7 +142,7 @@ for CONTAINER_TYPE in non-root root; do
  fi

  log_info "Building $CONTAINER_TYPE $CONTAINER_TAG $LS_VER"
-  docker build --provenance=false -f llama-swap.Containerfile --build-arg BASE_TAG=${BASE_TAG} --build-arg LS_VER=${LS_VER} --build-arg UID=${USER_UID} \
+  docker build -f llama-swap.Containerfile --build-arg BASE_TAG=${BASE_TAG} --build-arg LS_VER=${LS_VER} --build-arg UID=${USER_UID} \
    --build-arg LS_REPO=${LS_REPO} --build-arg GID=${USER_GID} --build-arg USER_HOME=${USER_HOME} -t ${CONTAINER_TAG} -t ${CONTAINER_LATEST} \
    --build-arg BASE_IMAGE=${BASE_IMAGE} .

@@ -150,7 +150,7 @@ for CONTAINER_TYPE in non-root root; do
  case "$ARCH" in
    "musa" | "vulkan")
      log_info "Adding sd-server to $CONTAINER_TAG"
-      docker build --provenance=false -f llama-swap-sd.Containerfile \
+      docker build -f llama-swap-sd.Containerfile \
        --build-arg BASE=${CONTAINER_TAG} \
        --build-arg SD_IMAGE=${SD_IMAGE} --build-arg SD_TAG=${SD_TAG} \
        --build-arg UID=${USER_UID} --build-arg GID=${USER_GID} \
@@ -1,305 +0,0 @@
-#!/bin/bash
-#
-# Build script for llama-swap-docker with commit hash pinning
-#
-# Usage:
-#   ./build-image.sh --cuda                    # Build CUDA image
-#   ./build-image.sh --vulkan                  # Build Vulkan image
-#   ./build-image.sh --cuda --no-cache         # Build CUDA image without cache
-#   LLAMA_COMMIT_HASH=abc123 ./build-image.sh --cuda      # Override llama.cpp commit
-#   LLAMA_COMMIT_HASH=b8429 ./build-image.sh --vulkan    # Override llama.cpp release tag (vulkan uses prebuilt binaries)
-#   WHISPER_COMMIT_HASH=def456 ./build-image.sh --vulkan  # Override whisper.cpp commit
-#   SD_COMMIT_HASH=ghi789 ./build-image.sh --cuda        # Override stable-diffusion.cpp commit
-#
-# Features:
-#   - Auto-detects latest commit hashes from git repos
-#   - Builds llama-swap from local source code
-#   - Allows environment variable overrides for reproducible builds
-#   - Cache-friendly: changing commit hash busts cache appropriately
-#   - Supports both CUDA and Vulkan backends (requires explicit flag)
-#
-
-set -euo pipefail
-
-# Parse command line arguments
-BACKEND=""
-NO_CACHE=false
-
-if [[ $# -eq 0 ]]; then
-    echo "Error: No backend specified. Please use --cuda or --vulkan."
-    echo ""
-    echo "Usage: ./build-image.sh --cuda|--vulkan [--no-cache]"
-    echo ""
-    echo "Options:"
-    echo "  --cuda      Build CUDA image (NVIDIA GPUs)"
-    echo "  --vulkan    Build Vulkan image (AMD GPUs and compatible hardware)"
-    echo "  --no-cache  Force rebuild without using Docker cache"
-    echo "  --help, -h  Show this help message"
-    echo ""
-    echo "Environment variables:"
-    echo "  DOCKER_IMAGE_TAG     Set custom image tag (default: llama-swap:cuda or llama-swap:vulkan)"
-    echo "  LLAMA_COMMIT_HASH    Override llama.cpp commit hash"
-    echo "  WHISPER_COMMIT_HASH  Override whisper.cpp commit hash"
-    echo "  SD_COMMIT_HASH       Override stable-diffusion.cpp commit hash"
-    exit 1
-fi
-
-for arg in "$@"; do
-    case $arg in
-        --cuda)
-            BACKEND="cuda"
-            ;;
-        --vulkan)
-            BACKEND="vulkan"
-            ;;
-        --no-cache)
-            NO_CACHE=true
-            ;;
-        --help|-h)
-            echo "Usage: ./build-image.sh --cuda|--vulkan [--no-cache]"
-            echo ""
-            echo "Options:"
-            echo "  --cuda      Build CUDA image (NVIDIA GPUs)"
-            echo "  --vulkan    Build Vulkan image (AMD GPUs and compatible hardware)"
-            echo "  --no-cache  Force rebuild without using Docker cache"
-            echo "  --help, -h  Show this help message"
-            echo ""
-            echo "Environment variables:"
-            echo "  DOCKER_IMAGE_TAG     Set custom image tag (default: llama-swap:cuda or llama-swap:vulkan)"
-            echo "  LLAMA_COMMIT_HASH    Override llama.cpp commit hash"
-            echo "  WHISPER_COMMIT_HASH  Override whisper.cpp commit hash"
-            echo "  SD_COMMIT_HASH       Override stable-diffusion.cpp commit hash"
-            exit 0
-            ;;
-    esac
-done
-
-# Validate backend selection
-if [[ -z "$BACKEND" ]]; then
-    echo "Error: No backend specified. Please use --cuda or --vulkan."
-    exit 1
-fi
-
-# Configuration
-if [[ -n "${DOCKER_IMAGE_TAG:-}" ]]; then
-    # User provided a custom tag, use it as-is
-    :
-elif [[ "$BACKEND" == "vulkan" ]]; then
-    DOCKER_IMAGE_TAG="llama-swap:vulkan"
-else
-    DOCKER_IMAGE_TAG="llama-swap:cuda"
-fi
-DOCKER_BUILDKIT="${DOCKER_BUILDKIT:-1}"
-
-# Single unified Dockerfile, backend selected via build arg
-DOCKERFILE="Dockerfile"
-if [[ "$BACKEND" == "vulkan" ]]; then
-    echo "Building for: Vulkan (AMD GPUs and compatible hardware)"
-else
-    echo "Building for: CUDA (NVIDIA GPUs)"
-fi
-
-# Git repository URLs
-LLAMA_REPO="https://github.com/ggml-org/llama.cpp.git"
-WHISPER_REPO="https://github.com/ggml-org/whisper.cpp.git"
-SD_REPO="https://github.com/leejet/stable-diffusion.cpp.git"
-
-# Function to get the latest commit hash from a git repo's default branch
-get_latest_commit() {
-    local repo_url="$1"
-    local branch="${2:-master}"
-
-    # Try to get the latest commit hash for the specified branch
-    git ls-remote --heads "${repo_url}" "${branch}" 2>/dev/null | head -1 | cut -f1
-}
-
-# Function to get the default branch name (master or main)
-get_default_branch() {
-    local repo_url="$1"
-
-    # Check for master first
-    if git ls-remote --heads "${repo_url}" master &>/dev/null; then
-        echo "master"
-    elif git ls-remote --heads "${repo_url}" main &>/dev/null; then
-        echo "main"
-    else
-        echo "master"  # fallback
-    fi
-}
-
-# Function to get the latest release tag from a GitHub repo
-get_latest_release_tag() {
-    local owner_repo="$1"
-    curl -fsSL "https://api.github.com/repos/${owner_repo}/releases/latest" \
-        | grep '"tag_name"' | head -1 | cut -d'"' -f4
-}
-
-echo "=========================================="
-echo "llama-swap-docker Build Script"
-echo "=========================================="
-echo ""
-
-# Determine commit hashes / release tags - use env vars or auto-detect
-# For vulkan builds, llama and sd use GitHub release tags (prebuilt binaries).
-# For cuda builds (or whisper on any backend), use git commit hashes.
-if [[ -n "${LLAMA_COMMIT_HASH:-}" ]]; then
-    LLAMA_HASH="${LLAMA_COMMIT_HASH}"
-    echo "llama.cpp: Using provided version: ${LLAMA_HASH}"
-elif [[ "$BACKEND" == "vulkan" ]]; then
-    LLAMA_HASH=$(get_latest_release_tag "ggml-org/llama.cpp")
-    if [[ -z "${LLAMA_HASH}" ]]; then
-        echo "ERROR: Could not determine latest release tag for llama.cpp" >&2
-        exit 1
-    fi
-    echo "llama.cpp: Auto-detected latest release tag: ${LLAMA_HASH}"
-else
-    LLAMA_BRANCH=$(get_default_branch "${LLAMA_REPO}")
-    LLAMA_HASH=$(get_latest_commit "${LLAMA_REPO}" "${LLAMA_BRANCH}")
-    if [[ -z "${LLAMA_HASH}" ]]; then
-        echo "ERROR: Could not determine latest commit for llama.cpp" >&2
-        exit 1
-    fi
-    echo "llama.cpp: Auto-detected latest commit (${LLAMA_BRANCH}): ${LLAMA_HASH}"
-fi
-
-if [[ -n "${WHISPER_COMMIT_HASH:-}" ]]; then
-    WHISPER_HASH="${WHISPER_COMMIT_HASH}"
-    echo "whisper.cpp: Using provided commit hash: ${WHISPER_HASH}"
-else
-    WHISPER_BRANCH=$(get_default_branch "${WHISPER_REPO}")
-    WHISPER_HASH=$(get_latest_commit "${WHISPER_REPO}" "${WHISPER_BRANCH}")
-    if [[ -z "${WHISPER_HASH}" ]]; then
-        echo "ERROR: Could not determine latest commit for whisper.cpp" >&2
-        exit 1
-    fi
-    echo "whisper.cpp: Auto-detected latest commit (${WHISPER_BRANCH}): ${WHISPER_HASH}"
-fi
-
-if [[ -n "${SD_COMMIT_HASH:-}" ]]; then
-    SD_HASH="${SD_COMMIT_HASH}"
-    echo "stable-diffusion.cpp: Using provided version: ${SD_HASH}"
-elif [[ "$BACKEND" == "vulkan" ]]; then
-    SD_HASH=$(get_latest_release_tag "leejet/stable-diffusion.cpp")
-    if [[ -z "${SD_HASH}" ]]; then
-        echo "ERROR: Could not determine latest release tag for stable-diffusion.cpp" >&2
-        exit 1
-    fi
-    echo "stable-diffusion.cpp: Auto-detected latest release tag: ${SD_HASH}"
-else
-    SD_BRANCH=$(get_default_branch "${SD_REPO}")
-    SD_HASH=$(get_latest_commit "${SD_REPO}" "${SD_BRANCH}")
-    if [[ -z "${SD_HASH}" ]]; then
-        echo "ERROR: Could not determine latest commit for stable-diffusion.cpp" >&2
-        exit 1
-    fi
-    echo "stable-diffusion.cpp: Auto-detected latest commit (${SD_BRANCH}): ${SD_HASH}"
-fi
-
-echo ""
-echo "=========================================="
-echo "Starting Docker build..."
-echo "=========================================="
-echo ""
-
-# Build the Docker image with commit hashes as build args
-# Build context is the repository root (..) so the Dockerfile can access Go source
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
-BUILD_ARGS=(
-    --build-arg "BACKEND=${BACKEND}"
-    --build-arg "LLAMA_COMMIT_HASH=${LLAMA_HASH}"
-    --build-arg "WHISPER_COMMIT_HASH=${WHISPER_HASH}"
-    --build-arg "SD_COMMIT_HASH=${SD_HASH}"
-    -t "${DOCKER_IMAGE_TAG}"
-    -f "${SCRIPT_DIR}/${DOCKERFILE}"
-)
-
-if [[ "$NO_CACHE" == true ]]; then
-    BUILD_ARGS+=(--no-cache)
-    echo "Note: Building without cache"
-fi
-
-# Use docker buildx with a custom builder for parallelism control
-# The legacy DOCKER_BUILDKIT=1 docker build doesn't respect BUILDKIT_MAX_PARALLELISM env var
-# We need to use a custom builder with a buildkitd.toml config file
-BUILDER_NAME="llama-swap-builder"
-
-# Check if our custom builder exists with the right config, create/update if needed
-if ! docker buildx inspect "$BUILDER_NAME" >/dev/null 2>&1; then
-    echo "Creating custom buildx builder with max-parallelism=1..."
-    
-    # Create buildkitd.toml config file
-    cat > buildkitd.toml << 'BUILDKIT_EOF'
-[worker.oci]
-  max-parallelism = 1
-BUILDKIT_EOF
-    
-    # Create the builder with the config
-    docker buildx create --name "$BUILDER_NAME" \
-        --driver docker-container \
-        --buildkitd-config buildkitd.toml \
-        --use
-else
-    # Switch to our builder
-    docker buildx use "$BUILDER_NAME"
-fi
-
-echo "Building with sequential stages (one at a time), each using all CPU cores..."
-echo "Using builder: $BUILDER_NAME"
-
-# Use docker buildx build with --load to load the image into Docker
-# The --builder flag ensures we use our custom builder with max-parallelism=1
-# Build context is the repository root so we can access Go source files
-docker buildx build --builder "$BUILDER_NAME" --load "${BUILD_ARGS[@]}" "${REPO_ROOT}"
-
-echo ""
-echo "=========================================="
-echo "Verifying build artifacts..."
-echo "=========================================="
-echo ""
-
-# Verify all expected binaries exist in the image
-MISSING_BINARIES=()
-
-for binary in llama-server llama-cli whisper-server whisper-cli sd-server sd-cli llama-swap; do
-    if ! docker run --rm "${DOCKER_IMAGE_TAG}" which "${binary}" >/dev/null 2>&1; then
-        MISSING_BINARIES+=("${binary}")
-    fi
-done
-
-if [[ ${#MISSING_BINARIES[@]} -gt 0 ]]; then
-    echo "ERROR: Build succeeded but the following binaries are missing from the image:"
-    for binary in "${MISSING_BINARIES[@]}"; do
-        echo "  - ${binary}"
-    done
-    echo ""
-    echo "This usually indicates a build stage failure. Try running with --no-cache flag:"
-    echo "  ./build-image.sh --vulkan --no-cache"
-    exit 1
-fi
-
-echo "All expected binaries verified: llama-server, llama-cli, whisper-server, whisper-cli, sd-server, sd-cli, llama-swap"
-
-echo ""
-echo "=========================================="
-echo "Build complete!"
-echo "=========================================="
-echo ""
-echo "Image tag: ${DOCKER_IMAGE_TAG}"
-echo ""
-echo "Built with:"
-echo "  llama.cpp:           ${LLAMA_HASH}"
-echo "  whisper.cpp:         ${WHISPER_HASH}"
-echo "  stable-diffusion.cpp: ${SD_HASH}"
-echo "  llama-swap:          $(docker run --rm "${DOCKER_IMAGE_TAG}" cat /versions.txt | grep llama-swap | cut -d' ' -f2-)"
-echo ""
-if [[ "$BACKEND" == "vulkan" ]]; then
-    echo "Run with:"
-    echo "  docker run -it --rm --device /dev/dri:/dev/dri ${DOCKER_IMAGE_TAG}"
-    echo ""
-    echo "Note: For AMD GPUs, you may also need to mount render devices:"
-    echo "  docker run -it --rm --device /dev/dri:/dev/dri --group-add video ${DOCKER_IMAGE_TAG}"
-else
-    echo "Run with:"
-    echo "  docker run -it --rm --gpus all ${DOCKER_IMAGE_TAG}"
-fi
@@ -1,203 +0,0 @@
-# Unified multi-stage Dockerfile for AI inference tools
-# Supports CUDA and Vulkan backends via BACKEND build arg
-#
-# Usage:
-#   docker buildx build --build-arg BACKEND=cuda -t llama-swap:unified-cuda .
-#   docker buildx build --build-arg BACKEND=vulkan -t llama-swap:unified-vulkan .
-#   docker buildx build --build-arg BACKEND=cuda --build-arg CMAKE_CUDA_ARCHITECTURES="86;89" -t llama-swap:unified-cuda .
-#
-# Each project has its own install script that handles cloning, building,
-# and installing binaries. Build stages are independent for cache efficiency.
-
-ARG BACKEND=cuda
-
-# ── Builder bases ──────────────────────────────────────────────────────
-
-FROM nvidia/cuda:12.9.1-devel-ubuntu24.04 AS builder-base-cuda
-
-ARG CMAKE_CUDA_ARCHITECTURES="60;61;75;86;89"
-ENV DEBIAN_FRONTEND=noninteractive
-ENV CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}
-ENV CCACHE_DIR=/ccache
-ENV CCACHE_MAXSIZE=2G
-ENV PATH="/usr/lib/ccache:${PATH}"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    build-essential cmake git python3 python3-pip libssl-dev \
-    curl ca-certificates ccache make wget \
-    && rm -rf /var/lib/apt/lists/*
-
-WORKDIR /build
-
-# ──
-
-FROM ubuntu:24.04 AS builder-base-vulkan
-
-ENV DEBIAN_FRONTEND=noninteractive
-ENV CCACHE_DIR=/ccache
-ENV CCACHE_MAXSIZE=2G
-ENV PATH="/usr/lib/ccache:${PATH}"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    build-essential cmake git python3 python3-pip libssl-dev \
-    curl ca-certificates ccache make wget software-properties-common \
-    libvulkan-dev glslang-tools spirv-tools vulkan-validationlayers glslc \
-    && rm -rf /var/lib/apt/lists/*
-
-WORKDIR /build
-
-# ── Select builder base by BACKEND ────────────────────────────────────
-
-FROM builder-base-${BACKEND} AS builder-base
-
-# ── Build whisper.cpp (fastest build, run first) ──────────────────────
-
-FROM builder-base AS whisper-build
-ARG BACKEND=cuda
-ARG WHISPER_COMMIT_HASH=master
-COPY install-whisper.sh /build/
-RUN --mount=type=cache,id=ccache-${BACKEND},target=/ccache \
-    --mount=type=cache,id=whisper-${BACKEND},target=/src/whisper.cpp/build \
-    BACKEND=${BACKEND} bash /build/install-whisper.sh "${WHISPER_COMMIT_HASH}"
-
-# ── Build stable-diffusion.cpp ────────────────────────────────────────
-
-FROM builder-base AS sd-build
-ARG BACKEND=cuda
-ARG SD_COMMIT_HASH=master
-COPY install-sd.sh /build/
-RUN --mount=type=cache,id=ccache-${BACKEND},target=/ccache \
-    --mount=type=cache,id=sd-${BACKEND},target=/src/stable-diffusion.cpp/build \
-    BACKEND=${BACKEND} bash /build/install-sd.sh "${SD_COMMIT_HASH}"
-
-# ── Build llama.cpp (slowest build, run last) ─────────────────────────
-
-FROM builder-base AS llama-build
-ARG BACKEND=cuda
-ARG LLAMA_COMMIT_HASH=master
-COPY install-llama.sh /build/
-RUN --mount=type=cache,id=ccache-${BACKEND},target=/ccache \
-    --mount=type=cache,id=llama-${BACKEND},target=/src/llama.cpp/build \
-    BACKEND=${BACKEND} bash /build/install-llama.sh "${LLAMA_COMMIT_HASH}"
-
-# ── Build ik_llama.cpp (CUDA only) ────────────────────────────────────
-#
-# Two named stages allow ARG BACKEND to select at build time:
-#   - ik-llama-cuda  : real build (from builder-base-cuda)
-#   - ik-llama-vulkan: no-op (empty /install/bin, skips CUDA pull entirely)
-# BuildKit only evaluates the selected branch, so vulkan builds never
-# pull nvidia/cuda:*-devel or compile ik_llama.cpp.
-
-FROM builder-base-vulkan AS ik-llama-vulkan
-RUN mkdir -p /install/bin
-
-FROM builder-base-cuda AS ik-llama-cuda
-ARG IK_LLAMA_COMMIT_HASH=main
-COPY install-ik-llama.sh /build/
-RUN --mount=type=cache,id=ccache-cuda,target=/ccache \
-    --mount=type=cache,id=ik-llama-cuda,target=/src/ik_llama.cpp/build \
-    bash /build/install-ik-llama.sh "${IK_LLAMA_COMMIT_HASH}"
-
-ARG BACKEND=cuda
-FROM ik-llama-${BACKEND} AS ik-llama-build
-
-# ── Download llama-swap release binary ────────────────────────────────
-
-FROM builder-base AS llama-swap-download
-ARG LS_VERSION=latest
-COPY install-llama-swap.sh /build/
-RUN bash /build/install-llama-swap.sh "${LS_VERSION}"
-
-# ── Runtime bases ─────────────────────────────────────────────────────
-
-FROM nvidia/cuda:12.9.1-runtime-ubuntu24.04 AS runtime-cuda
-
-ENV DEBIAN_FRONTEND=noninteractive
-ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
-ENV PATH="/usr/local/bin:${PATH}"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    libgomp1 python3 curl ca-certificates \
-    && rm -rf /var/lib/apt/lists/*
-
-# CUDA stub drivers for container compatibility
-COPY --from=builder-base-cuda /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so
-COPY --from=builder-base-cuda /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
-
-# ──
-
-FROM ubuntu:24.04 AS runtime-vulkan
-
-ENV DEBIAN_FRONTEND=noninteractive
-ENV PATH="/usr/local/bin:${PATH}"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    libgomp1 libvulkan1 mesa-vulkan-drivers \
-    python3 curl ca-certificates \
-    && rm -rf /var/lib/apt/lists/*
-
-# ── Select runtime base by BACKEND ────────────────────────────────────
-
-FROM runtime-${BACKEND} AS runtime
-
-ARG BACKEND=cuda
-ARG LLAMA_COMMIT_HASH=unknown
-ARG WHISPER_COMMIT_HASH=unknown
-ARG SD_COMMIT_HASH=unknown
-ARG IK_LLAMA_COMMIT_HASH=unknown
-ARG RUN_UID=0
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    python3-numpy python3-sentencepiece \
-    && rm -rf /var/lib/apt/lists/*
-
-# Create non-root user when RUN_UID != 0
-RUN if [ "$RUN_UID" != "0" ]; then \
-      groupadd --system --gid $RUN_UID llama-swap && \
-      useradd --system --uid $RUN_UID --gid $RUN_UID \
-        --home /app --shell /sbin/nologin llama-swap; \
-    fi && \
-    mkdir -p /etc/llama-swap/config && \
-    chown -R ${RUN_UID}:${RUN_UID} /etc/llama-swap
-
-WORKDIR /app
-
-# Copy whisper.cpp binaries and libraries
-COPY --from=whisper-build /install/bin/whisper-server /usr/local/bin/
-COPY --from=whisper-build /install/bin/whisper-cli /usr/local/bin/
-COPY --from=whisper-build /install/lib/ /usr/local/lib/
-
-# Copy stable-diffusion.cpp binaries and libraries
-COPY --from=sd-build /install/bin/sd-server /usr/local/bin/
-COPY --from=sd-build /install/bin/sd-cli /usr/local/bin/
-COPY --from=sd-build /install/lib/ /usr/local/lib/
-
-# Copy llama.cpp binaries (statically linked)
-COPY --from=llama-build /install/bin/llama-server /usr/local/bin/
-COPY --from=llama-build /install/bin/llama-cli /usr/local/bin/
-
-# Copy ik-llama-server (CUDA only; empty copy for vulkan)
-COPY --from=ik-llama-build /install/bin/ /usr/local/bin/
-
-# Copy llama-swap binary
-COPY --from=llama-swap-download /install/bin/llama-swap /usr/local/bin/
-COPY --from=llama-swap-download /install/llama-swap-version /tmp/
-
-RUN ldconfig
-
-COPY config.example.yaml /etc/llama-swap/config/config.yaml
-
-# Version tracking
-RUN echo "llama.cpp: ${LLAMA_COMMIT_HASH}" > /versions.txt && \
-    echo "whisper.cpp: ${WHISPER_COMMIT_HASH}" >> /versions.txt && \
-    echo "stable-diffusion.cpp: ${SD_COMMIT_HASH}" >> /versions.txt && \
-    echo "ik_llama.cpp: ${IK_LLAMA_COMMIT_HASH}" >> /versions.txt && \
-    echo "llama-swap: $(cat /tmp/llama-swap-version)" >> /versions.txt && \
-    echo "backend: ${BACKEND}" >> /versions.txt && \
-    echo "build_timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> /versions.txt
-
-RUN mkdir -p /models && chown ${RUN_UID}:${RUN_UID} /models
-WORKDIR /models
-USER ${RUN_UID}
-ENTRYPOINT ["llama-swap"]
-CMD ["-config", "/etc/llama-swap/config/config.yaml", "-listen", "0.0.0.0:8080"]
@@ -1,8 +0,0 @@
-# Unified Docker Container
-
-These scripts create a custom llama-swap container that contains:
-
- llama-server for LLMs, rerank and embedding model support
- sd-server (stable-diffusion.cpp) for image generation
- whisper.cpp for ASR
-
@@ -1,283 +0,0 @@
-#!/bin/bash
-#
-# Build script for unified container with version pinning
-#
-# Usage:
-#   ./build-image.sh --cuda                              # Build CUDA image
-#   ./build-image.sh --vulkan                            # Build Vulkan image
-#   ./build-image.sh --cuda --no-cache                   # Build without cache
-#   LLAMA_REF=b1234 ./build-image.sh --vulkan            # Pin llama.cpp to a commit hash
-#   LLAMA_REF=v1.2.3 ./build-image.sh --cuda             # Pin llama.cpp to a tag
-#   WHISPER_REF=v1.0.0 ./build-image.sh --vulkan         # Pin whisper.cpp to a tag
-#   SD_REF=master ./build-image.sh --cuda                # Pin stable-diffusion.cpp to a branch
-#   LS_VERSION=170 ./build-image.sh --cuda               # Override llama-swap version
-#   IK_LLAMA_REF=main ./build-image.sh --cuda            # Pin ik_llama.cpp to main branch (CUDA only)
-#
-
-set -euo pipefail
-
-BACKEND=""
-NO_CACHE=false
-
-for arg in "$@"; do
-    case $arg in
-        --cuda)
-            BACKEND="cuda"
-            ;;
-        --vulkan)
-            BACKEND="vulkan"
-            ;;
-        --no-cache)
-            NO_CACHE=true
-            ;;
-        --help|-h)
-            echo "Usage: ./build-image.sh --cuda|--vulkan [--no-cache]"
-            echo ""
-            echo "Options:"
-            echo "  --cuda      Build CUDA image (NVIDIA GPUs)"
-            echo "  --vulkan    Build Vulkan image (AMD GPUs and compatible hardware)"
-            echo "  --no-cache  Force rebuild without using Docker cache"
-            echo "  --help, -h  Show this help message"
-            echo ""
-            echo "Environment variables:"
-            echo "  DOCKER_IMAGE_TAG     Set custom image tag (default: llama-swap:unified-cuda or llama-swap:unified-vulkan)"
-            echo "  LLAMA_REF            Pin llama.cpp to a commit, tag, or branch"
-            echo "  WHISPER_REF          Pin whisper.cpp to a commit, tag, or branch"
-            echo "  SD_REF               Pin stable-diffusion.cpp to a commit, tag, or branch"
-            echo "  IK_LLAMA_REF         Pin ik_llama.cpp to a commit, tag, or branch (CUDA only)"
-            echo "  LS_VERSION           Override llama-swap version (e.g., '170' or 'latest')"
-            exit 0
-            ;;
-    esac
-done
-
-if [[ -z "$BACKEND" ]]; then
-    echo "Error: No backend specified. Please use --cuda or --vulkan."
-    echo ""
-    echo "Usage: ./build-image.sh --cuda|--vulkan [--no-cache]"
-    exit 1
-fi
-
-DOCKER_IMAGE_TAG="${DOCKER_IMAGE_TAG:-llama-swap:unified-${BACKEND}}"
-
-# Git repository URLs
-LLAMA_REPO="https://github.com/ggml-org/llama.cpp.git"
-WHISPER_REPO="https://github.com/ggml-org/whisper.cpp.git"
-SD_REPO="https://github.com/leejet/stable-diffusion.cpp.git"
-LLAMA_SWAP_REPO="https://github.com/mostlygeek/llama-swap.git"
-IK_LLAMA_REPO="https://github.com/ikawrakow/ik_llama.cpp.git"
-
-# Resolve a git ref (commit hash, tag, or branch) to a full commit hash.
-# Requires only: git, network access to the remote.
-resolve_ref() {
-    local repo_url="$1"
-    local ref="$2"
-
-    # Full 40-char SHA — use as-is
-    if [[ "${ref}" =~ ^[0-9a-f]{40}$ ]]; then
-        echo "${ref}"
-        return
-    fi
-
-    # Try tag then branch (exact match)
-    local hash
-    hash=$(git ls-remote "${repo_url}" "refs/tags/${ref}" "refs/heads/${ref}" 2>/dev/null | head -1 | cut -f1)
-    if [[ -n "${hash}" ]]; then
-        echo "${hash}"
-        return
-    fi
-
-    # Short hash (7+ chars): scan all refs for a SHA with this prefix
-    if [[ "${ref}" =~ ^[0-9a-f]{7,}$ ]]; then
-        hash=$(git ls-remote "${repo_url}" 2>/dev/null | grep "^${ref}" | head -1 | cut -f1)
-        if [[ -n "${hash}" ]]; then
-            echo "${hash}"
-            return
-        fi
-    fi
-
-    echo "ERROR: Could not resolve ref '${ref}' for ${repo_url}" >&2
-    if [[ "${ref}" =~ ^[0-9a-f]+$ && ${#ref} -lt 7 ]]; then
-        echo "  Short hashes must be at least 7 characters (got ${#ref})." >&2
-    else
-        echo "  Tried: tag, branch, git ls-remote prefix match" >&2
-    fi
-    echo "  Use a full 40-char SHA, a tag name, a branch name, or a 7+ char short hash." >&2
-    return 1
-}
-
-# Resolve HEAD of a repo without needing to know the default branch name.
-get_latest_hash() {
-    git ls-remote "${1}" HEAD 2>/dev/null | head -1 | cut -f1
-}
-
-echo "=========================================="
-echo "llama-swap Unified Build (${BACKEND})"
-echo "=========================================="
-echo ""
-
-# Resolve llama.cpp ref
-if [[ -n "${LLAMA_REF:-}" ]]; then
-    LLAMA_HASH=$(resolve_ref "${LLAMA_REPO}" "${LLAMA_REF}") || exit 1
-    echo "llama.cpp: ${LLAMA_REF} -> ${LLAMA_HASH}"
-else
-    LLAMA_HASH=$(get_latest_hash "${LLAMA_REPO}")
-    if [[ -z "${LLAMA_HASH}" ]]; then
-        echo "ERROR: Could not determine latest commit for llama.cpp" >&2
-        exit 1
-    fi
-    echo "llama.cpp: latest HEAD: ${LLAMA_HASH}"
-fi
-
-# Resolve whisper.cpp ref
-if [[ -n "${WHISPER_REF:-}" ]]; then
-    WHISPER_HASH=$(resolve_ref "${WHISPER_REPO}" "${WHISPER_REF}") || exit 1
-    echo "whisper.cpp: ${WHISPER_REF} -> ${WHISPER_HASH}"
-else
-    WHISPER_HASH=$(get_latest_hash "${WHISPER_REPO}")
-    if [[ -z "${WHISPER_HASH}" ]]; then
-        echo "ERROR: Could not determine latest commit for whisper.cpp" >&2
-        exit 1
-    fi
-    echo "whisper.cpp: latest HEAD: ${WHISPER_HASH}"
-fi
-
-# Resolve stable-diffusion.cpp ref
-if [[ -n "${SD_REF:-}" ]]; then
-    SD_HASH=$(resolve_ref "${SD_REPO}" "${SD_REF}") || exit 1
-    echo "stable-diffusion.cpp: ${SD_REF} -> ${SD_HASH}"
-else
-    SD_HASH=$(get_latest_hash "${SD_REPO}")
-    if [[ -z "${SD_HASH}" ]]; then
-        echo "ERROR: Could not determine latest commit for stable-diffusion.cpp" >&2
-        exit 1
-    fi
-    echo "stable-diffusion.cpp: latest HEAD: ${SD_HASH}"
-fi
-
-# Resolve ik_llama.cpp ref (CUDA only)
-if [[ "$BACKEND" == "cuda" ]]; then
-    if [[ -n "${IK_LLAMA_REF:-}" ]]; then
-        IK_LLAMA_HASH=$(resolve_ref "${IK_LLAMA_REPO}" "${IK_LLAMA_REF}") || exit 1
-        echo "ik_llama.cpp: ${IK_LLAMA_REF} -> ${IK_LLAMA_HASH}"
-    else
-        IK_LLAMA_HASH=$(get_latest_hash "${IK_LLAMA_REPO}")
-        if [[ -z "${IK_LLAMA_HASH}" ]]; then
-            echo "ERROR: Could not determine latest commit for ik_llama.cpp" >&2
-            exit 1
-        fi
-        echo "ik_llama.cpp: latest HEAD: ${IK_LLAMA_HASH}"
-    fi
-else
-    IK_LLAMA_HASH="n/a"
-    echo "ik_llama.cpp: skipped (vulkan build)"
-fi
-
-# Resolve llama-swap ref
-if [[ -n "${LS_VERSION:-}" ]]; then
-    LS_HASH=$(resolve_ref "${LLAMA_SWAP_REPO}" "${LS_VERSION}") || exit 1
-    echo "llama-swap: ${LS_VERSION} -> ${LS_HASH}"
-else
-    LS_HASH=$(get_latest_hash "${LLAMA_SWAP_REPO}")
-    if [[ -z "${LS_HASH}" ]]; then
-        echo "ERROR: Could not determine latest commit for llama-swap" >&2
-        exit 1
-    fi
-    echo "llama-swap: latest HEAD: ${LS_HASH}"
-fi
-
-echo ""
-echo "=========================================="
-echo "Starting Docker build..."
-echo "=========================================="
-echo ""
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-
-BUILD_ARGS=(
-    --build-arg "BACKEND=${BACKEND}"
-    --build-arg "LLAMA_COMMIT_HASH=${LLAMA_HASH}"
-    --build-arg "WHISPER_COMMIT_HASH=${WHISPER_HASH}"
-    --build-arg "SD_COMMIT_HASH=${SD_HASH}"
-    --build-arg "IK_LLAMA_COMMIT_HASH=${IK_LLAMA_HASH}"
-    --build-arg "LS_VERSION=${LS_HASH}"
-    --build-arg "RUN_UID=${RUN_UID:-0}"
-    -t "${DOCKER_IMAGE_TAG}"
-    -f "${SCRIPT_DIR}/Dockerfile"
-)
-
-if [[ "$NO_CACHE" == true ]]; then
-    BUILD_ARGS+=(--no-cache)
-    echo "Note: Building without cache"
-elif [[ "${GITHUB_ACTIONS:-}" == "true" && "${ACT:-}" != "true" ]]; then
-    CACHE_REF="ghcr.io/mostlygeek/llama-swap:unified-${BACKEND}-cache"
-    BUILD_ARGS+=(
-        --cache-from "type=registry,ref=${CACHE_REF}"
-        --cache-to "type=registry,ref=${CACHE_REF},mode=max"
-    )
-    echo "Note: Using registry cache (${CACHE_REF})"
-fi
-
-DOCKER_BUILDKIT=1 docker buildx build --load "${BUILD_ARGS[@]}" "${SCRIPT_DIR}"
-
-echo ""
-echo "=========================================="
-echo "Verifying build artifacts..."
-echo "=========================================="
-echo ""
-
-EXPECTED_BINARIES=(llama-server llama-cli whisper-server whisper-cli sd-server sd-cli llama-swap)
-if [[ "$BACKEND" == "cuda" ]]; then
-    EXPECTED_BINARIES+=(ik-llama-server)
-fi
-
-MISSING_BINARIES=()
-for binary in "${EXPECTED_BINARIES[@]}"; do
-    if ! docker run --rm --entrypoint which "${DOCKER_IMAGE_TAG}" "${binary}" >/dev/null 2>&1; then
-        MISSING_BINARIES+=("${binary}")
-    fi
-done
-
-if [[ ${#MISSING_BINARIES[@]} -gt 0 ]]; then
-    echo "ERROR: Build succeeded but the following binaries are missing:"
-    for binary in "${MISSING_BINARIES[@]}"; do
-        echo "  - ${binary}"
-    done
-    echo ""
-    echo "Try running with --no-cache flag:"
-    echo "  ./build-image.sh --${BACKEND} --no-cache"
-    exit 1
-fi
-
-VERIFIED_LIST="llama-server, llama-cli, whisper-server, whisper-cli, sd-server, sd-cli, llama-swap"
-if [[ "$BACKEND" == "cuda" ]]; then
-    VERIFIED_LIST="${VERIFIED_LIST}, ik-llama-server"
-fi
-echo "All expected binaries verified: ${VERIFIED_LIST}"
-
-echo ""
-echo "=========================================="
-echo "Build complete!"
-echo "=========================================="
-echo ""
-echo "Image tag: ${DOCKER_IMAGE_TAG}"
-echo ""
-echo "Built with:"
-echo "  llama.cpp:            ${LLAMA_HASH}"
-echo "  whisper.cpp:          ${WHISPER_HASH}"
-echo "  stable-diffusion.cpp: ${SD_HASH}"
-if [[ "$BACKEND" == "cuda" ]]; then
-    echo "  ik_llama.cpp:         ${IK_LLAMA_HASH}"
-fi
-echo "  llama-swap:           $(docker run --rm --entrypoint cat "${DOCKER_IMAGE_TAG}" /versions.txt | grep llama-swap | cut -d' ' -f2-)"
-echo ""
-if [[ "$BACKEND" == "vulkan" ]]; then
-    echo "Run with:"
-    echo "  docker run -it --rm --device /dev/dri:/dev/dri ${DOCKER_IMAGE_TAG}"
-    echo ""
-    echo "Note: For AMD GPUs, you may also need:"
-    echo "  docker run -it --rm --device /dev/dri:/dev/dri --group-add video ${DOCKER_IMAGE_TAG}"
-else
-    echo "Run with:"
-    echo "  docker run -it --rm --gpus all ${DOCKER_IMAGE_TAG}"
-fi
@@ -1,33 +0,0 @@
-# placeholder example configuration
-healthCheckTimeout: 300
-logRequests: true
-
-models:
-  "llama":
-    cmd: >
-      llama-server
-      -hf bartowski/Qwen2.5-0.5B-Instruct-GGUF:Q4_K_M
-      --port ${PORT}
-
-  "whisper":
-    checkEndpoint: /v1/audio/transcriptions/
-    cmd: >
-      whisper-server
-      --port ${PORT}
-      --m /models/whisper.bin
-      --flash-attn
-      --request-path /v1/audio/transcriptions --inference-path ""
-
-  "image":
-    checkEndpoint: /
-    cmd: |
-      /app/sd-server
-      --listen-port 9999
-      --diffusion-fa
-      --diffusion-model /models/z_image_turbo-Q8_0.gguf
-      --vae /models/ae.safetensors
-      --llm /models/qwen3-4b-instruct-2507-q8_0.gguf
-      --offload-to-cpu
-      --cfg-scale 1.0
-      --height 512 --width 512
-      --steps 8
@@ -1,48 +0,0 @@
-#!/bin/bash
-# Install ik_llama.cpp - clone, build, and install binaries
-# Usage: ./install-ik-llama.sh <commit_hash>
-# Note: CUDA only; always built against builder-base-cuda
-set -e
-
-COMMIT_HASH="${1:-main}"
-
-mkdir -p /install/bin
-
-# Clone and checkout (init-based so cache-mounted build dir doesn't break clone)
-echo "=== Cloning ik_llama.cpp at ${COMMIT_HASH} ==="
-mkdir -p /src/ik_llama.cpp
-cd /src/ik_llama.cpp
-if [ ! -d .git ]; then
-    git init
-    git remote add origin https://github.com/ikawrakow/ik_llama.cpp.git
-fi
-git fetch --depth=1 origin "${COMMIT_HASH}"
-git checkout FETCH_HEAD
-
-CMAKE_FLAGS=(
-    -DGGML_NATIVE=OFF
-    -DBUILD_SHARED_LIBS=OFF
-    -DCMAKE_BUILD_TYPE=Release
-    -DCMAKE_C_COMPILER_LAUNCHER=ccache
-    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-    -DGGML_CUDA=ON
-    "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:?CMAKE_CUDA_ARCHITECTURES must be set}"
-    "-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
-    "-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda -Wl,--allow-shlib-undefined"
-)
-
-rm -rf build/CMakeCache.txt build/CMakeFiles 2>/dev/null || true
-
-echo "=== Building ik_llama.cpp ==="
-cmake -B build "${CMAKE_FLAGS[@]}"
-cmake --build build --config Release -j"$(nproc)" --target llama-server
-
-if [ ! -f "build/bin/llama-server" ]; then
-    echo "FATAL: llama-server not found in build/bin/" >&2
-    exit 1
-fi
-
-# Install as ik-llama-server to avoid collision with llama.cpp's llama-server
-cp "build/bin/llama-server" "/install/bin/ik-llama-server"
-echo "=== ik_llama.cpp build complete ==="
-ls -la /install/bin/
@@ -1,59 +0,0 @@
-#!/bin/bash
-# Install llama-swap - download latest release binary from GitHub
-# Usage: ./install-llama-swap.sh [version]
-#   version: release version number (e.g., "170") or "latest" (default)
-set -e
-
-VERSION="${1:-latest}"
-REPO="mostlygeek/llama-swap"
-
-mkdir -p /install/bin
-
-# If a full commit hash is given, find the release tag that points to it
-if echo "${VERSION}" | grep -qE '^[0-9a-f]{40}$'; then
-    echo "=== Resolving commit ${VERSION:0:7} to release tag ==="
-    TAG=$(git ls-remote --tags "https://github.com/${REPO}.git" 2>/dev/null \
-        | grep "^${VERSION}" | sed 's|.*refs/tags/||' | grep -v '\^{}' | head -1)
-    if [ -n "${TAG}" ]; then
-        echo "Resolved to tag: ${TAG}"
-        VERSION="${TAG#v}"
-    else
-        echo "No release tag found for commit ${VERSION:0:7}, using latest"
-        VERSION="latest"
-    fi
-fi
-
-# Strip leading 'v' prefix so both "198" and "v198" work
-VERSION="${VERSION#v}"
-
-# Resolve "latest" to actual version number
-if [ "$VERSION" = "latest" ]; then
-    echo "=== Resolving latest llama-swap release ==="
-    VERSION=$(curl -fsSL "https://api.github.com/repos/${REPO}/releases/latest" \
-        | grep '"tag_name"' | head -1 | cut -d'"' -f4 | sed 's/^v//')
-    if [ -z "$VERSION" ]; then
-        echo "FATAL: Could not determine latest release version" >&2
-        exit 1
-    fi
-    echo "Latest version: ${VERSION}"
-fi
-
-# Download and extract
-URL="https://github.com/${REPO}/releases/download/v${VERSION}/llama-swap_${VERSION}_linux_amd64.tar.gz"
-echo "=== Downloading llama-swap v${VERSION} ==="
-echo "URL: $URL"
-curl -fSL -o /tmp/llama-swap.tar.gz "$URL"
-tar -xzf /tmp/llama-swap.tar.gz -C /install/bin/
-rm /tmp/llama-swap.tar.gz
-
-# Validate
-if [ ! -x "/install/bin/llama-swap" ]; then
-    echo "FATAL: llama-swap binary not found or not executable" >&2
-    ls -la /install/bin/ >&2
-    exit 1
-fi
-
-echo "$VERSION" > /install/llama-swap-version
-
-echo "=== llama-swap v${VERSION} installed ==="
-ls -la /install/bin/llama-swap
@@ -1,63 +0,0 @@
-#!/bin/bash
-# Install llama.cpp - clone, build, and install binaries
-# Usage: BACKEND=cuda|vulkan ./install-llama.sh <commit_hash>
-set -e
-
-COMMIT_HASH="${1:-master}"
-BACKEND="${BACKEND:-cuda}"
-
-mkdir -p /install/bin
-
-# Clone and checkout (init-based so cache-mounted /src/llama.cpp/build dir doesn't break clone)
-echo "=== Cloning llama.cpp at ${COMMIT_HASH} ==="
-mkdir -p /src/llama.cpp
-cd /src/llama.cpp
-if [ ! -d .git ]; then
-    git init
-    git remote add origin https://github.com/ggml-org/llama.cpp.git
-fi
-git fetch --depth=1 origin "${COMMIT_HASH}"
-git checkout FETCH_HEAD
-
-# Common cmake flags
-CMAKE_FLAGS=(
-    -DGGML_NATIVE=OFF
-    -DBUILD_SHARED_LIBS=OFF
-    -DCMAKE_BUILD_TYPE=Release
-    -DCMAKE_C_COMPILER_LAUNCHER=ccache
-    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-    -DLLAMA_BUILD_TESTS=OFF
-)
-
-if [ "$BACKEND" = "cuda" ]; then
-    CMAKE_FLAGS+=(
-        -DGGML_CUDA=ON
-        -DGGML_VULKAN=OFF
-        "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:?CMAKE_CUDA_ARCHITECTURES must be set}"
-        "-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
-        "-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
-    )
-elif [ "$BACKEND" = "vulkan" ]; then
-    CMAKE_FLAGS+=(
-        -DGGML_CUDA=OFF
-        -DGGML_VULKAN=ON
-    )
-fi
-
-TARGETS=(llama-cli llama-server)
-
-rm -rf build/CMakeCache.txt build/CMakeFiles 2>/dev/null || true
-
-echo "=== Building llama.cpp for ${BACKEND} ==="
-cmake -B build "${CMAKE_FLAGS[@]}"
-cmake --build build --config Release -j"$(nproc)" --target "${TARGETS[@]}"
-
-for bin in "${TARGETS[@]}"; do
-    if [ ! -f "build/bin/$bin" ]; then
-        echo "FATAL: $bin not found in build/bin/" >&2
-        exit 1
-    fi
-    cp "build/bin/$bin" "/install/bin/"
-done
-echo "=== llama.cpp build complete ==="
-ls -la /install/bin/
@@ -1,68 +0,0 @@
-#!/bin/bash
-# Install stable-diffusion.cpp - clone, build, and install binaries and library
-# Usage: BACKEND=cuda|vulkan ./install-sd.sh <commit_hash>
-set -e
-
-COMMIT_HASH="${1:-master}"
-BACKEND="${BACKEND:-cuda}"
-
-mkdir -p /install/bin /install/lib
-
-# Clone and checkout (init-based so cache-mounted /src/stable-diffusion.cpp/build dir doesn't break clone)
-echo "=== Cloning stable-diffusion.cpp at ${COMMIT_HASH} ==="
-mkdir -p /src/stable-diffusion.cpp
-cd /src/stable-diffusion.cpp
-if [ ! -d .git ]; then
-    git init
-    git remote add origin https://github.com/leejet/stable-diffusion.cpp.git
-fi
-git fetch --depth=1 origin "${COMMIT_HASH}"
-git checkout FETCH_HEAD
-git submodule update --init --recursive --depth=1
-
-# Common cmake flags
-CMAKE_FLAGS=(
-    -DGGML_NATIVE=OFF
-    -DCMAKE_BUILD_TYPE=Release
-    -DCMAKE_C_COMPILER_LAUNCHER=ccache
-    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-    -DSD_BUILD_EXAMPLES=ON
-)
-
-if [ "$BACKEND" = "cuda" ]; then
-    CMAKE_FLAGS+=(
-        -DGGML_CUDA=ON
-        -DGGML_VULKAN=OFF
-        "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:?CMAKE_CUDA_ARCHITECTURES must be set}"
-        "-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
-        "-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
-        "-DCMAKE_SHARED_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
-        -DSD_CUDA=ON
-    )
-elif [ "$BACKEND" = "vulkan" ]; then
-    CMAKE_FLAGS+=(
-        -DGGML_CUDA=OFF
-        -DGGML_VULKAN=ON
-        -DSD_VULKAN=ON
-    )
-fi
-
-TARGETS=(stable-diffusion sd-cli sd-server)
-
-rm -rf build/CMakeCache.txt build/CMakeFiles 2>/dev/null || true
-
-echo "=== Building stable-diffusion.cpp for ${BACKEND} ==="
-cmake -B build "${CMAKE_FLAGS[@]}"
-cmake --build build --config Release -j"$(nproc)" --target "${TARGETS[@]}"
-
-for bin in sd-cli sd-server; do
-    if [ ! -f "build/bin/$bin" ]; then
-        echo "FATAL: $bin not found in build/bin/" >&2
-        exit 1
-    fi
-    cp "build/bin/$bin" "/install/bin/"
-done
-find build -name "*.so*" -type f -exec cp {} /install/lib/ \;
-
-echo "=== stable-diffusion.cpp build complete ==="
-ls -la /install/bin/ /install/lib/
@@ -1,64 +0,0 @@
-#!/bin/bash
-# Install whisper.cpp - clone, build, and install binaries
-# Usage: BACKEND=cuda|vulkan ./install-whisper.sh <commit_hash>
-set -e
-
-COMMIT_HASH="${1:-master}"
-BACKEND="${BACKEND:-cuda}"
-
-mkdir -p /install/bin /install/lib
-
-# Clone and checkout (init-based so cache-mounted /src/whisper.cpp/build dir doesn't break clone)
-echo "=== Cloning whisper.cpp at ${COMMIT_HASH} ==="
-mkdir -p /src/whisper.cpp
-cd /src/whisper.cpp
-if [ ! -d .git ]; then
-    git init
-    git remote add origin https://github.com/ggml-org/whisper.cpp.git
-fi
-git fetch --depth=1 origin "${COMMIT_HASH}"
-git checkout FETCH_HEAD
-
-# Common cmake flags
-CMAKE_FLAGS=(
-    -DGGML_NATIVE=OFF
-    -DCMAKE_BUILD_TYPE=Release
-    -DCMAKE_C_COMPILER_LAUNCHER=ccache
-    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-)
-
-if [ "$BACKEND" = "cuda" ]; then
-    CMAKE_FLAGS+=(
-        -DGGML_CUDA=ON
-        -DGGML_VULKAN=OFF
-        "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES:?CMAKE_CUDA_ARCHITECTURES must be set}"
-        "-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
-        "-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
-        "-DCMAKE_SHARED_LINKER_FLAGS=-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda"
-    )
-elif [ "$BACKEND" = "vulkan" ]; then
-    CMAKE_FLAGS+=(
-        -DGGML_CUDA=OFF
-        -DGGML_VULKAN=ON
-    )
-fi
-
-TARGETS=(whisper-cli whisper-server)
-
-rm -rf build/CMakeCache.txt build/CMakeFiles 2>/dev/null || true
-
-echo "=== Building whisper.cpp for ${BACKEND} ==="
-cmake -B build "${CMAKE_FLAGS[@]}"
-cmake --build build --config Release -j"$(nproc)" --target "${TARGETS[@]}"
-
-for bin in "${TARGETS[@]}"; do
-    if [ ! -f "build/bin/$bin" ]; then
-        echo "FATAL: $bin not found in build/bin/" >&2
-        exit 1
-    fi
-    cp "build/bin/$bin" "/install/bin/"
-done
-find build -name "*.so*" -type f -exec cp {} /install/lib/ \;
-
-echo "=== whisper.cpp build complete ==="
-ls -la /install/bin/
@@ -319,29 +319,6 @@ models:
    # - recommended to be omitted and the default used
    concurrencyLimit: 0

-    # timeouts: configure proxy connection timeouts for this model
-    # - optional, defaults shown below
-    # - useful for models on slower hardware that need longer timeouts
-    # - increase responseHeader to avoid "timeout awaiting response headers" errors
-    # - set any value to 0 to disable that timeout (not recommended)
-    timeouts:
-      # connect: TCP connection timeout in seconds
-      # - default: 30
-      connect: 30
-
-      # responseHeader: time to wait for response headers in seconds
-      # - default: 60
-      # - for slow image generation or large models, consider increasing to 300+ seconds
-      responseHeader: 60
-
-      # tlsHandshake: TLS handshake timeout in seconds
-      # - default: 10
-      tlsHandshake: 10
-
-      # idleConn: idle connection timeout in seconds
-      # - default: 90
-      idleConn: 90
-
    # sendLoadingState: overrides the global sendLoadingState setting for this model
    # - optional, default: undefined (use global setting)
    sendLoadingState: false
@@ -467,17 +444,6 @@ peers:
    # - required
    # - requested path to llama-swap will be appended to the end of the proxy value
    proxy: http://192.168.1.23
-
-    # timeouts: configure proxy connection timeouts for this peer
-    # - optional, defaults shown below
-    # - useful when the peer runs on slower hardware
-    # - set any value to 0 to disable that timeout (not recommended)
-    timeouts:
-      connect: 30
-      responseHeader: 60
-      tlsHandshake: 10
-      idleConn: 90
-
    # models: a list of models served by the peer
    # - required
    models:
@@ -1,6 +1,6 @@
 module github.com/mostlygeek/llama-swap

-go 1.26.1
+go 1.25.4

 require (
 	github.com/billziss-gh/golib v0.2.0
@@ -124,7 +124,6 @@ type Config struct {
 	LogToStdout        string                 `yaml:"logToStdout"`
 	MetricsMaxInMemory int                    `yaml:"metricsMaxInMemory"`
 	CaptureBuffer      int                    `yaml:"captureBuffer"`
-	GlobalTTL          int                    `yaml:"globalTTL"`
 	Models             map[string]ModelConfig `yaml:"models"` /* key is model ID */
 	Profiles           map[string][]string    `yaml:"profiles"`
 	Groups             map[string]GroupConfig `yaml:"groups"` /* key is group ID */
@@ -204,7 +203,6 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 		LogToStdout:        LogToStdoutProxy,
 		MetricsMaxInMemory: 1000,
 		CaptureBuffer:      5,
-		GlobalTTL:          0,
 	}
 	if err = yaml.Unmarshal([]byte(yamlStr), &config); err != nil {
 		return Config{}, err
@@ -218,10 +216,6 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 		return Config{}, fmt.Errorf("startPort must be greater than 1")
 	}

-	if config.GlobalTTL < 0 {
-		return Config{}, fmt.Errorf("globalTTL must be >= 0")
-	}
-
 	switch config.LogToStdout {
 	case LogToStdoutProxy, LogToStdoutUpstream, LogToStdoutBoth, LogToStdoutNone:
 	default:
@@ -261,15 +255,6 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 		modelConfig.Cmd = StripComments(modelConfig.Cmd)
 		modelConfig.CmdStop = StripComments(modelConfig.CmdStop)

-		// set model TTL to globalTTL it is the default value
-		if modelConfig.UnloadAfter == MODEL_CONFIG_DEFAULT_TTL {
-			modelConfig.UnloadAfter = config.GlobalTTL
-		}
-
-		if modelConfig.UnloadAfter < 0 {
-			return Config{}, fmt.Errorf("model %s: invalid TTL value %d", modelId, modelConfig.UnloadAfter)
-		}
-
 		// Validate model macros
 		for _, macro := range modelConfig.Macros {
 			if err = validateMacro(macro.Name, macro.Value); err != nil {
@@ -308,26 +293,6 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 			modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, macroSlug, macroStr)
 			modelConfig.CheckEndpoint = strings.ReplaceAll(modelConfig.CheckEndpoint, macroSlug, macroStr)
 			modelConfig.Filters.StripParams = strings.ReplaceAll(modelConfig.Filters.StripParams, macroSlug, macroStr)
-			modelConfig.Name = strings.ReplaceAll(modelConfig.Name, macroSlug, macroStr)
-			modelConfig.Description = strings.ReplaceAll(modelConfig.Description, macroSlug, macroStr)
-
-			// Substitute macros in SetParamsByID keys and values
-			if len(modelConfig.Filters.SetParamsByID) > 0 {
-				newSetParamsByID := make(map[string]map[string]any, len(modelConfig.Filters.SetParamsByID))
-				for key, paramMap := range modelConfig.Filters.SetParamsByID {
-					newKey := strings.ReplaceAll(key, macroSlug, macroStr)
-					newValAny, err := substituteMacroInValue(any(paramMap), entry.Name, entry.Value)
-					if err != nil {
-						return Config{}, fmt.Errorf("model %s filters.setParamsByID: %s", modelId, err.Error())
-					}
-					newParamMap, ok := newValAny.(map[string]any)
-					if !ok {
-						return Config{}, fmt.Errorf("model %s filters.setParamsByID: unexpected type after macro substitution", modelId)
-					}
-					newSetParamsByID[newKey] = newParamMap
-				}
-				modelConfig.Filters.SetParamsByID = newSetParamsByID
-			}

 			// Substitute in metadata (type-preserving)
 			if len(modelConfig.Metadata) > 0 {
@@ -353,8 +318,6 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 			modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, macroSlug, macroStr)
 			modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, macroSlug, macroStr)
 			modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, macroSlug, macroStr)
-			modelConfig.Name = strings.ReplaceAll(modelConfig.Name, macroSlug, macroStr)
-			modelConfig.Description = strings.ReplaceAll(modelConfig.Description, macroSlug, macroStr)

 			if len(modelConfig.Metadata) > 0 {
 				result, err := substituteMacroInValue(modelConfig.Metadata, "PORT", nextPort)
@@ -374,8 +337,6 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 			"proxy":               modelConfig.Proxy,
 			"checkEndpoint":       modelConfig.CheckEndpoint,
 			"filters.stripParams": modelConfig.Filters.StripParams,
-			"name":                modelConfig.Name,
-			"description":         modelConfig.Description,
 		}

 		for fieldName, fieldValue := range fieldMap {
@@ -398,34 +359,6 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 			}
 		}

-		// Validate SetParamsByID keys and values
-		for key, paramMap := range modelConfig.Filters.SetParamsByID {
-			if matches := macroPatternRegex.FindAllStringSubmatch(key, -1); len(matches) > 0 {
-				return Config{}, fmt.Errorf("unknown macro '${%s}' found in model %s filters.setParamsByID key", matches[0][1], modelId)
-			}
-			if err := validateNestedForUnknownMacros(any(paramMap), fmt.Sprintf("model %s filters.setParamsByID[%s]", modelId, key)); err != nil {
-				return Config{}, err
-			}
-		}
-
-		// Auto-register setParamsByID keys as aliases (skip the model's own ID)
-		for key := range modelConfig.Filters.SetParamsByID {
-			if key == modelId {
-				continue
-			}
-			if _, exists := config.Models[key]; exists {
-				return Config{}, fmt.Errorf("model %s filters.setParamsByID: key '%s' conflicts with an existing model ID", modelId, key)
-			}
-			if existingModel, exists := config.aliases[key]; exists {
-				if existingModel != modelId {
-					return Config{}, fmt.Errorf("duplicate alias '%s' in model %s filters.setParamsByID, already used by model %s", key, modelId, existingModel)
-				}
-				continue // already registered as explicit alias for this model
-			}
-			config.aliases[key] = modelId
-			modelConfig.Aliases = append(modelConfig.Aliases, key)
-		}
-
 		if _, err := url.Parse(modelConfig.Proxy); err != nil {
 			return Config{}, fmt.Errorf("model %s: invalid proxy URL: %w", modelId, err)
 		}
@@ -187,13 +187,6 @@ groups:
 				Name:             "Model 1",
 				Description:      "This is model 1",
 				SendLoadingState: &modelLoadingState,
-				Timeouts: TimeoutsConfig{
-					Connect:        30,
-					ResponseHeader: 60,
-					TLSHandshake:   10,
-					ExpectContinue: 1,
-					IdleConn:       90,
-				},
 			},
 			"model2": {
 				Cmd:              "path/to/server --arg1 one",
@@ -202,13 +195,6 @@ groups:
 				Env:              []string{},
 				CheckEndpoint:    "/",
 				SendLoadingState: &modelLoadingState,
-				Timeouts: TimeoutsConfig{
-					Connect:        30,
-					ResponseHeader: 60,
-					TLSHandshake:   10,
-					ExpectContinue: 1,
-					IdleConn:       90,
-				},
 			},
 			"model3": {
 				Cmd:              "path/to/cmd --arg1 one",
@@ -217,13 +203,6 @@ groups:
 				Env:              []string{},
 				CheckEndpoint:    "/",
 				SendLoadingState: &modelLoadingState,
-				Timeouts: TimeoutsConfig{
-					Connect:        30,
-					ResponseHeader: 60,
-					TLSHandshake:   10,
-					ExpectContinue: 1,
-					IdleConn:       90,
-				},
 			},
 			"model4": {
 				Cmd:              "path/to/cmd --arg1 one",
@@ -232,13 +211,6 @@ groups:
 				Aliases:          []string{},
 				Env:              []string{},
 				SendLoadingState: &modelLoadingState,
-				Timeouts: TimeoutsConfig{
-					Connect:        30,
-					ResponseHeader: 60,
-					TLSHandshake:   10,
-					ExpectContinue: 1,
-					IdleConn:       90,
-				},
 			},
 		},
 		HealthCheckTimeout: 15,
@@ -6,7 +6,6 @@ import (
 	"testing"

 	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
 )

 func TestConfig_GroupMemberIsUnique(t *testing.T) {
@@ -849,71 +848,6 @@ func TestConfig_APIKeys_EnvMacros(t *testing.T) {
 	})
 }

-func TestConfig_GlobalTTL(t *testing.T) {
-	t.Run("globalTTL sets default for models", func(t *testing.T) {
-		content := `
-globalTTL: 300
-models:
-  model1:
-    cmd: server --port ${PORT}
-`
-		config, err := LoadConfigFromReader(strings.NewReader(content))
-		assert.NoError(t, err)
-		assert.Equal(t, 300, config.GlobalTTL)
-		assert.Equal(t, 300, config.Models["model1"].UnloadAfter)
-	})
-
-	t.Run("model ttl=0 overrides globalTTL", func(t *testing.T) {
-		content := `
-globalTTL: 300
-models:
-  model1:
-    cmd: server --port ${PORT}
-    ttl: 0
-`
-		config, err := LoadConfigFromReader(strings.NewReader(content))
-		assert.NoError(t, err)
-		assert.Equal(t, 0, config.Models["model1"].UnloadAfter)
-	})
-
-	t.Run("model explicit ttl overrides globalTTL", func(t *testing.T) {
-		content := `
-globalTTL: 300
-models:
-  model1:
-    cmd: server --port ${PORT}
-    ttl: 600
-`
-		config, err := LoadConfigFromReader(strings.NewReader(content))
-		assert.NoError(t, err)
-		assert.Equal(t, 600, config.Models["model1"].UnloadAfter)
-	})
-
-	t.Run("globalTTL defaults to 0", func(t *testing.T) {
-		content := `
-models:
-  model1:
-    cmd: server --port ${PORT}
-`
-		config, err := LoadConfigFromReader(strings.NewReader(content))
-		assert.NoError(t, err)
-		assert.Equal(t, 0, config.GlobalTTL)
-		assert.Equal(t, 0, config.Models["model1"].UnloadAfter)
-	})
-
-	t.Run("negative globalTTL rejected", func(t *testing.T) {
-		content := `
-globalTTL: -1
-models:
-  model1:
-    cmd: server --port ${PORT}
-`
-		_, err := LoadConfigFromReader(strings.NewReader(content))
-		assert.Error(t, err)
-		assert.Contains(t, err.Error(), "globalTTL must be >= 0")
-	})
-}
-
 func TestConfig_EnvMacros(t *testing.T) {
 	t.Run("basic env substitution in cmd", func(t *testing.T) {
 		t.Setenv("TEST_MODEL_PATH", "/opt/models")
@@ -1439,108 +1373,3 @@ models:
 	})

 }
-
-func TestConfig_TimeoutsParsing(t *testing.T) {
-	configYaml := `
-models:
-  model1:
-    cmd: test-server --port ${PORT}
-    timeouts:
-      connect: 45
-      responseHeader: 120
-`
-
-	config, err := LoadConfigFromReader(strings.NewReader(configYaml))
-	require.NoError(t, err)
-
-	modelConfig, found := config.Models["model1"]
-	require.True(t, found, "model1 should exist in config")
-
-	assert.Equal(t, 45, modelConfig.Timeouts.Connect)
-	assert.Equal(t, 120, modelConfig.Timeouts.ResponseHeader)
-}
-
-func TestConfig_TimeoutsDefaults(t *testing.T) {
-	configYaml := `
-models:
-  model1:
-    cmd: test-server --port ${PORT}
-`
-
-	config, err := LoadConfigFromReader(strings.NewReader(configYaml))
-	require.NoError(t, err)
-
-	modelConfig, found := config.Models["model1"]
-	require.True(t, found, "model1 should exist in config")
-
-	// Default values should be set during unmarshaling
-	assert.Equal(t, 30, modelConfig.Timeouts.Connect)
-	assert.Equal(t, 60, modelConfig.Timeouts.ResponseHeader)
-	assert.Equal(t, 10, modelConfig.Timeouts.TLSHandshake)
-	assert.Equal(t, 1, modelConfig.Timeouts.ExpectContinue)
-	assert.Equal(t, 90, modelConfig.Timeouts.IdleConn)
-}
-
-func TestConfig_TimeoutsZeroAllowed(t *testing.T) {
-	configYaml := `
-models:
-  model1:
-    cmd: test-server --port ${PORT}
-    timeouts:
-      connect: 0
-      responseHeader: 0
-`
-
-	config, err := LoadConfigFromReader(strings.NewReader(configYaml))
-	require.NoError(t, err)
-
-	modelConfig, found := config.Models["model1"]
-	require.True(t, found, "model1 should exist in config")
-
-	// Explicit 0 should be preserved (disables timeout)
-	assert.Equal(t, 0, modelConfig.Timeouts.Connect)
-	assert.Equal(t, 0, modelConfig.Timeouts.ResponseHeader)
-}
-
-func TestConfig_PeerTimeoutsParsing(t *testing.T) {
-	configYaml := `
-peers:
-  peer1:
-    proxy: http://example.com
-    models: [model1]
-    timeouts:
-      connect: 45
-      responseHeader: 120
-`
-
-	config, err := LoadConfigFromReader(strings.NewReader(configYaml))
-	require.NoError(t, err)
-
-	peerConfig, found := config.Peers["peer1"]
-	require.True(t, found, "peer1 should exist in config")
-
-	assert.Equal(t, 45, peerConfig.Timeouts.Connect)
-	assert.Equal(t, 120, peerConfig.Timeouts.ResponseHeader)
-}
-
-func TestConfig_PeerTimeoutsDefaults(t *testing.T) {
-	configYaml := `
-peers:
-  peer1:
-    proxy: http://example.com
-    models: [model1]
-`
-
-	config, err := LoadConfigFromReader(strings.NewReader(configYaml))
-	require.NoError(t, err)
-
-	peerConfig, found := config.Peers["peer1"]
-	require.True(t, found, "peer1 should exist in config")
-
-	// Default values should be set during unmarshaling
-	assert.Equal(t, 30, peerConfig.Timeouts.Connect)
-	assert.Equal(t, 60, peerConfig.Timeouts.ResponseHeader)
-	assert.Equal(t, 10, peerConfig.Timeouts.TLSHandshake)
-	assert.Equal(t, 1, peerConfig.Timeouts.ExpectContinue)
-	assert.Equal(t, 90, peerConfig.Timeouts.IdleConn)
-}
@@ -173,13 +173,6 @@ groups:
 				Env:              []string{"VAR1=value1", "VAR2=value2"},
 				CheckEndpoint:    "/health",
 				SendLoadingState: &modelLoadingState,
-				Timeouts: TimeoutsConfig{
-					Connect:        30,
-					ResponseHeader: 60,
-					TLSHandshake:   10,
-					ExpectContinue: 1,
-					IdleConn:       90,
-				},
 			},
 			"model2": {
 				Cmd:              "path/to/server --arg1 one",
@@ -189,13 +182,6 @@ groups:
 				Env:              []string{},
 				CheckEndpoint:    "/",
 				SendLoadingState: &modelLoadingState,
-				Timeouts: TimeoutsConfig{
-					Connect:        30,
-					ResponseHeader: 60,
-					TLSHandshake:   10,
-					ExpectContinue: 1,
-					IdleConn:       90,
-				},
 			},
 			"model3": {
 				Cmd:              "path/to/cmd --arg1 one",
@@ -205,13 +191,6 @@ groups:
 				Env:              []string{},
 				CheckEndpoint:    "/",
 				SendLoadingState: &modelLoadingState,
-				Timeouts: TimeoutsConfig{
-					Connect:        30,
-					ResponseHeader: 60,
-					TLSHandshake:   10,
-					ExpectContinue: 1,
-					IdleConn:       90,
-				},
 			},
 			"model4": {
 				Cmd:              "path/to/cmd --arg1 one",
@@ -221,13 +200,6 @@ groups:
 				Aliases:          []string{},
 				Env:              []string{},
 				SendLoadingState: &modelLoadingState,
-				Timeouts: TimeoutsConfig{
-					Connect:        30,
-					ResponseHeader: 60,
-					TLSHandshake:   10,
-					ExpectContinue: 1,
-					IdleConn:       90,
-				},
 			},
 		},
 		HealthCheckTimeout: 15,
@@ -20,12 +20,6 @@ type Filters struct {
 	// SetParams is a dictionary of parameters to set/override in requests
 	// Protected params (like "model") cannot be set
 	SetParams map[string]any `yaml:"setParams"`
-
-	// SetParamsByID maps requested model IDs to parameters to set/override in requests.
-	// Useful with aliases: a single loaded model can behave differently depending on
-	// which alias the client used. Applied after SetParams, so it can override those values.
-	// Protected params (like "model") cannot be set.
-	SetParamsByID map[string]map[string]any `yaml:"setParamsByID"`
 }

 // SanitizedStripParams returns a sorted list of parameters to strip,
@@ -57,33 +51,6 @@ func (f Filters) SanitizedStripParams() []string {
 	return cleaned
 }

-// SanitizedSetParamsByID returns the params to set for the given requestedModelID,
-// with protected params removed and keys sorted for consistent iteration order.
-// Returns nil if the ID has no entry or all its params are protected.
-func (f Filters) SanitizedSetParamsByID(requestedModelID string) (map[string]any, []string) {
-	if len(f.SetParamsByID) == 0 {
-		return nil, nil
-	}
-	params, found := f.SetParamsByID[requestedModelID]
-	if !found || len(params) == 0 {
-		return nil, nil
-	}
-	result := make(map[string]any, len(params))
-	keys := make([]string, 0, len(params))
-	for key, value := range params {
-		if slices.Contains(ProtectedParams, key) {
-			continue
-		}
-		result[key] = value
-		keys = append(keys, key)
-	}
-	sort.Strings(keys)
-	if len(result) == 0 {
-		return nil, nil
-	}
-	return result, keys
-}
-
 // SanitizedSetParams returns a copy of SetParams with protected params removed
 // and keys sorted for consistent iteration order
 func (f Filters) SanitizedSetParams() (map[string]any, []string) {
@@ -162,123 +162,6 @@ func TestFilters_SanitizedSetParams(t *testing.T) {
 	}
 }

-func TestFilters_SanitizedSetParamsByID(t *testing.T) {
-	tests := []struct {
-		name             string
-		setParamsByID    map[string]map[string]any
-		requestedModelID string
-		wantParams       map[string]any
-		wantKeys         []string
-	}{
-		{
-			name:             "empty SetParamsByID returns nil",
-			setParamsByID:    nil,
-			requestedModelID: "model1",
-			wantParams:       nil,
-			wantKeys:         nil,
-		},
-		{
-			name:             "empty map returns nil",
-			setParamsByID:    map[string]map[string]any{},
-			requestedModelID: "model1",
-			wantParams:       nil,
-			wantKeys:         nil,
-		},
-		{
-			name: "non-matching model ID returns nil",
-			setParamsByID: map[string]map[string]any{
-				"model2": {"temperature": 0.9},
-			},
-			requestedModelID: "model1",
-			wantParams:       nil,
-			wantKeys:         nil,
-		},
-		{
-			name: "matching model ID returns correct params",
-			setParamsByID: map[string]map[string]any{
-				"model1": {"temperature": 0.7, "top_p": 0.9},
-				"model2": {"temperature": 0.5},
-			},
-			requestedModelID: "model1",
-			wantParams: map[string]any{
-				"temperature": 0.7,
-				"top_p":       0.9,
-			},
-			wantKeys: []string{"temperature", "top_p"},
-		},
-		{
-			name: "protected param model is filtered out",
-			setParamsByID: map[string]map[string]any{
-				"model1": {
-					"model":       "should-be-filtered",
-					"temperature": 0.7,
-				},
-			},
-			requestedModelID: "model1",
-			wantParams: map[string]any{
-				"temperature": 0.7,
-			},
-			wantKeys: []string{"temperature"},
-		},
-		{
-			name: "only protected param returns nil",
-			setParamsByID: map[string]map[string]any{
-				"model1": {
-					"model": "should-be-filtered",
-				},
-			},
-			requestedModelID: "model1",
-			wantParams:       nil,
-			wantKeys:         nil,
-		},
-		{
-			name: "keys are sorted",
-			setParamsByID: map[string]map[string]any{
-				"model1": {
-					"z_param": "z",
-					"a_param": "a",
-					"m_param": "m",
-				},
-			},
-			requestedModelID: "model1",
-			wantParams: map[string]any{
-				"z_param": "z",
-				"a_param": "a",
-				"m_param": "m",
-			},
-			wantKeys: []string{"a_param", "m_param", "z_param"},
-		},
-		{
-			name: "alias style key lookup",
-			setParamsByID: map[string]map[string]any{
-				"model1:high": {"reasoning_effort": "high"},
-				"model1:low":  {"reasoning_effort": "low"},
-			},
-			requestedModelID: "model1:high",
-			wantParams: map[string]any{
-				"reasoning_effort": "high",
-			},
-			wantKeys: []string{"reasoning_effort"},
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			f := Filters{SetParamsByID: tt.setParamsByID}
-			gotParams, gotKeys := f.SanitizedSetParamsByID(tt.requestedModelID)
-
-			if tt.wantParams == nil {
-				assert.Nil(t, gotParams)
-				assert.Nil(t, gotKeys)
-				return
-			}
-
-			assert.Equal(t, tt.wantKeys, gotKeys)
-			assert.Equal(t, tt.wantParams, gotParams)
-		})
-	}
-}
-
 func TestProtectedParams(t *testing.T) {
 	// Verify that "model" is protected
 	assert.Contains(t, ProtectedParams, "model")
@@ -104,62 +104,6 @@ models:
 	assert.Contains(t, err.Error(), "self-reference")
 }

-// Test macro substitution in name and description fields
-func TestConfig_MacroInNameAndDescription(t *testing.T) {
-	content := `
-startPort: 10000
-macros:
-  "VARIANT": "Q4_K_M"
-  "FAMILY": "llama"
-
-models:
-  my-model:
-    cmd: echo ok
-    proxy: http://localhost:8080
-    name: "${FAMILY} ${VARIANT}"
-    description: "A ${FAMILY} model in ${VARIANT} format"
-`
-
-	config, err := LoadConfigFromReader(strings.NewReader(content))
-	assert.NoError(t, err)
-	assert.Equal(t, "llama Q4_K_M", config.Models["my-model"].Name)
-	assert.Equal(t, "A llama model in Q4_K_M format", config.Models["my-model"].Description)
-}
-
-// Test MODEL_ID macro in name and description fields
-func TestConfig_ModelIDInNameAndDescription(t *testing.T) {
-	content := `
-startPort: 10000
-models:
-  llama-3b:
-    cmd: echo ok
-    proxy: http://localhost:8080
-    name: "Model: ${MODEL_ID}"
-    description: "Running ${MODEL_ID}"
-`
-
-	config, err := LoadConfigFromReader(strings.NewReader(content))
-	assert.NoError(t, err)
-	assert.Equal(t, "Model: llama-3b", config.Models["llama-3b"].Name)
-	assert.Equal(t, "Running llama-3b", config.Models["llama-3b"].Description)
-}
-
-// Test unknown macro in name or description returns an error
-func TestConfig_UnknownMacroInNameDescription(t *testing.T) {
-	content := `
-startPort: 10000
-models:
-  test:
-    cmd: echo ok
-    proxy: http://localhost:8080
-    name: "Model ${UNDEFINED}"
-`
-
-	_, err := LoadConfigFromReader(strings.NewReader(content))
-	assert.Error(t, err)
-	assert.Contains(t, err.Error(), "UNDEFINED")
-}
-
 // Test undefined macro reference error
 func TestConfig_UndefinedMacroReference(t *testing.T) {
 	content := `
@@ -5,19 +5,6 @@ import (
 	"runtime"
 )

-const (
-	MODEL_CONFIG_DEFAULT_TTL = -1
-)
-
-// TimeoutsConfig holds timeout settings for proxy connections
-type TimeoutsConfig struct {
-	Connect        int `yaml:"connect"`        // seconds, 0 = no timeout (not recommended)
-	ResponseHeader int `yaml:"responseHeader"` // seconds, 0 = no timeout (not recommended)
-	TLSHandshake   int `yaml:"tlsHandshake"`   // seconds, 0 = no timeout (not recommended)
-	ExpectContinue int `yaml:"expectContinue"` // seconds, 0 = no timeout (not recommended)
-	IdleConn       int `yaml:"idleConn"`       // seconds, 0 = no timeout (not recommended)
-}
-
 type ModelConfig struct {
 	Cmd           string   `yaml:"cmd"`
 	CmdStop       string   `yaml:"cmdStop"`
@@ -49,9 +36,6 @@ type ModelConfig struct {

 	// override global setting
 	SendLoadingState *bool `yaml:"sendLoadingState"`
-
-	// Timeout settings for proxy connections
-	Timeouts TimeoutsConfig `yaml:"timeouts"`
 }

 func (m *ModelConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
@@ -63,19 +47,12 @@ func (m *ModelConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
 		Aliases:          []string{},
 		Env:              []string{},
 		CheckEndpoint:    "/health",
-		UnloadAfter:      MODEL_CONFIG_DEFAULT_TTL, // use GlobalTTL
+		UnloadAfter:      0,
 		Unlisted:         false,
 		UseModelName:     "",
 		ConcurrencyLimit: 0,
 		Name:             "",
 		Description:      "",
-		Timeouts: TimeoutsConfig{
-			Connect:        30,
-			ResponseHeader: 60,
-			TLSHandshake:   10,
-			ExpectContinue: 1,
-			IdleConn:       90,
-		},
 	}

 	// the default cmdStop to taskkill /f /t /pid ${PID}
@@ -73,72 +73,6 @@ models:
 	}
 }

-func TestConfig_SetParamsByIDAutoAlias(t *testing.T) {
-	content := `
-models:
-  model1:
-    cmd: path/to/cmd --port ${PORT}
-    filters:
-      setParamsByID:
-        "${MODEL_ID}:high":
-          reasoning_effort: high
-        "${MODEL_ID}:low":
-          reasoning_effort: low
-`
-	cfg, err := LoadConfigFromReader(strings.NewReader(content))
-	assert.NoError(t, err)
-
-	// Keys (other than the model's own ID) should be registered as aliases
-	realName, found := cfg.RealModelName("model1:high")
-	assert.True(t, found, "model1:high should be an auto-registered alias")
-	assert.Equal(t, "model1", realName)
-
-	realName, found = cfg.RealModelName("model1:low")
-	assert.True(t, found, "model1:low should be an auto-registered alias")
-	assert.Equal(t, "model1", realName)
-
-	// Auto-aliases should also appear in modelConfig.Aliases
-	aliases := cfg.Models["model1"].Aliases
-	assert.Contains(t, aliases, "model1:high")
-	assert.Contains(t, aliases, "model1:low")
-}
-
-func TestConfig_SetParamsByIDAutoAliasConflictWithModelID(t *testing.T) {
-	content := `
-models:
-  model1:
-    cmd: path/to/cmd --port ${PORT}
-    filters:
-      setParamsByID:
-        model2:
-          reasoning_effort: high
-  model2:
-    cmd: path/to/cmd --port ${PORT}
-`
-	_, err := LoadConfigFromReader(strings.NewReader(content))
-	assert.ErrorContains(t, err, "conflicts with an existing model ID")
-}
-
-func TestConfig_SetParamsByIDAutoAliasConflictWithOtherModel(t *testing.T) {
-	content := `
-models:
-  model1:
-    cmd: path/to/cmd --port ${PORT}
-    filters:
-      setParamsByID:
-        "shared-alias":
-          reasoning_effort: high
-  model2:
-    cmd: path/to/cmd --port ${PORT}
-    filters:
-      setParamsByID:
-        "shared-alias":
-          reasoning_effort: low
-`
-	_, err := LoadConfigFromReader(strings.NewReader(content))
-	assert.ErrorContains(t, err, "duplicate alias")
-}
-
 func TestConfig_ModelFiltersWithSetParams(t *testing.T) {
 	content := `
 models:
@@ -12,9 +12,6 @@ type PeerConfig struct {
 	ApiKey   string   `yaml:"apiKey"`
 	Models   []string `yaml:"models"`
 	Filters  Filters  `yaml:"filters"`
-
-	// Timeout settings for proxy connections
-	Timeouts TimeoutsConfig `yaml:"timeouts"`
 }

 func (c *PeerConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
@@ -24,13 +21,6 @@ func (c *PeerConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
 		ApiKey:  "",
 		Models:  []string{},
 		Filters: Filters{},
-		Timeouts: TimeoutsConfig{
-			Connect:        30,
-			ResponseHeader: 60,
-			TLSHandshake:   10,
-			ExpectContinue: 1,
-			IdleConn:       90,
-		},
 	}

 	if err := unmarshal(&defaults); err != nil {
@@ -8,7 +8,6 @@ const ConfigFileChangedEventID = 0x03
 const LogDataEventID = 0x04
 const TokenMetricsEventID = 0x05
 const ModelPreloadedEventID = 0x06
-const InFlightRequestsEventID = 0x07

 type ProcessStateChangeEvent struct {
 	ProcessName string
@@ -59,11 +58,3 @@ type ModelPreloadedEvent struct {
 func (e ModelPreloadedEvent) Type() uint32 {
 	return ModelPreloadedEventID
 }
-
-type InFlightRequestsEvent struct {
-	Total int
-}
-
-func (e InFlightRequestsEvent) Type() uint32 {
-	return InFlightRequestsEventID
-}
@@ -350,11 +350,6 @@ func processStreamingResponse(modelID string, start time.Time, body []byte) (Tok
 			usage := parsed.Get("usage")
 			timings := parsed.Get("timings")

-			// v1/responses format nests usage under response.usage
-			if !usage.Exists() {
-				usage = parsed.Get("response.usage")
-			}
-
 			if usage.Exists() || timings.Exists() {
 				return parseMetrics(modelID, start, usage, timings)
 			}
@@ -365,8 +360,6 @@ func processStreamingResponse(modelID string, start time.Time, body []byte) (Tok
 }

 func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result) (TokenMetrics, error) {
-	wallDurationMs := int(time.Since(start).Milliseconds())
-
 	// default values
 	cachedTokens := -1 // unknown or missing data
 	outputTokens := 0
@@ -375,7 +368,7 @@ func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result)
 	// timings data
 	tokensPerSecond := -1.0
 	promptPerSecond := -1.0
-	durationMs := wallDurationMs
+	durationMs := int(time.Since(start).Milliseconds())

 	if usage.Exists() {
 		if pt := usage.Get("prompt_tokens"); pt.Exists() {
@@ -404,10 +397,7 @@ func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result)
 		outputTokens = int(timings.Get("predicted_n").Int())
 		promptPerSecond = timings.Get("prompt_per_second").Float()
 		tokensPerSecond = timings.Get("predicted_per_second").Float()
-		timingsDurationMs := int(timings.Get("prompt_ms").Float() + timings.Get("predicted_ms").Float())
-		if timingsDurationMs > durationMs {
-			durationMs = timingsDurationMs
-		}
+		durationMs = int(timings.Get("prompt_ms").Float() + timings.Get("predicted_ms").Float())

 		if cachedValue := timings.Get("cache_n"); cachedValue.Exists() {
 			cachedTokens = int(cachedValue.Int())
@@ -513,9 +503,9 @@ func filterAcceptEncoding(acceptEncoding string) string {
 	supported := map[string]bool{"gzip": true, "deflate": true}
 	var filtered []string

-	for part := range strings.SplitSeq(acceptEncoding, ",") {
+	for _, part := range strings.Split(acceptEncoding, ",") {
 		// Parse encoding and optional quality value (e.g., "gzip;q=1.0")
-		encoding, _, _ := strings.Cut(strings.TrimSpace(part), ";")
+		encoding := strings.TrimSpace(strings.Split(part, ";")[0])
 		if supported[strings.ToLower(encoding)] {
 			filtered = append(filtered, strings.TrimSpace(part))
 		}
@@ -14,7 +14,6 @@ import (
 	"github.com/gin-gonic/gin"
 	"github.com/mostlygeek/llama-swap/event"
 	"github.com/stretchr/testify/assert"
-	"github.com/tidwall/gjson"
 )

 func TestMetricsMonitor_AddMetrics(t *testing.T) {
@@ -571,27 +570,6 @@ func TestMetricsMonitor_Concurrent(t *testing.T) {
 }

 func TestMetricsMonitor_ParseMetrics(t *testing.T) {
-	t.Run("keeps wall clock duration when timings underreport request time", func(t *testing.T) {
-		start := time.Now().Add(-5 * time.Second)
-		usage := gjson.Parse(`{"prompt_tokens": 5, "completion_tokens": 1}`)
-		timings := gjson.Parse(`{
-			"prompt_n": 5,
-			"predicted_n": 1,
-			"prompt_per_second": 10.0,
-			"predicted_per_second": 2.0,
-			"prompt_ms": 5.0,
-			"predicted_ms": 15.0
-		}`)
-
-		metrics, err := parseMetrics("test-model", start, usage, timings)
-		assert.NoError(t, err)
-		assert.Equal(t, 5, metrics.InputTokens)
-		assert.Equal(t, 1, metrics.OutputTokens)
-		assert.Equal(t, 10.0, metrics.PromptPerSecond)
-		assert.Equal(t, 2.0, metrics.TokensPerSecond)
-		assert.GreaterOrEqual(t, metrics.DurationMs, 5000)
-	})
-
 	t.Run("prefers timings over usage data", func(t *testing.T) {
 		mm := newMetricsMonitor(testLogger, 10, 0)

@@ -731,35 +709,6 @@ data: [DONE]
 		assert.Equal(t, 0, metrics[0].OutputTokens)
 	})

-	t.Run("v1/responses format with nested response.usage", func(t *testing.T) {
-		mm := newMetricsMonitor(testLogger, 10, 0)
-
-		// v1/responses SSE format: usage is nested under response.usage
-		responseBody := "event: response.completed\n" +
-			`data: {"type":"response.completed","response":{"id":"resp_abc","object":"response","created_at":1773416985,"status":"completed","model":"test-model","output":[],"usage":{"input_tokens":17,"output_tokens":23,"total_tokens":40}}}` +
-			"\n\n"
-
-		nextHandler := func(modelID string, w http.ResponseWriter, r *http.Request) error {
-			w.Header().Set("Content-Type", "text/event-stream")
-			w.WriteHeader(http.StatusOK)
-			w.Write([]byte(responseBody))
-			return nil
-		}
-
-		req := httptest.NewRequest("POST", "/v1/responses", nil)
-		rec := httptest.NewRecorder()
-		ginCtx, _ := gin.CreateTestContext(rec)
-
-		err := mm.wrapHandler("test-model", ginCtx.Writer, req, nextHandler)
-		assert.NoError(t, err)
-
-		metrics := mm.getMetrics()
-		assert.Equal(t, 1, len(metrics))
-		assert.Equal(t, "test-model", metrics[0].Model)
-		assert.Equal(t, 17, metrics[0].InputTokens)
-		assert.Equal(t, 23, metrics[0].OutputTokens)
-	})
-
 	t.Run("handles empty streaming response records minimal metrics", func(t *testing.T) {
 		mm := newMetricsMonitor(testLogger, 10, 0)

@@ -34,25 +34,23 @@ func NewPeerProxy(peers config.PeerDictionaryConfig, proxyLogger *LogMonitor) (*
 	}
 	sort.Strings(peerIDs)

+	// Create a shared transport with reasonable timeouts for peer connections
+	// these can be tuned with feedback later
+	peerTransport := &http.Transport{
+		DialContext: (&net.Dialer{
+			Timeout:   30 * time.Second, // Connection timeout
+			KeepAlive: 30 * time.Second,
+		}).DialContext,
+		TLSHandshakeTimeout:   10 * time.Second,
+		ResponseHeaderTimeout: 60 * time.Second, // Time to wait for response headers
+		ExpectContinueTimeout: 1 * time.Second,
+		MaxIdleConns:          100,
+		MaxIdleConnsPerHost:   10,
+		IdleConnTimeout:       90 * time.Second,
+	}
+
 	for _, peerID := range peerIDs {
 		peer := peers[peerID]
-
-		// Create a transport with per-peer timeout configuration
-		peerTransport := &http.Transport{
-			Proxy: http.ProxyFromEnvironment,
-			DialContext: (&net.Dialer{
-				Timeout:   time.Duration(peer.Timeouts.Connect) * time.Second,
-				KeepAlive: 30 * time.Second,
-			}).DialContext,
-			TLSHandshakeTimeout:   time.Duration(peer.Timeouts.TLSHandshake) * time.Second,
-			ResponseHeaderTimeout: time.Duration(peer.Timeouts.ResponseHeader) * time.Second,
-			ExpectContinueTimeout: time.Duration(peer.Timeouts.ExpectContinue) * time.Second,
-			ForceAttemptHTTP2:     true,
-			MaxIdleConns:          100,
-			MaxIdleConnsPerHost:   10,
-			IdleConnTimeout:       time.Duration(peer.Timeouts.IdleConn) * time.Second,
-		}
-
 		// Create reverse proxy for this peer
 		reverseProxy := httputil.NewSingleHostReverseProxy(peer.ProxyURL)
 		reverseProxy.Transport = peerTransport
@@ -6,7 +6,6 @@ import (
 	"net/url"
 	"strings"
 	"testing"
-	"time"

 	"github.com/mostlygeek/llama-swap/proxy/config"
 	"github.com/stretchr/testify/assert"
@@ -267,45 +266,3 @@ func TestProxyRequest_SSEHeaderModification(t *testing.T) {
 	// The X-Accel-Buffering header should be set to "no" for SSE
 	assert.Equal(t, "no", w.Header().Get("X-Accel-Buffering"))
 }
-
-func TestNewPeerProxy_CustomTimeouts(t *testing.T) {
-	proxyURL, _ := url.Parse("http://localhost:8080")
-
-	peers := config.PeerDictionaryConfig{
-		"test-peer": config.PeerConfig{
-			Proxy:    "http://localhost:8080",
-			ProxyURL: proxyURL,
-			Models:   []string{"model1"},
-			Timeouts: config.TimeoutsConfig{
-				Connect:        45,
-				ResponseHeader: 300,
-				TLSHandshake:   15,
-				ExpectContinue: 2,
-				IdleConn:       120,
-			},
-		},
-	}
-
-	peerProxy, err := NewPeerProxy(peers, testLogger)
-
-	assert.NoError(t, err)
-	assert.NotNil(t, peerProxy)
-	assert.True(t, peerProxy.HasPeerModel("model1"))
-
-	// Verify the timeout values are actually applied to the transport
-	member, found := peerProxy.proxyMap["model1"]
-	require.True(t, found, "model1 should exist in proxyMap")
-	assert.NotNil(t, member.reverseProxy)
-	assert.NotNil(t, member.reverseProxy.Transport)
-
-	transport, ok := member.reverseProxy.Transport.(*http.Transport)
-	require.True(t, ok, "Transport should be *http.Transport")
-
-	// Verify all timeout values are correctly applied
-	assert.Equal(t, 300*time.Second, transport.ResponseHeaderTimeout)
-	assert.Equal(t, 15*time.Second, transport.TLSHandshakeTimeout)
-	assert.Equal(t, 2*time.Second, transport.ExpectContinueTimeout)
-	assert.Equal(t, 120*time.Second, transport.IdleConnTimeout)
-	// ForceAttemptHTTP2 should be enabled
-	assert.True(t, transport.ForceAttemptHTTP2)
-}
@@ -96,24 +96,6 @@ func NewProcess(ID string, healthCheckTimeout int, config config.ModelConfig, pr
 	var reverseProxy *httputil.ReverseProxy
 	if proxyURL != nil {
 		reverseProxy = httputil.NewSingleHostReverseProxy(proxyURL)
-
-		// Create custom transport with configured timeouts
-		transport := &http.Transport{
-			Proxy: http.ProxyFromEnvironment,
-			DialContext: (&net.Dialer{
-				Timeout:   time.Duration(config.Timeouts.Connect) * time.Second,
-				KeepAlive: 30 * time.Second,
-			}).DialContext,
-			TLSHandshakeTimeout:   time.Duration(config.Timeouts.TLSHandshake) * time.Second,
-			ResponseHeaderTimeout: time.Duration(config.Timeouts.ResponseHeader) * time.Second,
-			ExpectContinueTimeout: time.Duration(config.Timeouts.ExpectContinue) * time.Second,
-			ForceAttemptHTTP2:     true,
-			MaxIdleConns:          100,
-			MaxIdleConnsPerHost:   10,
-			IdleConnTimeout:       time.Duration(config.Timeouts.IdleConn) * time.Second,
-		}
-		reverseProxy.Transport = transport
-
 		reverseProxy.ModifyResponse = func(resp *http.Response) error {
 			// prevent nginx from buffering streaming responses (e.g., SSE)
 			if strings.Contains(strings.ToLower(resp.Header.Get("Content-Type")), "text/event-stream") {
@@ -2,7 +2,6 @@ package proxy

 import (
 	"fmt"
-	"io"
 	"net/http"
 	"net/http/httptest"
 	"os"
@@ -118,12 +117,12 @@ func TestProcess_UnloadAfterTTL(t *testing.T) {
 	}

 	expectedMessage := "I_sense_imminent_danger"
-	conf := getTestSimpleResponderConfig(expectedMessage)
-	assert.Equal(t, config.MODEL_CONFIG_DEFAULT_TTL, conf.UnloadAfter)
-	conf.UnloadAfter = 3 // seconds
-	assert.Equal(t, 3, conf.UnloadAfter)
+	config := getTestSimpleResponderConfig(expectedMessage)
+	assert.Equal(t, 0, config.UnloadAfter)
+	config.UnloadAfter = 3 // seconds
+	assert.Equal(t, 3, config.UnloadAfter)

-	process := NewProcess("ttl_test", 2, conf, debugLogger, debugLogger)
+	process := NewProcess("ttl_test", 2, config, debugLogger, debugLogger)
 	defer process.Stop()

 	// this should take 4 seconds
@@ -160,12 +159,12 @@ func TestProcess_LowTTLValue(t *testing.T) {
 		t.Skip("skipping test, edit process_test.go to run it ")
 	}

-	conf := getTestSimpleResponderConfig("fast_ttl")
-	assert.Equal(t, config.MODEL_CONFIG_DEFAULT_TTL, conf.UnloadAfter)
-	conf.UnloadAfter = 1 // second
-	assert.Equal(t, 1, conf.UnloadAfter)
+	config := getTestSimpleResponderConfig("fast_ttl")
+	assert.Equal(t, 0, config.UnloadAfter)
+	config.UnloadAfter = 1 // second
+	assert.Equal(t, 1, config.UnloadAfter)

-	process := NewProcess("ttl", 2, conf, debugLogger, debugLogger)
+	process := NewProcess("ttl", 2, config, debugLogger, debugLogger)
 	defer process.Stop()

 	for i := 0; i < 100; i++ {
@@ -570,39 +569,3 @@ func (w *panicOnWriteResponseWriter) Write(b []byte) (int, error) {
 	}
 	return w.ResponseRecorder.Write(b)
 }
-
-func TestProcess_CustomTimeouts(t *testing.T) {
-	modelConfig := config.ModelConfig{
-		Cmd:           "echo test",
-		Proxy:         "http://localhost:8080",
-		CheckEndpoint: "/health",
-		Timeouts: config.TimeoutsConfig{
-			Connect:        45,
-			ResponseHeader: 120,
-			TLSHandshake:   15,
-			ExpectContinue: 2,
-			IdleConn:       120,
-		},
-	}
-
-	debugLogger := NewLogMonitorWriter(io.Discard)
-	process := NewProcess("test-model", 30, modelConfig, debugLogger, debugLogger)
-
-	// Verify the process was created successfully
-	assert.NotNil(t, process)
-	assert.Equal(t, "test-model", process.ID)
-	assert.NotNil(t, process.reverseProxy)
-	assert.NotNil(t, process.reverseProxy.Transport)
-
-	// Verify it's using http.Transport (not some other type)
-	transport, ok := process.reverseProxy.Transport.(*http.Transport)
-	assert.True(t, ok, "Transport should be *http.Transport")
-	assert.NotNil(t, transport)
-
-	// Verify the timeouts are correctly applied
-	assert.Equal(t, 120*time.Second, transport.ResponseHeaderTimeout)
-	assert.Equal(t, 15*time.Second, transport.TLSHandshakeTimeout)
-	assert.Equal(t, 2*time.Second, transport.ExpectContinueTimeout)
-	assert.Equal(t, 120*time.Second, transport.IdleConnTimeout)
-	assert.True(t, transport.ForceAttemptHTTP2)
-}
@@ -28,40 +28,6 @@ const (

 type proxyCtxKey string

-type InflightCounter struct {
-	mu    sync.Mutex
-	total int
-}
-
-func newInflightCounter() *InflightCounter {
-	return &InflightCounter{}
-}
-
-func (ic *InflightCounter) Current() int {
-	ic.mu.Lock()
-	total := ic.total
-	ic.mu.Unlock()
-	return total
-}
-
-func (ic *InflightCounter) Increment() int {
-	ic.mu.Lock()
-	ic.total++
-	total := ic.total
-	ic.mu.Unlock()
-	return total
-}
-
-func (ic *InflightCounter) Decrement() int {
-	ic.mu.Lock()
-	if ic.total > 0 {
-		ic.total--
-	}
-	total := ic.total
-	ic.mu.Unlock()
-	return total
-}
-
 type ProxyManager struct {
 	sync.Mutex

@@ -77,8 +43,6 @@ type ProxyManager struct {

 	processGroups map[string]*ProcessGroup

-	inFlightCounter *InflightCounter
-
 	// shutdown signaling
 	shutdownCtx    context.Context
 	shutdownCancel context.CancelFunc
@@ -191,8 +155,6 @@ func New(proxyConfig config.Config) *ProxyManager {

 		processGroups: make(map[string]*ProcessGroup),

-		inFlightCounter: newInflightCounter(),
-
 		shutdownCtx:    shutdownCtx,
 		shutdownCancel: shutdownCancel,

@@ -314,42 +276,37 @@ func (pm *ProxyManager) setupGinEngine() {

 	// Set up routes using the Gin engine
 	// Protected routes use pm.apiKeyAuth() middleware
-	pm.ginEngine.POST("/v1/chat/completions", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
-	pm.ginEngine.POST("/v1/responses", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/chat/completions", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/responses", pm.apiKeyAuth(), pm.proxyInferenceHandler)
 	// Support legacy /v1/completions api, see issue #12
-	pm.ginEngine.POST("/v1/completions", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/completions", pm.apiKeyAuth(), pm.proxyInferenceHandler)
 	// Support anthropic /v1/messages (added https://github.com/ggml-org/llama.cpp/pull/17570)
-	pm.ginEngine.POST("/v1/messages", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/messages", pm.apiKeyAuth(), pm.proxyInferenceHandler)
 	// Support anthropic count_tokens API (Also added in the above PR)
-	pm.ginEngine.POST("/v1/messages/count_tokens", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/messages/count_tokens", pm.apiKeyAuth(), pm.proxyInferenceHandler)

 	// Support embeddings and reranking
-	pm.ginEngine.POST("/v1/embeddings", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/embeddings", pm.apiKeyAuth(), pm.proxyInferenceHandler)

 	// llama-server's /reranking endpoint + aliases
-	pm.ginEngine.POST("/reranking", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
-	pm.ginEngine.POST("/rerank", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
-	pm.ginEngine.POST("/v1/rerank", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
-	pm.ginEngine.POST("/v1/reranking", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/reranking", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/rerank", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/rerank", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/reranking", pm.apiKeyAuth(), pm.proxyInferenceHandler)

 	// llama-server's /infill endpoint for code infilling
-	pm.ginEngine.POST("/infill", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/infill", pm.apiKeyAuth(), pm.proxyInferenceHandler)

 	// llama-server's /completion endpoint
-	pm.ginEngine.POST("/completion", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/completion", pm.apiKeyAuth(), pm.proxyInferenceHandler)

 	// Support audio/speech endpoint
-	pm.ginEngine.POST("/v1/audio/speech", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
-	pm.ginEngine.POST("/v1/audio/voices", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
-	pm.ginEngine.GET("/v1/audio/voices", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyGETModelHandler)
-	pm.ginEngine.POST("/v1/audio/transcriptions", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyOAIPostFormHandler)
-	pm.ginEngine.POST("/v1/images/generations", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
-	pm.ginEngine.POST("/v1/images/edits", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyOAIPostFormHandler)
-
-	// sd.cpp /sdapi/v1 endpoints
-	pm.ginEngine.POST("/sdapi/v1/txt2img", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
-	pm.ginEngine.POST("/sdapi/v1/img2img", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
-	pm.ginEngine.GET("/sdapi/v1/loras", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyGETModelHandler)
+	pm.ginEngine.POST("/v1/audio/speech", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/audio/voices", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.GET("/v1/audio/voices", pm.apiKeyAuth(), pm.proxyGETModelHandler)
+	pm.ginEngine.POST("/v1/audio/transcriptions", pm.apiKeyAuth(), pm.proxyOAIPostFormHandler)
+	pm.ginEngine.POST("/v1/images/generations", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/images/edits", pm.apiKeyAuth(), pm.proxyOAIPostFormHandler)

 	pm.ginEngine.GET("/v1/models", pm.apiKeyAuth(), pm.listModelsHandler)

@@ -368,7 +325,7 @@ func (pm *ProxyManager) setupGinEngine() {
 	pm.ginEngine.GET("/upstream", func(c *gin.Context) {
 		c.Redirect(http.StatusFound, "/ui/models")
 	})
-	pm.ginEngine.Any("/upstream/*upstreamPath", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyToUpstream)
+	pm.ginEngine.Any("/upstream/*upstreamPath", pm.apiKeyAuth(), pm.proxyToUpstream)
 	pm.ginEngine.GET("/unload", pm.apiKeyAuth(), pm.unloadAllModelsHandler)
 	pm.ginEngine.GET("/running", pm.apiKeyAuth(), pm.listRunningProcessesHandler)
 	pm.ginEngine.GET("/health", func(c *gin.Context) {
@@ -432,14 +389,6 @@ func (pm *ProxyManager) setupGinEngine() {
 	gin.DisableConsoleColor()
 }

-func (pm *ProxyManager) trackInflight() gin.HandlerFunc {
-	return func(c *gin.Context) {
-		event.Emit(InFlightRequestsEvent{Total: pm.inFlightCounter.Increment()})
-		defer event.Emit(InFlightRequestsEvent{Total: pm.inFlightCounter.Decrement()})
-		c.Next()
-	}
-}
-
 // ServeHTTP implements http.Handler interface
 func (pm *ProxyManager) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 	pm.ginEngine.ServeHTTP(w, r)
@@ -725,17 +674,6 @@ func (pm *ProxyManager) proxyInferenceHandler(c *gin.Context) {
 			}
 		}

-		// setParamsByID: set params based on the requested model ID (runs after setParams, can override it)
-		setParamsByIDParams, setParamsByIDKeys := pm.config.Models[modelID].Filters.SanitizedSetParamsByID(requestedModel)
-		for _, key := range setParamsByIDKeys {
-			pm.proxyLogger.Debugf("<%s> setting param by id: %s", requestedModel, key)
-			bodyBytes, err = sjson.SetBytes(bodyBytes, key, setParamsByIDParams[key])
-			if err != nil {
-				pm.sendErrorResponse(c, http.StatusInternalServerError, fmt.Sprintf("error setting parameter %s in request", key))
-				return
-			}
-		}
-
 		pm.proxyLogger.Debugf("ProxyManager using local Process for model: %s", requestedModel)
 		nextHandler = processGroup.ProxyRequest
 	} else if pm.peerProxy != nil && pm.peerProxy.HasPeerModel(requestedModel) {
@@ -14,13 +14,12 @@ import (
 )

 type Model struct {
-	Id          string   `json:"id"`
-	Name        string   `json:"name"`
-	Description string   `json:"description"`
-	State       string   `json:"state"`
-	Unlisted    bool     `json:"unlisted"`
-	PeerID      string   `json:"peerID"`
-	Aliases     []string `json:"aliases,omitempty"`
+	Id          string `json:"id"`
+	Name        string `json:"name"`
+	Description string `json:"description"`
+	State       string `json:"state"`
+	Unlisted    bool   `json:"unlisted"`
+	PeerID      string `json:"peerID"`
 }

 func addApiHandlers(pm *ProxyManager) {
@@ -84,7 +83,6 @@ func (pm *ProxyManager) getModelStatus() []Model {
 			Description: pm.config.Models[modelID].Description,
 			State:       state,
 			Unlisted:    pm.config.Models[modelID].Unlisted,
-			Aliases:     pm.config.Models[modelID].Aliases,
 		})
 	}

@@ -109,7 +107,6 @@ const (
 	msgTypeModelStatus messageType = "modelStatus"
 	msgTypeLogData     messageType = "logData"
 	msgTypeMetrics     messageType = "metrics"
-	msgTypeInFlight    messageType = "inflight"
 )

 type messageEnvelope struct {
@@ -169,18 +166,6 @@ func (pm *ProxyManager) apiSendEvents(c *gin.Context) {
 		}
 	}

-	sendInFlight := func(total int) {
-		jsonData, err := json.Marshal(gin.H{"total": total})
-		if err == nil {
-			select {
-			case sendBuffer <- messageEnvelope{Type: msgTypeInFlight, Data: string(jsonData)}:
-			case <-ctx.Done():
-				return
-			default:
-			}
-		}
-	}
-
 	/**
 	 * Send updated models list
 	 */
@@ -208,19 +193,11 @@ func (pm *ProxyManager) apiSendEvents(c *gin.Context) {
 		sendMetrics([]TokenMetrics{e.Metrics})
 	})()

-	/**
-	 * Send in-flight request stats related to token stats "Waiting: N" count.
-	 */
-	defer event.On(func(e InFlightRequestsEvent) {
-		sendInFlight(e.Total)
-	})()
-
 	// send initial batch of data
 	sendLogData("proxy", pm.proxyLogger.GetHistory())
 	sendLogData("upstream", pm.upstreamLogger.GetHistory())
 	sendModels()
 	sendMetrics(pm.metricsMonitor.getMetrics())
-	sendInFlight(pm.inFlightCounter.Current())

 	for {
 		select {
@@ -730,7 +730,7 @@ func TestProxyManager_RunningEndpoint(t *testing.T) {
 		// Verify extended fields are present
 		assert.NotEmpty(t, response.Running[0].Cmd, "cmd should be populated")
 		assert.NotEmpty(t, response.Running[0].Proxy, "proxy should be populated")
-		assert.Equal(t, -1, response.Running[0].TTL, "ttl should default to -1 (use globalTTL)")
+		assert.Equal(t, 0, response.Running[0].TTL, "ttl should default to 0")
 	})
 }

@@ -1046,61 +1046,6 @@ func TestProxyManager_FiltersStripParams(t *testing.T) {
 	// t.Logf("%v", response)
 }

-func TestProxyManager_FiltersSetParamsByID(t *testing.T) {
-	// no explicit aliases — setParamsByID keys are auto-registered as aliases
-	configStr := strings.Replace(`
-logLevel: error
-models:
-  model1:
-    cmd: 'SRPATH --port ${PORT} --silent --respond model1'
-    proxy: "http://127.0.0.1:${PORT}"
-    filters:
-      setParams:
-        reasoning_effort: medium
-      setParamsByID:
-        "${MODEL_ID}:high":
-          reasoning_effort: high
-        "${MODEL_ID}:low":
-          reasoning_effort: low
-`, "SRPATH", simpleResponderPath, -1)
-
-	cfg, err := config.LoadConfigFromReader(strings.NewReader(configStr))
-	if !assert.NoError(t, err, "invalid test configuration") {
-		return
-	}
-
-	proxy := New(cfg)
-	defer proxy.StopProcesses(StopWaitForInflightRequest)
-
-	tests := []struct {
-		requestedModel string
-		wantEffort     string
-	}{
-		// setParams applies, no setParamsByID match
-		{requestedModel: "model1", wantEffort: "medium"},
-		// setParamsByID overrides setParams
-		{requestedModel: "model1:high", wantEffort: "high"},
-		{requestedModel: "model1:low", wantEffort: "low"},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.requestedModel, func(t *testing.T) {
-			reqBody := fmt.Sprintf(`{"model":%q}`, tt.requestedModel)
-			req := httptest.NewRequest("POST", "/v1/chat/completions", bytes.NewBufferString(reqBody))
-			w := CreateTestResponseRecorder()
-			proxy.ServeHTTP(w, req)
-			assert.Equal(t, http.StatusOK, w.Code)
-
-			var response map[string]interface{}
-			assert.NoError(t, json.Unmarshal(w.Body.Bytes(), &response))
-
-			requestBody, _ := response["request_body"].(string)
-			gotEffort := gjson.Get(requestBody, "reasoning_effort").String()
-			assert.Equal(t, tt.wantEffort, gotEffort, "reasoning_effort mismatch for model %s", tt.requestedModel)
-		})
-	}
-}
-
 func TestProxyManager_HealthEndpoint(t *testing.T) {
 	config := config.AddDefaultGroupToConfig(config.Config{
 		HealthCheckTimeout: 15,
@@ -1659,82 +1604,3 @@ models:
 		assert.Equal(t, "no", w.Header().Get("X-Accel-Buffering"))
 	})
 }
-
-func TestProxyManager_SdApiTxt2ImgRouting(t *testing.T) {
-	conf := config.AddDefaultGroupToConfig(config.Config{
-		HealthCheckTimeout: 15,
-		Models: map[string]config.ModelConfig{
-			"sd-model": getTestSimpleResponderConfig("sd-model"),
-		},
-		LogLevel: "error",
-	})
-
-	proxy := New(conf)
-	defer proxy.StopProcesses(StopWaitForInflightRequest)
-
-	t.Run("successful txt2img with model", func(t *testing.T) {
-		reqBody := `{"model":"sd-model","prompt":"a cat"}`
-		req := httptest.NewRequest("POST", "/sdapi/v1/txt2img", bytes.NewBufferString(reqBody))
-		w := CreateTestResponseRecorder()
-
-		proxy.ServeHTTP(w, req)
-		assert.Equal(t, http.StatusOK, w.Code)
-		assert.Contains(t, w.Body.String(), "sd-model")
-	})
-
-	t.Run("successful img2img with model", func(t *testing.T) {
-		reqBody := `{"model":"sd-model","prompt":"a cat","init_images":[]}`
-		req := httptest.NewRequest("POST", "/sdapi/v1/img2img", bytes.NewBufferString(reqBody))
-		w := CreateTestResponseRecorder()
-
-		proxy.ServeHTTP(w, req)
-		assert.Equal(t, http.StatusOK, w.Code)
-		assert.Contains(t, w.Body.String(), "sd-model")
-	})
-
-	t.Run("missing model returns 400", func(t *testing.T) {
-		reqBody := `{"prompt":"a cat"}`
-		req := httptest.NewRequest("POST", "/sdapi/v1/txt2img", bytes.NewBufferString(reqBody))
-		w := CreateTestResponseRecorder()
-
-		proxy.ServeHTTP(w, req)
-		assert.Equal(t, http.StatusBadRequest, w.Code)
-		assert.Contains(t, w.Body.String(), "missing or invalid 'model' key")
-	})
-}
-
-func TestProxyManager_SdApiGetLoras(t *testing.T) {
-	conf := config.AddDefaultGroupToConfig(config.Config{
-		HealthCheckTimeout: 15,
-		Models: map[string]config.ModelConfig{
-			"sd-model": getTestSimpleResponderConfig("sd-model"),
-		},
-		LogLevel: "error",
-	})
-
-	proxy := New(conf)
-	defer proxy.StopProcesses(StopWaitForInflightRequest)
-
-	t.Run("successful GET loras with model query param", func(t *testing.T) {
-		req := httptest.NewRequest("GET", "/sdapi/v1/loras?model=sd-model", nil)
-		w := CreateTestResponseRecorder()
-		proxy.ServeHTTP(w, req)
-		assert.Equal(t, http.StatusOK, w.Code)
-	})
-
-	t.Run("missing model query param returns 400", func(t *testing.T) {
-		req := httptest.NewRequest("GET", "/sdapi/v1/loras", nil)
-		w := CreateTestResponseRecorder()
-		proxy.ServeHTTP(w, req)
-		assert.Equal(t, http.StatusBadRequest, w.Code)
-		assert.Contains(t, w.Body.String(), "missing required 'model' query parameter")
-	})
-
-	t.Run("unknown model returns 400", func(t *testing.T) {
-		req := httptest.NewRequest("GET", "/sdapi/v1/loras?model=nonexistent", nil)
-		w := CreateTestResponseRecorder()
-		proxy.ServeHTTP(w, req)
-		assert.Equal(t, http.StatusBadRequest, w.Code)
-		assert.Contains(t, w.Body.String(), "could not find suitable handler")
-	})
-}
@@ -1 +0,0 @@
-legacy-peer-deps=true
@@ -12,18 +12,18 @@
    "test:watch": "vitest"
  },
  "devDependencies": {
-    "@sveltejs/vite-plugin-svelte": "^7.0.0",
+    "@sveltejs/vite-plugin-svelte": "^5.0.3",
    "@tailwindcss/vite": "^4.1.8",
    "@tsconfig/svelte": "^5.0.4",
    "@types/hast": "^3.0.4",
    "@types/node": "^25.1.0",
-    "svelte": "^5.46.4",
+    "svelte": "^5.19.0",
    "svelte-check": "^4.1.4",
    "tailwindcss": "^4.1.8",
    "typescript": "~5.8.3",
-    "vite": "^8.0.0",
-    "vite-plugin-compression2": "^2.5.1",
-    "vitest": "^4.1.0"
+    "vite": "^6.3.5",
+    "vite-plugin-compression2": "^2.4.0",
+    "vitest": "^4.0.18"
  },
  "dependencies": {
    "highlight.js": "^11.11.1",
@@ -6,28 +6,23 @@
  import Models from "./routes/Models.svelte";
  import Activity from "./routes/Activity.svelte";
  import Playground from "./routes/Playground.svelte";
-  import PlaygroundStub from "./routes/PlaygroundStub.svelte";
  import { enableAPIEvents } from "./stores/api";
  import { initScreenWidth, isDarkMode, appTitle, connectionState } from "./stores/theme";
-  import { currentRoute } from "./stores/route";

  const routes = {
-    "/": PlaygroundStub,
+    "/": Playground,
    "/models": Models,
    "/logs": LogViewer,
    "/activity": Activity,
-    "*": PlaygroundStub,
+    "*": Playground,
  };

-  function handleRouteLoaded(event: { detail: { route: string | RegExp } }) {
-    const route = event.detail.route;
-    currentRoute.set(typeof route === "string" ? route : "/");
-  }
-
+  // Sync theme to document attribute
  $effect(() => {
    document.documentElement.setAttribute("data-theme", $isDarkMode ? "dark" : "light");
  });

+  // Sync title to document
  $effect(() => {
    const icon = $connectionState === "connecting" ? "\u{1F7E1}" : $connectionState === "connected" ? "\u{1F7E2}" : "\u{1F534}";
    document.title = `${icon} ${$appTitle}`;
@@ -48,11 +43,6 @@
  <Header />

  <main class="flex-1 overflow-auto p-4">
-    <div class="h-full" class:hidden={$currentRoute !== "/"}>
-      <Playground />
-    </div>
-    <div class="h-full" class:hidden={$currentRoute === "/"}>
-      <Router {routes} on:routeLoaded={handleRouteLoaded} />
-    </div>
+    <Router {routes} />
  </main>
 </div>
@@ -1,8 +1,6 @@
 <script lang="ts">
-  import { link } from "svelte-spa-router";
+  import { link, location } from "svelte-spa-router";
  import { screenWidth, toggleTheme, isDarkMode, appTitle, isNarrow } from "../stores/theme";
-  import { currentRoute } from "../stores/route";
-  import { playgroundActivity } from "../stores/playgroundActivity";
  import ConnectionStatus from "./ConnectionStatus.svelte";

  function handleTitleChange(newTitle: string): void {
@@ -24,10 +22,9 @@
    handleTitleChange(target.textContent || "(set title)");
  }

-  function isActive(path: string, current: string): boolean {
-    return path === "/" ? current === "/" : current.startsWith(path);
+  function isActive(path: string, currentLocation: string): boolean {
+    return path === "/" ? currentLocation === "/" : currentLocation.startsWith(path);
  }
-
 </script>

 <header
@@ -50,7 +47,8 @@
    <a
      href="/"
      use:link
-      class="p-1 whitespace-nowrap {isActive('/', $currentRoute) ? 'font-semibold' : ''} {$playgroundActivity ? 'activity-link' : 'text-gray-600 hover:text-black dark:text-gray-300 dark:hover:text-gray-100'}"
+      class="text-gray-600 hover:text-black dark:text-gray-300 dark:hover:text-gray-100 p-1 whitespace-nowrap"
+      class:font-semibold={isActive("/", $location)}
    >
      Playground
    </a>
@@ -58,7 +56,7 @@
      href="/models"
      use:link
      class="text-gray-600 hover:text-black dark:text-gray-300 dark:hover:text-gray-100 p-1 whitespace-nowrap"
-      class:font-semibold={isActive("/models", $currentRoute)}
+      class:font-semibold={isActive("/models", $location)}
    >
      Models
    </a>
@@ -66,7 +64,7 @@
      href="/activity"
      use:link
      class="text-gray-600 hover:text-black dark:text-gray-300 dark:hover:text-gray-100 p-1 whitespace-nowrap"
-      class:font-semibold={isActive("/activity", $currentRoute)}
+      class:font-semibold={isActive("/activity", $location)}
    >
      Activity
    </a>
@@ -74,7 +72,7 @@
      href="/logs"
      use:link
      class="text-gray-600 hover:text-black dark:text-gray-300 dark:hover:text-gray-100 p-1 whitespace-nowrap"
-      class:font-semibold={isActive("/logs", $currentRoute)}
+      class:font-semibold={isActive("/logs", $location)}
    >
      Logs
    </a>
@@ -98,23 +96,3 @@
    <ConnectionStatus />
  </menu>
 </header>
-
-<style>
-  .activity-link {
-    background: linear-gradient(90deg, #6366f1, #8b5cf6, #a855f7, #8b5cf6, #6366f1);
-    background-size: 200% 100%;
-    -webkit-background-clip: text;
-    background-clip: text;
-    -webkit-text-fill-color: transparent;
-    animation: gradient-shift 2s linear infinite;
-  }
-
-  @keyframes gradient-shift {
-    0% {
-      background-position: 0% 50%;
-    }
-    100% {
-      background-position: 200% 50%;
-    }
-  }
-</style>
@@ -65,17 +65,10 @@
  });

  let preElement: HTMLPreElement;
-  let userScrolledUp = $state(false);

-  function handleScroll() {
-    if (!preElement) return;
-    const { scrollTop, scrollHeight, clientHeight } = preElement;
-    userScrolledUp = scrollHeight - scrollTop - clientHeight > 40;
-  }
-
-  // Auto scroll to bottom when logs change, unless user has scrolled up
+  // Auto scroll to bottom when logs change
  $effect(() => {
-    if (preElement && filteredLogs && !userScrolledUp) {
+    if (preElement && filteredLogs) {
      preElement.scrollTop = preElement.scrollHeight;
    }
  });
@@ -89,7 +82,7 @@
      <div class="flex gap-2 items-center">
        <button class="btn border-0" onclick={toggleFontSize} title="Change font size">
          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="currentColor" class="w-4 h-4">
-            <path d="M2 4v3h5v12h3V7h5V4H2zm19 5h-9v3h3v7h3v-7h3V9z"/>
+            <path fill-rule="evenodd" d="M10.5 3.75a6 6 0 0 0-5.98 6.496A5.25 5.25 0 0 0 6.75 20.25H18a4.5 4.5 0 0 0 2.206-8.423 3.75 3.75 0 0 0-4.133-4.303A6.001 6.001 0 0 0 10.5 3.75Zm2.25 6a.75.75 0 0 0-1.5 0v4.94l-1.72-1.72a.75.75 0 0 0-1.06 1.06l3 3a.75.75 0 0 0 1.06 0l3-3a.75.75 0 1 0-1.06-1.06l-1.72 1.72V9.75Z" clip-rule="evenodd" />
          </svg>
        </button>
        <button class="btn border-0" onclick={toggleWrapText} title="Toggle text wrap">
@@ -134,6 +127,6 @@
    {/if}
  </div>
  <div class="rounded-lg bg-background font-mono text-sm flex-1 overflow-hidden">
-    <pre bind:this={preElement} onscroll={handleScroll} class="{textWrapClass} {fontSizeClass} h-full overflow-auto p-4">{filteredLogs}</pre>
+    <pre bind:this={preElement} class="{textWrapClass} {fontSizeClass} h-full overflow-auto p-4">{filteredLogs}</pre>
  </div>
 </div>
@@ -165,9 +165,6 @@
              {#if model.description}
                <p class={model.unlisted ? "text-opacity-70" : ""}><em>{model.description}</em></p>
              {/if}
-              {#if model.aliases && model.aliases.length > 0}
-                <p class="text-xs text-txtsecondary">Aliases: {model.aliases.join(", ")}</p>
-              {/if}
            </td>
            <td class="w-12">
              {#if model.state === "stopped"}
@@ -1,5 +1,5 @@
 <script lang="ts">
-  import { inFlightRequests, metrics } from "../stores/api";
+  import { metrics } from "../stores/api";
  import TokenHistogram from "./TokenHistogram.svelte";

  interface HistogramData {
@@ -15,14 +15,7 @@
  let stats = $derived.by(() => {
    const totalRequests = $metrics.length;
    if (totalRequests === 0) {
-      return {
-        totalRequests: 0,
-        totalInputTokens: 0,
-        totalOutputTokens: 0,
-        inFlightRequests: $inFlightRequests,
-        tokenStats: { p99: "0", p95: "0", p50: "0" },
-        histogramData: null,
-      };
+      return { totalRequests: 0, totalInputTokens: 0, totalOutputTokens: 0, tokenStats: { p99: "0", p95: "0", p50: "0" }, histogramData: null };
    }

    const totalInputTokens = $metrics.reduce((sum, m) => sum + m.input_tokens, 0);
@@ -31,14 +24,7 @@
    // Calculate token statistics using output_tokens and duration_ms
    const validMetrics = $metrics.filter((m) => m.duration_ms > 0 && m.output_tokens > 0);
    if (validMetrics.length === 0) {
-      return {
-        totalRequests,
-        totalInputTokens,
-        totalOutputTokens,
-        inFlightRequests: $inFlightRequests,
-        tokenStats: { p99: "0", p95: "0", p50: "0" },
-        histogramData: null,
-      };
+      return { totalRequests, totalInputTokens, totalOutputTokens, tokenStats: { p99: "0", p95: "0", p50: "0" }, histogramData: null };
    }

    // Calculate tokens/second for each valid metric
@@ -77,7 +63,6 @@
      totalRequests,
      totalInputTokens,
      totalOutputTokens,
-      inFlightRequests: $inFlightRequests,
      tokenStats: {
        p99: p99.toFixed(2),
        p95: p95.toFixed(2),
@@ -110,12 +95,7 @@

      <tbody class="bg-surface divide-y divide-card-border-inner">
        <tr class="hover:bg-secondary">
-          <td class="px-4 py-4 text-sm font-semibold text-gray-900 dark:text-white">
-            <div class="flex flex-col gap-1">
-              <span class="text-xs font-medium text-gray-500 dark:text-gray-400">Completed: {nf.format(stats.totalRequests)}</span>
-              <span class="text-xs font-medium text-gray-500 dark:text-gray-400">Waiting: {nf.format(stats.inFlightRequests)}</span>
-            </div>
-          </td>
+          <td class="px-4 py-4 text-sm font-semibold text-gray-900 dark:text-white">{stats.totalRequests}</td>

          <td class="px-4 py-4 text-sm text-gray-700 dark:text-gray-300 border-l border-gray-200 dark:border-white/10">
            <div class="flex items-center gap-2">
@@ -2,7 +2,6 @@
  import { models } from "../../stores/api";
  import { persistentStore } from "../../stores/persistent";
  import { transcribeAudio } from "../../lib/audioApi";
-  import { playgroundStores } from "../../stores/playgroundActivity";
  import ModelSelector from "./ModelSelector.svelte";

  const selectedModelStore = persistentStore<string>("playground-audio-model", "");
@@ -16,22 +15,18 @@
  let fileInput = $state<HTMLInputElement | null>(null);
  let copied = $state(false);

-  const ACCEPTED_FORMATS = ['.mp3', '.wav', '.ogg'];
+  const ACCEPTED_FORMATS = ['.mp3', '.wav'];
  const MAX_FILE_SIZE = 25 * 1024 * 1024; // 25MB

  let hasModels = $derived($models.some((m) => !m.unlisted));

  let canTranscribe = $derived(selectedFile !== null && $selectedModelStore !== "" && !isTranscribing);

-  $effect(() => {
-    playgroundStores.audioTranscribing.set(isTranscribing);
-  });
-
  function validateFile(file: File): { valid: boolean; error?: string } {
    const ext = '.' + file.name.split('.').pop()?.toLowerCase();

    if (!ACCEPTED_FORMATS.includes(ext)) {
-      return { valid: false, error: 'Invalid file type. Accepted: MP3, WAV, OGG' };
+      return { valid: false, error: 'Invalid file type. Accepted: MP3, WAV' };
    }

    if (file.size > MAX_FILE_SIZE) {
@@ -208,7 +203,7 @@
          <div>
            <p class="mb-2">Drag and drop an audio file here</p>
            <p class="text-sm">or use the Browse button below</p>
-            <p class="text-xs mt-4">Accepted formats: MP3, WAV, OGG (max 25MB)</p>
+            <p class="text-xs mt-4">Accepted formats: MP3, WAV (max 25MB)</p>
          </div>
        </div>
      {/if}
@@ -218,7 +213,7 @@
    <div class="shrink-0 flex gap-2">
      <input
        type="file"
-        accept=".mp3,.wav,.ogg"
+        accept=".mp3,.wav"
        class="hidden"
        onchange={handleFileSelect}
        bind:this={fileInput}
@@ -2,7 +2,6 @@
  import { models } from "../../stores/api";
  import { persistentStore } from "../../stores/persistent";
  import { streamChatCompletion } from "../../lib/chatApi";
-  import { playgroundStores } from "../../stores/playgroundActivity";
  import type { ChatMessage, ContentPart } from "../../lib/types";
  import ChatMessageComponent from "./ChatMessage.svelte";
  import ModelSelector from "./ModelSelector.svelte";
@@ -12,16 +11,7 @@
  const systemPromptStore = persistentStore<string>("playground-system-prompt", "");
  const temperatureStore = persistentStore<number>("playground-temperature", 0.7);

-  function loadMessages(): ChatMessage[] {
-    try {
-      const saved = localStorage.getItem("playground-messages");
-      return saved ? JSON.parse(saved) : [];
-    } catch {
-      return [];
-    }
-  }
-
-  let messages = $state<ChatMessage[]>(loadMessages());
+  let messages = $state<ChatMessage[]>([]);
  let userInput = $state("");
  let isStreaming = $state(false);
  let isReasoning = $state(false);
@@ -34,52 +24,21 @@
  let imageError = $state<string | null>(null);

  let hasModels = $derived($models.some((m) => !m.unlisted));
-  let userScrolledUp = $state(false);

+  // Auto-scroll when messages change
  $effect(() => {
-    playgroundStores.chatStreaming.set(isStreaming);
-  });
-
-  function handleMessagesScroll() {
-    if (!messagesContainer) return;
-    const { scrollTop, scrollHeight, clientHeight } = messagesContainer;
-    // Consider "at bottom" if within 40px of the bottom
-    userScrolledUp = scrollHeight - scrollTop - clientHeight > 40;
-  }
-
-  // Auto-scroll when messages change — skip if user scrolled up
-  $effect(() => {
-    if (messages.length > 0 && messagesContainer && !userScrolledUp) {
+    if (messages.length > 0 && messagesContainer) {
      messagesContainer.scrollTo({
        top: messagesContainer.scrollHeight,
-        behavior: isStreaming ? "instant" : "smooth",
+        behavior: "smooth",
      });
    }
  });

-  // Persist messages to localStorage (throttled to once per 2s)
-  let lastSaveTime = 0;
-  $effect(() => {
-    const json = JSON.stringify(messages);
-    const elapsed = Date.now() - lastSaveTime;
-    const save = () => {
-      try { localStorage.setItem("playground-messages", json); } catch {}
-      lastSaveTime = Date.now();
-    };
-    if (elapsed >= 2000) {
-      save();
-      return;
-    }
-    const timer = setTimeout(save, 2000 - elapsed);
-    return () => clearTimeout(timer);
-  });
-
  async function sendMessage() {
    const trimmedInput = userInput.trim();
    if ((!trimmedInput && attachedImages.length === 0) || !$selectedModelStore || isStreaming) return;

-    userScrolledUp = false;
-
    // Build message content (multimodal if images attached)
    let content: string | ContentPart[];
    if (attachedImages.length > 0) {
@@ -362,7 +321,6 @@
    <div
      class="flex-1 overflow-y-auto mb-4 px-2"
      bind:this={messagesContainer}
-      onscroll={handleMessagesScroll}
    >
      {#if messages.length === 0}
        <div class="h-full flex items-center justify-center text-txtsecondary">
@@ -1,6 +1,5 @@
 <script lang="ts">
-  import { renderMarkdown, escapeHtml, renderStreamingMarkdown, createStreamingCache } from "../../lib/markdown";
-  import type { RenderedBlock } from "../../lib/markdown";
+  import { renderMarkdown, escapeHtml } from "../../lib/markdown";
  import { Copy, Check, Pencil, X, Save, RefreshCw, ChevronDown, ChevronRight, Brain, Code } from "lucide-svelte";
  import { getTextContent, getImageUrls } from "../../lib/types";
  import type { ContentPart } from "../../lib/types";
@@ -23,17 +22,11 @@
  let hasImages = $derived(imageUrls.length > 0);
  let canEdit = $derived(onEdit !== undefined && !hasImages);

-  let streamingCache = createStreamingCache();
-  let renderedParts = $derived.by(() => {
-    if (role !== "assistant") {
-      return { blocks: [{ id: -1, html: escapeHtml(textContent).replace(/\n/g, '<br>') }] as RenderedBlock[], pendingHtml: "" };
-    }
-    if (!isStreaming) {
-      streamingCache = createStreamingCache();
-      return { blocks: [{ id: -1, html: renderMarkdown(textContent) }] as RenderedBlock[], pendingHtml: "" };
-    }
-    return renderStreamingMarkdown(textContent, streamingCache);
-  });
+  let renderedContent = $derived(
+    role === "assistant" && !isStreaming
+      ? renderMarkdown(textContent)
+      : escapeHtml(textContent).replace(/\n/g, '<br>')
+  );
  let copied = $state(false);
  let showRaw = $state(false);
  let isEditing = $state(false);
@@ -116,54 +109,13 @@
      cancelEdit();
    }
  }
-
-  const COPY_SVG = `<svg xmlns="http://www.w3.org/2000/svg" width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><rect width="14" height="14" x="8" y="8" rx="2" ry="2"/><path d="M4 16c-1.1 0-2-.9-2-2V4c0-1.1.9-2 2-2h10c1.1 0 2 .9 2 2"/></svg>`;
-  const CHECK_SVG = `<svg xmlns="http://www.w3.org/2000/svg" width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M20 6 9 17l-5-5"/></svg>`;
-
-  function codeBlockCopy(node: HTMLElement) {
-    function attachButtons() {
-      node.querySelectorAll<HTMLPreElement>('pre:not([data-copy-btn])').forEach(pre => {
-        pre.setAttribute('data-copy-btn', 'true');
-        const btn = document.createElement('button');
-        btn.className = 'code-copy-btn';
-        btn.title = 'Copy code';
-        btn.innerHTML = COPY_SVG;
-        btn.addEventListener('click', async () => {
-          const text = pre.querySelector('code')?.textContent ?? pre.textContent ?? '';
-          try {
-            if (navigator.clipboard && window.isSecureContext) {
-              await navigator.clipboard.writeText(text);
-            } else {
-              const ta = document.createElement('textarea');
-              ta.value = text;
-              ta.style.cssText = 'position:fixed;left:-9999px';
-              document.body.appendChild(ta);
-              ta.select();
-              document.execCommand('copy');
-              document.body.removeChild(ta);
-            }
-            btn.innerHTML = CHECK_SVG;
-            btn.classList.add('copied');
-            setTimeout(() => { btn.innerHTML = COPY_SVG; btn.classList.remove('copied'); }, 2000);
-          } catch (e) {
-            console.error('copy failed', e);
-          }
-        });
-        pre.appendChild(btn);
-      });
-    }
-    attachButtons();
-    const mo = new MutationObserver(attachButtons);
-    mo.observe(node, { childList: true, subtree: true });
-    return { destroy: () => mo.disconnect() };
-  }
 </script>

 <div class="flex {role === 'user' ? 'justify-end' : 'justify-start'} mb-4">
  <div
-    class="relative group rounded-lg px-4 py-2 {role === 'user'
-      ? 'max-w-[85%] bg-primary text-btn-primary-text'
-      : 'w-full sm:w-4/5 bg-surface border border-gray-200 dark:border-white/10'}"
+    class="relative group max-w-[85%] rounded-lg px-4 py-2 {role === 'user'
+      ? 'bg-primary text-btn-primary-text'
+      : 'bg-surface border border-gray-200 dark:border-white/10'}"
  >
    {#if role === "assistant"}
      {#if reasoning_content || isReasoning}
@@ -215,11 +167,8 @@
      {#if showRaw}
        <div class="whitespace-pre-wrap font-mono text-sm">{textContent}</div>
      {:else}
-        <div class="prose prose-sm dark:prose-invert max-w-none" use:codeBlockCopy>
-          {#each renderedParts.blocks as block (block.id)}
-            {@html block.html}
-          {/each}
-          {@html renderedParts.pendingHtml}
+        <div class="prose prose-sm dark:prose-invert max-w-none">
+          {@html renderedContent}
          {#if isStreaming && !isReasoning}
            <span class="inline-block w-2 h-4 bg-current animate-pulse ml-0.5"></span>
          {/if}
@@ -340,42 +289,14 @@

 <style>
  .prose :global(pre) {
-    position: relative;
    background-color: var(--color-surface);
    border: 1px solid var(--color-border, rgba(128, 128, 128, 0.2));
    border-radius: 0.375rem;
    padding: 0.75rem;
-    padding-right: 2.5rem;
    overflow-x: auto;
    margin: 0.5rem 0;
  }

-  .prose :global(.code-copy-btn) {
-    position: absolute;
-    top: 0.375rem;
-    right: 0.375rem;
-    display: flex;
-    align-items: center;
-    justify-content: center;
-    padding: 0.25rem;
-    border-radius: 0.25rem;
-    border: 1px solid var(--color-border);
-    background: var(--color-surface);
-    color: var(--color-txtsecondary);
-    cursor: pointer;
-    transition: background-color 0.15s;
-    line-height: 0;
-  }
-
-  .prose :global(.code-copy-btn:hover) {
-    background: var(--color-secondary);
-  }
-
-  .prose :global(.code-copy-btn.copied) {
-    color: var(--color-success);
-    opacity: 1;
-  }
-
  .prose :global(code) {
    font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace;
    font-size: 0.875em;
@@ -2,90 +2,20 @@
  import { models } from "../../stores/api";
  import { persistentStore } from "../../stores/persistent";
  import { generateImage } from "../../lib/imageApi";
-  import { generateSdImage, fetchSdLoras } from "../../lib/sdApi";
-  import { playgroundStores } from "../../stores/playgroundActivity";
  import ModelSelector from "./ModelSelector.svelte";
  import ExpandableTextarea from "./ExpandableTextarea.svelte";
-  import type { ImageApiMode, SdApiLora, SdApiLoraRef } from "../../lib/types";

  const selectedModelStore = persistentStore<string>("playground-image-model", "");
  const selectedSizeStore = persistentStore<string>("playground-image-size", "1024x1024");
-  const apiModeStore = persistentStore<ImageApiMode>("playground-image-api-mode", "openai");
-
-  // SDAPI persistent settings
-  const sdNegativePromptStore = persistentStore<string>("playground-sdapi-negative-prompt", "");
-  const sdStepsStore = persistentStore<number>("playground-sdapi-steps", 20);
-  const sdCfgScaleStore = persistentStore<number>("playground-sdapi-cfg-scale", 7);
-  const sdSeedStore = persistentStore<number>("playground-sdapi-seed", -1);
-  const sdSamplerStore = persistentStore<string>("playground-sdapi-sampler", "");
-  const sdSchedulerStore = persistentStore<string>("playground-sdapi-scheduler", "");
-  const sdBatchSizeStore = persistentStore<number>("playground-sdapi-batch-size", 1);

  let prompt = $state("");
  let isGenerating = $state(false);
-  let generatedImages = $state<string[]>([]);
+  let generatedImage = $state<string | null>(null);
  let error = $state<string | null>(null);
  let abortController = $state<AbortController | null>(null);
  let showFullscreen = $state(false);
-  let fullscreenIndex = $state(0);
-  let showSettings = $state(false);
-
-  // SDAPI lora state
-  let availableLoras = $state<SdApiLora[]>([]);
-  let selectedLoras = $state<SdApiLoraRef[]>([]);
-  let isLoadingLoras = $state(false);
-  let lorasLoaded = $state(false);
-  let loraError = $state<string | null>(null);

  let hasModels = $derived($models.some((m) => !m.unlisted));
-  let isSdapi = $derived($apiModeStore === "sdapi");
-
-  $effect(() => {
-    playgroundStores.imageGenerating.set(isGenerating);
-  });
-
-  async function loadLoras() {
-    if (!$selectedModelStore || isLoadingLoras) return;
-    isLoadingLoras = true;
-    loraError = null;
-    try {
-      const loras = await fetchSdLoras($selectedModelStore);
-      availableLoras = loras;
-      lorasLoaded = true;
-    } catch (err) {
-      availableLoras = [];
-      loraError = err instanceof Error ? err.message : "Failed to load LoRAs";
-      lorasLoaded = false;
-    } finally {
-      isLoadingLoras = false;
-    }
-  }
-
-  function addLora(event: Event) {
-    const select = event.target as HTMLSelectElement;
-    const path = select.value;
-    if (!path) return;
-
-    const lora = availableLoras.find((l) => l.path === path);
-    if (lora && !selectedLoras.some((l) => l.path === path)) {
-      selectedLoras = [...selectedLoras, { path: lora.path, multiplier: 1.0 }];
-    }
-    select.value = "";
-  }
-
-  function removeLora(path: string) {
-    selectedLoras = selectedLoras.filter((l) => l.path !== path);
-  }
-
-  function updateLoraMultiplier(path: string, multiplier: number) {
-    selectedLoras = selectedLoras.map((l) =>
-      l.path === path ? { ...l, multiplier } : l
-    );
-  }
-
-  function getLoraName(path: string): string {
-    return availableLoras.find((l) => l.path === path)?.name ?? path;
-  }

  async function generate() {
    const trimmedPrompt = prompt.trim();
@@ -96,44 +26,19 @@
    abortController = new AbortController();

    try {
-      if (isSdapi) {
-        const [w, h] = $selectedSizeStore.split("x").map(Number);
-        const request = {
-          model: $selectedModelStore,
-          prompt: trimmedPrompt,
-          negative_prompt: $sdNegativePromptStore || undefined,
-          width: w,
-          height: h,
-          steps: $sdStepsStore,
-          cfg_scale: $sdCfgScaleStore,
-          seed: $sdSeedStore,
-          batch_size: $sdBatchSizeStore,
-          sampler_name: $sdSamplerStore || undefined,
-          scheduler: $sdSchedulerStore || undefined,
-          lora: selectedLoras.length > 0 ? selectedLoras : undefined,
-        };
+      const response = await generateImage(
+        $selectedModelStore,
+        trimmedPrompt,
+        $selectedSizeStore,
+        abortController.signal
+      );

-        const response = await generateSdImage(request, abortController.signal);
-        if (response.images && response.images.length > 0) {
-          generatedImages = response.images.map(
-            (img) => `data:image/png;base64,${img}`
-          );
-        }
-      } else {
-        const response = await generateImage(
-          $selectedModelStore,
-          trimmedPrompt,
-          $selectedSizeStore,
-          abortController.signal
-        );
-
-        if (response.data && response.data.length > 0) {
-          const imageData = response.data[0];
-          if (imageData.b64_json) {
-            generatedImages = [`data:image/png;base64,${imageData.b64_json}`];
-          } else if (imageData.url) {
-            generatedImages = [imageData.url];
-          }
+      if (response.data && response.data.length > 0) {
+        const imageData = response.data[0];
+        if (imageData.b64_json) {
+          generatedImage = `data:image/png;base64,${imageData.b64_json}`;
+        } else if (imageData.url) {
+          generatedImage = imageData.url;
        }
      }
    } catch (err) {
@@ -153,29 +58,28 @@
  }

  function clearImage() {
-    generatedImages = [];
+    generatedImage = null;
    error = null;
    prompt = "";
  }

-  function downloadImage(index: number = 0) {
-    const img = generatedImages[index];
-    if (!img) return;
+  function downloadImage() {
+    if (!generatedImage) return;

    const link = document.createElement("a");
-    link.href = img;
-    link.download = `generated-image-${Date.now()}-${index}.png`;
+    link.href = generatedImage;
+    link.download = `generated-image-${Date.now()}.png`;
    document.body.appendChild(link);
    link.click();
    document.body.removeChild(link);
  }

-  function openFullscreen(index: number = 0) {
-    fullscreenIndex = index;
+  function openFullscreen() {
    showFullscreen = true;
  }

  function closeFullscreen(event?: MouseEvent) {
+    // Only close if clicking the background, not the image
    if (event && event.target !== event.currentTarget) {
      return;
    }
@@ -191,19 +95,9 @@
 </script>

 <div class="flex flex-col h-full">
-  <!-- Model selector and mode toggle -->
+  <!-- Model selector -->
  <div class="shrink-0 flex flex-wrap gap-2 mb-4">
    <ModelSelector bind:value={$selectedModelStore} placeholder="Select an image model..." disabled={isGenerating} />
-
-    <select
-      class="px-3 py-2 rounded border border-gray-200 dark:border-white/10 bg-surface focus:outline-none focus:ring-2 focus:ring-primary"
-      bind:value={$apiModeStore}
-      disabled={isGenerating}
-    >
-      <option value="openai">OpenAI</option>
-      <option value="sdapi">SDAPI</option>
-    </select>
-
    <select
      class="px-3 py-2 rounded border border-gray-200 dark:border-white/10 bg-surface focus:outline-none focus:ring-2 focus:ring-primary"
      bind:value={$selectedSizeStore}
@@ -224,166 +118,8 @@
        <option value="1024x1792">1024x1792 (SDXL)</option>
      </optgroup>
    </select>
-
-    {#if isSdapi}
-      <button
-        class="px-3 py-2 rounded border border-gray-200 dark:border-white/10 bg-surface hover:bg-secondary-hover transition-colors"
-        onclick={() => showSettings = !showSettings}
-      >
-        {showSettings ? "Hide Settings" : "Settings"}
-      </button>
-    {/if}
  </div>

-  <!-- SDAPI Settings Panel -->
-  {#if isSdapi && showSettings}
-    <div class="shrink-0 mb-4 p-4 rounded border border-gray-200 dark:border-white/10 bg-surface">
-      <div class="grid grid-cols-2 md:grid-cols-4 gap-3 mb-3">
-        <label class="flex flex-col gap-1">
-          <span class="text-xs text-txtsecondary">Steps</span>
-          <input
-            type="number"
-            class="px-2 py-1 rounded border border-gray-200 dark:border-white/10 bg-surface focus:outline-none focus:ring-2 focus:ring-primary"
-            bind:value={$sdStepsStore}
-            min="1"
-            max="150"
-          />
-        </label>
-        <label class="flex flex-col gap-1">
-          <span class="text-xs text-txtsecondary">CFG Scale</span>
-          <input
-            type="number"
-            class="px-2 py-1 rounded border border-gray-200 dark:border-white/10 bg-surface focus:outline-none focus:ring-2 focus:ring-primary"
-            bind:value={$sdCfgScaleStore}
-            min="1"
-            max="30"
-            step="0.5"
-          />
-        </label>
-        <label class="flex flex-col gap-1">
-          <span class="text-xs text-txtsecondary">Seed (-1 = random)</span>
-          <input
-            type="number"
-            class="px-2 py-1 rounded border border-gray-200 dark:border-white/10 bg-surface focus:outline-none focus:ring-2 focus:ring-primary"
-            bind:value={$sdSeedStore}
-            min="-1"
-          />
-        </label>
-        <label class="flex flex-col gap-1">
-          <span class="text-xs text-txtsecondary">Batch Size</span>
-          <input
-            type="number"
-            class="px-2 py-1 rounded border border-gray-200 dark:border-white/10 bg-surface focus:outline-none focus:ring-2 focus:ring-primary"
-            bind:value={$sdBatchSizeStore}
-            min="1"
-            max="8"
-          />
-        </label>
-        <label class="flex flex-col gap-1">
-          <span class="text-xs text-txtsecondary">Sampler</span>
-          <select
-            class="px-2 py-1 rounded border border-gray-200 dark:border-white/10 bg-surface focus:outline-none focus:ring-2 focus:ring-primary"
-            bind:value={$sdSamplerStore}
-          >
-            <option value="">Default</option>
-            <option value="euler_a">euler_a</option>
-            <option value="euler">euler</option>
-            <option value="heun">heun</option>
-            <option value="dpm2">dpm2</option>
-            <option value="dpmpp2s_a">dpmpp2s_a</option>
-            <option value="dpmpp2m">dpmpp2m</option>
-            <option value="dpmpp2mv2">dpmpp2mv2</option>
-            <option value="ipndm">ipndm</option>
-            <option value="ipndm_v">ipndm_v</option>
-            <option value="lcm">lcm</option>
-            <option value="ddim_trailing">ddim_trailing</option>
-            <option value="tcd">tcd</option>
-          </select>
-        </label>
-        <label class="flex flex-col gap-1">
-          <span class="text-xs text-txtsecondary">Scheduler</span>
-          <select
-            class="px-2 py-1 rounded border border-gray-200 dark:border-white/10 bg-surface focus:outline-none focus:ring-2 focus:ring-primary"
-            bind:value={$sdSchedulerStore}
-          >
-            <option value="">Auto for model</option>
-            <option value="discrete">discrete</option>
-            <option value="karras">karras</option>
-            <option value="exponential">exponential</option>
-            <option value="ays">ays</option>
-            <option value="gits">gits</option>
-          </select>
-        </label>
-      </div>
-
-      <label class="flex flex-col gap-1 mb-3">
-        <span class="text-xs text-txtsecondary">Negative Prompt</span>
-        <textarea
-          class="px-2 py-1 rounded border border-gray-200 dark:border-white/10 bg-surface focus:outline-none focus:ring-2 focus:ring-primary resize-y text-sm"
-          bind:value={$sdNegativePromptStore}
-          rows="2"
-          placeholder="Elements to avoid..."
-        ></textarea>
-      </label>
-
-      <!-- LoRA Selection -->
-      <div>
-        <span class="text-xs text-txtsecondary block mb-1">LoRAs</span>
-        <div class="flex items-center gap-2 mb-2">
-          <button
-            class="px-3 py-1.5 text-sm rounded border border-gray-200 dark:border-white/10 bg-surface hover:bg-secondary-hover transition-colors disabled:opacity-50"
-            onclick={loadLoras}
-            disabled={!$selectedModelStore || isLoadingLoras}
-          >
-            {isLoadingLoras ? "Loading..." : lorasLoaded ? "Reload LoRAs" : "Load LoRAs"}
-          </button>
-          {#if lorasLoaded && availableLoras.length > 0}
-            <select
-              class="flex-1 px-2 py-1.5 text-sm rounded border border-gray-200 dark:border-white/10 bg-surface focus:outline-none focus:ring-2 focus:ring-primary"
-              onchange={addLora}
-            >
-              <option value="">Add a LoRA...</option>
-              {#each availableLoras.filter((l) => !selectedLoras.some((s) => s.path === l.path)) as lora}
-                <option value={lora.path}>{lora.name}</option>
-              {/each}
-            </select>
-          {/if}
-        </div>
-        {#if loraError}
-          <p class="text-xs text-red-500 mb-1">{loraError}</p>
-        {/if}
-        {#if lorasLoaded && availableLoras.length === 0}
-          <p class="text-xs text-txtsecondary">No LoRAs available</p>
-        {/if}
-        {#if selectedLoras.length > 0}
-          <div class="flex flex-col gap-1.5">
-            {#each selectedLoras as lora}
-              <div class="flex items-center gap-2 text-sm">
-                <span class="flex-1 truncate">{getLoraName(lora.path)}</span>
-                <input
-                  type="number"
-                  class="w-20 px-1.5 py-1 text-xs rounded border border-gray-200 dark:border-white/10 bg-surface focus:outline-none focus:ring-1 focus:ring-primary"
-                  value={lora.multiplier}
-                  oninput={(e) => updateLoraMultiplier(lora.path, parseFloat((e.target as HTMLInputElement).value) || 1)}
-                  min="0"
-                  max="2"
-                  step="0.1"
-                />
-                <button
-                  class="px-1.5 py-0.5 text-xs rounded border border-gray-200 dark:border-white/10 hover:bg-red-500 hover:text-white hover:border-red-500 transition-colors"
-                  onclick={() => removeLora(lora.path)}
-                  aria-label="Remove LoRA"
-                >
-                  x
-                </button>
-              </div>
-            {/each}
-          </div>
-        {/if}
-      </div>
-    </div>
-  {/if}
-
  <!-- Empty state for no models configured -->
  {#if !hasModels}
    <div class="flex-1 flex items-center justify-center text-txtsecondary">
@@ -402,50 +138,22 @@
          <p class="font-medium">Error</p>
          <p class="text-sm mt-1">{error}</p>
        </div>
-      {:else if generatedImages.length > 1}
-        <!-- Grid for multiple images (batch) -->
-        <div class="grid grid-cols-2 gap-2 p-2 w-full h-full overflow-auto">
-          {#each generatedImages as img, i}
-            <div class="relative flex items-center justify-center">
-              <button
-                class="p-0 border-0 bg-transparent cursor-pointer"
-                onclick={() => openFullscreen(i)}
-                aria-label="View fullscreen"
-              >
-                <img
-                  src={img}
-                  alt="AI generated content {i + 1}"
-                  class="max-w-full max-h-full object-contain hover:opacity-90 transition-opacity"
-                />
-              </button>
-              <button
-                class="absolute bottom-2 right-2 p-1.5 bg-black/60 hover:bg-black/80 text-white rounded-full transition-colors"
-                onclick={(e) => { e.stopPropagation(); downloadImage(i); }}
-                aria-label="Download image"
-              >
-                <svg class="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
-                  <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M4 16v1a3 3 0 003 3h10a3 3 0 003-3v-1m-4-4l-4 4m0 0l-4-4m4 4V4"></path>
-                </svg>
-              </button>
-            </div>
-          {/each}
-        </div>
-      {:else if generatedImages.length === 1}
+      {:else if generatedImage}
        <div class="relative max-w-full max-h-full flex items-center justify-center">
          <button
            class="p-0 border-0 bg-transparent cursor-pointer"
-            onclick={() => openFullscreen(0)}
+            onclick={openFullscreen}
            aria-label="View fullscreen"
          >
            <img
-              src={generatedImages[0]}
+              src={generatedImage}
              alt="AI generated content"
              class="max-w-full max-h-full object-contain hover:opacity-90 transition-opacity"
            />
          </button>
          <button
            class="absolute bottom-2 right-2 p-2 bg-black/60 hover:bg-black/80 text-white rounded-full transition-colors"
-            onclick={(e) => { e.stopPropagation(); downloadImage(0); }}
+            onclick={(e) => { e.stopPropagation(); downloadImage(); }}
            aria-label="Download image"
          >
            <svg class="w-5 h-5" fill="none" stroke="currentColor" viewBox="0 0 24 24">
@@ -485,7 +193,7 @@
          <button
            class="btn flex-1 md:flex-none"
            onclick={clearImage}
-            disabled={generatedImages.length === 0 && !error && !prompt.trim()}
+            disabled={!generatedImage && !error && !prompt.trim()}
          >
            Clear
          </button>
@@ -496,7 +204,7 @@
 </div>

 <!-- Fullscreen dialog -->
-{#if showFullscreen && generatedImages[fullscreenIndex]}
+{#if showFullscreen && generatedImage}
  <div
    class="fixed inset-0 bg-black/90 z-50 flex items-center justify-center p-4"
    onclick={(e) => closeFullscreen(e)}
@@ -513,7 +221,7 @@
      ×
    </button>
    <img
-      src={generatedImages[fullscreenIndex]}
+      src={generatedImage}
      alt="AI generated content"
      class="max-w-full max-h-full object-contain pointer-events-none"
    />
@@ -25,11 +25,6 @@
      <optgroup label="Local">
        {#each grouped.local as model (model.id)}
          <option value={model.id}>{model.id}</option>
-          {#if model.aliases}
-            {#each model.aliases as alias (alias)}
-              <option value={alias}>  ↳ {alias}</option>
-            {/each}
-          {/if}
        {/each}
      </optgroup>
    {/if}
@@ -1,406 +0,0 @@
-<script lang="ts">
-  import { models } from "../../stores/api";
-  import { persistentStore } from "../../stores/persistent";
-  import { rerank } from "../../lib/rerankApi";
-  import { playgroundStores } from "../../stores/playgroundActivity";
-  import ModelSelector from "./ModelSelector.svelte";
-
-  type RerankRow = { doc: string; score: number | null };
-  type SortOrder = "none" | "asc" | "desc";
-  type EditorMode = "table" | "json";
-
-  const selectedModelStore = persistentStore<string>("playground-rerank-model", "");
-
-  const defaultQuery = "How do LLM's work?";
-  const defaultDocs = [
-    "Large language models (LLMs) use transformer architectures to predict the next token in a sequence based on massive amounts of text data.",
-    "LLMs are trained on diverse internet text, learning statistical patterns of language that allow them to generate coherent responses.",
-    "During training, LLMs minimize a loss function that measures the difference between predicted and actual tokens across billions of examples.",
-    "Attention mechanisms in transformers enable LLMs to weigh the importance of different words when generating output.",
-    "Fine\u2011tuning allows a pre\u2011trained LLM to adapt to a specific downstream task with a smaller dataset.",
-    "Neural networks consist of layers of interconnected neurons that adjust their weights during back\u2011propagation.",
-    "The history of the Roman Empire spanned over a thousand years.",
-    "Soccer is the most popular sport in many countries around the world.",
-    "Quantum computing uses qubits to perform calculations that are intractable for classical computers.",
-  ];
-
-  let query = $state(defaultQuery);
-  let rows = $state<RerankRow[]>([
-    ...defaultDocs.map((doc) => ({ doc, score: null })),
-    { doc: "", score: null },
-  ]);
-  let isLoading = $state(false);
-  let error = $state<string | null>(null);
-  let usage = $state<{ prompt_tokens: number; total_tokens: number } | null>(null);
-  let abortController: AbortController | null = null;
-  let sortOrder = $state<SortOrder>("desc");
-  let editorMode = $state<EditorMode>("table");
-  let jsonText = $state("");
-  let jsonError = $state<string | null>(null);
-
-  let hasModels = $derived($models.some((m) => !m.unlisted));
-
-  let canSubmit = $derived((() => {
-    if (!$selectedModelStore || isLoading) return false;
-    if (editorMode === "json") {
-      try {
-        const parsed = JSON.parse(jsonText) as Record<string, unknown>;
-        return (
-          typeof parsed.query === "string" &&
-          parsed.query.trim() !== "" &&
-          Array.isArray(parsed.documents) &&
-          (parsed.documents as unknown[]).some(
-            (d) => typeof d === "string" && (d as string).trim() !== ""
-          )
-        );
-      } catch {
-        return false;
-      }
-    }
-    return query.trim() !== "" && rows.some((r) => r.doc.trim() !== "");
-  })());
-
-  // Display rows with sort applied (display-only transform, rows[] is never mutated by sorting)
-  let displayRows = $derived((() => {
-    const indexed = rows.map((row, i) => ({ row, i }));
-    if (sortOrder === "none") return indexed;
-    return [...indexed].sort((a, b) => {
-      if (a.row.score === null && b.row.score === null) return 0;
-      if (a.row.score === null) return 1;
-      if (b.row.score === null) return -1;
-      return sortOrder === "desc"
-        ? b.row.score - a.row.score
-        : a.row.score - b.row.score;
-    });
-  })());
-
-  // Auto-add a new empty row when the last row gets content (table mode only)
-  $effect(() => {
-    if (editorMode === "table" && rows[rows.length - 1]?.doc.trim() !== "") {
-      rows = [...rows, { doc: "", score: null }];
-    }
-  });
-
-  // Sync loading state to activity store
-  $effect(() => {
-    playgroundStores.rerankLoading.set(isLoading);
-  });
-
-  function switchToJson() {
-    if (editorMode === "json") return;
-    const docs = rows.filter((r) => r.doc.trim() !== "").map((r) => r.doc);
-    jsonText = JSON.stringify({ query, documents: docs }, null, 2);
-    jsonError = null;
-    editorMode = "json";
-  }
-
-  function switchToTable() {
-    if (editorMode === "table") return;
-    if (jsonText.trim() === "") {
-      query = "";
-      rows = [{ doc: "", score: null }];
-      jsonError = null;
-      editorMode = "table";
-      return;
-    }
-    try {
-      const parsed = JSON.parse(jsonText) as unknown;
-      if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
-        throw new Error("Expected a JSON object");
-      }
-      const obj = parsed as Record<string, unknown>;
-      if (typeof obj.query !== "string") throw new Error('"query" must be a string');
-      if (!Array.isArray(obj.documents)) throw new Error('"documents" must be an array');
-      query = obj.query;
-      const newRows: RerankRow[] = (obj.documents as unknown[]).map((d) => ({
-        doc: typeof d === "string" ? d : String(d),
-        score: null,
-      }));
-      if (newRows.length === 0 || newRows[newRows.length - 1].doc.trim() !== "") {
-        newRows.push({ doc: "", score: null });
-      }
-      rows = newRows;
-      jsonError = null;
-      editorMode = "table";
-    } catch (err) {
-      jsonError = err instanceof Error ? err.message : "Invalid JSON";
-    }
-  }
-
-  function cycleSortOrder() {
-    sortOrder = sortOrder === "none" ? "desc" : sortOrder === "desc" ? "asc" : "none";
-  }
-
-  function sortIndicator(): string {
-    if (sortOrder === "desc") return " ↓";
-    if (sortOrder === "asc") return " ↑";
-    return "";
-  }
-
-  async function submit() {
-    if (!canSubmit) return;
-
-    let submitQuery: string;
-    let nonEmptyEntries: { originalIndex: number; doc: string }[];
-
-    if (editorMode === "json") {
-      // Parse JSON, sync state to table, then submit
-      try {
-        const parsed = JSON.parse(jsonText) as Record<string, unknown>;
-        submitQuery = parsed.query as string;
-        const docs = (parsed.documents as string[]).filter((d) => d.trim() !== "");
-        const newRows: RerankRow[] = docs.map((d) => ({ doc: d, score: null }));
-        newRows.push({ doc: "", score: null });
-        rows = newRows;
-        query = submitQuery;
-        editorMode = "table";
-      } catch {
-        error = "Invalid JSON — fix before submitting";
-        return;
-      }
-      nonEmptyEntries = rows
-        .map((r, i) => ({ originalIndex: i, doc: r.doc }))
-        .filter((e) => e.doc.trim() !== "");
-    } else {
-      submitQuery = query;
-      nonEmptyEntries = rows
-        .map((r, i) => ({ originalIndex: i, doc: r.doc }))
-        .filter((e) => e.doc.trim() !== "");
-    }
-
-    isLoading = true;
-    error = null;
-    usage = null;
-
-    // Clear previous scores
-    rows = rows.map((r) => ({ ...r, score: null }));
-
-    abortController = new AbortController();
-
-    try {
-      const response = await rerank(
-        $selectedModelStore,
-        submitQuery,
-        nonEmptyEntries.map((e) => e.doc),
-        abortController.signal
-      );
-
-      usage = response.usage;
-
-      // Map result.index (position in submitted docs array) back to original rows[] index
-      const updated = rows.map((r) => ({ ...r }));
-      for (const result of response.results) {
-        const entry = nonEmptyEntries[result.index];
-        if (entry !== undefined) {
-          updated[entry.originalIndex].score = result.relevance_score;
-        }
-      }
-      rows = updated;
-    } catch (err) {
-      if (err instanceof Error && err.name === "AbortError") {
-        // User cancelled
-      } else {
-        error = err instanceof Error ? err.message : "An error occurred";
-      }
-    } finally {
-      isLoading = false;
-      abortController = null;
-    }
-  }
-
-  function cancel() {
-    abortController?.abort();
-  }
-
-  function clear() {
-    query = defaultQuery;
-    rows = [...defaultDocs.map((doc) => ({ doc, score: null })), { doc: "", score: null }];
-    error = null;
-    usage = null;
-    sortOrder = "desc";
-    jsonText = "";
-    jsonError = null;
-  }
-
-  function deleteRow(originalIndex: number) {
-    if (rows.length <= 1) return;
-    rows = rows.filter((_, i) => i !== originalIndex);
-  }
-
-  function updateDoc(originalIndex: number, value: string) {
-    const updated = rows.map((r) => ({ ...r }));
-    updated[originalIndex].doc = value;
-    rows = updated;
-  }
-
-  function scoreColor(score: number | null): string {
-    if (score === null) return "text-txtsecondary";
-    if (score > 0) return "text-green-600 dark:text-green-400";
-    return "text-red-500 dark:text-red-400";
-  }
-
-  function formatScore(score: number | null): string {
-    if (score === null) return "—";
-    return score.toFixed(3);
-  }
-
-  function handleKeyDown(e: KeyboardEvent) {
-    if (e.key === "Enter" && !e.shiftKey) {
-      e.preventDefault();
-      submit();
-    }
-  }
-
-  let isCleared = $derived(
-    query === defaultQuery &&
-    rows.every((r, i) => r.score === null && r.doc === (defaultDocs[i] ?? "")) &&
-    rows.length === defaultDocs.length + 1 &&
-    !jsonText.trim() &&
-    !error &&
-    !usage
-  );
-</script>
-
-<div class="flex flex-col h-full">
-  <!-- Top bar: model selector + query input (table mode) + mode toggle -->
-  <div class="shrink-0 flex flex-wrap gap-2 mb-4">
-    <ModelSelector bind:value={$selectedModelStore} placeholder="Select a rerank model..." disabled={isLoading} />
-    {#if editorMode === "table"}
-      <input
-        type="text"
-        class="min-w-0 flex-1 basis-48 px-3 py-2 rounded border border-gray-200 dark:border-white/10 bg-surface focus:outline-none focus:ring-2 focus:ring-primary"
-        placeholder="Query..."
-        bind:value={query}
-        disabled={isLoading}
-        onkeydown={handleKeyDown}
-      />
-    {/if}
-    <!-- Table / JSON toggle -->
-    <div class="flex rounded border border-gray-200 dark:border-white/10 overflow-hidden shrink-0">
-      <button
-        class="px-3 py-1.5 text-sm transition-colors {editorMode === 'table'
-          ? 'bg-primary text-btn-primary-text'
-          : 'bg-surface hover:bg-secondary-hover'}"
-        onclick={switchToTable}
-        disabled={isLoading}
-      >
-        Table
-      </button>
-      <button
-        class="px-3 py-1.5 text-sm border-l border-gray-200 dark:border-white/10 transition-colors {editorMode === 'json'
-          ? 'bg-primary text-btn-primary-text'
-          : 'bg-surface hover:bg-secondary-hover'}"
-        onclick={switchToJson}
-        disabled={isLoading}
-      >
-        JSON
-      </button>
-    </div>
-  </div>
-
-  {#if !hasModels}
-    <div class="flex-1 flex items-center justify-center text-txtsecondary">
-      <p>No models configured. Add models to your configuration to use reranking.</p>
-    </div>
-  {:else if editorMode === "json"}
-    <!-- JSON editor -->
-    <div class="flex-1 flex flex-col min-h-0 mb-4">
-      <textarea
-        class="flex-1 w-full font-mono text-sm px-3 py-2 rounded border border-gray-200 dark:border-white/10 bg-surface focus:outline-none focus:ring-2 focus:ring-primary resize-none"
-        bind:value={jsonText}
-        disabled={isLoading}
-        placeholder={'{\n  "query": "your search query",\n  "documents": [\n    "document one",\n    "document two"\n  ]\n}'}
-        spellcheck={false}
-      ></textarea>
-      {#if jsonError}
-        <p class="mt-1 text-sm text-red-500">{jsonError}</p>
-      {/if}
-    </div>
-  {:else}
-    <!-- Document table -->
-    <div class="flex-1 overflow-y-auto mb-4 border border-gray-200 dark:border-white/10 rounded">
-      <table class="w-full border-collapse table-fixed">
-        <colgroup>
-          <col class="w-auto" />
-          <col style="width: 120px" />
-          <col style="width: 40px" />
-        </colgroup>
-        <thead class="sticky top-0 bg-surface border-b border-gray-200 dark:border-white/10">
-          <tr>
-            <th class="px-3 py-2 text-left text-sm font-medium text-txtsecondary">Document</th>
-            <th
-              class="px-3 py-2 text-right text-sm font-medium text-txtsecondary cursor-pointer select-none hover:text-txtprimary transition-colors"
-              onclick={cycleSortOrder}
-            >
-              Score{sortIndicator()}
-            </th>
-            <th class="px-2 py-2"></th>
-          </tr>
-        </thead>
-        <tbody>
-          {#each displayRows as { row, i } (i)}
-            <tr class="border-b border-gray-100 dark:border-white/5 last:border-0">
-              <td class="px-3 py-1.5">
-                <input
-                  type="text"
-                  class="w-full bg-transparent focus:outline-none focus:ring-1 focus:ring-primary rounded px-1 py-0.5"
-                  placeholder={i === rows.length - 1 ? "Add document..." : "Document text..."}
-                  value={row.doc}
-                  oninput={(e) => updateDoc(i, (e.target as HTMLInputElement).value)}
-                  disabled={isLoading}
-                  onkeydown={handleKeyDown}
-                />
-              </td>
-              <td class="px-3 py-1.5 text-right font-mono text-sm {scoreColor(row.score)}">
-                {#if isLoading && row.score === null && row.doc.trim() !== ""}
-                  <span class="inline-block w-4 h-4 border-2 border-current border-t-transparent rounded-full animate-spin align-middle"></span>
-                {:else}
-                  {formatScore(row.score)}
-                {/if}
-              </td>
-              <td class="px-2 py-1.5 text-center">
-                <button
-                  class="w-7 h-7 flex items-center justify-center text-txtsecondary hover:text-red-500 transition-colors rounded disabled:opacity-30 disabled:cursor-not-allowed"
-                  onclick={() => deleteRow(i)}
-                  disabled={rows.length <= 1}
-                  tabindex="-1"
-                  aria-label="Remove row"
-                >
-                  ×
-                </button>
-              </td>
-            </tr>
-          {/each}
-        </tbody>
-      </table>
-    </div>
-  {/if}
-
-  <!-- Bottom toolbar -->
-  {#if hasModels}
-    <div class="shrink-0 flex flex-wrap items-center gap-2">
-      {#if isLoading}
-        <button class="btn bg-red-500 hover:bg-red-600 text-white" onclick={cancel}>
-          Cancel
-        </button>
-      {:else}
-        <button
-          class="btn bg-primary text-btn-primary-text hover:opacity-90"
-          onclick={submit}
-          disabled={!canSubmit}
-        >
-          Rerank
-        </button>
-        <button class="btn" onclick={clear} disabled={isCleared}>
-          Clear
-        </button>
-      {/if}
-
-      {#if error}
-        <span class="text-sm text-red-500 ml-2">{error}</span>
-      {:else if usage}
-        <span class="text-sm text-txtsecondary ml-2">{usage.total_tokens} tokens</span>
-      {/if}
-    </div>
-  {/if}
-</div>
@@ -2,7 +2,6 @@
  import { models } from "../../stores/api";
  import { persistentStore } from "../../stores/persistent";
  import { generateSpeech } from "../../lib/speechApi";
-  import { playgroundStores } from "../../stores/playgroundActivity";
  import ModelSelector from "./ModelSelector.svelte";
  import ExpandableTextarea from "./ExpandableTextarea.svelte";

@@ -21,9 +20,11 @@
  let availableVoices = $state<string[]>(["coral", "alloy", "echo", "fable", "onyx", "nova", "shimmer"]);
  let isLoadingVoices = $state(false);

+  // Default voices to fall back to if API call fails
  const defaultVoices = ["coral", "alloy", "echo", "fable", "onyx", "nova", "shimmer"];
  const CACHE_KEY = "playground-speech-voices-cache";

+  // Load voices cache from localStorage
  function getVoicesCache(): Record<string, string[]> {
    if (typeof window === "undefined") return {};
    try {
@@ -34,6 +35,7 @@
    }
  }

+  // Save voices cache to localStorage
  function saveVoicesCache(cache: Record<string, string[]>) {
    if (typeof window === "undefined") return;
    try {
@@ -45,12 +47,9 @@

  let hasModels = $derived($models.some((m) => !m.unlisted));

+  // Track if this is the initial page load to avoid fetching on refresh
  let isInitialLoad = $state(true);

-  $effect(() => {
-    playgroundStores.speechGenerating.set(isGenerating);
-  });
-
  // On page load, restore cached voices for the selected model if available
  $effect(() => {
    const model = $selectedModelStore;
@@ -1,5 +1,5 @@
 import { describe, it, expect } from "vitest";
-import { renderMarkdown, escapeHtml, splitCompleteBlocks, closePendingBlock, normalizeLatexDelimiters, renderStreamingMarkdown, createStreamingCache } from "./markdown";
+import { renderMarkdown, escapeHtml } from "./markdown";

 describe("renderMarkdown", () => {
  describe("basic markdown", () => {
@@ -130,35 +130,6 @@ More text here.
      expect(result).toContain("katex");
      expect(result).toContain("sqrt");
    });
-
-    it("renders \\[...\\] display math", () => {
-      const result = renderMarkdown("\\[\nx^2 + y^2 = z^2\n\\]");
-      expect(result).toContain("katex");
-    });
-
-    it("renders \\(...\\) inline math", () => {
-      const result = renderMarkdown("The equation \\(E = mc^2\\) is famous.");
-      expect(result).toContain("katex");
-    });
-  });
-
-  describe("normalizeLatexDelimiters", () => {
-    it("converts \\[...\\] to $$...$$", () => {
-      expect(normalizeLatexDelimiters("\\[\nx^2\n\\]")).toBe("$$\nx^2\n$$");
-    });
-
-    it("converts \\(...\\) to $...$", () => {
-      expect(normalizeLatexDelimiters("\\(x^2\\)")).toBe("$x^2$");
-    });
-
-    it("leaves $$ and $ delimiters unchanged", () => {
-      const text = "$$x^2$$ and $y$";
-      expect(normalizeLatexDelimiters(text)).toBe(text);
-    });
-
-    it("handles multiple occurrences", () => {
-      expect(normalizeLatexDelimiters("\\(a\\) and \\(b\\)")).toBe("$a$ and $b$");
-    });
  });

  describe("escapeHtml", () => {
@@ -187,237 +158,3 @@ More text here.
    });
  });
 });
-
-describe("splitCompleteBlocks", () => {
-  it("returns everything as pending when no blank line", () => {
-    const result = splitCompleteBlocks("Hello world");
-    expect(result.complete).toBe("");
-    expect(result.pending).toBe("Hello world");
-  });
-
-  it("returns empty for empty input", () => {
-    const result = splitCompleteBlocks("");
-    expect(result.complete).toBe("");
-    expect(result.pending).toBe("");
-  });
-
-  it("splits on blank line between paragraphs", () => {
-    const result = splitCompleteBlocks("First paragraph.\n\nSecond paragraph");
-    expect(result.complete).toBe("First paragraph.\n");
-    expect(result.pending).toBe("Second paragraph");
-  });
-
-  it("splits multiple paragraphs at last blank line", () => {
-    const result = splitCompleteBlocks("Para 1.\n\nPara 2.\n\nPara 3");
-    expect(result.complete).toBe("Para 1.\n\nPara 2.\n");
-    expect(result.pending).toBe("Para 3");
-  });
-
-  it("treats closed code fence as complete boundary", () => {
-    const text = "```js\nconst x = 1;\n```\nMore text";
-    const result = splitCompleteBlocks(text);
-    expect(result.complete).toBe("```js\nconst x = 1;\n```");
-    expect(result.pending).toBe("More text");
-  });
-
-  it("treats unclosed code fence as pending", () => {
-    const text = "Done paragraph.\n\n```js\nconst x = 1;";
-    const result = splitCompleteBlocks(text);
-    expect(result.complete).toBe("Done paragraph.\n");
-    expect(result.pending).toBe("```js\nconst x = 1;");
-  });
-
-  it("does not split on blank lines inside code fences", () => {
-    const text = "```\nline1\n\nline2\n```";
-    const result = splitCompleteBlocks(text);
-    expect(result.complete).toBe("```\nline1\n\nline2\n```");
-    expect(result.pending).toBe("");
-  });
-
-  it("handles tilde fences", () => {
-    const text = "~~~py\nprint('hi')\n~~~\nAfter";
-    const result = splitCompleteBlocks(text);
-    expect(result.complete).toBe("~~~py\nprint('hi')\n~~~");
-    expect(result.pending).toBe("After");
-  });
-
-  it("does not close backtick fence with tilde fence", () => {
-    const text = "```\ncode\n~~~\nstill code";
-    const result = splitCompleteBlocks(text);
-    // The ~~~ should not close a backtick fence, so everything from ``` onward is pending
-    expect(result.complete).toBe("");
-    expect(result.pending).toBe("```\ncode\n~~~\nstill code");
-  });
-
-  it("treats closed math block as complete boundary", () => {
-    const text = "$$\nx^2\n$$\nAfter";
-    const result = splitCompleteBlocks(text);
-    expect(result.complete).toBe("$$\nx^2\n$$");
-    expect(result.pending).toBe("After");
-  });
-
-  it("treats unclosed math block as pending", () => {
-    const text = "Before.\n\n$$\nx^2";
-    const result = splitCompleteBlocks(text);
-    expect(result.complete).toBe("Before.\n");
-    expect(result.pending).toBe("$$\nx^2");
-  });
-
-  it("treats closed \\[...\\] math block as complete boundary", () => {
-    const text = "\\[\nx^2\n\\]\nAfter";
-    const result = splitCompleteBlocks(text);
-    expect(result.complete).toBe("\\[\nx^2\n\\]");
-    expect(result.pending).toBe("After");
-  });
-
-  it("treats unclosed \\[ math block as pending", () => {
-    const text = "Before.\n\n\\[\nx^2";
-    const result = splitCompleteBlocks(text);
-    expect(result.complete).toBe("Before.\n");
-    expect(result.pending).toBe("\\[\nx^2");
-  });
-
-  it("handles trailing blank line making everything complete", () => {
-    const text = "Hello world.\n";
-    const result = splitCompleteBlocks(text);
-    // Last line is empty string after split, which is a blank line
-    expect(result.complete).toBe("Hello world.\n");
-    expect(result.pending).toBe("");
-  });
-});
-
-describe("closePendingBlock", () => {
-  it("returns empty string for empty input", () => {
-    expect(closePendingBlock("")).toBe("");
-  });
-
-  it("returns plain text unchanged", () => {
-    expect(closePendingBlock("Hello world")).toBe("Hello world");
-  });
-
-  it("closes an open backtick code fence", () => {
-    const result = closePendingBlock("```python\nprint('hi')");
-    expect(result).toBe("```python\nprint('hi')\n```");
-  });
-
-  it("closes an open tilde code fence", () => {
-    const result = closePendingBlock("~~~js\nconst x = 1;");
-    expect(result).toBe("~~~js\nconst x = 1;\n~~~");
-  });
-
-  it("does not modify already-closed code fence", () => {
-    const text = "```py\ncode\n```";
-    expect(closePendingBlock(text)).toBe(text);
-  });
-
-  it("closes an open math block", () => {
-    const result = closePendingBlock("$$\nx^2 + y^2");
-    expect(result).toBe("$$\nx^2 + y^2\n$$");
-  });
-
-  it("does not modify already-closed math block", () => {
-    const text = "$$\nx^2\n$$";
-    expect(closePendingBlock(text)).toBe(text);
-  });
-
-  it("closes an open \\[ math block with \\]", () => {
-    const result = closePendingBlock("\\[\nx^2 + y^2");
-    expect(result).toBe("\\[\nx^2 + y^2\n\\]");
-  });
-
-  it("does not modify already-closed \\[...\\] math block", () => {
-    const text = "\\[\nx^2\n\\]";
-    expect(closePendingBlock(text)).toBe(text);
-  });
-
-  it("closes code fence when preceded by regular text", () => {
-    const result = closePendingBlock("Some text\n```\ncode");
-    expect(result).toBe("Some text\n```\ncode\n```");
-  });
-
-  it("leaves headers unchanged", () => {
-    expect(closePendingBlock("## Hello")).toBe("## Hello");
-  });
-
-  it("leaves tables unchanged", () => {
-    const table = "| a | b |\n| --- | --- |\n| 1 | 2 |";
-    expect(closePendingBlock(table)).toBe(table);
-  });
-
-  it("leaves lists unchanged", () => {
-    expect(closePendingBlock("- item 1\n- item 2")).toBe("- item 1\n- item 2");
-  });
-});
-
-describe("renderStreamingMarkdown", () => {
-  it("renders complete blocks and pending as markdown", () => {
-    const cache = createStreamingCache();
-    const text = "# Hello\n\nWorld";
-    const { blocks, pendingHtml } = renderStreamingMarkdown(text, cache);
-    expect(blocks).toHaveLength(1);
-    expect(blocks[0].html).toContain("<h1>Hello</h1>");
-    expect(pendingHtml).toContain("World");
-    expect(pendingHtml).toContain("<p>");
-  });
-
-  it("preserves existing blocks when complete portion is unchanged", () => {
-    const cache = createStreamingCache();
-    renderStreamingMarkdown("# Hello\n\nWor", cache);
-    const firstBlocks = cache.blocks;
-
-    const { blocks } = renderStreamingMarkdown("# Hello\n\nWorld", cache);
-    // Same block array reference — nothing changed in the complete section
-    expect(blocks).toBe(firstBlocks);
-    expect(cache.completeKey).toBe("# Hello\n");
-  });
-
-  it("appends a new block when a new section completes", () => {
-    const cache = createStreamingCache();
-    renderStreamingMarkdown("# Hello\n\nParagraph", cache);
-    expect(cache.blocks).toHaveLength(1);
-    const firstBlock = cache.blocks[0];
-
-    renderStreamingMarkdown("# Hello\n\nParagraph.\n\nMore", cache);
-    expect(cache.blocks).toHaveLength(2);
-    // First block is preserved with the same id and html
-    expect(cache.blocks[0].id).toBe(firstBlock.id);
-    expect(cache.blocks[0].html).toBe(firstBlock.html);
-    // Second block contains the new paragraph
-    expect(cache.blocks[1].html).toContain("Paragraph.");
-  });
-
-  it("assigns unique stable ids to each block", () => {
-    const cache = createStreamingCache();
-    renderStreamingMarkdown("A.\n\nB.\n\nC", cache);
-    expect(cache.blocks).toHaveLength(1);
-    const id0 = cache.blocks[0].id;
-
-    renderStreamingMarkdown("A.\n\nB.\n\nC.\n\nD", cache);
-    expect(cache.blocks).toHaveLength(2);
-    expect(cache.blocks[0].id).toBe(id0);
-    expect(cache.blocks[1].id).toBe(id0 + 1);
-  });
-
-  it("renders pending code block with syntax highlighting", () => {
-    const cache = createStreamingCache();
-    const text = "Done.\n\n```python\nprint('hello')";
-    const { pendingHtml } = renderStreamingMarkdown(text, cache);
-    expect(pendingHtml).toContain("<code");
-    expect(pendingHtml).toContain("hljs");
-  });
-
-  it("renders pending table as markdown", () => {
-    const cache = createStreamingCache();
-    const text = "Done.\n\n| a | b |\n| --- | --- |\n| 1 | 2 |";
-    const { pendingHtml } = renderStreamingMarkdown(text, cache);
-    expect(pendingHtml).toContain("<table>");
-    expect(pendingHtml).toContain("<td>");
-  });
-
-  it("renders pending portion through markdown pipeline", () => {
-    const cache = createStreamingCache();
-    const text = "Done.\n\nSome **bold** text";
-    const { pendingHtml } = renderStreamingMarkdown(text, cache);
-    expect(pendingHtml).toContain("<strong>bold</strong>");
-  });
-});
@@ -69,189 +69,13 @@ const processor = unified()
  .use(rehypeHighlight)
  .use(rehypeStringify, { allowDangerousHtml: true });

-export function splitCompleteBlocks(text: string): { complete: string; pending: string } {
-  if (!text) {
-    return { complete: "", pending: "" };
-  }
-
-  const lines = text.split("\n");
-  let lastCompleteBoundary = -1; // index of last line that ends a complete block
-  let inFence = false;
-  let fenceChar = "";
-  let inMathBlock = false;
-
-  for (let i = 0; i < lines.length; i++) {
-    const trimmed = lines[i].trimEnd();
-
-    if (inFence) {
-      // Check for closing fence: same character, at least 3, no other content
-      if (new RegExp(`^\\s*${fenceChar.replace(/~/g, "\\~")}{3,}\\s*$`).test(trimmed)) {
-        inFence = false;
-        fenceChar = "";
-        lastCompleteBoundary = i;
-      }
-      continue;
-    }
-
-    if (inMathBlock) {
-      if (trimmed === "$$" || trimmed === "\\]") {
-        inMathBlock = false;
-        lastCompleteBoundary = i;
-      }
-      continue;
-    }
-
-    // Check for opening fence
-    const fenceMatch = trimmed.match(/^(\s*)(```|~~~)/);
-    if (fenceMatch) {
-      // Check if it's an opening fence (may have language info after)
-      // A line with just ``` or ~~~ could be opening or closing, but since we're not in a fence it's opening
-      fenceChar = fenceMatch[2][0]; // '`' or '~'
-      inFence = true;
-      continue;
-    }
-
-    // Check for opening math block
-    if (trimmed === "$$" || trimmed === "\\[") {
-      inMathBlock = true;
-      continue;
-    }
-
-    // Outside fences/math: blank line marks a complete boundary
-    if (trimmed === "") {
-      lastCompleteBoundary = i;
-    }
-  }
-
-  if (lastCompleteBoundary < 0) {
-    return { complete: "", pending: text };
-  }
-
-  const completeLines = lines.slice(0, lastCompleteBoundary + 1);
-  const pendingLines = lines.slice(lastCompleteBoundary + 1);
-
-  return {
-    complete: completeLines.join("\n"),
-    pending: pendingLines.join("\n"),
-  };
-}
-
-export function closePendingBlock(pending: string): string {
-  if (!pending) return "";
-
-  const lines = pending.split("\n");
-  let inFence = false;
-  let fenceStr = "";
-  let inMathBlock = false;
-  let mathClose = "";
-
-  for (const line of lines) {
-    const trimmed = line.trimEnd();
-
-    if (inFence) {
-      if (new RegExp(`^\\s*${fenceStr[0] === "~" ? "~~~" : "\\`\\`\\`"}\\s*$`).test(trimmed)) {
-        inFence = false;
-        fenceStr = "";
-      }
-      continue;
-    }
-
-    if (inMathBlock) {
-      if (trimmed === "$$" || trimmed === "\\]") {
-        inMathBlock = false;
-        mathClose = "";
-      }
-      continue;
-    }
-
-    const fenceMatch = trimmed.match(/^(\s*)(```|~~~)/);
-    if (fenceMatch) {
-      fenceStr = fenceMatch[2];
-      inFence = true;
-      continue;
-    }
-
-    if (trimmed === "$$") {
-      inMathBlock = true;
-      mathClose = "$$";
-      continue;
-    }
-
-    if (trimmed === "\\[") {
-      inMathBlock = true;
-      mathClose = "\\]";
-      continue;
-    }
-  }
-
-  if (inFence) return pending + "\n" + fenceStr;
-  if (inMathBlock) return pending + "\n" + mathClose;
-  return pending;
-}
-
-export interface RenderedBlock {
-  id: number;
-  html: string;
-}
-
-export interface StreamingCache {
-  blocks: RenderedBlock[];
-  nextId: number;
-  completeKey: string;
-}
-
-export function createStreamingCache(): StreamingCache {
-  return { blocks: [], nextId: 0, completeKey: "" };
-}
-
-export function renderStreamingMarkdown(
-  text: string,
-  cache: StreamingCache,
-): { blocks: RenderedBlock[]; pendingHtml: string } {
-  const { complete, pending } = splitCompleteBlocks(text);
-
-  if (complete) {
-    if (cache.completeKey !== complete) {
-      if (complete.startsWith(cache.completeKey) && cache.completeKey.length > 0) {
-        // Complete section grew — render only the new part as a new block
-        const newPart = complete.slice(cache.completeKey.length);
-        cache.blocks = [...cache.blocks, { id: cache.nextId++, html: renderMarkdown(newPart) }];
-      } else {
-        // Complete section changed unexpectedly — re-render as single block
-        cache.blocks = [{ id: cache.nextId++, html: renderMarkdown(complete) }];
-      }
-      cache.completeKey = complete;
-    }
-  } else if (cache.blocks.length > 0) {
-    cache.blocks = [];
-    cache.completeKey = "";
-  }
-
-  let pendingHtml = "";
-  if (pending) {
-    const closed = closePendingBlock(pending);
-    pendingHtml = renderMarkdown(closed);
-  }
-
-  return { blocks: cache.blocks, pendingHtml };
-}
-
-// Convert \[...\] to $$...$$ and \(...\) to $...$
-export function normalizeLatexDelimiters(text: string): string {
-  // Display math: \[...\] → $$...$$  (may span multiple lines)
-  text = text.replace(/\\\[([\s\S]*?)\\\]/g, (_match, inner) => `$$${inner}$$`);
-  // Inline math: \(...\) → $...$
-  text = text.replace(/\\\(([\s\S]*?)\\\)/g, (_match, inner) => `$${inner}$`);
-  return text;
-}
-
 export function renderMarkdown(content: string): string {
  if (!content) {
    return "";
  }

  try {
-    const result = processor.processSync(normalizeLatexDelimiters(content));
+    const result = processor.processSync(content);
    return String(result);
  } catch {
    // Fallback to escaped plain text if markdown parsing fails
@@ -1,27 +0,0 @@
-export interface RerankResult {
-  index: number;
-  relevance_score: number;
-}
-
-export interface RerankResponse {
-  model: string;
-  object: string;
-  usage: { prompt_tokens: number; total_tokens: number };
-  results: RerankResult[];
-}
-
-export async function rerank(
-  model: string,
-  query: string,
-  documents: string[],
-  signal: AbortSignal
-): Promise<RerankResponse> {
-  const response = await fetch("/v1/rerank", {
-    method: "POST",
-    headers: { "Content-Type": "application/json" },
-    body: JSON.stringify({ model, query, documents }),
-    signal,
-  });
-  if (!response.ok) throw new Error(`${response.status} ${response.statusText}`);
-  return response.json();
-}
@@ -1,39 +0,0 @@
-import type { SdApiTxt2ImgRequest, SdApiResponse, SdApiLora } from "./types";
-
-export async function generateSdImage(
-  request: SdApiTxt2ImgRequest,
-  signal?: AbortSignal
-): Promise<SdApiResponse> {
-  const response = await fetch("/sdapi/v1/txt2img", {
-    method: "POST",
-    headers: {
-      "Content-Type": "application/json",
-    },
-    body: JSON.stringify(request),
-    signal,
-  });
-
-  if (!response.ok) {
-    const errorText = await response.text();
-    throw new Error(`SDAPI error: ${response.status} - ${errorText}`);
-  }
-
-  return response.json();
-}
-
-export async function fetchSdLoras(
-  model: string,
-  signal?: AbortSignal
-): Promise<SdApiLora[]> {
-  const response = await fetch(
-    `/sdapi/v1/loras?model=${encodeURIComponent(model)}`,
-    { signal }
-  );
-
-  if (!response.ok) {
-    const errorText = await response.text();
-    throw new Error(`SDAPI loras error: ${response.status} - ${errorText}`);
-  }
-
-  return response.json();
-}
@@ -9,7 +9,6 @@ export interface Model {
  description: string;
  unlisted: boolean;
  peerID: string;
-  aliases?: string[];
 }

 export interface Metrics {
@@ -39,12 +38,8 @@ export interface LogData {
  data: string;
 }

-export interface InFlightStats {
-  total: number;
-}
-
 export interface APIEventEnvelope {
-  type: "modelStatus" | "logData" | "metrics" | "inflight";
+  type: "modelStatus" | "logData" | "metrics";
  data: string;
 }

@@ -115,40 +110,6 @@ export interface ImageGenerationResponse {
  }>;
 }

-// SDAPI types (stable-diffusion.cpp)
-export type ImageApiMode = "openai" | "sdapi";
-
-export interface SdApiLora {
-  name: string;
-  path: string;
-}
-
-export interface SdApiLoraRef {
-  path: string;
-  multiplier: number;
-}
-
-export interface SdApiTxt2ImgRequest {
-  model?: string;
-  prompt: string;
-  negative_prompt?: string;
-  width?: number;
-  height?: number;
-  steps?: number;
-  cfg_scale?: number;
-  seed?: number;
-  batch_size?: number;
-  sampler_name?: string;
-  scheduler?: string;
-  lora?: SdApiLoraRef[];
-}
-
-export interface SdApiResponse {
-  images: string[];
-  parameters: Record<string, unknown>;
-  info: string;
-}
-
 export interface AudioTranscriptionRequest {
  file: File;
  model: string;
@@ -4,9 +4,8 @@
  import ImageInterface from "../components/playground/ImageInterface.svelte";
  import AudioInterface from "../components/playground/AudioInterface.svelte";
  import SpeechInterface from "../components/playground/SpeechInterface.svelte";
-  import RerankInterface from "../components/playground/RerankInterface.svelte";

-  type Tab = "chat" | "images" | "speech" | "audio" | "rerank";
+  type Tab = "chat" | "images" | "speech" | "audio";

  const selectedTabStore = persistentStore<Tab>("playground-selected-tab", "chat");
  let mobileMenuOpen = $state(false);
@@ -16,7 +15,6 @@
    { id: "images", label: "Images" },
    { id: "speech", label: "Speech" },
    { id: "audio", label: "Transcription" },
-    { id: "rerank", label: "Rerank" },
  ];

  function selectTab(tab: Tab) {
@@ -91,9 +89,6 @@
    <div class="h-full" class:tab-hidden={$selectedTabStore !== "audio"}>
      <AudioInterface />
    </div>
-    <div class="h-full" class:tab-hidden={$selectedTabStore !== "rerank"}>
-      <RerankInterface />
-    </div>
  </div>
 </div>

@@ -1 +0,0 @@
-<!-- empty: real Playground is always mounted in App.svelte -->
@@ -1,5 +1,5 @@
 import { writable } from "svelte/store";
-import type { Model, Metrics, VersionInfo, LogData, APIEventEnvelope, ReqRespCapture, InFlightStats } from "../lib/types";
+import type { Model, Metrics, VersionInfo, LogData, APIEventEnvelope, ReqRespCapture } from "../lib/types";
 import { connectionState } from "./theme";

 const LOG_LENGTH_LIMIT = 1024 * 100; /* 100KB of log data */
@@ -9,7 +9,6 @@ export const models = writable<Model[]>([]);
 export const proxyLogs = writable<string>("");
 export const upstreamLogs = writable<string>("");
 export const metrics = writable<Metrics[]>([]);
-export const inFlightRequests = writable<number>(0);
 export const versionInfo = writable<VersionInfo>({
  build_date: "unknown",
  commit: "unknown",
@@ -30,7 +29,6 @@ export function enableAPIEvents(enabled: boolean): void {
    apiEventSource?.close();
    apiEventSource = null;
    metrics.set([]);
-    inFlightRequests.set(0);
    return;
  }

@@ -48,7 +46,6 @@ export function enableAPIEvents(enabled: boolean): void {
      proxyLogs.set("");
      upstreamLogs.set("");
      metrics.set([]);
-      inFlightRequests.set(0);
      models.set([]);
      retryCount = 0;
      connectionState.set("connected");
@@ -62,7 +59,7 @@ export function enableAPIEvents(enabled: boolean): void {
            const newModels = JSON.parse(message.data) as Model[];
            // Sort models by name and id
            newModels.sort((a, b) => {
-              return (a.name + a.id).localeCompare(b.name + b.id, undefined, { numeric : true} );
+              return (a.name + a.id).localeCompare(b.name + b.id);
            });
            models.set(newModels);
            break;
@@ -86,11 +83,6 @@ export function enableAPIEvents(enabled: boolean): void {
            metrics.update((prevMetrics) => [...newMetrics, ...prevMetrics]);
            break;
          }
-          case "inflight": {
-            const stats = JSON.parse(message.data) as InFlightStats;
-            inFlightRequests.set(stats.total ?? 0);
-            break;
-          }
        }
      } catch (err) {
        console.error(e.data, err);
@@ -1,20 +0,0 @@
-import { writable, derived } from "svelte/store";
-
-const chatStreaming = writable(false);
-const imageGenerating = writable(false);
-const speechGenerating = writable(false);
-const audioTranscribing = writable(false);
-const rerankLoading = writable(false);
-
-export const playgroundActivity = derived(
-  [chatStreaming, imageGenerating, speechGenerating, audioTranscribing, rerankLoading],
-  ([$chat, $image, $speech, $audio, $rerank]) => $chat || $image || $speech || $audio || $rerank
-);
-
-export const playgroundStores = {
-  chatStreaming,
-  imageGenerating,
-  speechGenerating,
-  audioTranscribing,
-  rerankLoading,
-};
@@ -1,3 +0,0 @@
-import { writable } from "svelte/store";
-
-export const currentRoute = writable("/");
@@ -32,7 +32,6 @@ export default defineConfig({
      "/upstream": "http://localhost:8080",
      "/unload": "http://localhost:8080",
      "/v1": "http://localhost:8080",
-      "/sdapi": "http://localhost:8080",
    },
  },
 });
				`@@ -1 +0,0 @@`
				`<!-- empty: real Playground is always mounted in App.svelte -->`