proxy: implement setParamsByID filter (#535 )

Add setParamsByID filter that applies different request parameters based on the requested model ID, enabling per-alias behaviour for a single loaded model. - add SetParamsByID field to Filters struct and SanitizedSetParamsByID method - substitute ${MODEL_ID} and other macros in setParamsByID keys and values - validate no unknown macros remain in keys or values after substitution - apply setParamsByID in proxyInferenceHandler after setParams (can override it) - update config-schema.json with setParamsByID definition - update UI to show aliases and make them selectable in the Playground closes #534
ui: smart auto-scroll in LogPanel (#530 )
2026-02-19 22:21:10 -08:00 · 2026-02-18 19:47:37 -08:00 · 2026-02-16 09:41:15 -08:00
16 changed files with 530 additions and 48 deletions
@@ -200,11 +200,20 @@
                                "additionalProperties": true,
                                "default": {},
                                "description": "Dictionary of parameters to set/override in requests. Useful for enforcing specific parameter values. Protected params like 'model' cannot be overridden. Values can be strings, numbers, booleans, arrays, or objects."
                            },
                            "setParamsByID": {
                                "type": "object",
                                "additionalProperties": {
                                    "type": "object",
                                    "additionalProperties": true
                                },
                                "default": {},
                                "description": "Dictionary mapping requested model IDs (or aliases) to parameters to set/override in requests. Applied after setParams and can override those values. Useful with aliases to vary behaviour depending on which alias the client used (e.g. different reasoning_effort per alias). Keys support ${MODEL_ID} macro substitution. Protected params like 'model' cannot be overridden."
                            }
                        },
                        "additionalProperties": false,
                        "default": {},
-                        "description": "Dictionary of filter settings. Supports stripParams and setParams."
+                        "description": "Dictionary of filter settings. Supports stripParams, setParams, and setParamsByID."
                    },
                    "metadata": {
                        "type": "object",
@@ -126,7 +126,7 @@ apiKeys:
 # - below are examples of the all the settings a model can have
 models:
  # keys are the model names used in API requests
-  "llama":
+  "gpt-oss-120b":
    # macros: a dictionary of string substitutions specific to this model
    # - optional, default: empty dictionary
    # - macros defined here override macros defined in the global macros section
@@ -143,7 +143,7 @@ models:
    cmd: |
      # ${latest-llama} is a macro that is defined above
      ${latest-llama}
-      --model path/to/llama-8B-Q4_K_M.gguf
+      --model path/to/gpt-oss-120B.gguf
      --ctx-size ${default_ctx}
      --temperature ${temp}
@@ -151,13 +151,13 @@ models:
    # - optional, default: empty string
    # - if set, it will be used in the v1/models API response
    # - if not set, it will be omitted in the JSON model record
-    name: "llama 3.1 8B"
+    name: "gpt-oss 120B"
    # description: a description for the model
    # - optional, default: empty string
    # - if set, it will be used in the v1/models API response
    # - if not set, it will be omitted in the JSON model record
-    description: "A small but capable model used for quick testing"
+    description: "A thinking model from OpenAI"
    # env: define an array of environment variables to inject into cmd's environment
    # - optional, default: empty array
@@ -172,14 +172,6 @@ models:
    # - if you use a custom port in cmd this *must* be set
    proxy: http://127.0.0.1:8999
    # aliases: alternative model names that this model configuration is used for
    # - optional, default: empty array
    # - aliases must be unique globally
    # - useful for impersonating a specific model
    aliases:
      - "gpt-4o-mini"
      - "gpt-3.5-turbo"
    # checkEndpoint: URL path to check if the server is ready
    # - optional, default: /health
    # - endpoint is expected to return an HTTP 200 response
@@ -197,7 +189,7 @@ models:
    # - optional, default: ""
    # - useful for when the upstream server expects a specific model name that
    #   is different from the model's ID
-    useModelName: "qwen:qwq"
+    useModelName: "openai/gpt-oss-120B"
    # filters: a dictionary of filter settings
    # - optional, default: empty dictionary
@@ -216,11 +208,38 @@ models:
      # - useful for enforcing specific parameter values
      # - protected params like "model" cannot be overridden
      # - values can be strings, numbers, booleans, arrays, or objects
      # - always runs for the model
      setParams:
        # Example: enforce specific sampling parameters
        temperature: 0.7
        top_p: 0.9
      # setParamsByID: a dictionary of parameters to set based the model ID
      # - optional, default: empty dictionary
      # - combine with aliases to create variant behaviour without reloading the model
      # - parameters are set in the request body JSON
      # - run after setParams so it will override any settings
      # - protected params like "model" cannot be overridden
      # - values can be strings, numbers, booleans, arrays, or objects
      # - model aliases will be automatically created for each key
      setParamsByID:
        "${MODEL_ID}":
          chat_template_kwargs:
            reasoning_effort: medium
        "${MODEL_ID}:high":
          chat_template_kwargs:
            reasoning_effort: high
        "${MODEL_ID}:low":
          chat_template_kwargs:
            reasoning_effort: low
    # aliases: alternative model names that this model configuration is used for
    # - optional, default: empty array
    # - aliases must be unique globally
    # - useful for impersonating a specific model
    aliases:
      - "gpt-4o-mini"
    # metadata: a dictionary of arbitrary values that are included in /v1/models
    # - optional, default: empty dictionary
    # - while metadata can contains complex types it is recommended to keep it simple
@@ -294,6 +294,24 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 			modelConfig.CheckEndpoint = strings.ReplaceAll(modelConfig.CheckEndpoint, macroSlug, macroStr)
 			modelConfig.Filters.StripParams = strings.ReplaceAll(modelConfig.Filters.StripParams, macroSlug, macroStr)
 			// Substitute macros in SetParamsByID keys and values
 			if len(modelConfig.Filters.SetParamsByID) > 0 {
 				newSetParamsByID := make(map[string]map[string]any, len(modelConfig.Filters.SetParamsByID))
 				for key, paramMap := range modelConfig.Filters.SetParamsByID {
 					newKey := strings.ReplaceAll(key, macroSlug, macroStr)
 					newValAny, err := substituteMacroInValue(any(paramMap), entry.Name, entry.Value)
 					if err != nil {
 						return Config{}, fmt.Errorf("model %s filters.setParamsByID: %s", modelId, err.Error())
 					}
 					newParamMap, ok := newValAny.(map[string]any)
 					if !ok {
 						return Config{}, fmt.Errorf("model %s filters.setParamsByID: unexpected type after macro substitution", modelId)
 					}
 					newSetParamsByID[newKey] = newParamMap
 				}
 				modelConfig.Filters.SetParamsByID = newSetParamsByID
 			}
 			// Substitute in metadata (type-preserving)
 			if len(modelConfig.Metadata) > 0 {
 				result, err := substituteMacroInValue(modelConfig.Metadata, entry.Name, entry.Value)
@@ -359,6 +377,34 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 			}
 		}
 		// Validate SetParamsByID keys and values
 		for key, paramMap := range modelConfig.Filters.SetParamsByID {
 			if matches := macroPatternRegex.FindAllStringSubmatch(key, -1); len(matches) > 0 {
 				return Config{}, fmt.Errorf("unknown macro '${%s}' found in model %s filters.setParamsByID key", matches[0][1], modelId)
 			}
 			if err := validateNestedForUnknownMacros(any(paramMap), fmt.Sprintf("model %s filters.setParamsByID[%s]", modelId, key)); err != nil {
 				return Config{}, err
 			}
 		}
 		// Auto-register setParamsByID keys as aliases (skip the model's own ID)
 		for key := range modelConfig.Filters.SetParamsByID {
 			if key == modelId {
 				continue
 			}
 			if _, exists := config.Models[key]; exists {
 				return Config{}, fmt.Errorf("model %s filters.setParamsByID: key '%s' conflicts with an existing model ID", modelId, key)
 			}
 			if existingModel, exists := config.aliases[key]; exists {
 				if existingModel != modelId {
 					return Config{}, fmt.Errorf("duplicate alias '%s' in model %s filters.setParamsByID, already used by model %s", key, modelId, existingModel)
 				}
 				continue // already registered as explicit alias for this model
 			}
 			config.aliases[key] = modelId
 			modelConfig.Aliases = append(modelConfig.Aliases, key)
 		}
 		if _, err := url.Parse(modelConfig.Proxy); err != nil {
 			return Config{}, fmt.Errorf("model %s: invalid proxy URL: %w", modelId, err)
 		}
@@ -20,6 +20,12 @@ type Filters struct {
 	// SetParams is a dictionary of parameters to set/override in requests
 	// Protected params (like "model") cannot be set
 	SetParams map[string]any `yaml:"setParams"`
 	// SetParamsByID maps requested model IDs to parameters to set/override in requests.
 	// Useful with aliases: a single loaded model can behave differently depending on
 	// which alias the client used. Applied after SetParams, so it can override those values.
 	// Protected params (like "model") cannot be set.
 	SetParamsByID map[string]map[string]any `yaml:"setParamsByID"`
 }
 // SanitizedStripParams returns a sorted list of parameters to strip,
@@ -51,6 +57,33 @@ func (f Filters) SanitizedStripParams() []string {
 	return cleaned
 }
 // SanitizedSetParamsByID returns the params to set for the given requestedModelID,
 // with protected params removed and keys sorted for consistent iteration order.
 // Returns nil if the ID has no entry or all its params are protected.
 func (f Filters) SanitizedSetParamsByID(requestedModelID string) (map[string]any, []string) {
 	if len(f.SetParamsByID) == 0 {
 		return nil, nil
 	}
 	params, found := f.SetParamsByID[requestedModelID]
 	if !found || len(params) == 0 {
 		return nil, nil
 	}
 	result := make(map[string]any, len(params))
 	keys := make([]string, 0, len(params))
 	for key, value := range params {
 		if slices.Contains(ProtectedParams, key) {
 			continue
 		}
 		result[key] = value
 		keys = append(keys, key)
 	}
 	sort.Strings(keys)
 	if len(result) == 0 {
 		return nil, nil
 	}
 	return result, keys
 }
 // SanitizedSetParams returns a copy of SetParams with protected params removed
 // and keys sorted for consistent iteration order
 func (f Filters) SanitizedSetParams() (map[string]any, []string) {
@@ -162,6 +162,123 @@ func TestFilters_SanitizedSetParams(t *testing.T) {
 	}
 }
 func TestFilters_SanitizedSetParamsByID(t *testing.T) {
 	tests := []struct {
 		name             string
 		setParamsByID    map[string]map[string]any
 		requestedModelID string
 		wantParams       map[string]any
 		wantKeys         []string
 	}{
 		{
 			name:             "empty SetParamsByID returns nil",
 			setParamsByID:    nil,
 			requestedModelID: "model1",
 			wantParams:       nil,
 			wantKeys:         nil,
 		},
 		{
 			name:             "empty map returns nil",
 			setParamsByID:    map[string]map[string]any{},
 			requestedModelID: "model1",
 			wantParams:       nil,
 			wantKeys:         nil,
 		},
 		{
 			name: "non-matching model ID returns nil",
 			setParamsByID: map[string]map[string]any{
 				"model2": {"temperature": 0.9},
 			},
 			requestedModelID: "model1",
 			wantParams:       nil,
 			wantKeys:         nil,
 		},
 		{
 			name: "matching model ID returns correct params",
 			setParamsByID: map[string]map[string]any{
 				"model1": {"temperature": 0.7, "top_p": 0.9},
 				"model2": {"temperature": 0.5},
 			},
 			requestedModelID: "model1",
 			wantParams: map[string]any{
 				"temperature": 0.7,
 				"top_p":       0.9,
 			},
 			wantKeys: []string{"temperature", "top_p"},
 		},
 		{
 			name: "protected param model is filtered out",
 			setParamsByID: map[string]map[string]any{
 				"model1": {
 					"model":       "should-be-filtered",
 					"temperature": 0.7,
 				},
 			},
 			requestedModelID: "model1",
 			wantParams: map[string]any{
 				"temperature": 0.7,
 			},
 			wantKeys: []string{"temperature"},
 		},
 		{
 			name: "only protected param returns nil",
 			setParamsByID: map[string]map[string]any{
 				"model1": {
 					"model": "should-be-filtered",
 				},
 			},
 			requestedModelID: "model1",
 			wantParams:       nil,
 			wantKeys:         nil,
 		},
 		{
 			name: "keys are sorted",
 			setParamsByID: map[string]map[string]any{
 				"model1": {
 					"z_param": "z",
 					"a_param": "a",
 					"m_param": "m",
 				},
 			},
 			requestedModelID: "model1",
 			wantParams: map[string]any{
 				"z_param": "z",
 				"a_param": "a",
 				"m_param": "m",
 			},
 			wantKeys: []string{"a_param", "m_param", "z_param"},
 		},
 		{
 			name: "alias style key lookup",
 			setParamsByID: map[string]map[string]any{
 				"model1:high": {"reasoning_effort": "high"},
 				"model1:low":  {"reasoning_effort": "low"},
 			},
 			requestedModelID: "model1:high",
 			wantParams: map[string]any{
 				"reasoning_effort": "high",
 			},
 			wantKeys: []string{"reasoning_effort"},
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			f := Filters{SetParamsByID: tt.setParamsByID}
 			gotParams, gotKeys := f.SanitizedSetParamsByID(tt.requestedModelID)
 			if tt.wantParams == nil {
 				assert.Nil(t, gotParams)
 				assert.Nil(t, gotKeys)
 				return
 			}
 			assert.Equal(t, tt.wantKeys, gotKeys)
 			assert.Equal(t, tt.wantParams, gotParams)
 		})
 	}
 }
 func TestProtectedParams(t *testing.T) {
 	// Verify that "model" is protected
 	assert.Contains(t, ProtectedParams, "model")
@@ -73,6 +73,72 @@ models:
 	}
 }
 func TestConfig_SetParamsByIDAutoAlias(t *testing.T) {
 	content := `
 models:
  model1:
    cmd: path/to/cmd --port ${PORT}
    filters:
      setParamsByID:
        "${MODEL_ID}:high":
          reasoning_effort: high
        "${MODEL_ID}:low":
          reasoning_effort: low
 `
 	cfg, err := LoadConfigFromReader(strings.NewReader(content))
 	assert.NoError(t, err)
 	// Keys (other than the model's own ID) should be registered as aliases
 	realName, found := cfg.RealModelName("model1:high")
 	assert.True(t, found, "model1:high should be an auto-registered alias")
 	assert.Equal(t, "model1", realName)
 	realName, found = cfg.RealModelName("model1:low")
 	assert.True(t, found, "model1:low should be an auto-registered alias")
 	assert.Equal(t, "model1", realName)
 	// Auto-aliases should also appear in modelConfig.Aliases
 	aliases := cfg.Models["model1"].Aliases
 	assert.Contains(t, aliases, "model1:high")
 	assert.Contains(t, aliases, "model1:low")
 }
 func TestConfig_SetParamsByIDAutoAliasConflictWithModelID(t *testing.T) {
 	content := `
 models:
  model1:
    cmd: path/to/cmd --port ${PORT}
    filters:
      setParamsByID:
        model2:
          reasoning_effort: high
  model2:
    cmd: path/to/cmd --port ${PORT}
 `
 	_, err := LoadConfigFromReader(strings.NewReader(content))
 	assert.ErrorContains(t, err, "conflicts with an existing model ID")
 }
 func TestConfig_SetParamsByIDAutoAliasConflictWithOtherModel(t *testing.T) {
 	content := `
 models:
  model1:
    cmd: path/to/cmd --port ${PORT}
    filters:
      setParamsByID:
        "shared-alias":
          reasoning_effort: high
  model2:
    cmd: path/to/cmd --port ${PORT}
    filters:
      setParamsByID:
        "shared-alias":
          reasoning_effort: low
 `
 	_, err := LoadConfigFromReader(strings.NewReader(content))
 	assert.ErrorContains(t, err, "duplicate alias")
 }
 func TestConfig_ModelFiltersWithSetParams(t *testing.T) {
 	content := `
 models:
@@ -8,6 +8,7 @@ const ConfigFileChangedEventID = 0x03
 const LogDataEventID = 0x04
 const TokenMetricsEventID = 0x05
 const ModelPreloadedEventID = 0x06
 const InFlightRequestsEventID = 0x07
 type ProcessStateChangeEvent struct {
 	ProcessName string
@@ -58,3 +59,11 @@ type ModelPreloadedEvent struct {
 func (e ModelPreloadedEvent) Type() uint32 {
 	return ModelPreloadedEventID
 }
 type InFlightRequestsEvent struct {
 	Total int
 }
 func (e InFlightRequestsEvent) Type() uint32 {
 	return InFlightRequestsEventID
 }
@@ -28,6 +28,40 @@ const (
 type proxyCtxKey string
 type InflightCounter struct {
 	mu    sync.Mutex
 	total int
 }
 func newInflightCounter() *InflightCounter {
 	return &InflightCounter{}
 }
 func (ic *InflightCounter) Current() int {
 	ic.mu.Lock()
 	total := ic.total
 	ic.mu.Unlock()
 	return total
 }
 func (ic *InflightCounter) Increment() int {
 	ic.mu.Lock()
 	ic.total++
 	total := ic.total
 	ic.mu.Unlock()
 	return total
 }
 func (ic *InflightCounter) Decrement() int {
 	ic.mu.Lock()
 	if ic.total > 0 {
 		ic.total--
 	}
 	total := ic.total
 	ic.mu.Unlock()
 	return total
 }
 type ProxyManager struct {
 	sync.Mutex
@@ -43,6 +77,8 @@ type ProxyManager struct {
 	processGroups map[string]*ProcessGroup
 	inFlightCounter *InflightCounter
 	// shutdown signaling
 	shutdownCtx    context.Context
 	shutdownCancel context.CancelFunc
@@ -155,6 +191,8 @@ func New(proxyConfig config.Config) *ProxyManager {
 		processGroups: make(map[string]*ProcessGroup),
 		inFlightCounter: newInflightCounter(),
 		shutdownCtx:    shutdownCtx,
 		shutdownCancel: shutdownCancel,
@@ -276,37 +314,37 @@ func (pm *ProxyManager) setupGinEngine() {
 	// Set up routes using the Gin engine
 	// Protected routes use pm.apiKeyAuth() middleware
-	pm.ginEngine.POST("/v1/chat/completions", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/chat/completions", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
-	pm.ginEngine.POST("/v1/responses", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/responses", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
 	// Support legacy /v1/completions api, see issue #12
-	pm.ginEngine.POST("/v1/completions", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/completions", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
 	// Support anthropic /v1/messages (added https://github.com/ggml-org/llama.cpp/pull/17570)
-	pm.ginEngine.POST("/v1/messages", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/messages", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
 	// Support anthropic count_tokens API (Also added in the above PR)
-	pm.ginEngine.POST("/v1/messages/count_tokens", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/messages/count_tokens", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
 	// Support embeddings and reranking
-	pm.ginEngine.POST("/v1/embeddings", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/embeddings", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
 	// llama-server's /reranking endpoint + aliases
-	pm.ginEngine.POST("/reranking", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/reranking", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
-	pm.ginEngine.POST("/rerank", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/rerank", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
-	pm.ginEngine.POST("/v1/rerank", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/rerank", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
-	pm.ginEngine.POST("/v1/reranking", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/reranking", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
 	// llama-server's /infill endpoint for code infilling
-	pm.ginEngine.POST("/infill", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/infill", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
 	// llama-server's /completion endpoint
-	pm.ginEngine.POST("/completion", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/completion", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
 	// Support audio/speech endpoint
-	pm.ginEngine.POST("/v1/audio/speech", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/audio/speech", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
-	pm.ginEngine.POST("/v1/audio/voices", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/audio/voices", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
-	pm.ginEngine.GET("/v1/audio/voices", pm.apiKeyAuth(), pm.proxyGETModelHandler)
+	pm.ginEngine.GET("/v1/audio/voices", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyGETModelHandler)
-	pm.ginEngine.POST("/v1/audio/transcriptions", pm.apiKeyAuth(), pm.proxyOAIPostFormHandler)
+	pm.ginEngine.POST("/v1/audio/transcriptions", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyOAIPostFormHandler)
-	pm.ginEngine.POST("/v1/images/generations", pm.apiKeyAuth(), pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/images/generations", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyInferenceHandler)
-	pm.ginEngine.POST("/v1/images/edits", pm.apiKeyAuth(), pm.proxyOAIPostFormHandler)
+	pm.ginEngine.POST("/v1/images/edits", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyOAIPostFormHandler)
 	pm.ginEngine.GET("/v1/models", pm.apiKeyAuth(), pm.listModelsHandler)
@@ -325,7 +363,7 @@ func (pm *ProxyManager) setupGinEngine() {
 	pm.ginEngine.GET("/upstream", func(c *gin.Context) {
 		c.Redirect(http.StatusFound, "/ui/models")
 	})
-	pm.ginEngine.Any("/upstream/*upstreamPath", pm.apiKeyAuth(), pm.proxyToUpstream)
+	pm.ginEngine.Any("/upstream/*upstreamPath", pm.apiKeyAuth(), pm.trackInflight(), pm.proxyToUpstream)
 	pm.ginEngine.GET("/unload", pm.apiKeyAuth(), pm.unloadAllModelsHandler)
 	pm.ginEngine.GET("/running", pm.apiKeyAuth(), pm.listRunningProcessesHandler)
 	pm.ginEngine.GET("/health", func(c *gin.Context) {
@@ -389,6 +427,14 @@ func (pm *ProxyManager) setupGinEngine() {
 	gin.DisableConsoleColor()
 }
 func (pm *ProxyManager) trackInflight() gin.HandlerFunc {
 	return func(c *gin.Context) {
 		event.Emit(InFlightRequestsEvent{Total: pm.inFlightCounter.Increment()})
 		defer event.Emit(InFlightRequestsEvent{Total: pm.inFlightCounter.Decrement()})
 		c.Next()
 	}
 }
 // ServeHTTP implements http.Handler interface
 func (pm *ProxyManager) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 	pm.ginEngine.ServeHTTP(w, r)
@@ -674,6 +720,17 @@ func (pm *ProxyManager) proxyInferenceHandler(c *gin.Context) {
 			}
 		}
 		// setParamsByID: set params based on the requested model ID (runs after setParams, can override it)
 		setParamsByIDParams, setParamsByIDKeys := pm.config.Models[modelID].Filters.SanitizedSetParamsByID(requestedModel)
 		for _, key := range setParamsByIDKeys {
 			pm.proxyLogger.Debugf("<%s> setting param by id: %s", requestedModel, key)
 			bodyBytes, err = sjson.SetBytes(bodyBytes, key, setParamsByIDParams[key])
 			if err != nil {
 				pm.sendErrorResponse(c, http.StatusInternalServerError, fmt.Sprintf("error setting parameter %s in request", key))
 				return
 			}
 		}
 		pm.proxyLogger.Debugf("ProxyManager using local Process for model: %s", requestedModel)
 		nextHandler = processGroup.ProxyRequest
 	} else if pm.peerProxy != nil && pm.peerProxy.HasPeerModel(requestedModel) {
@@ -20,6 +20,7 @@ type Model struct {
 	State       string   `json:"state"`
 	Unlisted    bool     `json:"unlisted"`
 	PeerID      string   `json:"peerID"`
 	Aliases     []string `json:"aliases,omitempty"`
 }
 func addApiHandlers(pm *ProxyManager) {
@@ -83,6 +84,7 @@ func (pm *ProxyManager) getModelStatus() []Model {
 			Description: pm.config.Models[modelID].Description,
 			State:       state,
 			Unlisted:    pm.config.Models[modelID].Unlisted,
 			Aliases:     pm.config.Models[modelID].Aliases,
 		})
 	}
@@ -107,6 +109,7 @@ const (
 	msgTypeModelStatus messageType = "modelStatus"
 	msgTypeLogData     messageType = "logData"
 	msgTypeMetrics     messageType = "metrics"
 	msgTypeInFlight    messageType = "inflight"
 )
 type messageEnvelope struct {
@@ -166,6 +169,18 @@ func (pm *ProxyManager) apiSendEvents(c *gin.Context) {
 		}
 	}
 	sendInFlight := func(total int) {
 		jsonData, err := json.Marshal(gin.H{"total": total})
 		if err == nil {
 			select {
 			case sendBuffer <- messageEnvelope{Type: msgTypeInFlight, Data: string(jsonData)}:
 			case <-ctx.Done():
 				return
 			default:
 			}
 		}
 	}
 	/**
 	 * Send updated models list
 	 */
@@ -193,11 +208,19 @@ func (pm *ProxyManager) apiSendEvents(c *gin.Context) {
 		sendMetrics([]TokenMetrics{e.Metrics})
 	})()
 	/**
 	 * Send in-flight request stats related to token stats "Waiting: N" count.
 	 */
 	defer event.On(func(e InFlightRequestsEvent) {
 		sendInFlight(e.Total)
 	})()
 	// send initial batch of data
 	sendLogData("proxy", pm.proxyLogger.GetHistory())
 	sendLogData("upstream", pm.upstreamLogger.GetHistory())
 	sendModels()
 	sendMetrics(pm.metricsMonitor.getMetrics())
 	sendInFlight(pm.inFlightCounter.Current())
 	for {
 		select {
@@ -1046,6 +1046,61 @@ func TestProxyManager_FiltersStripParams(t *testing.T) {
 	// t.Logf("%v", response)
 }
 func TestProxyManager_FiltersSetParamsByID(t *testing.T) {
 	// no explicit aliases — setParamsByID keys are auto-registered as aliases
 	configStr := strings.Replace(`
 logLevel: error
 models:
  model1:
    cmd: 'SRPATH --port ${PORT} --silent --respond model1'
    proxy: "http://127.0.0.1:${PORT}"
    filters:
      setParams:
        reasoning_effort: medium
      setParamsByID:
        "${MODEL_ID}:high":
          reasoning_effort: high
        "${MODEL_ID}:low":
          reasoning_effort: low
 `, "SRPATH", simpleResponderPath, -1)
 	cfg, err := config.LoadConfigFromReader(strings.NewReader(configStr))
 	if !assert.NoError(t, err, "invalid test configuration") {
 		return
 	}
 	proxy := New(cfg)
 	defer proxy.StopProcesses(StopWaitForInflightRequest)
 	tests := []struct {
 		requestedModel string
 		wantEffort     string
 	}{
 		// setParams applies, no setParamsByID match
 		{requestedModel: "model1", wantEffort: "medium"},
 		// setParamsByID overrides setParams
 		{requestedModel: "model1:high", wantEffort: "high"},
 		{requestedModel: "model1:low", wantEffort: "low"},
 	}
 	for _, tt := range tests {
 		t.Run(tt.requestedModel, func(t *testing.T) {
 			reqBody := fmt.Sprintf(`{"model":%q}`, tt.requestedModel)
 			req := httptest.NewRequest("POST", "/v1/chat/completions", bytes.NewBufferString(reqBody))
 			w := CreateTestResponseRecorder()
 			proxy.ServeHTTP(w, req)
 			assert.Equal(t, http.StatusOK, w.Code)
 			var response map[string]interface{}
 			assert.NoError(t, json.Unmarshal(w.Body.Bytes(), &response))
 			requestBody, _ := response["request_body"].(string)
 			gotEffort := gjson.Get(requestBody, "reasoning_effort").String()
 			assert.Equal(t, tt.wantEffort, gotEffort, "reasoning_effort mismatch for model %s", tt.requestedModel)
 		})
 	}
 }
 func TestProxyManager_HealthEndpoint(t *testing.T) {
 	config := config.AddDefaultGroupToConfig(config.Config{
 		HealthCheckTimeout: 15,
@@ -65,10 +65,17 @@
  });
  let preElement: HTMLPreElement;
  let userScrolledUp = $state(false);
-  // Auto scroll to bottom when logs change
+  function handleScroll() {
    if (!preElement) return;
    const { scrollTop, scrollHeight, clientHeight } = preElement;
    userScrolledUp = scrollHeight - scrollTop - clientHeight > 40;
  }
  // Auto scroll to bottom when logs change, unless user has scrolled up
  $effect(() => {
-    if (preElement && filteredLogs) {
+    if (preElement && filteredLogs && !userScrolledUp) {
      preElement.scrollTop = preElement.scrollHeight;
    }
  });
@@ -127,6 +134,6 @@
    {/if}
  </div>
  <div class="rounded-lg bg-background font-mono text-sm flex-1 overflow-hidden">
-    <pre bind:this={preElement} class="{textWrapClass} {fontSizeClass} h-full overflow-auto p-4">{filteredLogs}</pre>
+    <pre bind:this={preElement} onscroll={handleScroll} class="{textWrapClass} {fontSizeClass} h-full overflow-auto p-4">{filteredLogs}</pre>
  </div>
 </div>
@@ -165,6 +165,9 @@
              {#if model.description}
                <p class={model.unlisted ? "text-opacity-70" : ""}><em>{model.description}</em></p>
              {/if}
              {#if model.aliases && model.aliases.length > 0}
                <p class="text-xs text-txtsecondary">Aliases: {model.aliases.join(", ")}</p>
              {/if}
            </td>
            <td class="w-12">
              {#if model.state === "stopped"}
@@ -1,5 +1,5 @@
 <script lang="ts">
-  import { metrics } from "../stores/api";
+  import { inFlightRequests, metrics } from "../stores/api";
  import TokenHistogram from "./TokenHistogram.svelte";
  interface HistogramData {
@@ -15,7 +15,14 @@
  let stats = $derived.by(() => {
    const totalRequests = $metrics.length;
    if (totalRequests === 0) {
-      return { totalRequests: 0, totalInputTokens: 0, totalOutputTokens: 0, tokenStats: { p99: "0", p95: "0", p50: "0" }, histogramData: null };
+      return {
        totalRequests: 0,
        totalInputTokens: 0,
        totalOutputTokens: 0,
        inFlightRequests: $inFlightRequests,
        tokenStats: { p99: "0", p95: "0", p50: "0" },
        histogramData: null,
      };
    }
    const totalInputTokens = $metrics.reduce((sum, m) => sum + m.input_tokens, 0);
@@ -24,7 +31,14 @@
    // Calculate token statistics using output_tokens and duration_ms
    const validMetrics = $metrics.filter((m) => m.duration_ms > 0 && m.output_tokens > 0);
    if (validMetrics.length === 0) {
-      return { totalRequests, totalInputTokens, totalOutputTokens, tokenStats: { p99: "0", p95: "0", p50: "0" }, histogramData: null };
+      return {
        totalRequests,
        totalInputTokens,
        totalOutputTokens,
        inFlightRequests: $inFlightRequests,
        tokenStats: { p99: "0", p95: "0", p50: "0" },
        histogramData: null,
      };
    }
    // Calculate tokens/second for each valid metric
@@ -63,6 +77,7 @@
      totalRequests,
      totalInputTokens,
      totalOutputTokens,
      inFlightRequests: $inFlightRequests,
      tokenStats: {
        p99: p99.toFixed(2),
        p95: p95.toFixed(2),
@@ -95,7 +110,12 @@
      <tbody class="bg-surface divide-y divide-card-border-inner">
        <tr class="hover:bg-secondary">
-          <td class="px-4 py-4 text-sm font-semibold text-gray-900 dark:text-white">{stats.totalRequests}</td>
+          <td class="px-4 py-4 text-sm font-semibold text-gray-900 dark:text-white">
            <div class="flex flex-col gap-1">
              <span class="text-xs font-medium text-gray-500 dark:text-gray-400">Completed: {nf.format(stats.totalRequests)}</span>
              <span class="text-xs font-medium text-gray-500 dark:text-gray-400">Waiting: {nf.format(stats.inFlightRequests)}</span>
            </div>
          </td>
          <td class="px-4 py-4 text-sm text-gray-700 dark:text-gray-300 border-l border-gray-200 dark:border-white/10">
            <div class="flex items-center gap-2">
@@ -25,6 +25,11 @@
      <optgroup label="Local">
        {#each grouped.local as model (model.id)}
          <option value={model.id}>{model.id}</option>
          {#if model.aliases}
            {#each model.aliases as alias (alias)}
              <option value={alias}>  ↳ {alias}</option>
            {/each}
          {/if}
        {/each}
      </optgroup>
    {/if}
@@ -9,6 +9,7 @@ export interface Model {
  description: string;
  unlisted: boolean;
  peerID: string;
  aliases?: string[];
 }
 export interface Metrics {
@@ -38,8 +39,12 @@ export interface LogData {
  data: string;
 }
 export interface InFlightStats {
  total: number;
 }
 export interface APIEventEnvelope {
-  type: "modelStatus" | "logData" | "metrics";
+  type: "modelStatus" | "logData" | "metrics" | "inflight";
  data: string;
 }
@@ -1,5 +1,5 @@
 import { writable } from "svelte/store";
-import type { Model, Metrics, VersionInfo, LogData, APIEventEnvelope, ReqRespCapture } from "../lib/types";
+import type { Model, Metrics, VersionInfo, LogData, APIEventEnvelope, ReqRespCapture, InFlightStats } from "../lib/types";
 import { connectionState } from "./theme";
 const LOG_LENGTH_LIMIT = 1024 * 100; /* 100KB of log data */
@@ -9,6 +9,7 @@ export const models = writable<Model[]>([]);
 export const proxyLogs = writable<string>("");
 export const upstreamLogs = writable<string>("");
 export const metrics = writable<Metrics[]>([]);
 export const inFlightRequests = writable<number>(0);
 export const versionInfo = writable<VersionInfo>({
  build_date: "unknown",
  commit: "unknown",
@@ -29,6 +30,7 @@ export function enableAPIEvents(enabled: boolean): void {
    apiEventSource?.close();
    apiEventSource = null;
    metrics.set([]);
    inFlightRequests.set(0);
    return;
  }
@@ -46,6 +48,7 @@ export function enableAPIEvents(enabled: boolean): void {
      proxyLogs.set("");
      upstreamLogs.set("");
      metrics.set([]);
      inFlightRequests.set(0);
      models.set([]);
      retryCount = 0;
      connectionState.set("connected");
@@ -83,6 +86,11 @@ export function enableAPIEvents(enabled: boolean): void {
            metrics.update((prevMetrics) => [...newMetrics, ...prevMetrics]);
            break;
          }
          case "inflight": {
            const stats = JSON.parse(message.data) as InFlightStats;
            inFlightRequests.set(stats.total ?? 0);
            break;
          }
        }
      } catch (err) {
        console.error(e.data, err);