upstream handler support for model names with forward slash (#298 )

The upstream handler would break on model IDs that contained a forward slash. Model IDs like "aaa/bbb" called at upstream/aaa/bbb would result in an error. This commit adds support for model IDs with a forward slash by iteratively searching the path for a match. Fixes: #229
Fix nginx proxy buffering for streaming endpoints (#295 )
2025-09-13 13:37:03 -07:00 · 2025-09-09 16:07:46 -07:00 · 2025-09-07 21:48:58 -07:00 · 2025-09-06 13:58:02 -07:00 · 2025-09-04 11:57:28 -07:00
10 changed files with 217 additions and 29 deletions
@@ -34,7 +34,7 @@ Written in golang, it is very easy to install (single binary with no dependencie
 - ✅ Run multiple models at once with `Groups` ([#107](https://github.com/mostlygeek/llama-swap/issues/107))
 - ✅ Automatic unloading of models after timeout by setting a `ttl`
 - ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabbyAPI, etc)
- ✅ Reliable Docker and Podman support with `cmdStart` and `cmdStop`
+- ✅ Reliable Docker and Podman support using `cmd` and `cmdStop` together
 - ✅ Full control over server settings per model
 - ✅ Preload models on startup with `hooks` ([#235](https://github.com/mostlygeek/llama-swap/pull/235))

@@ -73,6 +73,30 @@ However, there are many more capabilities that llama-swap supports:

 See the [configuration documentation](https://github.com/mostlygeek/llama-swap/wiki/Configuration) in the wiki all options and examples.

+## Reverse Proxy Configuration (nginx)
+
+If you deploy llama-swap behind nginx, disable response buffering for streaming endpoints. By default, nginx buffers responses which breaks Server‑Sent Events (SSE) and streaming chat completion. ([#236](https://github.com/mostlygeek/llama-swap/issues/236))
+
+Recommended nginx configuration snippets:
+
+```nginx
+# SSE for UI events/logs
+location /api/events {
+    proxy_pass http://your-llama-swap-backend;
+    proxy_buffering off;
+    proxy_cache off;
+}
+
+# Streaming chat completions (stream=true)
+location /v1/chat/completions {
+    proxy_pass http://your-llama-swap-backend;
+    proxy_buffering off;
+    proxy_cache off;
+}
+```
+
+As a safeguard, llama-swap also sets `X-Accel-Buffering: no` on SSE responses. However, explicitly disabling `proxy_buffering` at your reverse proxy is still recommended for reliable streaming behavior.
+
 ## Web UI

 llama-swap includes a real time web interface for monitoring logs and models:
@@ -61,7 +61,6 @@ func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
 		} else {
 			writer.metricsRecorder.processNonStreamingResponse(writer.body)
 		}
-
 	}
 }

@@ -73,6 +72,7 @@ func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool {
 	}

 	// default values
+	cachedTokens := -1 // unknown or missing data
 	outputTokens := 0
 	inputTokens := 0

@@ -93,11 +93,16 @@ func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool {
 		promptPerSecond = jsonData.Get("timings.prompt_per_second").Float()
 		tokensPerSecond = jsonData.Get("timings.predicted_per_second").Float()
 		durationMs = int(jsonData.Get("timings.prompt_ms").Float() + jsonData.Get("timings.predicted_ms").Float())
+
+		if cachedValue := jsonData.Get("timings.cache_n"); cachedValue.Exists() {
+			cachedTokens = int(cachedValue.Int())
+		}
 	}

 	rec.metricsMonitor.addMetrics(TokenMetrics{
 		Timestamp:       time.Now(),
 		Model:           rec.realModelName,
+		CachedTokens:    cachedTokens,
 		InputTokens:     inputTokens,
 		OutputTokens:    outputTokens,
 		PromptPerSecond: promptPerSecond,
@@ -13,6 +13,7 @@ type TokenMetrics struct {
 	ID              int       `json:"id"`
 	Timestamp       time.Time `json:"timestamp"`
 	Model           string    `json:"model"`
+	CachedTokens    int       `json:"cache_tokens"`
 	InputTokens     int       `json:"input_tokens"`
 	OutputTokens    int       `json:"output_tokens"`
 	PromptPerSecond float64   `json:"prompt_per_second"`
@@ -61,7 +62,6 @@ func (mp *MetricsMonitor) addMetrics(metric TokenMetrics) {
 	if len(mp.metrics) > mp.maxMetrics {
 		mp.metrics = mp.metrics[len(mp.metrics)-mp.maxMetrics:]
 	}
-
 	event.Emit(TokenMetricsEvent{Metrics: metric})
 }

@@ -458,6 +458,10 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
 			w.Header().Add(k, v)
 		}
 	}
+	// prevent nginx from buffering streaming responses (e.g., SSE)
+	if strings.Contains(strings.ToLower(resp.Header.Get("Content-Type")), "text/event-stream") {
+		w.Header().Set("X-Accel-Buffering", "no")
+	}
 	w.WriteHeader(resp.StatusCode)

 	// faster than io.Copy when streaming
@@ -227,7 +227,7 @@ func (pm *ProxyManager) setupGinEngine() {
 	pm.ginEngine.GET("/upstream", func(c *gin.Context) {
 		c.Redirect(http.StatusFound, "/ui/models")
 	})
-	pm.ginEngine.Any("/upstream/:model_id/*upstreamPath", pm.proxyToUpstream)
+	pm.ginEngine.Any("/upstream/*upstreamPath", pm.proxyToUpstream)

 	pm.ginEngine.GET("/unload", pm.unloadAllModelsHandler)
 	pm.ginEngine.GET("/running", pm.listRunningProcessesHandler)
@@ -393,24 +393,52 @@ func (pm *ProxyManager) listModelsHandler(c *gin.Context) {
 }

 func (pm *ProxyManager) proxyToUpstream(c *gin.Context) {
-	requestedModel := c.Param("model_id")
+	upstreamPath := c.Param("upstreamPath")

-	if requestedModel == "" {
+	// split the upstream path by / and search for the model name
+	parts := strings.Split(strings.TrimSpace(upstreamPath), "/")
+	if len(parts) == 0 {
 		pm.sendErrorResponse(c, http.StatusBadRequest, "model id required in path")
 		return
 	}

-	processGroup, realModelName, err := pm.swapProcessGroup(requestedModel)
+	modelFound := false
+	searchModelName := ""
+	var modelName, remainingPath string
+	for i, part := range parts {
+		if parts[i] == "" {
+			continue
+		}
+
+		if searchModelName == "" {
+			searchModelName = part
+		} else {
+			searchModelName = searchModelName + "/" + parts[i]
+		}
+
+		if real, ok := pm.config.RealModelName(searchModelName); ok {
+			modelName = real
+			remainingPath = "/" + strings.Join(parts[i+1:], "/")
+			modelFound = true
+			break
+		}
+	}
+
+	if !modelFound {
+		pm.sendErrorResponse(c, http.StatusBadRequest, "model id required in path")
+		return
+	}
+
+	processGroup, realModelName, err := pm.swapProcessGroup(modelName)
 	if err != nil {
 		pm.sendErrorResponse(c, http.StatusInternalServerError, fmt.Sprintf("error swapping process group: %s", err.Error()))
 		return
 	}

 	// rewrite the path
-	c.Request.URL.Path = c.Param("upstreamPath")
+	c.Request.URL.Path = remainingPath
 	processGroup.ProxyRequest(realModelName, c.Writer, c.Request)
 }
-
 func (pm *ProxyManager) proxyOAIHandler(c *gin.Context) {
 	bodyBytes, err := io.ReadAll(c.Request.Body)
 	if err != nil {
@@ -100,6 +100,8 @@ func (pm *ProxyManager) apiSendEvents(c *gin.Context) {
 	c.Header("Cache-Control", "no-cache")
 	c.Header("Connection", "keep-alive")
 	c.Header("X-Content-Type-Options", "nosniff")
+	// prevent nginx from buffering SSE
+	c.Header("X-Accel-Buffering", "no")

 	sendBuffer := make(chan messageEnvelope, 25)
 	ctx, cancel := context.WithCancel(c.Request.Context())
@@ -28,6 +28,8 @@ func (pm *ProxyManager) streamLogsHandler(c *gin.Context) {
 	c.Header("Content-Type", "text/plain")
 	c.Header("Transfer-Encoding", "chunked")
 	c.Header("X-Content-Type-Options", "nosniff")
+	// prevent nginx from buffering streamed logs
+	c.Header("X-Accel-Buffering", "no")

 	logMonitorId := c.Param("logMonitorID")
 	logger, err := pm.getLogger(logMonitorId)
@@ -2,6 +2,7 @@ package proxy

 import (
 	"bytes"
+	"context"
 	"encoding/json"
 	"fmt"
 	"math/rand"
@@ -913,3 +914,67 @@ models:
 	assert.Equal(t, StateReady, proxy.processGroups["preloadTestGroup"].processes["model1"].CurrentState())
 	assert.Equal(t, StateReady, proxy.processGroups["preloadTestGroup"].processes["model2"].CurrentState())
 }
+
+func TestProxyManager_StreamingEndpointsReturnNoBufferingHeader(t *testing.T) {
+	config := AddDefaultGroupToConfig(Config{
+		HealthCheckTimeout: 15,
+		Models: map[string]ModelConfig{
+			"model1": getTestSimpleResponderConfig("model1"),
+		},
+		LogLevel: "error",
+	})
+
+	proxy := New(config)
+	defer proxy.StopProcesses(StopWaitForInflightRequest)
+
+	endpoints := []string{
+		"/api/events",
+		"/logs/stream",
+		"/logs/stream/proxy",
+		"/logs/stream/upstream",
+	}
+
+	for _, endpoint := range endpoints {
+		t.Run(endpoint, func(t *testing.T) {
+			ctx, cancel := context.WithCancel(context.Background())
+			defer cancel()
+
+			req := httptest.NewRequest("GET", endpoint, nil)
+			req = req.WithContext(ctx)
+			rec := httptest.NewRecorder()
+
+			// We don't need the handler to fully complete, just to set the headers
+			// so run it in a goroutine and check the headers after a short delay
+			go proxy.ServeHTTP(rec, req)
+			time.Sleep(10 * time.Millisecond) // give it time to start and write headers
+
+			assert.Equal(t, http.StatusOK, rec.Code)
+			assert.Equal(t, "no", rec.Header().Get("X-Accel-Buffering"))
+		})
+	}
+}
+
+func TestProxyManager_ProxiedStreamingEndpointReturnsNoBufferingHeader(t *testing.T) {
+	config := AddDefaultGroupToConfig(Config{
+		HealthCheckTimeout: 15,
+		Models: map[string]ModelConfig{
+			"streaming-model": getTestSimpleResponderConfig("streaming-model"),
+		},
+		LogLevel: "error",
+	})
+
+	proxy := New(config)
+	defer proxy.StopProcesses(StopWaitForInflightRequest)
+
+	// Make a streaming request
+	reqBody := `{"model":"streaming-model"}`
+	// simple-responder will return text/event-stream when stream=true is in the query
+	req := httptest.NewRequest("POST", "/v1/chat/completions?stream=true", bytes.NewBufferString(reqBody))
+	rec := httptest.NewRecorder()
+
+	proxy.ServeHTTP(rec, req)
+
+	assert.Equal(t, http.StatusOK, rec.Code)
+	assert.Equal(t, "no", rec.Header().Get("X-Accel-Buffering"))
+	assert.Contains(t, rec.Header().Get("Content-Type"), "text/event-stream")
+}
@@ -1,4 +1,4 @@
-import { useRef, createContext, useState, useContext, useEffect, useCallback, useMemo, type ReactNode } from "react";
+import { createContext, useState, useContext, useEffect, useCallback, useMemo, type ReactNode } from "react";
 import type { ConnectionState } from "../lib/types";

 type ModelStatus = "ready" | "starting" | "stopping" | "stopped" | "shutdown" | "unknown";
@@ -28,6 +28,7 @@ interface Metrics {
  id: number;
  timestamp: string;
  model: string;
+  cache_tokens: number;
  input_tokens: number;
  output_tokens: number;
  prompt_per_second: number;
@@ -50,12 +51,14 @@ type APIProviderProps = {
  autoStartAPIEvents?: boolean;
 };

+let apiEventSource: EventSource | null = null;
+
 export function APIProvider({ children, autoStartAPIEvents = true }: APIProviderProps) {
  const [proxyLogs, setProxyLogs] = useState("");
  const [upstreamLogs, setUpstreamLogs] = useState("");
  const [metrics, setMetrics] = useState<Metrics[]>([]);
  const [connectionStatus, setConnectionState] = useState<ConnectionState>("disconnected");
-  const apiEventSource = useRef<EventSource | null>(null);
+  //const apiEventSource = useRef<EventSource | null>(null);

  const [models, setModels] = useState<Model[]>([]);

@@ -68,8 +71,8 @@ export function APIProvider({ children, autoStartAPIEvents = true }: APIProvider

  const enableAPIEvents = useCallback((enabled: boolean) => {
    if (!enabled) {
-      apiEventSource.current?.close();
-      apiEventSource.current = null;
+      apiEventSource?.close();
+      apiEventSource = null;
      setMetrics([]);
      return;
    }
@@ -78,22 +81,22 @@ export function APIProvider({ children, autoStartAPIEvents = true }: APIProvider
    const initialDelay = 1000; // 1 second

    const connect = () => {
-      apiEventSource.current = null;
-      const eventSource = new EventSource("/api/events");
+      apiEventSource?.close();
+      apiEventSource = new EventSource("/api/events");
+
      setConnectionState("connecting");

-      eventSource.onopen = () => {
+      apiEventSource.onopen = () => {
        // clear everything out on connect to keep things in sync
        setProxyLogs("");
        setUpstreamLogs("");
        setMetrics([]); // clear metrics on reconnect
        setModels([]); // clear models on reconnect
-        apiEventSource.current = eventSource;
        retryCount = 0;
        setConnectionState("connected");
      };

-      eventSource.onmessage = (e: MessageEvent) => {
+      apiEventSource.onmessage = (e: MessageEvent) => {
        try {
          const message = JSON.parse(e.data) as APIEventEnvelope;
          switch (message.type) {
@@ -136,8 +139,8 @@ export function APIProvider({ children, autoStartAPIEvents = true }: APIProvider
        }
      };

-      eventSource.onerror = () => {
-        eventSource.close();
+      apiEventSource.onerror = () => {
+        apiEventSource?.close();
        retryCount++;
        const delay = Math.min(initialDelay * Math.pow(2, retryCount - 1), 5000);
        setConnectionState("disconnected");
@@ -1,10 +1,6 @@
 import { useMemo } from "react";
 import { useAPI } from "../contexts/APIProvider";

-const formatTimestamp = (timestamp: string): string => {
-  return new Date(timestamp).toLocaleString();
-};
-
 const formatSpeed = (speed: number): string => {
  return speed < 0 ? "unknown" : speed.toFixed(2) + " t/s";
 };
@@ -13,6 +9,33 @@ const formatDuration = (ms: number): string => {
  return (ms / 1000).toFixed(2) + "s";
 };

+const formatRelativeTime = (timestamp: string): string => {
+  const now = new Date();
+  const date = new Date(timestamp);
+  const diffInSeconds = Math.floor((now.getTime() - date.getTime()) / 1000);
+
+  // Handle future dates by returning "just now"
+  if (diffInSeconds < 5) {
+    return "now";
+  }
+
+  if (diffInSeconds < 60) {
+    return `${diffInSeconds}s ago`;
+  }
+
+  const diffInMinutes = Math.floor(diffInSeconds / 60);
+  if (diffInMinutes < 60) {
+    return `${diffInMinutes}m ago`;
+  }
+
+  const diffInHours = Math.floor(diffInMinutes / 60);
+  if (diffInHours < 24) {
+    return `${diffInHours}h ago`;
+  }
+
+  return "a while ago";
+};
+
 const ActivityPage = () => {
  const { metrics } = useAPI();
  const sortedMetrics = useMemo(() => {
@@ -32,11 +55,16 @@ const ActivityPage = () => {
          <table className="min-w-full divide-y">
            <thead>
              <tr>
-                <th className="px-4 py-3 text-left text-xs font-medium uppercase tracking-wider">Id</th>
-                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Timestamp</th>
+                <th className="px-4 py-3 text-left text-xs font-medium uppercase tracking-wider">ID</th>
+                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Time</th>
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Model</th>
-                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Input Tokens</th>
-                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Output Tokens</th>
+                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">
+                  Cached <Tooltip content="prompt tokens from cache" />
+                </th>
+                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">
+                  Prompt <Tooltip content="new prompt tokens processed" />
+                </th>
+                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Generated</th>
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Prompt Processing</th>
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Generation Speed</th>
                <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Duration</th>
@@ -46,8 +74,11 @@ const ActivityPage = () => {
              {sortedMetrics.map((metric) => (
                <tr key={`metric_${metric.id}`}>
                  <td className="px-4 py-4 whitespace-nowrap text-sm">{metric.id + 1 /* un-zero index */}</td>
-                  <td className="px-6 py-4 whitespace-nowrap text-sm">{formatTimestamp(metric.timestamp)}</td>
+                  <td className="px-6 py-4 whitespace-nowrap text-sm">{formatRelativeTime(metric.timestamp)}</td>
                  <td className="px-6 py-4 whitespace-nowrap text-sm">{metric.model}</td>
+                  <td className="px-6 py-4 whitespace-nowrap text-sm">
+                    {metric.cache_tokens > 0 ? metric.cache_tokens.toLocaleString() : "-"}
+                  </td>
                  <td className="px-6 py-4 whitespace-nowrap text-sm">{metric.input_tokens.toLocaleString()}</td>
                  <td className="px-6 py-4 whitespace-nowrap text-sm">{metric.output_tokens.toLocaleString()}</td>
                  <td className="px-6 py-4 whitespace-nowrap text-sm">{formatSpeed(metric.prompt_per_second)}</td>
@@ -63,4 +94,28 @@ const ActivityPage = () => {
  );
 };

+interface TooltipProps {
+  content: string;
+}
+
+const Tooltip: React.FC<TooltipProps> = ({ content }) => {
+  return (
+    <div className="relative group inline-block">
+      ⓘ
+      <div
+        className="absolute top-full left-1/2 transform -translate-x-1/2 mt-2
+                     px-3 py-2 bg-gray-900 text-white text-sm rounded-md
+                     opacity-0 group-hover:opacity-100 transition-opacity
+                     duration-200 pointer-events-none whitespace-nowrap z-50 normal-case"
+      >
+        {content}
+        <div
+          className="absolute bottom-full left-1/2 transform -translate-x-1/2
+                       border-4 border-transparent border-b-gray-900"
+        ></div>
+      </div>
+    </div>
+  );
+};
+
 export default ActivityPage;
Author	SHA1	Message	Date
Benson Wong	c36986fef6	upstream handler support for model names with forward slash (#298 ) The upstream handler would break on model IDs that contained a forward slash. Model IDs like "aaa/bbb" called at upstream/aaa/bbb would result in an error. This commit adds support for model IDs with a forward slash by iteratively searching the path for a match. Fixes: #229	2025-09-13 13:37:03 -07:00
Artur Podsiadły	558801db1a	Fix nginx proxy buffering for streaming endpoints (#295 ) * Fix nginx proxy buffering for streaming endpoints - Add X-Accel-Buffering: no header to SSE endpoints (/api/events, /logs/stream) - Add X-Accel-Buffering: no header to proxied text/event-stream responses - Add nginx reverse proxy configuration section to README - Add tests for X-Accel-Buffering header on streaming endpoints Fixes #236 * Fix goroutine cleanup in streaming endpoints test Add context cancellation to TestProxyManager_StreamingEndpointsReturnNoBufferingHeader to ensure the goroutine is properly cleaned up when the test completes.	2025-09-09 16:07:46 -07:00
Benson Wong	b21dee27c1	Fix #288 Vite hot module reloading creating multiple SSE connections (#290 ) - move SSE (EventSource) connection to module level - manage EventSource as a singleton, closing open connection before reopening a new one	2025-09-07 21:48:58 -07:00
Benson Wong	f58c8c8ec5	Support llama.cpp's cache_n in timings info (#287 ) Capture prompt cache metrics and surface them on Activities page in UI	2025-09-06 13:58:02 -07:00
Benson Wong	954e2dee73	Remove `cmdStart` from README [skip ci] cmdStart was in the README but it doesn't exist. Fixed the typo. Oops.	2025-09-04 11:57:28 -07:00