Compare commits

..

3 Commits

Author SHA1 Message Date
Benson Wong 57803fd3aa Support llama-server's /infill endpoint (#272)
Add support for llama-server's /infill endpoint and metrics gathering on the Activities page.
2025-08-27 08:36:05 -07:00
Benson Wong c55d0cc842 Add docs for model.concurrencyLimit #263 [skip ci] 2025-08-22 16:08:37 -07:00
Benson Wong 7acbaf4712 Add connection status indicator in UI (#260)
* show connection status as icon in UI title
* make connection status event driven
2025-08-20 13:58:24 -07:00
10 changed files with 154 additions and 121 deletions
+3 -1
View File
@@ -18,9 +18,11 @@ Written in golang, it is very easy to install (single binary with no dependencie
- `v1/completions`
- `v1/chat/completions`
- `v1/embeddings`
- `v1/rerank`, `v1/reranking`, `rerank`
- `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36))
- `v1/audio/transcriptions` ([docs](https://github.com/mostlygeek/llama-swap/issues/41#issuecomment-2722637867))
- ✅ llama-server (llama.cpp) supported endpoints:
- `v1/rerank`, `v1/reranking`, `/rerank`
- `/infill` - for code infilling
- ✅ llama-swap custom API endpoints
- `/ui` - web UI
- `/log` - remote log monitoring
+9
View File
@@ -129,6 +129,15 @@ models:
# - recommended to stick to sampling parameters
strip_params: "temperature, top_p, top_k"
# concurrencyLimit: overrides the allowed number of active parallel requests to a model
# - optional, default: 0
# - useful for limiting the number of active parallel requests a model can process
# - must be set per model
# - any number greater than 0 will override the internal default value of 10
# - any requests that exceeds the limit will receive an HTTP 429 Too Many Requests response
# - recommended to be omitted and the default used
concurrencyLimit: 0
# Unlisted model example:
"qwen-unlisted":
# unlisted: boolean, true or false
+28 -22
View File
@@ -5,12 +5,20 @@ import (
"fmt"
"io"
"net/http"
"strings"
"time"
"github.com/gin-gonic/gin"
"github.com/tidwall/gjson"
)
type MetricsRecorder struct {
metricsMonitor *MetricsMonitor
realModelName string
// isStreaming bool
startTime time.Time
}
// MetricsMiddleware sets up the MetricsResponseWriter for capturing upstream requests
func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
return func(c *gin.Context) {
@@ -41,49 +49,47 @@ func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
metricsRecorder: &MetricsRecorder{
metricsMonitor: pm.metricsMonitor,
realModelName: realModelName,
isStreaming: gjson.GetBytes(bodyBytes, "stream").Bool(),
startTime: time.Now(),
},
}
c.Writer = writer
c.Next()
rec := writer.metricsRecorder
rec.processBody(writer.body)
}
}
// check for streaming response
if strings.Contains(c.Writer.Header().Get("Content-Type"), "text/event-stream") {
writer.metricsRecorder.processStreamingResponse(writer.body)
} else {
writer.metricsRecorder.processNonStreamingResponse(writer.body)
}
type MetricsRecorder struct {
metricsMonitor *MetricsMonitor
realModelName string
isStreaming bool
startTime time.Time
}
// processBody handles response processing after request completes
func (rec *MetricsRecorder) processBody(body []byte) {
if rec.isStreaming {
rec.processStreamingResponse(body)
} else {
rec.processNonStreamingResponse(body)
}
}
func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool {
usage := jsonData.Get("usage")
if !usage.Exists() {
timings := jsonData.Get("timings")
if !usage.Exists() && !timings.Exists() {
return false
}
// default values
outputTokens := int(jsonData.Get("usage.completion_tokens").Int())
inputTokens := int(jsonData.Get("usage.prompt_tokens").Int())
outputTokens := 0
inputTokens := 0
// timings data
tokensPerSecond := -1.0
promptPerSecond := -1.0
durationMs := int(time.Since(rec.startTime).Milliseconds())
if usage.Exists() {
outputTokens = int(jsonData.Get("usage.completion_tokens").Int())
inputTokens = int(jsonData.Get("usage.prompt_tokens").Int())
}
// use llama-server's timing data for tok/sec and duration as it is more accurate
if timings := jsonData.Get("timings"); timings.Exists() {
if timings.Exists() {
inputTokens = int(jsonData.Get("timings.prompt_n").Int())
outputTokens = int(jsonData.Get("timings.predicted_n").Int())
promptPerSecond = jsonData.Get("timings.prompt_per_second").Float()
tokensPerSecond = jsonData.Get("timings.predicted_per_second").Float()
durationMs = int(jsonData.Get("timings.prompt_ms").Float() + jsonData.Get("timings.predicted_ms").Float())
+8 -2
View File
@@ -191,11 +191,17 @@ func (pm *ProxyManager) setupGinEngine() {
// Support legacy /v1/completions api, see issue #12
pm.ginEngine.POST("/v1/completions", mm, pm.proxyOAIHandler)
// Support embeddings
// Support embeddings and reranking
pm.ginEngine.POST("/v1/embeddings", mm, pm.proxyOAIHandler)
// llama-server's /reranking endpoint + aliases
pm.ginEngine.POST("/reranking", mm, pm.proxyOAIHandler)
pm.ginEngine.POST("/rerank", mm, pm.proxyOAIHandler)
pm.ginEngine.POST("/v1/rerank", mm, pm.proxyOAIHandler)
pm.ginEngine.POST("/v1/reranking", mm, pm.proxyOAIHandler)
pm.ginEngine.POST("/rerank", mm, pm.proxyOAIHandler)
// llama-server's /infill endpoint for code infilling
pm.ginEngine.POST("/infill", mm, pm.proxyOAIHandler)
// Support audio/speech endpoint
pm.ginEngine.POST("/v1/audio/speech", pm.proxyOAIHandler)
+53 -63
View File
@@ -1,88 +1,78 @@
import { useEffect, useCallback } from "react";
import { BrowserRouter as Router, Routes, Route, Navigate, NavLink } from "react-router-dom";
import { useTheme } from "./contexts/ThemeProvider";
import { APIProvider } from "./contexts/APIProvider";
import { useAPI } from "./contexts/APIProvider";
import LogViewerPage from "./pages/LogViewer";
import ModelPage from "./pages/Models";
import ActivityPage from "./pages/Activity";
import ConnectionStatus from "./components/ConnectionStatus";
import ConnectionStatusIcon from "./components/ConnectionStatus";
import { RiSunFill, RiMoonFill } from "react-icons/ri";
import { usePersistentState } from "./hooks/usePersistentState";
function App() {
const { isNarrow, toggleTheme, isDarkMode } = useTheme();
const [appTitle, setAppTitle] = usePersistentState("app-title", "llama-swap");
const { isNarrow, toggleTheme, isDarkMode, appTitle, setAppTitle, setConnectionState } = useTheme();
const handleTitleChange = useCallback(
(newTitle: string) => {
setAppTitle(newTitle);
document.title = newTitle;
setAppTitle(newTitle.replace(/\n/g, "").trim().substring(0, 64) || "llama-swap");
},
[setAppTitle]
);
const { connectionStatus } = useAPI();
// Synchronize the window.title connections state with the actual connection state
useEffect(() => {
document.title = appTitle; // Set initial title
}, [appTitle]);
setConnectionState(connectionStatus);
}, [connectionStatus]);
return (
<Router basename="/ui/">
<APIProvider>
<div className="flex flex-col h-screen">
<nav className="bg-surface border-b border-border p-2 h-[75px]">
<div className="flex items-center justify-between mx-auto px-4 h-full">
{!isNarrow && (
<h1
contentEditable
suppressContentEditableWarning
className="flex items-center p-0 outline-none hover:bg-gray-100 dark:hover:bg-gray-700 rounded px-1"
onBlur={(e) =>
handleTitleChange(e.currentTarget.textContent?.replace(/\n/g, "").trim() || "llama-swap")
<div className="flex flex-col h-screen">
<nav className="bg-surface border-b border-border p-2 h-[75px]">
<div className="flex items-center justify-between mx-auto px-4 h-full">
{!isNarrow && (
<h1
contentEditable
suppressContentEditableWarning
className="flex items-center p-0 outline-none hover:bg-gray-100 dark:hover:bg-gray-700 rounded px-1"
onBlur={(e) => handleTitleChange(e.currentTarget.textContent || "(set title)")}
onKeyDown={(e) => {
if (e.key === "Enter") {
e.preventDefault();
handleTitleChange(e.currentTarget.textContent || "(set title)");
e.currentTarget.blur();
}
onKeyDown={(e) => {
if (e.key === "Enter") {
e.preventDefault();
const sanitizedText =
e.currentTarget.textContent?.replace(/\n/g, "").trim().substring(0, 25) || "llama-swap";
handleTitleChange(sanitizedText);
e.currentTarget.textContent = sanitizedText;
e.currentTarget.blur();
}
}}
>
{appTitle}
</h1>
)}
<div className="flex items-center space-x-4">
<NavLink to="/" className={({ isActive }) => (isActive ? "navlink active" : "navlink")}>
Logs
</NavLink>
<NavLink to="/models" className={({ isActive }) => (isActive ? "navlink active" : "navlink")}>
Models
</NavLink>
<NavLink to="/activity" className={({ isActive }) => (isActive ? "navlink active" : "navlink")}>
Activity
</NavLink>
<button className="" onClick={toggleTheme}>
{isDarkMode ? <RiMoonFill /> : <RiSunFill />}
</button>
<ConnectionStatus />
</div>
}}
>
{appTitle}
</h1>
)}
<div className="flex items-center space-x-4">
<NavLink to="/" className={({ isActive }) => (isActive ? "navlink active" : "navlink")}>
Logs
</NavLink>
<NavLink to="/models" className={({ isActive }) => (isActive ? "navlink active" : "navlink")}>
Models
</NavLink>
<NavLink to="/activity" className={({ isActive }) => (isActive ? "navlink active" : "navlink")}>
Activity
</NavLink>
<button className="" onClick={toggleTheme}>
{isDarkMode ? <RiMoonFill /> : <RiSunFill />}
</button>
<ConnectionStatusIcon />
</div>
</nav>
</div>
</nav>
<main className="flex-1 overflow-auto p-4">
<Routes>
<Route path="/" element={<LogViewerPage />} />
<Route path="/models" element={<ModelPage />} />
<Route path="/activity" element={<ActivityPage />} />
<Route path="*" element={<Navigate to="/" replace />} />
</Routes>
</main>
</div>
</APIProvider>
<main className="flex-1 overflow-auto p-4">
<Routes>
<Route path="/" element={<LogViewerPage />} />
<Route path="/models" element={<ModelPage />} />
<Route path="/activity" element={<ActivityPage />} />
<Route path="*" element={<Navigate to="/" replace />} />
</Routes>
</main>
</div>
</Router>
);
}
+7 -17
View File
@@ -1,21 +1,11 @@
import { useAPI } from "../contexts/APIProvider";
import { useEffect, useState, useMemo } from "react";
import { useMemo } from "react";
type ConnectionStatus = "disconnected" | "connecting" | "connected";
const ConnectionStatus = () => {
const { getConnectionStatus } = useAPI();
const [eventStreamStatus, setEventStreamStatus] = useState<ConnectionStatus>("disconnected");
useEffect(() => {
const interval = setInterval(() => {
setEventStreamStatus(getConnectionStatus());
}, 1000);
return () => clearInterval(interval);
});
const ConnectionStatusIcon = () => {
const { connectionStatus } = useAPI();
const eventStatusColor = useMemo(() => {
switch (eventStreamStatus) {
switch (connectionStatus) {
case "connected":
return "bg-green-500";
case "connecting":
@@ -24,13 +14,13 @@ const ConnectionStatus = () => {
default:
return "bg-red-500";
}
}, [eventStreamStatus]);
}, [connectionStatus]);
return (
<div className="flex items-center" title={`event stream: ${eventStreamStatus}`}>
<div className="flex items-center" title={`event stream: ${connectionStatus}`}>
<span className={`inline-block w-3 h-3 rounded-full ${eventStatusColor} mr-2`}></span>
</div>
);
};
export default ConnectionStatus;
export default ConnectionStatusIcon;
+11 -14
View File
@@ -1,4 +1,5 @@
import { useRef, createContext, useState, useContext, useEffect, useCallback, useMemo, type ReactNode } from "react";
import type { ConnectionState } from "../lib/types";
type ModelStatus = "ready" | "starting" | "stopping" | "stopped" | "shutdown" | "unknown";
const LOG_LENGTH_LIMIT = 1024 * 100; /* 100KB of log data */
@@ -20,7 +21,7 @@ interface APIProviderType {
proxyLogs: string;
upstreamLogs: string;
metrics: Metrics[];
getConnectionStatus: () => "connected" | "connecting" | "disconnected";
connectionStatus: ConnectionState;
}
interface Metrics {
@@ -53,6 +54,7 @@ export function APIProvider({ children, autoStartAPIEvents = true }: APIProvider
const [proxyLogs, setProxyLogs] = useState("");
const [upstreamLogs, setUpstreamLogs] = useState("");
const [metrics, setMetrics] = useState<Metrics[]>([]);
const [connectionStatus, setConnectionState] = useState<ConnectionState>("disconnected");
const apiEventSource = useRef<EventSource | null>(null);
const [models, setModels] = useState<Model[]>([]);
@@ -64,16 +66,6 @@ export function APIProvider({ children, autoStartAPIEvents = true }: APIProvider
});
}, []);
const getConnectionStatus = useCallback(() => {
if (apiEventSource.current?.readyState === EventSource.OPEN) {
return "connected";
} else if (apiEventSource.current?.readyState === EventSource.CONNECTING) {
return "connecting";
} else {
return "disconnected";
}
}, []);
const enableAPIEvents = useCallback((enabled: boolean) => {
if (!enabled) {
apiEventSource.current?.close();
@@ -86,7 +78,9 @@ export function APIProvider({ children, autoStartAPIEvents = true }: APIProvider
const initialDelay = 1000; // 1 second
const connect = () => {
apiEventSource.current = null;
const eventSource = new EventSource("/api/events");
setConnectionState("connecting");
eventSource.onopen = () => {
// clear everything out on connect to keep things in sync
@@ -94,6 +88,9 @@ export function APIProvider({ children, autoStartAPIEvents = true }: APIProvider
setUpstreamLogs("");
setMetrics([]); // clear metrics on reconnect
setModels([]); // clear models on reconnect
apiEventSource.current = eventSource;
retryCount = 0;
setConnectionState("connected");
};
eventSource.onmessage = (e: MessageEvent) => {
@@ -138,14 +135,14 @@ export function APIProvider({ children, autoStartAPIEvents = true }: APIProvider
console.error(e.data, err);
}
};
eventSource.onerror = () => {
eventSource.close();
retryCount++;
const delay = Math.min(initialDelay * Math.pow(2, retryCount - 1), 5000);
setConnectionState("disconnected");
setTimeout(connect, delay);
};
apiEventSource.current = eventSource;
};
connect();
@@ -213,7 +210,7 @@ export function APIProvider({ children, autoStartAPIEvents = true }: APIProvider
proxyLogs,
upstreamLogs,
metrics,
getConnectionStatus,
connectionStatus,
}),
[models, listModels, unloadAllModels, loadModel, enableAPIEvents, proxyLogs, upstreamLogs, metrics]
);
+30 -1
View File
@@ -1,5 +1,6 @@
import { createContext, useContext, useEffect, type ReactNode, useMemo, useState } from "react";
import { usePersistentState } from "../hooks/usePersistentState";
import type { ConnectionState } from "../lib/types";
type ScreenWidth = "xs" | "sm" | "md" | "lg" | "xl" | "2xl";
type ThemeContextType = {
@@ -7,6 +8,11 @@ type ThemeContextType = {
screenWidth: ScreenWidth;
isNarrow: boolean;
toggleTheme: () => void;
// for managing the window title and connection state information
appTitle: string;
setAppTitle: (title: string) => void;
setConnectionState: (state: ConnectionState) => void;
};
const ThemeContext = createContext<ThemeContextType | undefined>(undefined);
@@ -16,6 +22,17 @@ type ThemeProviderProps = {
};
export function ThemeProvider({ children }: ThemeProviderProps) {
const [appTitle, setAppTitle] = usePersistentState("app-title", "llama-swap");
const [connectionState, setConnectionState] = useState<ConnectionState>("disconnected");
/**
* Set the document.title with informative information
*/
useEffect(() => {
const connectionIcon = connectionState === "connecting" ? "🟡" : connectionState === "connected" ? "🟢" : "🔴";
document.title = connectionIcon + " " + appTitle; // Set initial title
}, [appTitle, connectionState]);
const [isDarkMode, setIsDarkMode] = usePersistentState<boolean>("theme", false);
const [screenWidth, setScreenWidth] = useState<ScreenWidth>("md"); // Default to md
@@ -55,7 +72,19 @@ export function ThemeProvider({ children }: ThemeProviderProps) {
}, [screenWidth]);
return (
<ThemeContext.Provider value={{ isDarkMode, toggleTheme, screenWidth, isNarrow }}>{children}</ThemeContext.Provider>
<ThemeContext.Provider
value={{
isDarkMode,
toggleTheme,
screenWidth,
isNarrow,
appTitle,
setAppTitle,
setConnectionState,
}}
>
{children}
</ThemeContext.Provider>
);
}
+1
View File
@@ -0,0 +1 @@
export type ConnectionState = "connected" | "connecting" | "disconnected";
+4 -1
View File
@@ -3,11 +3,14 @@ import { createRoot } from "react-dom/client";
import "./index.css";
import App from "./App.tsx";
import { ThemeProvider } from "./contexts/ThemeProvider";
import { APIProvider } from "./contexts/APIProvider";
createRoot(document.getElementById("root")!).render(
<StrictMode>
<ThemeProvider>
<App />
<APIProvider>
<App />
</APIProvider>
</ThemeProvider>
</StrictMode>
);