proxy,ui: add performance monitoring with Prometheus metrics (#743)

Add a comprehensive performance monitoring system that collects CPU, memory, swap, load average, network IO, and GPU stats. Provides both a REST API for the UI and a Prometheus /metrics endpoint.

Backend changes:
- New internal/perf package with configurable interval-based stats collection
- GPU monitoring via LACT (Unix socket) and nvidia-smi fallback on Linux
- Ring buffer (internal/ring) for time-series stat storage
- Prometheus /metrics endpoint with all system and GPU metrics
- Moved LogMonitor to internal/logmon package
- New PerformanceConfig for hot-reloadable monitoring settings
- REST /api/performance endpoint replacing SSE streaming

UI changes:
- New Performance page with real-time charts for CPU, memory, GPU, and network
- Reusable PerformanceChart component
- LLAMA_SWAP_URL environment variable support
- Improved capture dialog display

Other:
- Example Grafana dashboard for Prometheus metrics
- monitor-test standalone binary
- Config schema and example updates

fixes #596
This commit is contained in:
Benson Wong
2026-05-09 13:29:22 -07:00
committed by GitHub
parent e261745c66
commit 7e3e94a08a
49 changed files with 4322 additions and 273 deletions
+13
View File
@@ -9,6 +9,7 @@ import (
"runtime"
"sort"
"strings"
"time"
"github.com/billziss-gh/golib/shlex"
"gopkg.in/yaml.v3"
@@ -124,6 +125,7 @@ type Config struct {
LogToStdout string `yaml:"logToStdout"`
MetricsMaxInMemory int `yaml:"metricsMaxInMemory"`
CaptureBuffer int `yaml:"captureBuffer"`
Performance PerformanceConfig `yaml:"performance"`
GlobalTTL int `yaml:"globalTTL"`
Models map[string]ModelConfig `yaml:"models"` /* key is model ID */
Profiles map[string][]string `yaml:"profiles"`
@@ -220,6 +222,17 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
config.HealthCheckTimeout = 15
}
// Apply defaults for performance config when section is missing
if !config.Performance.Enable && config.Performance.Every == 0 && config.Performance.MaxAge == 0 && config.Performance.GC == 0 {
config.Performance.Enable = true
config.Performance.Every = 15 * time.Second
config.Performance.MaxAge = 1 * time.Hour
config.Performance.GC = 5 * time.Minute
}
if err = config.Performance.Validate(); err != nil {
return Config{}, fmt.Errorf("performance: %w", err)
}
if config.StartPort < 1 {
return Config{}, fmt.Errorf("startPort must be greater than 1")
}
+7
View File
@@ -7,6 +7,7 @@ import (
"path/filepath"
"strings"
"testing"
"time"
"github.com/stretchr/testify/assert"
)
@@ -229,6 +230,12 @@ groups:
HealthCheckTimeout: 15,
MetricsMaxInMemory: 1000,
CaptureBuffer: 5,
Performance: PerformanceConfig{
Enable: true,
Every: 15 * time.Second,
MaxAge: 1 * time.Hour,
GC: 5 * time.Minute,
},
Profiles: map[string][]string{
"test": {"model1", "model2"},
},
+7
View File
@@ -7,6 +7,7 @@ import (
"path/filepath"
"strings"
"testing"
"time"
"github.com/stretchr/testify/assert"
)
@@ -218,6 +219,12 @@ groups:
HealthCheckTimeout: 15,
MetricsMaxInMemory: 1000,
CaptureBuffer: 5,
Performance: PerformanceConfig{
Enable: true,
Every: 15 * time.Second,
MaxAge: 1 * time.Hour,
GC: 5 * time.Minute,
},
Profiles: map[string][]string{
"test": {"model1", "model2"},
},
+45
View File
@@ -0,0 +1,45 @@
package config
import (
"fmt"
"time"
)
// PerformanceConfig holds configuration for system performance monitoring
type PerformanceConfig struct {
Enable bool `yaml:"enable"`
Every time.Duration `yaml:"every"`
MaxAge time.Duration `yaml:"maxAge"`
GC time.Duration `yaml:"gc"`
}
func (p *PerformanceConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
type rawPerformanceConfig PerformanceConfig
defaults := rawPerformanceConfig{
Enable: true,
Every: 15 * time.Second,
MaxAge: 1 * time.Hour,
GC: 5 * time.Minute,
}
if err := unmarshal(&defaults); err != nil {
return err
}
*p = PerformanceConfig(defaults)
return nil
}
// Validate checks the PerformanceConfig values and returns an error if invalid
func (p *PerformanceConfig) Validate() error {
if p.Every < time.Second {
return fmt.Errorf("every must be at least 1s, got %v", p.Every)
}
if p.MaxAge <= 0 {
return fmt.Errorf("maxAge must be greater than 0, got %v", p.MaxAge)
}
if p.GC <= 0 {
return fmt.Errorf("gc must be greater than 0, got %v", p.GC)
}
return nil
}
+140
View File
@@ -0,0 +1,140 @@
package config
import (
"strings"
"testing"
"time"
"github.com/stretchr/testify/assert"
)
func TestPerformanceConfig_Defaults(t *testing.T) {
content := `
models:
model1:
cmd: path/to/cmd --port ${PORT}
`
config, err := LoadConfigFromReader(strings.NewReader(content))
assert.NoError(t, err)
// When performance section is missing, defaults should be applied
assert.True(t, config.Performance.Enable)
assert.Equal(t, 15*time.Second, config.Performance.Every)
assert.Equal(t, 1*time.Hour, config.Performance.MaxAge)
assert.Equal(t, 5*time.Minute, config.Performance.GC)
}
func TestPerformanceConfig_CustomValues(t *testing.T) {
content := `
performance:
enable: true
every: 30s
maxAge: 12h
gc: 10m
models:
model1:
cmd: path/to/cmd --port ${PORT}
`
config, err := LoadConfigFromReader(strings.NewReader(content))
assert.NoError(t, err)
assert.True(t, config.Performance.Enable)
assert.Equal(t, 30*time.Second, config.Performance.Every)
assert.Equal(t, 12*time.Hour, config.Performance.MaxAge)
assert.Equal(t, 10*time.Minute, config.Performance.GC)
}
func TestPerformanceConfig_Disabled(t *testing.T) {
content := `
performance:
enable: false
models:
model1:
cmd: path/to/cmd --port ${PORT}
`
config, err := LoadConfigFromReader(strings.NewReader(content))
assert.NoError(t, err)
assert.False(t, config.Performance.Enable)
// Duration defaults should still apply
assert.Equal(t, 15*time.Second, config.Performance.Every)
assert.Equal(t, 1*time.Hour, config.Performance.MaxAge)
assert.Equal(t, 5*time.Minute, config.Performance.GC)
}
func TestPerformanceConfig_PartialValues(t *testing.T) {
content := `
performance:
every: 10s
maxAge: 6h
models:
model1:
cmd: path/to/cmd --port ${PORT}
`
config, err := LoadConfigFromReader(strings.NewReader(content))
assert.NoError(t, err)
// enable should default to true
assert.True(t, config.Performance.Enable)
assert.Equal(t, 10*time.Second, config.Performance.Every)
assert.Equal(t, 6*time.Hour, config.Performance.MaxAge)
// gc should use default
assert.Equal(t, 5*time.Minute, config.Performance.GC)
}
func TestPerformanceConfig_InvalidEvery(t *testing.T) {
content := `
performance:
every: 500ms
models:
model1:
cmd: path/to/cmd --port ${PORT}
`
_, err := LoadConfigFromReader(strings.NewReader(content))
assert.Error(t, err)
assert.Contains(t, err.Error(), "every must be at least 1s")
}
func TestPerformanceConfig_InvalidMaxAge(t *testing.T) {
content := `
performance:
maxAge: 0s
models:
model1:
cmd: path/to/cmd --port ${PORT}
`
_, err := LoadConfigFromReader(strings.NewReader(content))
assert.Error(t, err)
assert.Contains(t, err.Error(), "maxAge must be greater than 0")
}
func TestPerformanceConfig_InvalidGC(t *testing.T) {
content := `
performance:
gc: 0s
models:
model1:
cmd: path/to/cmd --port ${PORT}
`
_, err := LoadConfigFromReader(strings.NewReader(content))
assert.Error(t, err)
assert.Contains(t, err.Error(), "gc must be greater than 0")
}
func TestPerformanceConfig_ComplexDurations(t *testing.T) {
content := `
performance:
every: 1m30s
maxAge: 2h10m
gc: 1m
models:
model1:
cmd: path/to/cmd --port ${PORT}
`
config, err := LoadConfigFromReader(strings.NewReader(content))
assert.NoError(t, err)
assert.Equal(t, 90*time.Second, config.Performance.Every)
assert.Equal(t, (2*time.Hour)+(10*time.Minute), config.Performance.MaxAge)
assert.Equal(t, 1*time.Minute, config.Performance.GC)
}
-9
View File
@@ -5,7 +5,6 @@ package proxy
const ProcessStateChangeEventID = 0x01
const ChatCompletionStatsEventID = 0x02
const ConfigFileChangedEventID = 0x03
const LogDataEventID = 0x04
const ActivityLogEventID = 0x05
const ModelPreloadedEventID = 0x06
const InFlightRequestsEventID = 0x07
@@ -43,14 +42,6 @@ func (e ConfigFileChangedEvent) Type() uint32 {
return ConfigFileChangedEventID
}
type LogDataEvent struct {
Data []byte
}
func (e LogDataEvent) Type() uint32 {
return LogDataEventID
}
type ModelPreloadedEvent struct {
ModelName string
Success bool
+6 -5
View File
@@ -15,6 +15,7 @@ import (
"time"
"github.com/gin-gonic/gin"
"github.com/mostlygeek/llama-swap/internal/logmon"
"github.com/mostlygeek/llama-swap/proxy/config"
"github.com/stretchr/testify/require"
"github.com/tidwall/gjson"
@@ -24,7 +25,7 @@ import (
var (
nextTestPort int = 12000
portMutex sync.Mutex
testLogger = NewLogMonitorWriter(os.Stdout)
testLogger = logmon.NewWriter(os.Stdout)
simpleResponderPath = getSimpleResponderPath()
)
@@ -40,13 +41,13 @@ func TestMain(m *testing.M) {
switch os.Getenv("LOG_LEVEL") {
case "debug":
testLogger.SetLogLevel(LevelDebug)
testLogger.SetLogLevel(logmon.LevelDebug)
case "warn":
testLogger.SetLogLevel(LevelWarn)
testLogger.SetLogLevel(logmon.LevelWarn)
case "info":
testLogger.SetLogLevel(LevelInfo)
testLogger.SetLogLevel(logmon.LevelInfo)
default:
testLogger.SetLogLevel(LevelWarn)
testLogger.SetLogLevel(logmon.LevelWarn)
}
m.Run()
-269
View File
@@ -1,269 +0,0 @@
package proxy
import (
"context"
"fmt"
"io"
"os"
"sync"
"time"
"github.com/mostlygeek/llama-swap/event"
)
// circularBuffer is a fixed-size circular byte buffer that overwrites
// oldest data when full. It provides O(1) writes and O(n) reads.
type circularBuffer struct {
data []byte // pre-allocated capacity
head int // next write position
size int // current number of bytes stored (0 to cap)
}
func newCircularBuffer(capacity int) *circularBuffer {
return &circularBuffer{
data: make([]byte, capacity),
head: 0,
size: 0,
}
}
// Write appends bytes to the buffer, overwriting oldest data when full.
// Data is copied into the internal buffer (not stored by reference).
func (cb *circularBuffer) Write(p []byte) {
if len(p) == 0 {
return
}
cap := len(cb.data)
// If input is larger than capacity, only keep the last cap bytes
if len(p) >= cap {
copy(cb.data, p[len(p)-cap:])
cb.head = 0
cb.size = cap
return
}
// Calculate how much space is available from head to end of buffer
firstPart := cap - cb.head
if firstPart >= len(p) {
// All data fits without wrapping
copy(cb.data[cb.head:], p)
cb.head = (cb.head + len(p)) % cap
} else {
// Data wraps around
copy(cb.data[cb.head:], p[:firstPart])
copy(cb.data[:len(p)-firstPart], p[firstPart:])
cb.head = len(p) - firstPart
}
// Update size
cb.size += len(p)
if cb.size > cap {
cb.size = cap
}
}
// GetHistory returns all buffered data in correct order (oldest to newest).
// Returns a new slice (copy), not a view into internal buffer.
func (cb *circularBuffer) GetHistory() []byte {
if cb.size == 0 {
return nil
}
result := make([]byte, cb.size)
cap := len(cb.data)
// Calculate start position (oldest data)
start := (cb.head - cb.size + cap) % cap
if start+cb.size <= cap {
// Data is contiguous, single copy
copy(result, cb.data[start:start+cb.size])
} else {
// Data wraps around, two copies
firstPart := cap - start
copy(result[:firstPart], cb.data[start:])
copy(result[firstPart:], cb.data[:cb.size-firstPart])
}
return result
}
type LogLevel int
const (
LevelDebug LogLevel = iota
LevelInfo
LevelWarn
LevelError
LogBufferSize = 100 * 1024
)
type LogMonitor struct {
eventbus *event.Dispatcher
mu sync.RWMutex
buffer *circularBuffer
bufferMu sync.RWMutex
// typically this can be os.Stdout
stdout io.Writer
// logging levels
level LogLevel
prefix string
// timestamps
timeFormat string
}
func NewLogMonitor() *LogMonitor {
return NewLogMonitorWriter(os.Stdout)
}
func NewLogMonitorWriter(stdout io.Writer) *LogMonitor {
return &LogMonitor{
eventbus: event.NewDispatcherConfig(1000),
buffer: nil, // lazy initialized on first Write
stdout: stdout,
level: LevelInfo,
prefix: "",
timeFormat: "",
}
}
func (w *LogMonitor) Write(p []byte) (n int, err error) {
if len(p) == 0 {
return 0, nil
}
n, err = w.stdout.Write(p)
if err != nil {
return n, err
}
w.bufferMu.Lock()
if w.buffer == nil {
w.buffer = newCircularBuffer(LogBufferSize)
}
w.buffer.Write(p)
w.bufferMu.Unlock()
// Make a copy for broadcast to preserve immutability
bufferCopy := make([]byte, len(p))
copy(bufferCopy, p)
w.broadcast(bufferCopy)
return n, nil
}
func (w *LogMonitor) GetHistory() []byte {
w.bufferMu.RLock()
defer w.bufferMu.RUnlock()
if w.buffer == nil {
return nil
}
return w.buffer.GetHistory()
}
// Clear releases the buffer memory, making it eligible for GC.
// The buffer will be lazily re-allocated on the next Write.
func (w *LogMonitor) Clear() {
w.bufferMu.Lock()
w.buffer = nil
w.bufferMu.Unlock()
}
func (w *LogMonitor) OnLogData(callback func(data []byte)) context.CancelFunc {
return event.Subscribe(w.eventbus, func(e LogDataEvent) {
callback(e.Data)
})
}
func (w *LogMonitor) broadcast(msg []byte) {
event.Publish(w.eventbus, LogDataEvent{Data: msg})
}
func (w *LogMonitor) SetPrefix(prefix string) {
w.mu.Lock()
defer w.mu.Unlock()
w.prefix = prefix
}
func (w *LogMonitor) SetLogLevel(level LogLevel) {
w.mu.Lock()
defer w.mu.Unlock()
w.level = level
}
func (w *LogMonitor) SetLogTimeFormat(timeFormat string) {
w.mu.Lock()
defer w.mu.Unlock()
w.timeFormat = timeFormat
}
func (w *LogMonitor) formatMessage(level string, msg string) []byte {
prefix := ""
if w.prefix != "" {
prefix = fmt.Sprintf("[%s] ", w.prefix)
}
timestamp := ""
if w.timeFormat != "" {
timestamp = fmt.Sprintf("%s ", time.Now().Format(w.timeFormat))
}
return []byte(fmt.Sprintf("%s%s[%s] %s\n", timestamp, prefix, level, msg))
}
func (w *LogMonitor) log(level LogLevel, msg string) {
if level < w.level {
return
}
w.Write(w.formatMessage(level.String(), msg))
}
func (w *LogMonitor) Debug(msg string) {
w.log(LevelDebug, msg)
}
func (w *LogMonitor) Info(msg string) {
w.log(LevelInfo, msg)
}
func (w *LogMonitor) Warn(msg string) {
w.log(LevelWarn, msg)
}
func (w *LogMonitor) Error(msg string) {
w.log(LevelError, msg)
}
func (w *LogMonitor) Debugf(format string, args ...interface{}) {
w.log(LevelDebug, fmt.Sprintf(format, args...))
}
func (w *LogMonitor) Infof(format string, args ...interface{}) {
w.log(LevelInfo, fmt.Sprintf(format, args...))
}
func (w *LogMonitor) Warnf(format string, args ...interface{}) {
w.log(LevelWarn, fmt.Sprintf(format, args...))
}
func (w *LogMonitor) Errorf(format string, args ...interface{}) {
w.log(LevelError, fmt.Sprintf(format, args...))
}
func (l LogLevel) String() string {
switch l {
case LevelDebug:
return "DEBUG"
case LevelInfo:
return "INFO"
case LevelWarn:
return "WARN"
case LevelError:
return "ERROR"
default:
return "UNKNOWN"
}
}
-316
View File
@@ -1,316 +0,0 @@
package proxy
import (
"bytes"
"io"
"strings"
"sync"
"testing"
"time"
)
func TestLogMonitor(t *testing.T) {
logMonitor := NewLogMonitorWriter(io.Discard)
// A WaitGroup is used to wait for all the expected writes to complete
var wg sync.WaitGroup
client1Messages := make([]byte, 0)
client2Messages := make([]byte, 0)
defer logMonitor.OnLogData(func(data []byte) {
client1Messages = append(client1Messages, data...)
wg.Done()
})()
defer logMonitor.OnLogData(func(data []byte) {
client2Messages = append(client2Messages, data...)
wg.Done()
})()
wg.Add(6) // 2 x 3 writes
logMonitor.Write([]byte("1"))
logMonitor.Write([]byte("2"))
logMonitor.Write([]byte("3"))
// wait for all writes to complete
wg.Wait()
// Check the buffer
expectedHistory := "123"
history := string(logMonitor.GetHistory())
if history != expectedHistory {
t.Errorf("Expected history: %s, got: %s", expectedHistory, history)
}
c1Data := string(client1Messages)
if c1Data != expectedHistory {
t.Errorf("Client1 expected %s, got: %s", expectedHistory, c1Data)
}
c2Data := string(client2Messages)
if c2Data != expectedHistory {
t.Errorf("Client2 expected %s, got: %s", expectedHistory, c2Data)
}
}
func TestWrite_ImmutableBuffer(t *testing.T) {
// Create a new LogMonitor instance
lm := NewLogMonitorWriter(io.Discard)
// Prepare a message to write
msg := []byte("Hello, World!")
lenmsg := len(msg)
// Write the message to the LogMonitor
n, err := lm.Write(msg)
if err != nil {
t.Fatalf("Write failed: %v", err)
}
if n != lenmsg {
t.Errorf("Expected %d bytes written but got %d", lenmsg, n)
}
// Change the original message
msg[0] = 'B' // This should not affect the buffer
// Get the history from the LogMonitor
history := lm.GetHistory()
// Check that the history contains the original message, not the modified one
expected := []byte("Hello, World!")
if !bytes.Equal(history, expected) {
t.Errorf("Expected history to be %q, got %q", expected, history)
}
}
func TestWrite_LogTimeFormat(t *testing.T) {
// Create a new LogMonitor instance
lm := NewLogMonitorWriter(io.Discard)
// Enable timestamps
lm.timeFormat = time.RFC3339
// Write the message to the LogMonitor
lm.Info("Hello, World!")
// Get the history from the LogMonitor
history := lm.GetHistory()
timestamp := ""
fields := strings.Fields(string(history))
if len(fields) > 0 {
timestamp = fields[0]
} else {
t.Fatalf("Cannot extract string from history")
}
_, err := time.Parse(time.RFC3339, timestamp)
if err != nil {
t.Fatalf("Cannot find timestamp: %v", err)
}
}
func TestCircularBuffer_WrapAround(t *testing.T) {
// Create a small buffer to test wrap-around
cb := newCircularBuffer(10)
// Write "hello" (5 bytes)
cb.Write([]byte("hello"))
if got := string(cb.GetHistory()); got != "hello" {
t.Errorf("Expected 'hello', got %q", got)
}
// Write "world" (5 bytes) - buffer now full
cb.Write([]byte("world"))
if got := string(cb.GetHistory()); got != "helloworld" {
t.Errorf("Expected 'helloworld', got %q", got)
}
// Write "12345" (5 bytes) - should overwrite "hello"
cb.Write([]byte("12345"))
if got := string(cb.GetHistory()); got != "world12345" {
t.Errorf("Expected 'world12345', got %q", got)
}
// Write data larger than buffer capacity
cb.Write([]byte("abcdefghijklmnop")) // 16 bytes, only last 10 kept
if got := string(cb.GetHistory()); got != "ghijklmnop" {
t.Errorf("Expected 'ghijklmnop', got %q", got)
}
}
func TestCircularBuffer_BoundaryConditions(t *testing.T) {
// Test empty buffer
cb := newCircularBuffer(10)
if got := cb.GetHistory(); got != nil {
t.Errorf("Expected nil for empty buffer, got %q", got)
}
// Test exact capacity
cb.Write([]byte("1234567890"))
if got := string(cb.GetHistory()); got != "1234567890" {
t.Errorf("Expected '1234567890', got %q", got)
}
// Test write exactly at capacity boundary
cb = newCircularBuffer(10)
cb.Write([]byte("12345"))
cb.Write([]byte("67890"))
if got := string(cb.GetHistory()); got != "1234567890" {
t.Errorf("Expected '1234567890', got %q", got)
}
}
func TestLogMonitor_LazyInit(t *testing.T) {
lm := NewLogMonitorWriter(io.Discard)
// Buffer should be nil before any writes
if lm.buffer != nil {
t.Error("Expected buffer to be nil before first write")
}
// GetHistory should return nil when buffer is nil
if got := lm.GetHistory(); got != nil {
t.Errorf("Expected nil history before first write, got %q", got)
}
// Write should lazily initialize the buffer
lm.Write([]byte("test"))
if lm.buffer == nil {
t.Error("Expected buffer to be initialized after write")
}
if got := string(lm.GetHistory()); got != "test" {
t.Errorf("Expected 'test', got %q", got)
}
}
func TestLogMonitor_Clear(t *testing.T) {
lm := NewLogMonitorWriter(io.Discard)
// Write some data
lm.Write([]byte("hello"))
if got := string(lm.GetHistory()); got != "hello" {
t.Errorf("Expected 'hello', got %q", got)
}
// Clear should release the buffer
lm.Clear()
if lm.buffer != nil {
t.Error("Expected buffer to be nil after Clear")
}
if got := lm.GetHistory(); got != nil {
t.Errorf("Expected nil history after Clear, got %q", got)
}
}
func TestLogMonitor_ClearAndReuse(t *testing.T) {
lm := NewLogMonitorWriter(io.Discard)
// Write, clear, then write again
lm.Write([]byte("first"))
lm.Clear()
lm.Write([]byte("second"))
if got := string(lm.GetHistory()); got != "second" {
t.Errorf("Expected 'second' after clear and reuse, got %q", got)
}
}
func BenchmarkLogMonitorWrite(b *testing.B) {
// Test data of varying sizes
smallMsg := []byte("small message\n")
mediumMsg := []byte(strings.Repeat("medium message content ", 10) + "\n")
largeMsg := []byte(strings.Repeat("large message content for benchmarking ", 100) + "\n")
b.Run("SmallWrite", func(b *testing.B) {
lm := NewLogMonitorWriter(io.Discard)
b.ResetTimer()
for i := 0; i < b.N; i++ {
lm.Write(smallMsg)
}
})
b.Run("MediumWrite", func(b *testing.B) {
lm := NewLogMonitorWriter(io.Discard)
b.ResetTimer()
for i := 0; i < b.N; i++ {
lm.Write(mediumMsg)
}
})
b.Run("LargeWrite", func(b *testing.B) {
lm := NewLogMonitorWriter(io.Discard)
b.ResetTimer()
for i := 0; i < b.N; i++ {
lm.Write(largeMsg)
}
})
b.Run("WithSubscribers", func(b *testing.B) {
lm := NewLogMonitorWriter(io.Discard)
// Add some subscribers
for i := 0; i < 5; i++ {
lm.OnLogData(func(data []byte) {})
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
lm.Write(mediumMsg)
}
})
b.Run("GetHistory", func(b *testing.B) {
lm := NewLogMonitorWriter(io.Discard)
// Pre-populate with data
for i := 0; i < 1000; i++ {
lm.Write(mediumMsg)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
lm.GetHistory()
}
})
}
/*
Benchmark Results - MBP M1 Pro
Before (ring.Ring):
| Benchmark | ns/op | bytes/op | allocs/op |
|---------------------------------|------------|----------|-----------|
| SmallWrite (14B) | 43 ns | 40 B | 2 |
| MediumWrite (241B) | 76 ns | 264 B | 2 |
| LargeWrite (4KB) | 504 ns | 4,120 B | 2 |
| WithSubscribers (5 subs) | 355 ns | 264 B | 2 |
| GetHistory (after 1000 writes) | 145,000 ns | 1.2 MB | 22 |
After (circularBuffer 10KB):
| Benchmark | ns/op | bytes/op | allocs/op |
|---------------------------------|------------|----------|-----------|
| SmallWrite (14B) | 26 ns | 16 B | 1 |
| MediumWrite (241B) | 67 ns | 240 B | 1 |
| LargeWrite (4KB) | 774 ns | 4,096 B | 1 |
| WithSubscribers (5 subs) | 325 ns | 240 B | 1 |
| GetHistory (after 1000 writes) | 1,042 ns | 10,240 B | 1 |
After (circularBuffer 100KB):
| Benchmark | ns/op | bytes/op | allocs/op |
|---------------------------------|------------|-----------|-----------|
| SmallWrite (14B) | 26 ns | 16 B | 1 |
| MediumWrite (241B) | 66 ns | 240 B | 1 |
| LargeWrite (4KB) | 753 ns | 4,096 B | 1 |
| WithSubscribers (5 subs) | 309 ns | 240 B | 1 |
| GetHistory (after 1000 writes) | 7,788 ns | 106,496 B | 1 |
Summary:
- GetHistory: 139x faster (10KB), 18x faster (100KB)
- Allocations: reduced from 2 to 1 across all operations
- Small/medium writes: ~1.1-1.6x faster
*/
+5 -4
View File
@@ -7,6 +7,7 @@ import (
"sort"
"sync"
"github.com/mostlygeek/llama-swap/internal/logmon"
"github.com/mostlygeek/llama-swap/proxy/config"
)
@@ -145,8 +146,8 @@ type Matrix struct {
solver *MatrixSolver
processes map[string]*Process // all processes keyed by real model name
config config.Config
proxyLogger *LogMonitor
upstreamLogger *LogMonitor
proxyLogger *logmon.Monitor
upstreamLogger *logmon.Monitor
// inflight tracks ProxyRequest calls that have released m.Lock but may
// not yet have incremented Process.inFlightRequests. A concurrent
@@ -165,10 +166,10 @@ type Matrix struct {
// NewMatrix creates a Matrix from config. It creates a Process for every
// model defined in the config (any model can run alone even if not in a set).
func NewMatrix(cfg config.Config, proxyLogger, upstreamLogger *LogMonitor) *Matrix {
func NewMatrix(cfg config.Config, proxyLogger, upstreamLogger *logmon.Monitor) *Matrix {
processes := make(map[string]*Process)
for modelID, modelConfig := range cfg.Models {
processLogger := NewLogMonitorWriter(upstreamLogger)
processLogger := logmon.NewWriter(upstreamLogger)
process := NewProcess(modelID, cfg.HealthCheckTimeout, modelConfig, processLogger, proxyLogger)
processes[modelID] = process
}
+27 -26
View File
@@ -16,6 +16,8 @@ import (
"github.com/gin-gonic/gin"
"github.com/klauspost/compress/zstd"
"github.com/mostlygeek/llama-swap/event"
"github.com/mostlygeek/llama-swap/internal/logmon"
"github.com/mostlygeek/llama-swap/internal/ring"
"github.com/mostlygeek/llama-swap/proxy/cache"
"github.com/tidwall/gjson"
)
@@ -113,11 +115,10 @@ func (e ActivityLogEvent) Type() uint32 {
// metricsMonitor parses llama-server output for token statistics
type metricsMonitor struct {
mu sync.RWMutex
metrics []ActivityLogEntry
maxMetrics int
nextID int
logger *LogMonitor
mu sync.RWMutex
metrics ring.Buffer[ActivityLogEntry]
nextID int
logger *logmon.Monitor
// capture fields
enableCaptures bool
@@ -126,10 +127,10 @@ type metricsMonitor struct {
// newMetricsMonitor creates a new metricsMonitor. captureBufferMB is the
// capture buffer size in megabytes; 0 disables captures.
func newMetricsMonitor(logger *LogMonitor, maxMetrics int, captureBufferMB int) *metricsMonitor {
func newMetricsMonitor(logger *logmon.Monitor, maxMetrics int, captureBufferMB int) *metricsMonitor {
mm := &metricsMonitor{
logger: logger,
maxMetrics: maxMetrics,
metrics: ring.NewBuffer[ActivityLogEntry](maxMetrics),
enableCaptures: captureBufferMB > 0,
}
if captureBufferMB > 0 {
@@ -146,10 +147,7 @@ func (mp *metricsMonitor) queueMetrics(metric ActivityLogEntry) int {
metric.ID = mp.nextID
mp.nextID++
mp.metrics = append(mp.metrics, metric)
if len(mp.metrics) > mp.maxMetrics {
mp.metrics = mp.metrics[len(mp.metrics)-mp.maxMetrics:]
}
mp.metrics.Push(metric)
return metric.ID
}
@@ -213,30 +211,36 @@ func (mp *metricsMonitor) getCaptureByID(id int) *ReqRespCapture {
return capture
}
// getMetrics returns a copy of the current metrics
// getMetrics returns a copy of the current metrics with HasCapture resolved from cache.
func (mp *metricsMonitor) getMetrics() []ActivityLogEntry {
mp.mu.RLock()
defer mp.mu.RUnlock()
result := make([]ActivityLogEntry, len(mp.metrics))
copy(result, mp.metrics)
result := mp.metrics.Slice()
if result == nil {
return []ActivityLogEntry{}
}
if mp.captureCache != nil {
for i := range result {
result[i].HasCapture = mp.captureCache.Has(result[i].ID)
}
}
return result
}
// getMetricsJSON returns metrics as JSON
// getMetricsJSON returns metrics as JSON with HasCapture resolved from cache.
func (mp *metricsMonitor) getMetricsJSON() ([]byte, error) {
mp.mu.RLock()
defer mp.mu.RUnlock()
if mp.captureCache == nil {
return json.Marshal(mp.metrics)
result := mp.metrics.Slice()
if result == nil {
return json.Marshal([]ActivityLogEntry{})
}
// Make a copy with up-to-date has_capture from cache
result := make([]ActivityLogEntry, len(mp.metrics))
for i, m := range mp.metrics {
m.HasCapture = mp.captureCache.Has(m.ID)
result[i] = m
if mp.captureCache != nil {
for i := range result {
result[i].HasCapture = mp.captureCache.Has(result[i].ID)
}
}
return json.Marshal(result)
}
@@ -412,9 +416,6 @@ func (mp *metricsMonitor) wrapHandler(
capture.ID = metricID
if mp.addCapture(*capture) {
tm.HasCapture = true
mp.mu.Lock()
mp.metrics[len(mp.metrics)-1].HasCapture = true
mp.mu.Unlock()
}
}
+2 -1
View File
@@ -10,6 +10,7 @@ import (
"strings"
"time"
"github.com/mostlygeek/llama-swap/internal/logmon"
"github.com/mostlygeek/llama-swap/proxy/config"
)
@@ -24,7 +25,7 @@ type PeerProxy struct {
proxyMap map[string]*peerProxyMember
}
func NewPeerProxy(peers config.PeerDictionaryConfig, proxyLogger *LogMonitor) (*PeerProxy, error) {
func NewPeerProxy(peers config.PeerDictionaryConfig, proxyLogger *logmon.Monitor) (*PeerProxy, error) {
proxyMap := make(map[string]*peerProxyMember)
// Sort peer IDs for consistent iteration order
+6 -5
View File
@@ -18,6 +18,7 @@ import (
"time"
"github.com/mostlygeek/llama-swap/event"
"github.com/mostlygeek/llama-swap/internal/logmon"
"github.com/mostlygeek/llama-swap/proxy/config"
)
@@ -53,8 +54,8 @@ type Process struct {
// closed when command exits
cmdWaitChan chan struct{}
processLogger *LogMonitor
proxyLogger *LogMonitor
processLogger *logmon.Monitor
proxyLogger *logmon.Monitor
healthCheckTimeout int
healthCheckLoopInterval time.Duration
@@ -84,7 +85,7 @@ type Process struct {
failedStartCount int
}
func NewProcess(ID string, healthCheckTimeout int, config config.ModelConfig, processLogger *LogMonitor, proxyLogger *LogMonitor) *Process {
func NewProcess(ID string, healthCheckTimeout int, config config.ModelConfig, processLogger *logmon.Monitor, proxyLogger *logmon.Monitor) *Process {
concurrentLimit := 10
if config.ConcurrencyLimit > 0 {
concurrentLimit = config.ConcurrencyLimit
@@ -149,7 +150,7 @@ func NewProcess(ID string, healthCheckTimeout int, config config.ModelConfig, pr
}
// LogMonitor returns the log monitor associated with the process.
func (p *Process) LogMonitor() *LogMonitor {
func (p *Process) LogMonitor() *logmon.Monitor {
return p.processLogger
}
@@ -726,7 +727,7 @@ func (p *Process) cmdStopUpstreamProcess() error {
}
// Logger returns the logger for this process.
func (p *Process) Logger() *LogMonitor {
func (p *Process) Logger() *logmon.Monitor {
return p.processLogger
}
+5 -4
View File
@@ -11,20 +11,21 @@ import (
"testing"
"time"
"github.com/mostlygeek/llama-swap/internal/logmon"
"github.com/mostlygeek/llama-swap/proxy/config"
"github.com/stretchr/testify/assert"
)
var (
debugLogger = NewLogMonitorWriter(os.Stdout)
debugLogger = logmon.NewWriter(os.Stdout)
)
func init() {
// flip to help with debugging tests
if false {
debugLogger.SetLogLevel(LevelDebug)
debugLogger.SetLogLevel(logmon.LevelDebug)
} else {
debugLogger.SetLogLevel(LevelError)
debugLogger.SetLogLevel(logmon.LevelError)
}
}
@@ -585,7 +586,7 @@ func TestProcess_CustomTimeouts(t *testing.T) {
},
}
debugLogger := NewLogMonitorWriter(io.Discard)
debugLogger := logmon.NewWriter(io.Discard)
process := NewProcess("test-model", 30, modelConfig, debugLogger, debugLogger)
// Verify the process was created successfully
+5 -4
View File
@@ -6,6 +6,7 @@ import (
"slices"
"sync"
"github.com/mostlygeek/llama-swap/internal/logmon"
"github.com/mostlygeek/llama-swap/proxy/config"
)
@@ -18,8 +19,8 @@ type ProcessGroup struct {
exclusive bool
persistent bool
proxyLogger *LogMonitor
upstreamLogger *LogMonitor
proxyLogger *logmon.Monitor
upstreamLogger *logmon.Monitor
// map of current processes
processes map[string]*Process
@@ -42,7 +43,7 @@ type ProcessGroup struct {
testDelayFastPath func()
}
func NewProcessGroup(id string, config config.Config, proxyLogger *LogMonitor, upstreamLogger *LogMonitor) *ProcessGroup {
func NewProcessGroup(id string, config config.Config, proxyLogger *logmon.Monitor, upstreamLogger *logmon.Monitor) *ProcessGroup {
groupConfig, ok := config.Groups[id]
if !ok {
panic("Unable to find configuration for group id: " + id)
@@ -62,7 +63,7 @@ func NewProcessGroup(id string, config config.Config, proxyLogger *LogMonitor, u
// Create a Process for each member in the group
for _, modelID := range groupConfig.Members {
modelConfig, modelID, _ := pg.config.FindConfig(modelID)
processLogger := NewLogMonitorWriter(upstreamLogger)
processLogger := logmon.NewWriter(upstreamLogger)
process := NewProcess(modelID, pg.config.HealthCheckTimeout, modelConfig, processLogger, pg.proxyLogger)
pg.processes[modelID] = process
}
+46 -34
View File
@@ -17,6 +17,8 @@ import (
"github.com/gin-gonic/gin"
"github.com/mostlygeek/llama-swap/event"
"github.com/mostlygeek/llama-swap/internal/logmon"
"github.com/mostlygeek/llama-swap/internal/perf"
"github.com/mostlygeek/llama-swap/proxy/config"
"github.com/tidwall/gjson"
"github.com/tidwall/sjson"
@@ -69,11 +71,12 @@ type ProxyManager struct {
ginEngine *gin.Engine
// logging
proxyLogger *LogMonitor
upstreamLogger *LogMonitor
muxLogger *LogMonitor
proxyLogger *logmon.Monitor
upstreamLogger *logmon.Monitor
muxLogger *logmon.Monitor
metricsMonitor *metricsMonitor
perfMonitor *perf.Monitor
processGroups map[string]*ProcessGroup
@@ -98,27 +101,27 @@ type ProxyManager struct {
func New(proxyConfig config.Config) *ProxyManager {
// set up loggers
var muxLogger, upstreamLogger, proxyLogger *LogMonitor
var muxLogger, upstreamLogger, proxyLogger *logmon.Monitor
switch proxyConfig.LogToStdout {
case config.LogToStdoutNone:
muxLogger = NewLogMonitorWriter(io.Discard)
upstreamLogger = NewLogMonitorWriter(io.Discard)
proxyLogger = NewLogMonitorWriter(io.Discard)
muxLogger = logmon.NewWriter(io.Discard)
upstreamLogger = logmon.NewWriter(io.Discard)
proxyLogger = logmon.NewWriter(io.Discard)
case config.LogToStdoutBoth:
muxLogger = NewLogMonitorWriter(os.Stdout)
upstreamLogger = NewLogMonitorWriter(muxLogger)
proxyLogger = NewLogMonitorWriter(muxLogger)
muxLogger = logmon.NewWriter(os.Stdout)
upstreamLogger = logmon.NewWriter(muxLogger)
proxyLogger = logmon.NewWriter(muxLogger)
case config.LogToStdoutUpstream:
muxLogger = NewLogMonitorWriter(os.Stdout)
upstreamLogger = NewLogMonitorWriter(muxLogger)
proxyLogger = NewLogMonitorWriter(io.Discard)
muxLogger = logmon.NewWriter(os.Stdout)
upstreamLogger = logmon.NewWriter(muxLogger)
proxyLogger = logmon.NewWriter(io.Discard)
default:
// same as config.LogToStdoutProxy
// helpful because some old tests create a config.Config directly and it
// may not have LogToStdout set explicitly
muxLogger = NewLogMonitorWriter(os.Stdout)
upstreamLogger = NewLogMonitorWriter(io.Discard)
proxyLogger = NewLogMonitorWriter(muxLogger)
muxLogger = logmon.NewWriter(os.Stdout)
upstreamLogger = logmon.NewWriter(io.Discard)
proxyLogger = logmon.NewWriter(muxLogger)
}
if proxyConfig.LogRequests {
@@ -127,20 +130,20 @@ func New(proxyConfig config.Config) *ProxyManager {
switch strings.ToLower(strings.TrimSpace(proxyConfig.LogLevel)) {
case "debug":
proxyLogger.SetLogLevel(LevelDebug)
upstreamLogger.SetLogLevel(LevelDebug)
proxyLogger.SetLogLevel(logmon.LevelDebug)
upstreamLogger.SetLogLevel(logmon.LevelDebug)
case "info":
proxyLogger.SetLogLevel(LevelInfo)
upstreamLogger.SetLogLevel(LevelInfo)
proxyLogger.SetLogLevel(logmon.LevelInfo)
upstreamLogger.SetLogLevel(logmon.LevelInfo)
case "warn":
proxyLogger.SetLogLevel(LevelWarn)
upstreamLogger.SetLogLevel(LevelWarn)
proxyLogger.SetLogLevel(logmon.LevelWarn)
upstreamLogger.SetLogLevel(logmon.LevelWarn)
case "error":
proxyLogger.SetLogLevel(LevelError)
upstreamLogger.SetLogLevel(LevelError)
proxyLogger.SetLogLevel(logmon.LevelError)
upstreamLogger.SetLogLevel(logmon.LevelError)
default:
proxyLogger.SetLogLevel(LevelInfo)
upstreamLogger.SetLogLevel(LevelInfo)
proxyLogger.SetLogLevel(logmon.LevelInfo)
upstreamLogger.SetLogLevel(logmon.LevelInfo)
}
// see: https://go.dev/src/time/format.go
@@ -271,13 +274,17 @@ func (pm *ProxyManager) setupGinEngine() {
pm.ginEngine.Use(func(c *gin.Context) {
// don't log the Wake on Lan proxy health check
if c.Request.URL.Path == "/wol-health" {
c.Next()
return
for _, prefix := range []string{
"/wol-health",
"/api/performance",
"/metrics",
} {
if strings.HasPrefix(c.Request.URL.Path, prefix) {
c.Next()
return
}
}
// Start timer
start := time.Now()
// capture these because /upstream/:model rewrites them in c.Next()
@@ -285,12 +292,9 @@ func (pm *ProxyManager) setupGinEngine() {
method := c.Request.Method
path := c.Request.URL.Path
// Process request
c.Next()
// Stop timer
duration := time.Since(start)
statusCode := c.Writer.Status()
bodySize := c.Writer.Size()
@@ -439,6 +443,8 @@ func (pm *ProxyManager) setupGinEngine() {
c.String(http.StatusOK, "OK")
})
pm.ginEngine.GET("/metrics", pm.prometheusMetricsHandler)
// see cmd/wol-proxy/wol-proxy.go, not logged
pm.ginEngine.GET("/wol-health", func(c *gin.Context) {
c.String(http.StatusOK, "OK")
@@ -1218,3 +1224,9 @@ func (pm *ProxyManager) SetVersion(buildDate string, commit string, version stri
pm.commit = commit
pm.version = version
}
func (pm *ProxyManager) SetPerfMonitor(m *perf.Monitor) {
pm.Lock()
defer pm.Unlock()
pm.perfMonitor = m
}
+54 -1
View File
@@ -8,9 +8,11 @@ import (
"sort"
"strconv"
"strings"
"time"
"github.com/gin-gonic/gin"
"github.com/mostlygeek/llama-swap/event"
"github.com/mostlygeek/llama-swap/internal/perf"
)
type Model struct {
@@ -32,6 +34,7 @@ func addApiHandlers(pm *ProxyManager) {
apiGroup.POST("/models/unload/*model", pm.apiUnloadSingleModelHandler)
apiGroup.GET("/events", pm.apiSendEvents)
apiGroup.GET("/metrics", pm.apiGetMetrics)
apiGroup.GET("/performance", pm.apiGetPerformance)
apiGroup.GET("/version", pm.apiGetVersion)
apiGroup.GET("/captures/:id", pm.apiGetCapture)
}
@@ -247,6 +250,56 @@ func (pm *ProxyManager) apiGetMetrics(c *gin.Context) {
c.Data(http.StatusOK, "application/json", jsonData)
}
func (pm *ProxyManager) prometheusMetricsHandler(c *gin.Context) {
if pm.perfMonitor == nil {
c.String(http.StatusServiceUnavailable, "# performance monitor not available\n")
return
}
pm.perfMonitor.MetricsHandler().ServeHTTP(c.Writer, c.Request)
}
func (pm *ProxyManager) apiGetPerformance(c *gin.Context) {
if pm.perfMonitor == nil {
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "performance monitor not available"})
return
}
sysStats, gpuStats := pm.perfMonitor.Current()
var after time.Time
if afterStr := c.Query("after"); afterStr != "" {
ts, err := time.Parse(time.RFC3339, afterStr)
if err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "invalid 'after' timestamp, use RFC3339 format"})
return
}
after = ts
}
if !after.IsZero() {
filtered := make([]perf.SysStat, 0, len(sysStats))
for _, s := range sysStats {
if s.Timestamp.After(after) {
filtered = append(filtered, s)
}
}
sysStats = filtered
filteredGpu := make([]perf.GpuStat, 0, len(gpuStats))
for _, g := range gpuStats {
if g.Timestamp.After(after) {
filteredGpu = append(filteredGpu, g)
}
}
gpuStats = filteredGpu
}
c.JSON(http.StatusOK, gin.H{
"sys_stats": sysStats,
"gpu_stats": gpuStats,
})
}
func (pm *ProxyManager) apiUnloadSingleModelHandler(c *gin.Context) {
requestedModel := strings.TrimPrefix(c.Param("model"), "/")
realModelName, found := pm.config.RealModelName(requestedModel)
@@ -291,7 +344,7 @@ func (pm *ProxyManager) apiGetCapture(c *gin.Context) {
}
capture := pm.metricsMonitor.getCaptureByID(id)
if capture == nil {
if capture == nil || (capture.ReqPath == "" && capture.ReqHeaders == nil && capture.ReqBody == nil && capture.RespHeaders == nil && capture.RespBody == nil) {
c.JSON(http.StatusNotFound, gin.H{"error": "capture not found"})
return
}
+2 -1
View File
@@ -7,6 +7,7 @@ import (
"strings"
"github.com/gin-gonic/gin"
"github.com/mostlygeek/llama-swap/internal/logmon"
)
func (pm *ProxyManager) sendLogsHandlers(c *gin.Context) {
@@ -89,7 +90,7 @@ func (pm *ProxyManager) streamLogsHandler(c *gin.Context) {
}
// getLogger searches for the appropriate logger based on the logMonitorId
func (pm *ProxyManager) getLogger(logMonitorId string) (*LogMonitor, error) {
func (pm *ProxyManager) getLogger(logMonitorId string) (*logmon.Monitor, error) {
switch logMonitorId {
case "":
// maintain the default