proxy,ui: add performance monitoring with Prometheus metrics (#743)
Add a comprehensive performance monitoring system that collects CPU, memory, swap, load average, network IO, and GPU stats. Provides both a REST API for the UI and a Prometheus /metrics endpoint. Backend changes: - New internal/perf package with configurable interval-based stats collection - GPU monitoring via LACT (Unix socket) and nvidia-smi fallback on Linux - Ring buffer (internal/ring) for time-series stat storage - Prometheus /metrics endpoint with all system and GPU metrics - Moved LogMonitor to internal/logmon package - New PerformanceConfig for hot-reloadable monitoring settings - REST /api/performance endpoint replacing SSE streaming UI changes: - New Performance page with real-time charts for CPU, memory, GPU, and network - Reusable PerformanceChart component - LLAMA_SWAP_URL environment variable support - Improved capture dialog display Other: - Example Grafana dashboard for Prometheus metrics - monitor-test standalone binary - Config schema and example updates fixes #596
This commit is contained in:
@@ -9,6 +9,7 @@ import (
|
||||
"runtime"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/billziss-gh/golib/shlex"
|
||||
"gopkg.in/yaml.v3"
|
||||
@@ -124,6 +125,7 @@ type Config struct {
|
||||
LogToStdout string `yaml:"logToStdout"`
|
||||
MetricsMaxInMemory int `yaml:"metricsMaxInMemory"`
|
||||
CaptureBuffer int `yaml:"captureBuffer"`
|
||||
Performance PerformanceConfig `yaml:"performance"`
|
||||
GlobalTTL int `yaml:"globalTTL"`
|
||||
Models map[string]ModelConfig `yaml:"models"` /* key is model ID */
|
||||
Profiles map[string][]string `yaml:"profiles"`
|
||||
@@ -220,6 +222,17 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
|
||||
config.HealthCheckTimeout = 15
|
||||
}
|
||||
|
||||
// Apply defaults for performance config when section is missing
|
||||
if !config.Performance.Enable && config.Performance.Every == 0 && config.Performance.MaxAge == 0 && config.Performance.GC == 0 {
|
||||
config.Performance.Enable = true
|
||||
config.Performance.Every = 15 * time.Second
|
||||
config.Performance.MaxAge = 1 * time.Hour
|
||||
config.Performance.GC = 5 * time.Minute
|
||||
}
|
||||
if err = config.Performance.Validate(); err != nil {
|
||||
return Config{}, fmt.Errorf("performance: %w", err)
|
||||
}
|
||||
|
||||
if config.StartPort < 1 {
|
||||
return Config{}, fmt.Errorf("startPort must be greater than 1")
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
@@ -229,6 +230,12 @@ groups:
|
||||
HealthCheckTimeout: 15,
|
||||
MetricsMaxInMemory: 1000,
|
||||
CaptureBuffer: 5,
|
||||
Performance: PerformanceConfig{
|
||||
Enable: true,
|
||||
Every: 15 * time.Second,
|
||||
MaxAge: 1 * time.Hour,
|
||||
GC: 5 * time.Minute,
|
||||
},
|
||||
Profiles: map[string][]string{
|
||||
"test": {"model1", "model2"},
|
||||
},
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
@@ -218,6 +219,12 @@ groups:
|
||||
HealthCheckTimeout: 15,
|
||||
MetricsMaxInMemory: 1000,
|
||||
CaptureBuffer: 5,
|
||||
Performance: PerformanceConfig{
|
||||
Enable: true,
|
||||
Every: 15 * time.Second,
|
||||
MaxAge: 1 * time.Hour,
|
||||
GC: 5 * time.Minute,
|
||||
},
|
||||
Profiles: map[string][]string{
|
||||
"test": {"model1", "model2"},
|
||||
},
|
||||
|
||||
@@ -0,0 +1,45 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PerformanceConfig holds configuration for system performance monitoring
|
||||
type PerformanceConfig struct {
|
||||
Enable bool `yaml:"enable"`
|
||||
Every time.Duration `yaml:"every"`
|
||||
MaxAge time.Duration `yaml:"maxAge"`
|
||||
GC time.Duration `yaml:"gc"`
|
||||
}
|
||||
|
||||
func (p *PerformanceConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
||||
type rawPerformanceConfig PerformanceConfig
|
||||
defaults := rawPerformanceConfig{
|
||||
Enable: true,
|
||||
Every: 15 * time.Second,
|
||||
MaxAge: 1 * time.Hour,
|
||||
GC: 5 * time.Minute,
|
||||
}
|
||||
|
||||
if err := unmarshal(&defaults); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
*p = PerformanceConfig(defaults)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Validate checks the PerformanceConfig values and returns an error if invalid
|
||||
func (p *PerformanceConfig) Validate() error {
|
||||
if p.Every < time.Second {
|
||||
return fmt.Errorf("every must be at least 1s, got %v", p.Every)
|
||||
}
|
||||
if p.MaxAge <= 0 {
|
||||
return fmt.Errorf("maxAge must be greater than 0, got %v", p.MaxAge)
|
||||
}
|
||||
if p.GC <= 0 {
|
||||
return fmt.Errorf("gc must be greater than 0, got %v", p.GC)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,140 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestPerformanceConfig_Defaults(t *testing.T) {
|
||||
content := `
|
||||
models:
|
||||
model1:
|
||||
cmd: path/to/cmd --port ${PORT}
|
||||
`
|
||||
config, err := LoadConfigFromReader(strings.NewReader(content))
|
||||
assert.NoError(t, err)
|
||||
|
||||
// When performance section is missing, defaults should be applied
|
||||
assert.True(t, config.Performance.Enable)
|
||||
assert.Equal(t, 15*time.Second, config.Performance.Every)
|
||||
assert.Equal(t, 1*time.Hour, config.Performance.MaxAge)
|
||||
assert.Equal(t, 5*time.Minute, config.Performance.GC)
|
||||
}
|
||||
|
||||
func TestPerformanceConfig_CustomValues(t *testing.T) {
|
||||
content := `
|
||||
performance:
|
||||
enable: true
|
||||
every: 30s
|
||||
maxAge: 12h
|
||||
gc: 10m
|
||||
models:
|
||||
model1:
|
||||
cmd: path/to/cmd --port ${PORT}
|
||||
`
|
||||
config, err := LoadConfigFromReader(strings.NewReader(content))
|
||||
assert.NoError(t, err)
|
||||
|
||||
assert.True(t, config.Performance.Enable)
|
||||
assert.Equal(t, 30*time.Second, config.Performance.Every)
|
||||
assert.Equal(t, 12*time.Hour, config.Performance.MaxAge)
|
||||
assert.Equal(t, 10*time.Minute, config.Performance.GC)
|
||||
}
|
||||
|
||||
func TestPerformanceConfig_Disabled(t *testing.T) {
|
||||
content := `
|
||||
performance:
|
||||
enable: false
|
||||
models:
|
||||
model1:
|
||||
cmd: path/to/cmd --port ${PORT}
|
||||
`
|
||||
config, err := LoadConfigFromReader(strings.NewReader(content))
|
||||
assert.NoError(t, err)
|
||||
|
||||
assert.False(t, config.Performance.Enable)
|
||||
// Duration defaults should still apply
|
||||
assert.Equal(t, 15*time.Second, config.Performance.Every)
|
||||
assert.Equal(t, 1*time.Hour, config.Performance.MaxAge)
|
||||
assert.Equal(t, 5*time.Minute, config.Performance.GC)
|
||||
}
|
||||
|
||||
func TestPerformanceConfig_PartialValues(t *testing.T) {
|
||||
content := `
|
||||
performance:
|
||||
every: 10s
|
||||
maxAge: 6h
|
||||
models:
|
||||
model1:
|
||||
cmd: path/to/cmd --port ${PORT}
|
||||
`
|
||||
config, err := LoadConfigFromReader(strings.NewReader(content))
|
||||
assert.NoError(t, err)
|
||||
|
||||
// enable should default to true
|
||||
assert.True(t, config.Performance.Enable)
|
||||
assert.Equal(t, 10*time.Second, config.Performance.Every)
|
||||
assert.Equal(t, 6*time.Hour, config.Performance.MaxAge)
|
||||
// gc should use default
|
||||
assert.Equal(t, 5*time.Minute, config.Performance.GC)
|
||||
}
|
||||
|
||||
func TestPerformanceConfig_InvalidEvery(t *testing.T) {
|
||||
content := `
|
||||
performance:
|
||||
every: 500ms
|
||||
models:
|
||||
model1:
|
||||
cmd: path/to/cmd --port ${PORT}
|
||||
`
|
||||
_, err := LoadConfigFromReader(strings.NewReader(content))
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "every must be at least 1s")
|
||||
}
|
||||
|
||||
func TestPerformanceConfig_InvalidMaxAge(t *testing.T) {
|
||||
content := `
|
||||
performance:
|
||||
maxAge: 0s
|
||||
models:
|
||||
model1:
|
||||
cmd: path/to/cmd --port ${PORT}
|
||||
`
|
||||
_, err := LoadConfigFromReader(strings.NewReader(content))
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "maxAge must be greater than 0")
|
||||
}
|
||||
|
||||
func TestPerformanceConfig_InvalidGC(t *testing.T) {
|
||||
content := `
|
||||
performance:
|
||||
gc: 0s
|
||||
models:
|
||||
model1:
|
||||
cmd: path/to/cmd --port ${PORT}
|
||||
`
|
||||
_, err := LoadConfigFromReader(strings.NewReader(content))
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "gc must be greater than 0")
|
||||
}
|
||||
|
||||
func TestPerformanceConfig_ComplexDurations(t *testing.T) {
|
||||
content := `
|
||||
performance:
|
||||
every: 1m30s
|
||||
maxAge: 2h10m
|
||||
gc: 1m
|
||||
models:
|
||||
model1:
|
||||
cmd: path/to/cmd --port ${PORT}
|
||||
`
|
||||
config, err := LoadConfigFromReader(strings.NewReader(content))
|
||||
assert.NoError(t, err)
|
||||
|
||||
assert.Equal(t, 90*time.Second, config.Performance.Every)
|
||||
assert.Equal(t, (2*time.Hour)+(10*time.Minute), config.Performance.MaxAge)
|
||||
assert.Equal(t, 1*time.Minute, config.Performance.GC)
|
||||
}
|
||||
Reference in New Issue
Block a user