proxy,ui: add performance monitoring with Prometheus metrics (#743)
Add a comprehensive performance monitoring system that collects CPU, memory, swap, load average, network IO, and GPU stats. Provides both a REST API for the UI and a Prometheus /metrics endpoint. Backend changes: - New internal/perf package with configurable interval-based stats collection - GPU monitoring via LACT (Unix socket) and nvidia-smi fallback on Linux - Ring buffer (internal/ring) for time-series stat storage - Prometheus /metrics endpoint with all system and GPU metrics - Moved LogMonitor to internal/logmon package - New PerformanceConfig for hot-reloadable monitoring settings - REST /api/performance endpoint replacing SSE streaming UI changes: - New Performance page with real-time charts for CPU, memory, GPU, and network - Reusable PerformanceChart component - LLAMA_SWAP_URL environment variable support - Improved capture dialog display Other: - Example Grafana dashboard for Prometheus metrics - monitor-test standalone binary - Config schema and example updates fixes #596
This commit is contained in:
@@ -0,0 +1,251 @@
|
||||
package logmon
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/mostlygeek/llama-swap/event"
|
||||
)
|
||||
|
||||
const DataEventID = 0x04
|
||||
|
||||
type DataEvent struct {
|
||||
Data []byte
|
||||
}
|
||||
|
||||
func (e DataEvent) Type() uint32 {
|
||||
return DataEventID
|
||||
}
|
||||
|
||||
// circularBuffer is a fixed-size circular byte buffer that overwrites
|
||||
// oldest data when full. It provides O(1) writes and O(n) reads.
|
||||
type circularBuffer struct {
|
||||
data []byte
|
||||
head int
|
||||
size int
|
||||
}
|
||||
|
||||
func newCircularBuffer(capacity int) *circularBuffer {
|
||||
return &circularBuffer{
|
||||
data: make([]byte, capacity),
|
||||
head: 0,
|
||||
size: 0,
|
||||
}
|
||||
}
|
||||
|
||||
func (cb *circularBuffer) Write(p []byte) {
|
||||
if len(p) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
cap := len(cb.data)
|
||||
|
||||
if len(p) >= cap {
|
||||
copy(cb.data, p[len(p)-cap:])
|
||||
cb.head = 0
|
||||
cb.size = cap
|
||||
return
|
||||
}
|
||||
|
||||
firstPart := cap - cb.head
|
||||
if firstPart >= len(p) {
|
||||
copy(cb.data[cb.head:], p)
|
||||
cb.head = (cb.head + len(p)) % cap
|
||||
} else {
|
||||
copy(cb.data[cb.head:], p[:firstPart])
|
||||
copy(cb.data[:len(p)-firstPart], p[firstPart:])
|
||||
cb.head = len(p) - firstPart
|
||||
}
|
||||
|
||||
cb.size += len(p)
|
||||
if cb.size > cap {
|
||||
cb.size = cap
|
||||
}
|
||||
}
|
||||
|
||||
func (cb *circularBuffer) GetHistory() []byte {
|
||||
if cb.size == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
result := make([]byte, cb.size)
|
||||
cap := len(cb.data)
|
||||
|
||||
start := (cb.head - cb.size + cap) % cap
|
||||
|
||||
if start+cb.size <= cap {
|
||||
copy(result, cb.data[start:start+cb.size])
|
||||
} else {
|
||||
firstPart := cap - start
|
||||
copy(result[:firstPart], cb.data[start:])
|
||||
copy(result[firstPart:], cb.data[:cb.size-firstPart])
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
type Level int
|
||||
|
||||
const (
|
||||
LevelDebug Level = iota
|
||||
LevelInfo
|
||||
LevelWarn
|
||||
LevelError
|
||||
|
||||
BufferSize = 100 * 1024
|
||||
)
|
||||
|
||||
type Monitor struct {
|
||||
eventbus *event.Dispatcher
|
||||
mu sync.RWMutex
|
||||
buffer *circularBuffer
|
||||
bufferMu sync.RWMutex
|
||||
|
||||
stdout io.Writer
|
||||
|
||||
level Level
|
||||
prefix string
|
||||
timeFormat string
|
||||
}
|
||||
|
||||
func New() *Monitor {
|
||||
return NewWriter(os.Stdout)
|
||||
}
|
||||
|
||||
func NewWriter(stdout io.Writer) *Monitor {
|
||||
return &Monitor{
|
||||
eventbus: event.NewDispatcherConfig(1000),
|
||||
buffer: nil,
|
||||
stdout: stdout,
|
||||
level: LevelInfo,
|
||||
prefix: "",
|
||||
timeFormat: "",
|
||||
}
|
||||
}
|
||||
|
||||
func (w *Monitor) Write(p []byte) (n int, err error) {
|
||||
if len(p) == 0 {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
n, err = w.stdout.Write(p)
|
||||
if err != nil {
|
||||
return n, err
|
||||
}
|
||||
|
||||
w.bufferMu.Lock()
|
||||
if w.buffer == nil {
|
||||
w.buffer = newCircularBuffer(BufferSize)
|
||||
}
|
||||
w.buffer.Write(p)
|
||||
w.bufferMu.Unlock()
|
||||
|
||||
bufferCopy := make([]byte, len(p))
|
||||
copy(bufferCopy, p)
|
||||
w.broadcast(bufferCopy)
|
||||
return n, nil
|
||||
}
|
||||
|
||||
func (w *Monitor) GetHistory() []byte {
|
||||
w.bufferMu.RLock()
|
||||
defer w.bufferMu.RUnlock()
|
||||
if w.buffer == nil {
|
||||
return nil
|
||||
}
|
||||
return w.buffer.GetHistory()
|
||||
}
|
||||
|
||||
// Clear releases the buffer memory, making it eligible for GC.
|
||||
// The buffer will be lazily re-allocated on the next Write.
|
||||
func (w *Monitor) Clear() {
|
||||
w.bufferMu.Lock()
|
||||
w.buffer = nil
|
||||
w.bufferMu.Unlock()
|
||||
}
|
||||
|
||||
func (w *Monitor) OnLogData(callback func(data []byte)) context.CancelFunc {
|
||||
return event.Subscribe(w.eventbus, func(e DataEvent) {
|
||||
callback(e.Data)
|
||||
})
|
||||
}
|
||||
|
||||
func (w *Monitor) broadcast(msg []byte) {
|
||||
event.Publish(w.eventbus, DataEvent{Data: msg})
|
||||
}
|
||||
|
||||
func (w *Monitor) SetPrefix(prefix string) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
w.prefix = prefix
|
||||
}
|
||||
|
||||
func (w *Monitor) SetLogLevel(level Level) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
w.level = level
|
||||
}
|
||||
|
||||
func (w *Monitor) SetLogTimeFormat(timeFormat string) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
w.timeFormat = timeFormat
|
||||
}
|
||||
|
||||
func (w *Monitor) formatMessage(level string, msg string) []byte {
|
||||
prefix := ""
|
||||
if w.prefix != "" {
|
||||
prefix = fmt.Sprintf("[%s] ", w.prefix)
|
||||
}
|
||||
timestamp := ""
|
||||
if w.timeFormat != "" {
|
||||
timestamp = fmt.Sprintf("%s ", time.Now().Format(w.timeFormat))
|
||||
}
|
||||
return fmt.Appendf(nil, "%s%s[%s] %s\n", timestamp, prefix, level, msg)
|
||||
}
|
||||
|
||||
func (w *Monitor) log(level Level, msg string) {
|
||||
if level < w.level {
|
||||
return
|
||||
}
|
||||
w.Write(w.formatMessage(level.String(), msg))
|
||||
}
|
||||
|
||||
func (w *Monitor) Debug(msg string) { w.log(LevelDebug, msg) }
|
||||
func (w *Monitor) Info(msg string) { w.log(LevelInfo, msg) }
|
||||
func (w *Monitor) Warn(msg string) { w.log(LevelWarn, msg) }
|
||||
func (w *Monitor) Error(msg string) { w.log(LevelError, msg) }
|
||||
|
||||
func (w *Monitor) Debugf(format string, args ...any) {
|
||||
w.log(LevelDebug, fmt.Sprintf(format, args...))
|
||||
}
|
||||
|
||||
func (w *Monitor) Infof(format string, args ...any) {
|
||||
w.log(LevelInfo, fmt.Sprintf(format, args...))
|
||||
}
|
||||
|
||||
func (w *Monitor) Warnf(format string, args ...any) {
|
||||
w.log(LevelWarn, fmt.Sprintf(format, args...))
|
||||
}
|
||||
|
||||
func (w *Monitor) Errorf(format string, args ...any) {
|
||||
w.log(LevelError, fmt.Sprintf(format, args...))
|
||||
}
|
||||
|
||||
func (l Level) String() string {
|
||||
switch l {
|
||||
case LevelDebug:
|
||||
return "DEBUG"
|
||||
case LevelInfo:
|
||||
return "INFO"
|
||||
case LevelWarn:
|
||||
return "WARN"
|
||||
case LevelError:
|
||||
return "ERROR"
|
||||
default:
|
||||
return "UNKNOWN"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,250 @@
|
||||
package logmon
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestLogMonitor(t *testing.T) {
|
||||
logMonitor := NewWriter(io.Discard)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
|
||||
client1Messages := make([]byte, 0)
|
||||
client2Messages := make([]byte, 0)
|
||||
|
||||
defer logMonitor.OnLogData(func(data []byte) {
|
||||
client1Messages = append(client1Messages, data...)
|
||||
wg.Done()
|
||||
})()
|
||||
|
||||
defer logMonitor.OnLogData(func(data []byte) {
|
||||
client2Messages = append(client2Messages, data...)
|
||||
wg.Done()
|
||||
})()
|
||||
|
||||
wg.Add(6) // 2 x 3 writes
|
||||
|
||||
logMonitor.Write([]byte("1"))
|
||||
logMonitor.Write([]byte("2"))
|
||||
logMonitor.Write([]byte("3"))
|
||||
|
||||
wg.Wait()
|
||||
|
||||
expectedHistory := "123"
|
||||
history := string(logMonitor.GetHistory())
|
||||
|
||||
if history != expectedHistory {
|
||||
t.Errorf("Expected history: %s, got: %s", expectedHistory, history)
|
||||
}
|
||||
|
||||
c1Data := string(client1Messages)
|
||||
if c1Data != expectedHistory {
|
||||
t.Errorf("Client1 expected %s, got: %s", expectedHistory, c1Data)
|
||||
}
|
||||
|
||||
c2Data := string(client2Messages)
|
||||
if c2Data != expectedHistory {
|
||||
t.Errorf("Client2 expected %s, got: %s", expectedHistory, c2Data)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWrite_ImmutableBuffer(t *testing.T) {
|
||||
lm := NewWriter(io.Discard)
|
||||
|
||||
msg := []byte("Hello, World!")
|
||||
lenmsg := len(msg)
|
||||
|
||||
n, err := lm.Write(msg)
|
||||
if err != nil {
|
||||
t.Fatalf("Write failed: %v", err)
|
||||
}
|
||||
|
||||
if n != lenmsg {
|
||||
t.Errorf("Expected %d bytes written but got %d", lenmsg, n)
|
||||
}
|
||||
|
||||
msg[0] = 'B'
|
||||
|
||||
history := lm.GetHistory()
|
||||
|
||||
expected := []byte("Hello, World!")
|
||||
if !bytes.Equal(history, expected) {
|
||||
t.Errorf("Expected history to be %q, got %q", expected, history)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWrite_LogTimeFormat(t *testing.T) {
|
||||
lm := NewWriter(io.Discard)
|
||||
|
||||
lm.timeFormat = time.RFC3339
|
||||
|
||||
lm.Info("Hello, World!")
|
||||
|
||||
history := lm.GetHistory()
|
||||
|
||||
timestamp := ""
|
||||
fields := strings.Fields(string(history))
|
||||
if len(fields) > 0 {
|
||||
timestamp = fields[0]
|
||||
} else {
|
||||
t.Fatalf("Cannot extract string from history")
|
||||
}
|
||||
|
||||
_, err := time.Parse(time.RFC3339, timestamp)
|
||||
if err != nil {
|
||||
t.Fatalf("Cannot find timestamp: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCircularBuffer_WrapAround(t *testing.T) {
|
||||
cb := newCircularBuffer(10)
|
||||
|
||||
cb.Write([]byte("hello"))
|
||||
if got := string(cb.GetHistory()); got != "hello" {
|
||||
t.Errorf("Expected 'hello', got %q", got)
|
||||
}
|
||||
|
||||
cb.Write([]byte("world"))
|
||||
if got := string(cb.GetHistory()); got != "helloworld" {
|
||||
t.Errorf("Expected 'helloworld', got %q", got)
|
||||
}
|
||||
|
||||
cb.Write([]byte("12345"))
|
||||
if got := string(cb.GetHistory()); got != "world12345" {
|
||||
t.Errorf("Expected 'world12345', got %q", got)
|
||||
}
|
||||
|
||||
cb.Write([]byte("abcdefghijklmnop"))
|
||||
if got := string(cb.GetHistory()); got != "ghijklmnop" {
|
||||
t.Errorf("Expected 'ghijklmnop', got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCircularBuffer_BoundaryConditions(t *testing.T) {
|
||||
cb := newCircularBuffer(10)
|
||||
if got := cb.GetHistory(); got != nil {
|
||||
t.Errorf("Expected nil for empty buffer, got %q", got)
|
||||
}
|
||||
|
||||
cb.Write([]byte("1234567890"))
|
||||
if got := string(cb.GetHistory()); got != "1234567890" {
|
||||
t.Errorf("Expected '1234567890', got %q", got)
|
||||
}
|
||||
|
||||
cb = newCircularBuffer(10)
|
||||
cb.Write([]byte("12345"))
|
||||
cb.Write([]byte("67890"))
|
||||
if got := string(cb.GetHistory()); got != "1234567890" {
|
||||
t.Errorf("Expected '1234567890', got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLogMonitor_LazyInit(t *testing.T) {
|
||||
lm := NewWriter(io.Discard)
|
||||
|
||||
if lm.buffer != nil {
|
||||
t.Error("Expected buffer to be nil before first write")
|
||||
}
|
||||
|
||||
if got := lm.GetHistory(); got != nil {
|
||||
t.Errorf("Expected nil history before first write, got %q", got)
|
||||
}
|
||||
|
||||
lm.Write([]byte("test"))
|
||||
|
||||
if lm.buffer == nil {
|
||||
t.Error("Expected buffer to be initialized after write")
|
||||
}
|
||||
|
||||
if got := string(lm.GetHistory()); got != "test" {
|
||||
t.Errorf("Expected 'test', got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLogMonitor_Clear(t *testing.T) {
|
||||
lm := NewWriter(io.Discard)
|
||||
|
||||
lm.Write([]byte("hello"))
|
||||
if got := string(lm.GetHistory()); got != "hello" {
|
||||
t.Errorf("Expected 'hello', got %q", got)
|
||||
}
|
||||
|
||||
lm.Clear()
|
||||
|
||||
if lm.buffer != nil {
|
||||
t.Error("Expected buffer to be nil after Clear")
|
||||
}
|
||||
|
||||
if got := lm.GetHistory(); got != nil {
|
||||
t.Errorf("Expected nil history after Clear, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLogMonitor_ClearAndReuse(t *testing.T) {
|
||||
lm := NewWriter(io.Discard)
|
||||
|
||||
lm.Write([]byte("first"))
|
||||
lm.Clear()
|
||||
lm.Write([]byte("second"))
|
||||
|
||||
if got := string(lm.GetHistory()); got != "second" {
|
||||
t.Errorf("Expected 'second' after clear and reuse, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkLogMonitorWrite(b *testing.B) {
|
||||
smallMsg := []byte("small message\n")
|
||||
mediumMsg := []byte(strings.Repeat("medium message content ", 10) + "\n")
|
||||
largeMsg := []byte(strings.Repeat("large message content for benchmarking ", 100) + "\n")
|
||||
|
||||
b.Run("SmallWrite", func(b *testing.B) {
|
||||
lm := NewWriter(io.Discard)
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
lm.Write(smallMsg)
|
||||
}
|
||||
})
|
||||
|
||||
b.Run("MediumWrite", func(b *testing.B) {
|
||||
lm := NewWriter(io.Discard)
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
lm.Write(mediumMsg)
|
||||
}
|
||||
})
|
||||
|
||||
b.Run("LargeWrite", func(b *testing.B) {
|
||||
lm := NewWriter(io.Discard)
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
lm.Write(largeMsg)
|
||||
}
|
||||
})
|
||||
|
||||
b.Run("WithSubscribers", func(b *testing.B) {
|
||||
lm := NewWriter(io.Discard)
|
||||
for i := 0; i < 5; i++ {
|
||||
lm.OnLogData(func(data []byte) {})
|
||||
}
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
lm.Write(mediumMsg)
|
||||
}
|
||||
})
|
||||
|
||||
b.Run("GetHistory", func(b *testing.B) {
|
||||
lm := NewWriter(io.Discard)
|
||||
for i := 0; i < 1000; i++ {
|
||||
lm.Write(mediumMsg)
|
||||
}
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
lm.GetHistory()
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,210 @@
|
||||
package perf
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/mostlygeek/llama-swap/internal/logmon"
|
||||
"github.com/mostlygeek/llama-swap/internal/ring"
|
||||
"github.com/mostlygeek/llama-swap/proxy/config"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrNotImplemented = errors.New("Not Implemented")
|
||||
ErrNoGpuTool = errors.New("no GPU monitoring tool available")
|
||||
)
|
||||
|
||||
type Monitor struct {
|
||||
mutex sync.RWMutex
|
||||
log *logmon.Monitor
|
||||
conf config.PerformanceConfig
|
||||
sysRing ring.Buffer[SysStat]
|
||||
gpuRing ring.Buffer[[]GpuStat]
|
||||
|
||||
stopCtx context.Context
|
||||
stopCancel context.CancelFunc
|
||||
|
||||
sysListeners map[chan SysStat]struct{}
|
||||
gpuListeners map[chan []GpuStat]struct{}
|
||||
}
|
||||
|
||||
func ringCapacity(c config.PerformanceConfig) int {
|
||||
n := int(c.MaxAge / c.Every)
|
||||
if n < 1 {
|
||||
n = 1
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
func New(c config.PerformanceConfig, logger *logmon.Monitor) (*Monitor, error) {
|
||||
|
||||
if c.Every < 100*time.Millisecond {
|
||||
c.Every = 100 * time.Millisecond
|
||||
}
|
||||
if c.GC < 1*time.Second {
|
||||
c.GC = 1 * time.Second
|
||||
}
|
||||
if c.MaxAge < 1*time.Minute {
|
||||
c.MaxAge = 1 * time.Minute
|
||||
}
|
||||
|
||||
if logger == nil {
|
||||
return nil, errors.New("logger is required")
|
||||
}
|
||||
|
||||
capacity := ringCapacity(c)
|
||||
return &Monitor{
|
||||
conf: c,
|
||||
log: logger,
|
||||
sysRing: ring.NewBuffer[SysStat](capacity),
|
||||
gpuRing: ring.NewBuffer[[]GpuStat](capacity),
|
||||
sysListeners: make(map[chan SysStat]struct{}),
|
||||
gpuListeners: make(map[chan []GpuStat]struct{}),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (m *Monitor) Stop() {
|
||||
m.mutex.Lock()
|
||||
defer m.mutex.Unlock()
|
||||
if m.stopCancel == nil {
|
||||
return
|
||||
}
|
||||
m.stopCancel()
|
||||
m.stopCancel = nil
|
||||
}
|
||||
|
||||
// UpdateConfig updates the monitor configuration and restarts if changed.
|
||||
func (m *Monitor) UpdateConfig(newConf config.PerformanceConfig) {
|
||||
m.mutex.RLock()
|
||||
changed := m.conf != newConf
|
||||
m.mutex.RUnlock()
|
||||
|
||||
if !changed {
|
||||
return
|
||||
}
|
||||
|
||||
m.Stop()
|
||||
m.mutex.Lock()
|
||||
m.conf = newConf
|
||||
capacity := ringCapacity(newConf)
|
||||
m.sysRing = ring.NewBuffer[SysStat](capacity)
|
||||
m.gpuRing = ring.NewBuffer[[]GpuStat](capacity)
|
||||
m.mutex.Unlock()
|
||||
m.Start()
|
||||
}
|
||||
|
||||
// Subscribe returns channels to listen to system and GPU stats.
|
||||
func (m *Monitor) Subscribe() (chan SysStat, chan []GpuStat, func()) {
|
||||
m.mutex.Lock()
|
||||
defer m.mutex.Unlock()
|
||||
sysChan := make(chan SysStat, 1)
|
||||
gpuChan := make(chan []GpuStat, 1)
|
||||
|
||||
m.sysListeners[sysChan] = struct{}{}
|
||||
m.gpuListeners[gpuChan] = struct{}{}
|
||||
|
||||
unsub := func() {
|
||||
m.mutex.Lock()
|
||||
defer m.mutex.Unlock()
|
||||
delete(m.sysListeners, sysChan)
|
||||
delete(m.gpuListeners, gpuChan)
|
||||
}
|
||||
|
||||
return sysChan, gpuChan, unsub
|
||||
}
|
||||
|
||||
func (m *Monitor) Start() {
|
||||
m.mutex.Lock()
|
||||
defer m.mutex.Unlock()
|
||||
if m.stopCancel != nil {
|
||||
return
|
||||
}
|
||||
|
||||
m.stopCtx, m.stopCancel = context.WithCancel(context.Background())
|
||||
|
||||
go func() {
|
||||
tick := time.NewTicker(m.conf.Every)
|
||||
defer tick.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-m.stopCtx.Done():
|
||||
return
|
||||
case <-tick.C:
|
||||
s, err := ReadSysStats()
|
||||
if err != nil {
|
||||
if err != ErrNotImplemented {
|
||||
m.log.Errorf("failed to read sys stats: %s", err.Error())
|
||||
}
|
||||
continue
|
||||
}
|
||||
m.mutex.Lock()
|
||||
m.sysRing.Push(s)
|
||||
for l := range m.sysListeners {
|
||||
select {
|
||||
case l <- s:
|
||||
default:
|
||||
}
|
||||
}
|
||||
m.mutex.Unlock()
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
go func() {
|
||||
gpuCh, err := getGpuStats(m.stopCtx, m.conf.Every, m.log)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrNotImplemented) || errors.Is(err, ErrNoGpuTool) {
|
||||
m.log.Infof("GPU monitoring not available: %s", err.Error())
|
||||
} else {
|
||||
m.log.Errorf("failed to initialize GPU monitoring: %s", err.Error())
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-m.stopCtx.Done():
|
||||
return
|
||||
case g, ok := <-gpuCh:
|
||||
if !ok {
|
||||
m.log.Errorf("failed reading from gpuCh - stopping read goroutine")
|
||||
return
|
||||
}
|
||||
m.mutex.Lock()
|
||||
m.gpuRing.Push(g)
|
||||
for l := range m.gpuListeners {
|
||||
select {
|
||||
case l <- g:
|
||||
default:
|
||||
}
|
||||
}
|
||||
m.mutex.Unlock()
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// Current returns a copy of the current log of system and GPU stats.
|
||||
func (m *Monitor) Current() ([]SysStat, []GpuStat) {
|
||||
m.mutex.RLock()
|
||||
defer m.mutex.RUnlock()
|
||||
|
||||
sysStats := m.sysRing.Slice()
|
||||
|
||||
snapshots := m.gpuRing.Slice()
|
||||
var gpuStats []GpuStat
|
||||
for _, snapshot := range snapshots {
|
||||
gpuStats = append(gpuStats, snapshot...)
|
||||
}
|
||||
return sysStats, gpuStats
|
||||
}
|
||||
|
||||
func ReadSysStats() (SysStat, error) {
|
||||
return readSysStats()
|
||||
}
|
||||
|
||||
func GetGpuStats(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
|
||||
return getGpuStats(ctx, every, logger)
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
package perf
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/mostlygeek/llama-swap/internal/logmon"
|
||||
"github.com/shirou/gopsutil/v4/cpu"
|
||||
"github.com/shirou/gopsutil/v4/load"
|
||||
"github.com/shirou/gopsutil/v4/mem"
|
||||
)
|
||||
|
||||
func getGpuStats(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
|
||||
return nil, ErrNotImplemented
|
||||
}
|
||||
|
||||
func readSysStats() (SysStat, error) {
|
||||
cpuPcts, err := cpu.Percent(0, true)
|
||||
if err != nil {
|
||||
return SysStat{}, err
|
||||
}
|
||||
|
||||
vmStat, err := mem.VirtualMemory()
|
||||
if err != nil {
|
||||
return SysStat{}, err
|
||||
}
|
||||
|
||||
const toMB = 1024 * 1024
|
||||
|
||||
var swapTotalMB, swapUsedMB int
|
||||
if swapStat, err := mem.SwapMemory(); err == nil {
|
||||
swapTotalMB = int(swapStat.Total / toMB)
|
||||
swapUsedMB = int(swapStat.Used / toMB)
|
||||
}
|
||||
|
||||
var loadAvg1, loadAvg5, loadAvg15 float64
|
||||
if loadStat, err := load.Avg(); err == nil {
|
||||
loadAvg1 = loadStat.Load1
|
||||
loadAvg5 = loadStat.Load5
|
||||
loadAvg15 = loadStat.Load15
|
||||
}
|
||||
|
||||
return SysStat{
|
||||
Timestamp: time.Now(),
|
||||
CpuUtilPerCore: cpuPcts,
|
||||
MemTotalMB: int(vmStat.Total / toMB),
|
||||
MemUsedMB: int(vmStat.Used / toMB),
|
||||
MemFreeMB: int(vmStat.Free / toMB),
|
||||
SwapTotalMB: swapTotalMB,
|
||||
SwapUsedMB: swapUsedMB,
|
||||
LoadAvg1: loadAvg1,
|
||||
LoadAvg5: loadAvg5,
|
||||
LoadAvg15: loadAvg15,
|
||||
}, nil
|
||||
}
|
||||
@@ -0,0 +1,238 @@
|
||||
package perf
|
||||
|
||||
import (
|
||||
"io"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/mostlygeek/llama-swap/internal/logmon"
|
||||
"github.com/mostlygeek/llama-swap/proxy/config"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func newTestLogger() *logmon.Monitor {
|
||||
return logmon.NewWriter(io.Discard)
|
||||
}
|
||||
|
||||
func TestNew_DefaultConfig(t *testing.T) {
|
||||
logger := newTestLogger()
|
||||
|
||||
m, err := New(config.PerformanceConfig{}, logger)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, m)
|
||||
|
||||
assert.Equal(t, 100*time.Millisecond, m.conf.Every)
|
||||
assert.Equal(t, 1*time.Second, m.conf.GC)
|
||||
assert.Equal(t, 1*time.Minute, m.conf.MaxAge)
|
||||
}
|
||||
|
||||
func TestNew_CustomConfig(t *testing.T) {
|
||||
logger := newTestLogger()
|
||||
|
||||
cfg := config.PerformanceConfig{
|
||||
Enable: true,
|
||||
Every: 500 * time.Millisecond,
|
||||
GC: 5 * time.Second,
|
||||
MaxAge: 10 * time.Minute,
|
||||
}
|
||||
|
||||
m, err := New(cfg, logger)
|
||||
require.NoError(t, err)
|
||||
|
||||
assert.Equal(t, 500*time.Millisecond, m.conf.Every)
|
||||
assert.Equal(t, 5*time.Second, m.conf.GC)
|
||||
assert.Equal(t, 10*time.Minute, m.conf.MaxAge)
|
||||
}
|
||||
|
||||
func TestNew_NilLogger(t *testing.T) {
|
||||
m, err := New(config.PerformanceConfig{}, nil)
|
||||
assert.Error(t, err)
|
||||
assert.Nil(t, m)
|
||||
}
|
||||
|
||||
func TestNew_BelowMinimumConfig(t *testing.T) {
|
||||
logger := newTestLogger()
|
||||
|
||||
cfg := config.PerformanceConfig{
|
||||
Enable: true,
|
||||
Every: 1 * time.Millisecond,
|
||||
GC: 100 * time.Millisecond,
|
||||
MaxAge: 1 * time.Second,
|
||||
}
|
||||
|
||||
m, err := New(cfg, logger)
|
||||
require.NoError(t, err)
|
||||
|
||||
assert.Equal(t, 100*time.Millisecond, m.conf.Every)
|
||||
assert.Equal(t, 1*time.Second, m.conf.GC)
|
||||
assert.Equal(t, 1*time.Minute, m.conf.MaxAge)
|
||||
}
|
||||
|
||||
func TestSubscribe_ReturnsChannels(t *testing.T) {
|
||||
m, err := New(config.PerformanceConfig{}, newTestLogger())
|
||||
require.NoError(t, err)
|
||||
|
||||
sysCh, gpuCh, unsub := m.Subscribe()
|
||||
defer unsub()
|
||||
|
||||
assert.NotNil(t, sysCh)
|
||||
assert.NotNil(t, gpuCh)
|
||||
assert.NotNil(t, unsub)
|
||||
}
|
||||
|
||||
func TestSubscribe_UnsubscribeRemovesListeners(t *testing.T) {
|
||||
m, err := New(config.PerformanceConfig{}, newTestLogger())
|
||||
require.NoError(t, err)
|
||||
|
||||
_, _, unsub := m.Subscribe()
|
||||
|
||||
m.mutex.RLock()
|
||||
assert.Len(t, m.sysListeners, 1)
|
||||
assert.Len(t, m.gpuListeners, 1)
|
||||
m.mutex.RUnlock()
|
||||
|
||||
unsub()
|
||||
|
||||
m.mutex.RLock()
|
||||
assert.Len(t, m.sysListeners, 0)
|
||||
assert.Len(t, m.gpuListeners, 0)
|
||||
m.mutex.RUnlock()
|
||||
}
|
||||
|
||||
func TestSubscribe_MultipleSubscriptions(t *testing.T) {
|
||||
m, err := New(config.PerformanceConfig{}, newTestLogger())
|
||||
require.NoError(t, err)
|
||||
|
||||
sysCh1, gpuCh1, unsub1 := m.Subscribe()
|
||||
sysCh2, gpuCh2, unsub2 := m.Subscribe()
|
||||
defer unsub1()
|
||||
defer unsub2()
|
||||
|
||||
assert.NotEqual(t, sysCh1, sysCh2)
|
||||
assert.NotEqual(t, gpuCh1, gpuCh2)
|
||||
|
||||
m.mutex.RLock()
|
||||
assert.Len(t, m.sysListeners, 2)
|
||||
assert.Len(t, m.gpuListeners, 2)
|
||||
m.mutex.RUnlock()
|
||||
}
|
||||
|
||||
func TestCurrent_EmptyByDefault(t *testing.T) {
|
||||
m, err := New(config.PerformanceConfig{}, newTestLogger())
|
||||
require.NoError(t, err)
|
||||
|
||||
sysStats, gpuStats := m.Current()
|
||||
assert.Empty(t, sysStats)
|
||||
assert.Empty(t, gpuStats)
|
||||
}
|
||||
|
||||
func TestCurrent_ReturnsCopies(t *testing.T) {
|
||||
m, err := New(config.PerformanceConfig{}, newTestLogger())
|
||||
require.NoError(t, err)
|
||||
|
||||
now := time.Now()
|
||||
m.sysRing.Push(SysStat{Timestamp: now, MemTotalMB: 1024})
|
||||
m.gpuRing.Push([]GpuStat{{Timestamp: now, ID: 0, Name: "gpu0"}})
|
||||
|
||||
sysStats, gpuStats := m.Current()
|
||||
|
||||
assert.Len(t, sysStats, 1)
|
||||
assert.Len(t, gpuStats, 1)
|
||||
assert.Equal(t, 1024, sysStats[0].MemTotalMB)
|
||||
assert.Equal(t, "gpu0", gpuStats[0].Name)
|
||||
|
||||
// modifying the returned slice should not affect the original
|
||||
sysStats[0].MemTotalMB = 999
|
||||
original, _ := m.Current()
|
||||
assert.Equal(t, 1024, original[0].MemTotalMB)
|
||||
}
|
||||
|
||||
func TestStart_CollectsSysStats(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping slow test")
|
||||
}
|
||||
|
||||
m, err := New(config.PerformanceConfig{Every: 100 * time.Millisecond}, newTestLogger())
|
||||
require.NoError(t, err)
|
||||
|
||||
m.Start()
|
||||
|
||||
time.Sleep(350 * time.Millisecond)
|
||||
m.Stop()
|
||||
|
||||
sysStats, _ := m.Current()
|
||||
assert.NotEmpty(t, sysStats, "expected sys stats to be collected")
|
||||
}
|
||||
|
||||
func TestStart_StopStopsGoroutines(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping slow test")
|
||||
}
|
||||
|
||||
m, err := New(config.PerformanceConfig{Every: 100 * time.Millisecond}, newTestLogger())
|
||||
require.NoError(t, err)
|
||||
|
||||
m.Start()
|
||||
if m.stopCancel == nil {
|
||||
t.Error("stopCancel should not be nil after Start()")
|
||||
}
|
||||
|
||||
m.Stop()
|
||||
if m.stopCancel != nil {
|
||||
t.Error("stopCancel should be nil after Stop()")
|
||||
}
|
||||
}
|
||||
|
||||
func TestStart_SubscriberReceivesStats(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping slow test")
|
||||
}
|
||||
|
||||
m, err := New(config.PerformanceConfig{Every: 100 * time.Millisecond}, newTestLogger())
|
||||
require.NoError(t, err)
|
||||
|
||||
sysCh, _, unsub := m.Subscribe()
|
||||
defer unsub()
|
||||
|
||||
m.Start()
|
||||
defer m.Stop()
|
||||
|
||||
select {
|
||||
case s := <-sysCh:
|
||||
assert.False(t, s.Timestamp.IsZero())
|
||||
assert.NotEmpty(t, s.CpuUtilPerCore)
|
||||
case <-time.After(500 * time.Millisecond):
|
||||
t.Fatal("timed out waiting for sys stats")
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadSysStats(t *testing.T) {
|
||||
s, err := ReadSysStats()
|
||||
require.NoError(t, err)
|
||||
|
||||
assert.False(t, s.Timestamp.IsZero())
|
||||
assert.NotEmpty(t, s.CpuUtilPerCore)
|
||||
assert.Greater(t, s.MemTotalMB, 0)
|
||||
}
|
||||
|
||||
func TestCurrent_ConcurrentAccess(t *testing.T) {
|
||||
m, err := New(config.PerformanceConfig{}, newTestLogger())
|
||||
require.NoError(t, err)
|
||||
|
||||
m.sysRing.Push(SysStat{Timestamp: time.Now(), MemTotalMB: 1024})
|
||||
m.gpuRing.Push([]GpuStat{{Timestamp: time.Now(), ID: 0}})
|
||||
|
||||
var wg sync.WaitGroup
|
||||
for i := 0; i < 10; i++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
sys, gpu := m.Current()
|
||||
assert.Len(t, sys, 1)
|
||||
assert.Len(t, gpu, 1)
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
@@ -0,0 +1,461 @@
|
||||
//go:build unix && !darwin
|
||||
|
||||
package perf
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net"
|
||||
"os"
|
||||
"os/exec"
|
||||
"os/user"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/mostlygeek/llama-swap/internal/logmon"
|
||||
"github.com/shirou/gopsutil/v4/cpu"
|
||||
"github.com/shirou/gopsutil/v4/load"
|
||||
"github.com/shirou/gopsutil/v4/mem"
|
||||
psnet "github.com/shirou/gopsutil/v4/net"
|
||||
)
|
||||
|
||||
func getGpuStats(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
|
||||
if ch, err := tryLACT(ctx, every, logger); err == nil {
|
||||
logger.Info("using LACT for GPU monitoring")
|
||||
return ch, nil
|
||||
} else {
|
||||
logger.Debugf("LACT: %s", err.Error())
|
||||
}
|
||||
|
||||
if ch, err := tryNvidiaSmi(ctx, every, logger); err == nil {
|
||||
logger.Info("using nvidia-smi for GPU monitoring")
|
||||
return ch, nil
|
||||
} else {
|
||||
logger.Debugf("nvidia-smi: %s", err.Error())
|
||||
}
|
||||
|
||||
if ch, err := trySysfs(ctx, every, logger); err == nil {
|
||||
logger.Info("using sysfs for GPU monitoring")
|
||||
return ch, nil
|
||||
} else {
|
||||
logger.Debugf("sysfs: %s", err.Error())
|
||||
}
|
||||
|
||||
return nil, ErrNoGpuTool
|
||||
}
|
||||
|
||||
func tryLACT(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
|
||||
socketPath := lactSocketPath()
|
||||
if socketPath == "" {
|
||||
return nil, ErrNoGpuTool
|
||||
}
|
||||
|
||||
conn, err := net.DialTimeout("unix", socketPath, 2*time.Second)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot connect to LACT socket: %w", err)
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
conn.SetDeadline(time.Now().Add(5 * time.Second))
|
||||
|
||||
devices, err := lactListDevices(conn)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("LACT ListDevices failed: %w", err)
|
||||
}
|
||||
|
||||
if len(devices) == 0 {
|
||||
return nil, fmt.Errorf("LACT returned no devices")
|
||||
}
|
||||
|
||||
ch := make(chan []GpuStat, 1)
|
||||
|
||||
go func() {
|
||||
defer close(ch)
|
||||
ticker := time.NewTicker(every)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
socketPath := lactSocketPath()
|
||||
if socketPath == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
conn, err := net.DialTimeout("unix", socketPath, 2*time.Second)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
conn.SetDeadline(time.Now().Add(5 * time.Second))
|
||||
|
||||
devices, err := lactListDevices(conn)
|
||||
if err != nil {
|
||||
conn.Close()
|
||||
continue
|
||||
}
|
||||
|
||||
stats := make([]GpuStat, 0, len(devices))
|
||||
for i, d := range devices {
|
||||
stat, err := lactGetDeviceStats(conn, d.ID, d.Name, i)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
stats = append(stats, stat)
|
||||
}
|
||||
conn.Close()
|
||||
|
||||
if len(stats) > 0 {
|
||||
select {
|
||||
case ch <- stats:
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
return ch, nil
|
||||
}
|
||||
|
||||
func tryNvidiaSmi(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
|
||||
if _, err := exec.LookPath("nvidia-smi"); err != nil {
|
||||
return nil, ErrNoGpuTool
|
||||
}
|
||||
|
||||
sec := int(every.Seconds())
|
||||
if sec < 1 {
|
||||
sec = 1
|
||||
}
|
||||
|
||||
cmd := exec.CommandContext(ctx, "nvidia-smi",
|
||||
"--query-gpu=index,name,uuid,temperature.gpu,utilization.gpu,memory.used,memory.total,fan.speed,power.draw",
|
||||
"--format=csv,noheader,nounits",
|
||||
"-loop", fmt.Sprintf("%d", sec),
|
||||
)
|
||||
|
||||
stdout, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia-smi stdout pipe failed: %w", err)
|
||||
}
|
||||
|
||||
if err := cmd.Start(); err != nil {
|
||||
return nil, fmt.Errorf("nvidia-smi start failed: %w", err)
|
||||
}
|
||||
|
||||
ch := make(chan []GpuStat, 1)
|
||||
|
||||
go func() {
|
||||
defer close(ch)
|
||||
|
||||
scanner := bufio.NewScanner(stdout)
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
stat := parseNvidiaSmiLine(line)
|
||||
if stat != nil {
|
||||
select {
|
||||
case ch <- []GpuStat{*stat}:
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
cmd.Wait()
|
||||
}()
|
||||
|
||||
return ch, nil
|
||||
}
|
||||
|
||||
func parseNvidiaSmiLine(line string) *GpuStat {
|
||||
fields := strings.Split(line, ", ")
|
||||
if len(fields) < 9 {
|
||||
return nil
|
||||
}
|
||||
|
||||
id, _ := strconv.Atoi(strings.TrimSpace(fields[0]))
|
||||
name := strings.TrimSpace(fields[1])
|
||||
uuid := strings.TrimSpace(fields[2])
|
||||
tempC, _ := strconv.Atoi(strings.TrimSpace(fields[3]))
|
||||
gpuUtil, _ := strconv.ParseFloat(strings.TrimSpace(fields[4]), 64)
|
||||
memUsed, _ := strconv.Atoi(strings.TrimSpace(fields[5]))
|
||||
memTotal, _ := strconv.Atoi(strings.TrimSpace(fields[6]))
|
||||
fanSpeed, _ := strconv.ParseFloat(strings.TrimSpace(fields[7]), 64)
|
||||
powerDraw, _ := strconv.ParseFloat(strings.TrimSpace(fields[8]), 64)
|
||||
|
||||
var memUtil float64
|
||||
if memTotal > 0 {
|
||||
memUtil = float64(memUsed) / float64(memTotal) * 100
|
||||
}
|
||||
|
||||
return &GpuStat{
|
||||
Timestamp: time.Now(),
|
||||
ID: id,
|
||||
Name: name,
|
||||
UUID: uuid,
|
||||
TempC: tempC,
|
||||
GpuUtilPct: gpuUtil,
|
||||
MemUtilPct: memUtil,
|
||||
MemUsedMB: memUsed,
|
||||
MemTotalMB: memTotal,
|
||||
FanSpeedPct: fanSpeed,
|
||||
PowerDrawW: powerDraw,
|
||||
}
|
||||
}
|
||||
|
||||
func trySysfs(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
|
||||
return nil, ErrNotImplemented
|
||||
}
|
||||
|
||||
func lactSocketPath() string {
|
||||
if p := os.Getenv("LACT_DAEMON_SOCKET_PATH"); p != "" {
|
||||
if _, err := os.Stat(p); err == nil {
|
||||
return p
|
||||
}
|
||||
}
|
||||
|
||||
rootPath := "/run/lactd.sock"
|
||||
if _, err := os.Stat(rootPath); err == nil {
|
||||
return rootPath
|
||||
}
|
||||
|
||||
u, err := user.Current()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
userPath := filepath.Join("/run/user", u.Uid, "lactd.sock")
|
||||
if _, err := os.Stat(userPath); err == nil {
|
||||
return userPath
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
type lactRequest struct {
|
||||
Command string `json:"command"`
|
||||
Args interface{} `json:"args,omitempty"`
|
||||
}
|
||||
|
||||
type lactResponse struct {
|
||||
Status string `json:"status"`
|
||||
Data json.RawMessage `json:"data"`
|
||||
}
|
||||
|
||||
type lactDeviceEntry struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
}
|
||||
|
||||
type lactDeviceStats struct {
|
||||
Fan struct {
|
||||
PwmCurrent *uint8 `json:"pwm_current"`
|
||||
} `json:"fan"`
|
||||
Vram struct {
|
||||
Total *uint64 `json:"total"`
|
||||
Used *uint64 `json:"used"`
|
||||
} `json:"vram"`
|
||||
Power struct {
|
||||
Average *float64 `json:"average"`
|
||||
Current *float64 `json:"current"`
|
||||
} `json:"power"`
|
||||
Temps map[string]lactTempEntry `json:"temps"`
|
||||
BusyPercent *uint8 `json:"busy_percent"`
|
||||
}
|
||||
|
||||
type lactTempEntry struct {
|
||||
Current *float64 `json:"current"`
|
||||
}
|
||||
|
||||
func lactSendRequest(conn net.Conn, req lactRequest) (json.RawMessage, error) {
|
||||
data, err := json.Marshal(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
data = append(data, '\n')
|
||||
|
||||
if _, err := conn.Write(data); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
reader := bufio.NewReader(conn)
|
||||
line, err := reader.ReadBytes('\n')
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var resp lactResponse
|
||||
if err := json.Unmarshal(line, &resp); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if resp.Status != "ok" {
|
||||
return nil, fmt.Errorf("LACT error: %s", string(resp.Data))
|
||||
}
|
||||
|
||||
return resp.Data, nil
|
||||
}
|
||||
|
||||
func lactListDevices(conn net.Conn) ([]lactDeviceEntry, error) {
|
||||
data, err := lactSendRequest(conn, lactRequest{Command: "list_devices"})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var devices []lactDeviceEntry
|
||||
if err := json.Unmarshal(data, &devices); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return devices, nil
|
||||
}
|
||||
|
||||
func lactGetDeviceStats(conn net.Conn, id string, name string, index int) (GpuStat, error) {
|
||||
data, err := lactSendRequest(conn, lactRequest{
|
||||
Command: "device_stats",
|
||||
Args: struct {
|
||||
ID string `json:"id"`
|
||||
}{ID: id},
|
||||
})
|
||||
if err != nil {
|
||||
return GpuStat{}, err
|
||||
}
|
||||
|
||||
var stats lactDeviceStats
|
||||
if err := json.Unmarshal(data, &stats); err != nil {
|
||||
return GpuStat{}, err
|
||||
}
|
||||
|
||||
var memUsedMB, memTotalMB int
|
||||
if stats.Vram.Used != nil {
|
||||
memUsedMB = int(*stats.Vram.Used / 1024 / 1024)
|
||||
}
|
||||
if stats.Vram.Total != nil {
|
||||
memTotalMB = int(*stats.Vram.Total / 1024 / 1024)
|
||||
}
|
||||
|
||||
var memUtil float64
|
||||
if memTotalMB > 0 {
|
||||
memUtil = float64(memUsedMB) / float64(memTotalMB) * 100
|
||||
}
|
||||
|
||||
var gpuUtil float64
|
||||
if stats.BusyPercent != nil {
|
||||
gpuUtil = float64(*stats.BusyPercent)
|
||||
}
|
||||
|
||||
var fanSpeed float64
|
||||
if stats.Fan.PwmCurrent != nil {
|
||||
fanSpeed = float64(*stats.Fan.PwmCurrent) / 255.0 * 100.0
|
||||
}
|
||||
|
||||
var powerDraw float64
|
||||
if stats.Power.Average != nil && *stats.Power.Average > 0 {
|
||||
powerDraw = *stats.Power.Average
|
||||
} else if stats.Power.Current != nil {
|
||||
powerDraw = *stats.Power.Current
|
||||
}
|
||||
|
||||
var tempC int
|
||||
if t, ok := stats.Temps["edge"]; ok && t.Current != nil {
|
||||
tempC = int(*t.Current)
|
||||
} else if t, ok := stats.Temps["junction"]; ok && t.Current != nil {
|
||||
tempC = int(*t.Current)
|
||||
} else {
|
||||
for _, t := range stats.Temps {
|
||||
if t.Current != nil {
|
||||
tempC = int(*t.Current)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var vramTempC int
|
||||
// nvidia uses "VRAM", amd "mem"
|
||||
for _, key := range []string{"mem", "VRAM"} {
|
||||
if t, ok := stats.Temps[key]; ok && t.Current != nil && *t.Current > 0 {
|
||||
vramTempC = int(*t.Current)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return GpuStat{
|
||||
Timestamp: time.Now(),
|
||||
ID: index,
|
||||
Name: name,
|
||||
UUID: id,
|
||||
TempC: tempC,
|
||||
VramTempC: vramTempC,
|
||||
GpuUtilPct: gpuUtil,
|
||||
MemUtilPct: memUtil,
|
||||
MemUsedMB: memUsedMB,
|
||||
MemTotalMB: memTotalMB,
|
||||
FanSpeedPct: fanSpeed,
|
||||
PowerDrawW: powerDraw,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func readSysfs() ([]GpuStat, error) {
|
||||
return nil, ErrNotImplemented
|
||||
}
|
||||
|
||||
func readSysStats() (SysStat, error) {
|
||||
cpuPcts, err := cpu.Percent(0, true)
|
||||
if err != nil {
|
||||
return SysStat{}, err
|
||||
}
|
||||
|
||||
vmStat, err := mem.VirtualMemory()
|
||||
if err != nil {
|
||||
return SysStat{}, err
|
||||
}
|
||||
|
||||
const toMB = 1024 * 1024
|
||||
|
||||
var swapTotalMB, swapUsedMB int
|
||||
if swapStat, err := mem.SwapMemory(); err == nil {
|
||||
swapTotalMB = int(swapStat.Total / toMB)
|
||||
swapUsedMB = int(swapStat.Used / toMB)
|
||||
}
|
||||
|
||||
var loadAvg1, loadAvg5, loadAvg15 float64
|
||||
if loadStat, err := load.Avg(); err == nil {
|
||||
loadAvg1 = loadStat.Load1
|
||||
loadAvg5 = loadStat.Load5
|
||||
loadAvg15 = loadStat.Load15
|
||||
}
|
||||
|
||||
netIO := make([]NetIOStat, 0)
|
||||
if ioCounters, err := psnet.IOCounters(true); err == nil {
|
||||
for _, ioc := range ioCounters {
|
||||
if ioc.Name == "lo" {
|
||||
continue
|
||||
}
|
||||
netIO = append(netIO, NetIOStat{
|
||||
Name: ioc.Name,
|
||||
BytesRecv: ioc.BytesRecv,
|
||||
BytesSent: ioc.BytesSent,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return SysStat{
|
||||
Timestamp: time.Now(),
|
||||
CpuUtilPerCore: cpuPcts,
|
||||
MemTotalMB: int(vmStat.Total / toMB),
|
||||
MemUsedMB: int(vmStat.Used / toMB),
|
||||
MemFreeMB: int(vmStat.Free / toMB),
|
||||
SwapTotalMB: swapTotalMB,
|
||||
SwapUsedMB: swapUsedMB,
|
||||
LoadAvg1: loadAvg1,
|
||||
LoadAvg5: loadAvg5,
|
||||
LoadAvg15: loadAvg15,
|
||||
NetIO: netIO,
|
||||
}, nil
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
package perf
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/mostlygeek/llama-swap/internal/logmon"
|
||||
"github.com/shirou/gopsutil/v4/cpu"
|
||||
"github.com/shirou/gopsutil/v4/mem"
|
||||
"github.com/shirou/gopsutil/v4/net"
|
||||
)
|
||||
|
||||
func getGpuStats(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
|
||||
return nil, ErrNotImplemented
|
||||
}
|
||||
|
||||
func readSysStats() (SysStat, error) {
|
||||
cpuPcts, err := cpu.Percent(0, true)
|
||||
if err != nil {
|
||||
return SysStat{}, err
|
||||
}
|
||||
|
||||
vmStat, err := mem.VirtualMemory()
|
||||
if err != nil {
|
||||
return SysStat{}, err
|
||||
}
|
||||
|
||||
const toMB = 1024 * 1024
|
||||
|
||||
netIO := make([]NetIOStat, 0)
|
||||
if ioCounters, err := net.IOCounters(true); err == nil {
|
||||
for _, ioc := range ioCounters {
|
||||
netIO = append(netIO, NetIOStat{
|
||||
Name: ioc.Name,
|
||||
BytesRecv: ioc.BytesRecv,
|
||||
BytesSent: ioc.BytesSent,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return SysStat{
|
||||
Timestamp: time.Now(),
|
||||
CpuUtilPerCore: cpuPcts,
|
||||
MemTotalMB: int(vmStat.Total / toMB),
|
||||
MemUsedMB: int(vmStat.Used / toMB),
|
||||
MemFreeMB: int(vmStat.Free / toMB),
|
||||
NetIO: netIO,
|
||||
}, nil
|
||||
}
|
||||
@@ -0,0 +1,129 @@
|
||||
package perf
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const mbToBytes = int64(1024 * 1024)
|
||||
|
||||
// MetricsHandler returns an http.HandlerFunc serving Prometheus text format metrics
|
||||
// with the most recent system and GPU stats.
|
||||
func (m *Monitor) MetricsHandler() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
sysStats, gpuStats := m.Current()
|
||||
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
|
||||
|
||||
if len(sysStats) > 0 {
|
||||
writeSysMetrics(w, sysStats[len(sysStats)-1])
|
||||
}
|
||||
|
||||
if len(gpuStats) > 0 {
|
||||
writeGpuMetrics(w, latestPerGPU(gpuStats))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func writeSysMetrics(w http.ResponseWriter, s SysStat) {
|
||||
fmt.Fprintf(w, "# HELP llamaswap_cpu_util_percent CPU utilization per core (0-100)\n")
|
||||
fmt.Fprintf(w, "# TYPE llamaswap_cpu_util_percent gauge\n")
|
||||
for i, pct := range s.CpuUtilPerCore {
|
||||
fmt.Fprintf(w, "llamaswap_cpu_util_percent{core=\"%d\"} %g\n", i, pct)
|
||||
}
|
||||
|
||||
fmt.Fprintf(w, "# HELP llamaswap_memory_total_bytes Total memory in bytes\n")
|
||||
fmt.Fprintf(w, "# TYPE llamaswap_memory_total_bytes gauge\n")
|
||||
fmt.Fprintf(w, "llamaswap_memory_total_bytes %d\n", int64(s.MemTotalMB)*mbToBytes)
|
||||
|
||||
fmt.Fprintf(w, "# HELP llamaswap_memory_used_bytes Used memory in bytes\n")
|
||||
fmt.Fprintf(w, "# TYPE llamaswap_memory_used_bytes gauge\n")
|
||||
fmt.Fprintf(w, "llamaswap_memory_used_bytes %d\n", int64(s.MemUsedMB)*mbToBytes)
|
||||
|
||||
fmt.Fprintf(w, "# HELP llamaswap_memory_free_bytes Free memory in bytes\n")
|
||||
fmt.Fprintf(w, "# TYPE llamaswap_memory_free_bytes gauge\n")
|
||||
fmt.Fprintf(w, "llamaswap_memory_free_bytes %d\n", int64(s.MemFreeMB)*mbToBytes)
|
||||
|
||||
fmt.Fprintf(w, "# HELP llamaswap_swap_total_bytes Total swap in bytes\n")
|
||||
fmt.Fprintf(w, "# TYPE llamaswap_swap_total_bytes gauge\n")
|
||||
fmt.Fprintf(w, "llamaswap_swap_total_bytes %d\n", int64(s.SwapTotalMB)*mbToBytes)
|
||||
|
||||
fmt.Fprintf(w, "# HELP llamaswap_swap_used_bytes Used swap in bytes\n")
|
||||
fmt.Fprintf(w, "# TYPE llamaswap_swap_used_bytes gauge\n")
|
||||
fmt.Fprintf(w, "llamaswap_swap_used_bytes %d\n", int64(s.SwapUsedMB)*mbToBytes)
|
||||
|
||||
fmt.Fprintf(w, "# HELP llamaswap_load_average Load average\n")
|
||||
fmt.Fprintf(w, "# TYPE llamaswap_load_average gauge\n")
|
||||
fmt.Fprintf(w, "llamaswap_load_average{interval=\"1m\"} %g\n", s.LoadAvg1)
|
||||
fmt.Fprintf(w, "llamaswap_load_average{interval=\"5m\"} %g\n", s.LoadAvg5)
|
||||
fmt.Fprintf(w, "llamaswap_load_average{interval=\"15m\"} %g\n", s.LoadAvg15)
|
||||
|
||||
if len(s.NetIO) > 0 {
|
||||
fmt.Fprintf(w, "# HELP llamaswap_network_bytes_total Total network bytes transferred\n")
|
||||
fmt.Fprintf(w, "# TYPE llamaswap_network_bytes_total counter\n")
|
||||
for _, io := range s.NetIO {
|
||||
iface := sanitizeLabel(io.Name)
|
||||
fmt.Fprintf(w, "llamaswap_network_bytes_total{interface=\"%s\",direction=\"recv\"} %d\n", iface, io.BytesRecv)
|
||||
fmt.Fprintf(w, "llamaswap_network_bytes_total{interface=\"%s\",direction=\"sent\"} %d\n", iface, io.BytesSent)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func writeGpuMetrics(w http.ResponseWriter, gpus []GpuStat) {
|
||||
if len(gpus) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
type gpuMetric struct {
|
||||
help string
|
||||
name string
|
||||
value func(GpuStat) float64
|
||||
}
|
||||
|
||||
metrics := []gpuMetric{
|
||||
{"GPU temperature in Celsius", "llamaswap_gpu_temperature_celsius", func(g GpuStat) float64 { return float64(g.TempC) }},
|
||||
{"GPU VRAM temperature in Celsius", "llamaswap_gpu_vram_temperature_celsius", func(g GpuStat) float64 { return float64(g.VramTempC) }},
|
||||
{"GPU utilization percent (0-100)", "llamaswap_gpu_util_percent", func(g GpuStat) float64 { return g.GpuUtilPct }},
|
||||
{"GPU memory utilization percent (0-100)", "llamaswap_gpu_memory_util_percent", func(g GpuStat) float64 { return g.MemUtilPct }},
|
||||
{"GPU memory used in bytes", "llamaswap_gpu_memory_used_bytes", func(g GpuStat) float64 { return float64(g.MemUsedMB) * float64(mbToBytes) }},
|
||||
{"GPU memory total in bytes", "llamaswap_gpu_memory_total_bytes", func(g GpuStat) float64 { return float64(g.MemTotalMB) * float64(mbToBytes) }},
|
||||
{"GPU fan speed percent (0-100)", "llamaswap_gpu_fan_speed_percent", func(g GpuStat) float64 { return g.FanSpeedPct }},
|
||||
{"GPU power draw in watts", "llamaswap_gpu_power_draw_watts", func(g GpuStat) float64 { return g.PowerDrawW }},
|
||||
}
|
||||
|
||||
for _, m := range metrics {
|
||||
fmt.Fprintf(w, "# HELP %s %s\n", m.name, m.help)
|
||||
fmt.Fprintf(w, "# TYPE %s gauge\n", m.name)
|
||||
for _, g := range gpus {
|
||||
if g.UUID != "" {
|
||||
fmt.Fprintf(w, "%s{id=\"%d\",name=\"%s\",uuid=\"%s\"} %g\n",
|
||||
m.name, g.ID, sanitizeLabel(g.Name), sanitizeLabel(g.UUID), m.value(g))
|
||||
} else {
|
||||
fmt.Fprintf(w, "%s{id=\"%d\",name=\"%s\"} %g\n",
|
||||
m.name, g.ID, sanitizeLabel(g.Name), m.value(g))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// latestPerGPU returns the most recent GpuStat for each GPU ID, sorted by ID.
|
||||
func latestPerGPU(stats []GpuStat) []GpuStat {
|
||||
latest := make(map[int]GpuStat)
|
||||
for _, g := range stats {
|
||||
if prev, ok := latest[g.ID]; !ok || g.Timestamp.After(prev.Timestamp) {
|
||||
latest[g.ID] = g
|
||||
}
|
||||
}
|
||||
result := make([]GpuStat, 0, len(latest))
|
||||
for _, g := range latest {
|
||||
result = append(result, g)
|
||||
}
|
||||
sort.Slice(result, func(i, j int) bool { return result[i].ID < result[j].ID })
|
||||
return result
|
||||
}
|
||||
|
||||
// sanitizeLabel escapes characters that are invalid in Prometheus label values.
|
||||
func sanitizeLabel(s string) string {
|
||||
return strings.NewReplacer(`"`, `\"`, `\`, `\\`, "\n", `\n`).Replace(s)
|
||||
}
|
||||
@@ -0,0 +1,248 @@
|
||||
package perf
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/mostlygeek/llama-swap/proxy/config"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestSanitizeLabel(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
want string
|
||||
}{
|
||||
{"normal", "normal"},
|
||||
{"", ""},
|
||||
{`with"quote`, `with\"quote`},
|
||||
{`with\backslash`, `with\\backslash`},
|
||||
{"with\nnewline", `with\nnewline`},
|
||||
{`"both\n"`, `\"both\\n\"`},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
assert.Equal(t, tc.want, sanitizeLabel(tc.input), "input: %q", tc.input)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLatestPerGPU_Empty(t *testing.T) {
|
||||
result := latestPerGPU(nil)
|
||||
assert.Empty(t, result)
|
||||
}
|
||||
|
||||
func TestLatestPerGPU_Single(t *testing.T) {
|
||||
now := time.Now()
|
||||
stats := []GpuStat{{ID: 0, Name: "gpu0", Timestamp: now}}
|
||||
result := latestPerGPU(stats)
|
||||
require.Len(t, result, 1)
|
||||
assert.Equal(t, "gpu0", result[0].Name)
|
||||
}
|
||||
|
||||
func TestLatestPerGPU_PicksLatest(t *testing.T) {
|
||||
earlier := time.Now().Add(-time.Second)
|
||||
later := time.Now()
|
||||
stats := []GpuStat{
|
||||
{ID: 0, Name: "old", TempC: 50, Timestamp: earlier},
|
||||
{ID: 0, Name: "new", TempC: 70, Timestamp: later},
|
||||
}
|
||||
result := latestPerGPU(stats)
|
||||
require.Len(t, result, 1)
|
||||
assert.Equal(t, "new", result[0].Name)
|
||||
assert.Equal(t, 70, result[0].TempC)
|
||||
}
|
||||
|
||||
func TestLatestPerGPU_MultipleGPUsSortedByID(t *testing.T) {
|
||||
now := time.Now()
|
||||
stats := []GpuStat{
|
||||
{ID: 2, Name: "gpu2", Timestamp: now},
|
||||
{ID: 0, Name: "gpu0", Timestamp: now},
|
||||
{ID: 1, Name: "gpu1", Timestamp: now},
|
||||
}
|
||||
result := latestPerGPU(stats)
|
||||
require.Len(t, result, 3)
|
||||
assert.Equal(t, 0, result[0].ID)
|
||||
assert.Equal(t, 1, result[1].ID)
|
||||
assert.Equal(t, 2, result[2].ID)
|
||||
}
|
||||
|
||||
func TestWriteSysMetrics(t *testing.T) {
|
||||
rec := httptest.NewRecorder()
|
||||
s := SysStat{
|
||||
CpuUtilPerCore: []float64{10.5, 20.0},
|
||||
MemTotalMB: 8192,
|
||||
MemUsedMB: 4096,
|
||||
MemFreeMB: 4096,
|
||||
SwapTotalMB: 2048,
|
||||
SwapUsedMB: 512,
|
||||
LoadAvg1: 1.5,
|
||||
LoadAvg5: 1.2,
|
||||
LoadAvg15: 0.9,
|
||||
NetIO: []NetIOStat{
|
||||
{Name: "eth0", BytesRecv: 1000, BytesSent: 2000},
|
||||
},
|
||||
}
|
||||
|
||||
writeSysMetrics(rec, s)
|
||||
body := rec.Body.String()
|
||||
|
||||
assert.Contains(t, body, `llamaswap_cpu_util_percent{core="0"} 10.5`)
|
||||
assert.Contains(t, body, `llamaswap_cpu_util_percent{core="1"} 20`)
|
||||
assert.Contains(t, body, "llamaswap_memory_total_bytes 8589934592")
|
||||
assert.Contains(t, body, "llamaswap_memory_used_bytes 4294967296")
|
||||
assert.Contains(t, body, "llamaswap_memory_free_bytes 4294967296")
|
||||
assert.Contains(t, body, "llamaswap_swap_total_bytes 2147483648")
|
||||
assert.Contains(t, body, "llamaswap_swap_used_bytes 536870912")
|
||||
assert.Contains(t, body, `llamaswap_load_average{interval="1m"} 1.5`)
|
||||
assert.Contains(t, body, `llamaswap_load_average{interval="5m"} 1.2`)
|
||||
assert.Contains(t, body, `llamaswap_load_average{interval="15m"} 0.9`)
|
||||
assert.Contains(t, body, `llamaswap_network_bytes_total{interface="eth0",direction="recv"} 1000`)
|
||||
assert.Contains(t, body, `llamaswap_network_bytes_total{interface="eth0",direction="sent"} 2000`)
|
||||
}
|
||||
|
||||
func TestWriteSysMetrics_NoNetIO(t *testing.T) {
|
||||
rec := httptest.NewRecorder()
|
||||
writeSysMetrics(rec, SysStat{CpuUtilPerCore: []float64{5.0}})
|
||||
body := rec.Body.String()
|
||||
assert.NotContains(t, body, "llamaswap_network_bytes_total")
|
||||
}
|
||||
|
||||
func TestWriteGpuMetrics_Empty(t *testing.T) {
|
||||
rec := httptest.NewRecorder()
|
||||
writeGpuMetrics(rec, nil)
|
||||
assert.Empty(t, rec.Body.String())
|
||||
}
|
||||
|
||||
func TestWriteGpuMetrics(t *testing.T) {
|
||||
rec := httptest.NewRecorder()
|
||||
gpus := []GpuStat{
|
||||
{
|
||||
ID: 0,
|
||||
Name: "NVIDIA RTX 4090",
|
||||
UUID: "GPU-1234",
|
||||
TempC: 75,
|
||||
GpuUtilPct: 85.5,
|
||||
MemUtilPct: 60.0,
|
||||
MemUsedMB: 8192,
|
||||
MemTotalMB: 24576,
|
||||
FanSpeedPct: 55.0,
|
||||
PowerDrawW: 300.5,
|
||||
},
|
||||
}
|
||||
|
||||
writeGpuMetrics(rec, gpus)
|
||||
body := rec.Body.String()
|
||||
|
||||
assert.Contains(t, body, `llamaswap_gpu_temperature_celsius{id="0",name="NVIDIA RTX 4090",uuid="GPU-1234"} 75`)
|
||||
assert.Contains(t, body, `llamaswap_gpu_vram_temperature_celsius{id="0",name="NVIDIA RTX 4090",uuid="GPU-1234"} 0`)
|
||||
assert.Contains(t, body, `llamaswap_gpu_util_percent{id="0",name="NVIDIA RTX 4090",uuid="GPU-1234"} 85.5`)
|
||||
assert.Contains(t, body, `llamaswap_gpu_memory_util_percent{id="0",name="NVIDIA RTX 4090",uuid="GPU-1234"} 60`)
|
||||
assert.Contains(t, body, `llamaswap_gpu_memory_used_bytes{id="0",name="NVIDIA RTX 4090",uuid="GPU-1234"}`)
|
||||
assert.Contains(t, body, `llamaswap_gpu_memory_total_bytes{id="0",name="NVIDIA RTX 4090",uuid="GPU-1234"}`)
|
||||
assert.Contains(t, body, `llamaswap_gpu_fan_speed_percent{id="0",name="NVIDIA RTX 4090",uuid="GPU-1234"} 55`)
|
||||
assert.Contains(t, body, `llamaswap_gpu_power_draw_watts{id="0",name="NVIDIA RTX 4090",uuid="GPU-1234"} 300.5`)
|
||||
}
|
||||
|
||||
func TestWriteGpuMetrics_VramTemp(t *testing.T) {
|
||||
rec := httptest.NewRecorder()
|
||||
gpus := []GpuStat{
|
||||
{ID: 0, Name: "AMD RX 7900", UUID: "GPU-5678", TempC: 70, VramTempC: 85},
|
||||
}
|
||||
writeGpuMetrics(rec, gpus)
|
||||
body := rec.Body.String()
|
||||
assert.Contains(t, body, `llamaswap_gpu_temperature_celsius{id="0",name="AMD RX 7900",uuid="GPU-5678"} 70`)
|
||||
assert.Contains(t, body, `llamaswap_gpu_vram_temperature_celsius{id="0",name="AMD RX 7900",uuid="GPU-5678"} 85`)
|
||||
}
|
||||
|
||||
func TestWriteGpuMetrics_EmptyUUID(t *testing.T) {
|
||||
rec := httptest.NewRecorder()
|
||||
gpus := []GpuStat{{ID: 3, Name: "AMD RX 7900", UUID: ""}}
|
||||
writeGpuMetrics(rec, gpus)
|
||||
body := rec.Body.String()
|
||||
assert.NotContains(t, body, "uuid=")
|
||||
assert.Contains(t, body, `name="AMD RX 7900"`)
|
||||
}
|
||||
|
||||
func TestWriteGpuMetrics_LabelSanitization(t *testing.T) {
|
||||
rec := httptest.NewRecorder()
|
||||
gpus := []GpuStat{
|
||||
{ID: 0, Name: `GPU "special"`, UUID: "uuid\nline"},
|
||||
}
|
||||
writeGpuMetrics(rec, gpus)
|
||||
body := rec.Body.String()
|
||||
assert.Contains(t, body, `name="GPU \"special\""`)
|
||||
assert.Contains(t, body, `uuid="uuid\nline"`)
|
||||
}
|
||||
|
||||
func TestMetricsHandler_ContentType(t *testing.T) {
|
||||
m, err := New(config.PerformanceConfig{}, newTestLogger())
|
||||
require.NoError(t, err)
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
|
||||
rec := httptest.NewRecorder()
|
||||
m.MetricsHandler()(rec, req)
|
||||
|
||||
assert.Equal(t, "text/plain; version=0.0.4; charset=utf-8", rec.Header().Get("Content-Type"))
|
||||
}
|
||||
|
||||
func TestMetricsHandler_EmptyStats(t *testing.T) {
|
||||
m, err := New(config.PerformanceConfig{}, newTestLogger())
|
||||
require.NoError(t, err)
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
|
||||
rec := httptest.NewRecorder()
|
||||
m.MetricsHandler()(rec, req)
|
||||
|
||||
assert.Equal(t, http.StatusOK, rec.Code)
|
||||
assert.Empty(t, strings.TrimSpace(rec.Body.String()))
|
||||
}
|
||||
|
||||
func TestMetricsHandler_WithSysStats(t *testing.T) {
|
||||
m, err := New(config.PerformanceConfig{}, newTestLogger())
|
||||
require.NoError(t, err)
|
||||
|
||||
m.sysRing.Push(SysStat{Timestamp: time.Now(), CpuUtilPerCore: []float64{25.0}, MemTotalMB: 4096, MemUsedMB: 2048, MemFreeMB: 2048})
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
|
||||
rec := httptest.NewRecorder()
|
||||
m.MetricsHandler()(rec, req)
|
||||
|
||||
body := rec.Body.String()
|
||||
assert.Contains(t, body, "llamaswap_cpu_util_percent")
|
||||
assert.Contains(t, body, "llamaswap_memory_total_bytes")
|
||||
}
|
||||
|
||||
func TestMetricsHandler_UsesLatestSysStat(t *testing.T) {
|
||||
m, err := New(config.PerformanceConfig{}, newTestLogger())
|
||||
require.NoError(t, err)
|
||||
|
||||
now := time.Now()
|
||||
m.sysRing.Push(SysStat{Timestamp: now.Add(-time.Second), MemTotalMB: 1000})
|
||||
m.sysRing.Push(SysStat{Timestamp: now, MemTotalMB: 8192})
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
|
||||
rec := httptest.NewRecorder()
|
||||
m.MetricsHandler()(rec, req)
|
||||
|
||||
body := rec.Body.String()
|
||||
// 8192 MB = 8589934592 bytes
|
||||
assert.Contains(t, body, "llamaswap_memory_total_bytes 8589934592")
|
||||
}
|
||||
|
||||
func TestMetricsHandler_WithGpuStats(t *testing.T) {
|
||||
m, err := New(config.PerformanceConfig{}, newTestLogger())
|
||||
require.NoError(t, err)
|
||||
|
||||
m.gpuRing.Push([]GpuStat{{ID: 0, Name: "TestGPU", UUID: "uuid-0", TempC: 65, Timestamp: time.Now()}})
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
|
||||
rec := httptest.NewRecorder()
|
||||
m.MetricsHandler()(rec, req)
|
||||
|
||||
body := rec.Body.String()
|
||||
assert.Contains(t, body, "llamaswap_gpu_temperature_celsius")
|
||||
assert.Contains(t, body, `name="TestGPU"`)
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
package perf
|
||||
|
||||
import "time"
|
||||
|
||||
type GpuStat struct {
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
|
||||
ID int `json:"id"`
|
||||
Name string `json:"name"`
|
||||
UUID string `json:"uuid"`
|
||||
TempC int `json:"temp_c"`
|
||||
VramTempC int `json:"vram_temp_c"`
|
||||
GpuUtilPct float64 `json:"gpu_util_pct"`
|
||||
MemUtilPct float64 `json:"mem_util_pct"`
|
||||
MemUsedMB int `json:"mem_used_mb"`
|
||||
MemTotalMB int `json:"mem_total_mb"`
|
||||
FanSpeedPct float64 `json:"fan_speed_pct"`
|
||||
PowerDrawW float64 `json:"power_draw_w"`
|
||||
}
|
||||
|
||||
type NetIOStat struct {
|
||||
Name string `json:"name"`
|
||||
BytesRecv uint64 `json:"bytes_recv"`
|
||||
BytesSent uint64 `json:"bytes_sent"`
|
||||
}
|
||||
|
||||
type SysStat struct {
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
|
||||
CpuUtilPerCore []float64 `json:"cpu_util_per_core"`
|
||||
MemTotalMB int `json:"mem_total_mb"`
|
||||
MemUsedMB int `json:"mem_used_mb"`
|
||||
MemFreeMB int `json:"mem_free_mb"`
|
||||
SwapTotalMB int `json:"swap_total_mb"`
|
||||
SwapUsedMB int `json:"swap_used_mb"`
|
||||
LoadAvg1 float64 `json:"load_avg_1"`
|
||||
LoadAvg5 float64 `json:"load_avg_5"`
|
||||
LoadAvg15 float64 `json:"load_avg_15"`
|
||||
NetIO []NetIOStat `json:"net_io"`
|
||||
}
|
||||
@@ -0,0 +1,39 @@
|
||||
package ring
|
||||
|
||||
type Buffer[T any] struct {
|
||||
buf []T
|
||||
head int
|
||||
size int
|
||||
}
|
||||
|
||||
func NewBuffer[T any](capacity int) Buffer[T] {
|
||||
if capacity < 1 {
|
||||
capacity = 1
|
||||
}
|
||||
return Buffer[T]{buf: make([]T, capacity)}
|
||||
}
|
||||
|
||||
// Push adds v, overwriting the oldest entry when the buffer is full.
|
||||
func (r *Buffer[T]) Push(v T) {
|
||||
cap := len(r.buf)
|
||||
if r.size < cap {
|
||||
r.buf[(r.head+r.size)%cap] = v
|
||||
r.size++
|
||||
} else {
|
||||
r.buf[r.head] = v
|
||||
r.head = (r.head + 1) % cap
|
||||
}
|
||||
}
|
||||
|
||||
// Slice returns all entries in insertion order as a new slice.
|
||||
func (r *Buffer[T]) Slice() []T {
|
||||
if r.size == 0 {
|
||||
return nil
|
||||
}
|
||||
cap := len(r.buf)
|
||||
result := make([]T, r.size)
|
||||
for i := 0; i < r.size; i++ {
|
||||
result[i] = r.buf[(r.head+i)%cap]
|
||||
}
|
||||
return result
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
package ring
|
||||
|
||||
import "testing"
|
||||
|
||||
const benchCap = 600 // matches default MaxAge/Every (1min / 100ms)
|
||||
|
||||
func BenchmarkBuffer_PushNoWrap(b *testing.B) {
|
||||
for b.Loop() {
|
||||
buf := NewBuffer[int](b.N + 1)
|
||||
for i := range b.N {
|
||||
buf.Push(i)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkBuffer_PushWrap(b *testing.B) {
|
||||
buf := NewBuffer[int](benchCap)
|
||||
b.ResetTimer()
|
||||
for i := range b.N {
|
||||
buf.Push(i)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkBuffer_Slice(b *testing.B) {
|
||||
buf := NewBuffer[int](benchCap)
|
||||
for i := range benchCap {
|
||||
buf.Push(i)
|
||||
}
|
||||
b.ResetTimer()
|
||||
for range b.N {
|
||||
_ = buf.Slice()
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkBuffer_PushAndSlice(b *testing.B) {
|
||||
buf := NewBuffer[int](benchCap)
|
||||
b.ResetTimer()
|
||||
for i := range b.N {
|
||||
buf.Push(i)
|
||||
if i%benchCap == 0 {
|
||||
_ = buf.Slice()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
package ring
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestBuffer_EmptySliceIsNil(t *testing.T) {
|
||||
b := NewBuffer[int](4)
|
||||
assert.Nil(t, b.Slice())
|
||||
}
|
||||
|
||||
func TestBuffer_PushBelowCapacity(t *testing.T) {
|
||||
b := NewBuffer[int](4)
|
||||
b.Push(1)
|
||||
b.Push(2)
|
||||
assert.Equal(t, []int{1, 2}, b.Slice())
|
||||
}
|
||||
|
||||
func TestBuffer_PushAtCapacity(t *testing.T) {
|
||||
b := NewBuffer[int](3)
|
||||
b.Push(1)
|
||||
b.Push(2)
|
||||
b.Push(3)
|
||||
assert.Equal(t, []int{1, 2, 3}, b.Slice())
|
||||
}
|
||||
|
||||
func TestBuffer_PushOverCapacityEvictsOldest(t *testing.T) {
|
||||
b := NewBuffer[int](3)
|
||||
b.Push(1)
|
||||
b.Push(2)
|
||||
b.Push(3)
|
||||
b.Push(4)
|
||||
assert.Equal(t, []int{2, 3, 4}, b.Slice())
|
||||
}
|
||||
|
||||
func TestBuffer_CapacityOne(t *testing.T) {
|
||||
b := NewBuffer[int](1)
|
||||
b.Push(1)
|
||||
b.Push(2)
|
||||
assert.Equal(t, []int{2}, b.Slice())
|
||||
}
|
||||
|
||||
func TestBuffer_ZeroCapacityDefaultsToOne(t *testing.T) {
|
||||
b := NewBuffer[int](0)
|
||||
b.Push(42)
|
||||
assert.Equal(t, []int{42}, b.Slice())
|
||||
}
|
||||
|
||||
func TestBuffer_SliceReturnsCopy(t *testing.T) {
|
||||
b := NewBuffer[int](4)
|
||||
b.Push(10)
|
||||
s := b.Slice()
|
||||
s[0] = 99
|
||||
assert.Equal(t, []int{10}, b.Slice())
|
||||
}
|
||||
|
||||
func TestBuffer_InsertionOrderPreservedAfterWrap(t *testing.T) {
|
||||
b := NewBuffer[int](4)
|
||||
for i := 1; i <= 8; i++ {
|
||||
b.Push(i)
|
||||
}
|
||||
assert.Equal(t, []int{5, 6, 7, 8}, b.Slice())
|
||||
}
|
||||
Reference in New Issue
Block a user