Files
llama-swap/internal/perf/monitor_unix.go
T
David Soušek 3e3646f9f9 perf: ignore LACT devices reporting zero VRAM (#753)
Ignore LACT devices that report zero total VRAM.

Some virtual GPUs on headless VMs report `MemTotalMB == 0` through LACT,
which makes them appear in performance monitoring despite not providing
useful memory data. Skip those entries so only usable GPU devices are
reported.

This makes performance monitoring cleaner on headless VMs with virtual
GPUs that report zero VRAM.

Co-authored-by: David Soušek <david.sousek@intelogy.co.uk>
2026-05-13 10:03:54 -07:00

465 lines
10 KiB
Go

//go:build unix && !darwin
package perf
import (
"bufio"
"context"
"encoding/json"
"fmt"
"net"
"os"
"os/exec"
"os/user"
"path/filepath"
"strconv"
"strings"
"time"
"github.com/mostlygeek/llama-swap/internal/logmon"
"github.com/shirou/gopsutil/v4/cpu"
"github.com/shirou/gopsutil/v4/load"
"github.com/shirou/gopsutil/v4/mem"
psnet "github.com/shirou/gopsutil/v4/net"
)
func getGpuStats(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
if ch, err := tryLACT(ctx, every, logger); err == nil {
logger.Info("using LACT for GPU monitoring")
return ch, nil
} else {
logger.Debugf("LACT: %s", err.Error())
}
if ch, err := tryNvidiaSmi(ctx, every, logger); err == nil {
logger.Info("using nvidia-smi for GPU monitoring")
return ch, nil
} else {
logger.Debugf("nvidia-smi: %s", err.Error())
}
if ch, err := trySysfs(ctx, every, logger); err == nil {
logger.Info("using sysfs for GPU monitoring")
return ch, nil
} else {
logger.Debugf("sysfs: %s", err.Error())
}
return nil, ErrNoGpuTool
}
func tryLACT(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
socketPath := lactSocketPath()
if socketPath == "" {
return nil, ErrNoGpuTool
}
conn, err := net.DialTimeout("unix", socketPath, 2*time.Second)
if err != nil {
return nil, fmt.Errorf("cannot connect to LACT socket: %w", err)
}
defer conn.Close()
conn.SetDeadline(time.Now().Add(5 * time.Second))
devices, err := lactListDevices(conn)
if err != nil {
return nil, fmt.Errorf("LACT ListDevices failed: %w", err)
}
if len(devices) == 0 {
return nil, fmt.Errorf("LACT returned no devices")
}
ch := make(chan []GpuStat, 1)
go func() {
defer close(ch)
ticker := time.NewTicker(every)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
socketPath := lactSocketPath()
if socketPath == "" {
continue
}
conn, err := net.DialTimeout("unix", socketPath, 2*time.Second)
if err != nil {
continue
}
conn.SetDeadline(time.Now().Add(5 * time.Second))
devices, err := lactListDevices(conn)
if err != nil {
conn.Close()
continue
}
stats := make([]GpuStat, 0, len(devices))
for i, d := range devices {
stat, err := lactGetDeviceStats(conn, d.ID, d.Name, i)
if err != nil {
continue
}
if stat.MemTotalMB == 0 {
continue
}
stats = append(stats, stat)
}
conn.Close()
if len(stats) > 0 {
select {
case ch <- stats:
default:
}
}
}
}
}()
return ch, nil
}
func tryNvidiaSmi(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
if _, err := exec.LookPath("nvidia-smi"); err != nil {
return nil, ErrNoGpuTool
}
sec := int(every.Seconds())
if sec < 1 {
sec = 1
}
cmd := exec.CommandContext(ctx, "nvidia-smi",
"--query-gpu=index,name,uuid,temperature.gpu,utilization.gpu,memory.used,memory.total,fan.speed,power.draw",
"--format=csv,noheader,nounits",
"-loop", fmt.Sprintf("%d", sec),
)
stdout, err := cmd.StdoutPipe()
if err != nil {
return nil, fmt.Errorf("nvidia-smi stdout pipe failed: %w", err)
}
if err := cmd.Start(); err != nil {
return nil, fmt.Errorf("nvidia-smi start failed: %w", err)
}
ch := make(chan []GpuStat, 1)
go func() {
defer close(ch)
scanner := bufio.NewScanner(stdout)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" {
continue
}
stat := parseNvidiaSmiLine(line)
if stat != nil {
select {
case ch <- []GpuStat{*stat}:
default:
}
}
}
cmd.Wait()
}()
return ch, nil
}
func parseNvidiaSmiLine(line string) *GpuStat {
fields := strings.Split(line, ", ")
if len(fields) < 9 {
return nil
}
id, _ := strconv.Atoi(strings.TrimSpace(fields[0]))
name := strings.TrimSpace(fields[1])
uuid := strings.TrimSpace(fields[2])
tempC, _ := strconv.Atoi(strings.TrimSpace(fields[3]))
gpuUtil, _ := strconv.ParseFloat(strings.TrimSpace(fields[4]), 64)
memUsed, _ := strconv.Atoi(strings.TrimSpace(fields[5]))
memTotal, _ := strconv.Atoi(strings.TrimSpace(fields[6]))
fanSpeed, _ := strconv.ParseFloat(strings.TrimSpace(fields[7]), 64)
powerDraw, _ := strconv.ParseFloat(strings.TrimSpace(fields[8]), 64)
var memUtil float64
if memTotal > 0 {
memUtil = float64(memUsed) / float64(memTotal) * 100
}
return &GpuStat{
Timestamp: time.Now(),
ID: id,
Name: name,
UUID: uuid,
TempC: tempC,
GpuUtilPct: gpuUtil,
MemUtilPct: memUtil,
MemUsedMB: memUsed,
MemTotalMB: memTotal,
FanSpeedPct: fanSpeed,
PowerDrawW: powerDraw,
}
}
func trySysfs(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
return nil, ErrNotImplemented
}
func lactSocketPath() string {
if p := os.Getenv("LACT_DAEMON_SOCKET_PATH"); p != "" {
if _, err := os.Stat(p); err == nil {
return p
}
}
rootPath := "/run/lactd.sock"
if _, err := os.Stat(rootPath); err == nil {
return rootPath
}
u, err := user.Current()
if err != nil {
return ""
}
userPath := filepath.Join("/run/user", u.Uid, "lactd.sock")
if _, err := os.Stat(userPath); err == nil {
return userPath
}
return ""
}
type lactRequest struct {
Command string `json:"command"`
Args interface{} `json:"args,omitempty"`
}
type lactResponse struct {
Status string `json:"status"`
Data json.RawMessage `json:"data"`
}
type lactDeviceEntry struct {
ID string `json:"id"`
Name string `json:"name"`
}
type lactDeviceStats struct {
Fan struct {
PwmCurrent *uint8 `json:"pwm_current"`
} `json:"fan"`
Vram struct {
Total *uint64 `json:"total"`
Used *uint64 `json:"used"`
} `json:"vram"`
Power struct {
Average *float64 `json:"average"`
Current *float64 `json:"current"`
} `json:"power"`
Temps map[string]lactTempEntry `json:"temps"`
BusyPercent *uint8 `json:"busy_percent"`
}
type lactTempEntry struct {
Current *float64 `json:"current"`
}
func lactSendRequest(conn net.Conn, req lactRequest) (json.RawMessage, error) {
data, err := json.Marshal(req)
if err != nil {
return nil, err
}
data = append(data, '\n')
if _, err := conn.Write(data); err != nil {
return nil, err
}
reader := bufio.NewReader(conn)
line, err := reader.ReadBytes('\n')
if err != nil {
return nil, err
}
var resp lactResponse
if err := json.Unmarshal(line, &resp); err != nil {
return nil, err
}
if resp.Status != "ok" {
return nil, fmt.Errorf("LACT error: %s", string(resp.Data))
}
return resp.Data, nil
}
func lactListDevices(conn net.Conn) ([]lactDeviceEntry, error) {
data, err := lactSendRequest(conn, lactRequest{Command: "list_devices"})
if err != nil {
return nil, err
}
var devices []lactDeviceEntry
if err := json.Unmarshal(data, &devices); err != nil {
return nil, err
}
return devices, nil
}
func lactGetDeviceStats(conn net.Conn, id string, name string, index int) (GpuStat, error) {
data, err := lactSendRequest(conn, lactRequest{
Command: "device_stats",
Args: struct {
ID string `json:"id"`
}{ID: id},
})
if err != nil {
return GpuStat{}, err
}
var stats lactDeviceStats
if err := json.Unmarshal(data, &stats); err != nil {
return GpuStat{}, err
}
var memUsedMB, memTotalMB int
if stats.Vram.Used != nil {
memUsedMB = int(*stats.Vram.Used / 1024 / 1024)
}
if stats.Vram.Total != nil {
memTotalMB = int(*stats.Vram.Total / 1024 / 1024)
}
var memUtil float64
if memTotalMB > 0 {
memUtil = float64(memUsedMB) / float64(memTotalMB) * 100
}
var gpuUtil float64
if stats.BusyPercent != nil {
gpuUtil = float64(*stats.BusyPercent)
}
var fanSpeed float64
if stats.Fan.PwmCurrent != nil {
fanSpeed = float64(*stats.Fan.PwmCurrent) / 255.0 * 100.0
}
var powerDraw float64
if stats.Power.Average != nil && *stats.Power.Average > 0 {
powerDraw = *stats.Power.Average
} else if stats.Power.Current != nil {
powerDraw = *stats.Power.Current
}
var tempC int
if t, ok := stats.Temps["edge"]; ok && t.Current != nil {
tempC = int(*t.Current)
} else if t, ok := stats.Temps["junction"]; ok && t.Current != nil {
tempC = int(*t.Current)
} else {
for _, t := range stats.Temps {
if t.Current != nil {
tempC = int(*t.Current)
break
}
}
}
var vramTempC int
// nvidia uses "VRAM", amd "mem"
for _, key := range []string{"mem", "VRAM"} {
if t, ok := stats.Temps[key]; ok && t.Current != nil && *t.Current > 0 {
vramTempC = int(*t.Current)
break
}
}
return GpuStat{
Timestamp: time.Now(),
ID: index,
Name: name,
UUID: id,
TempC: tempC,
VramTempC: vramTempC,
GpuUtilPct: gpuUtil,
MemUtilPct: memUtil,
MemUsedMB: memUsedMB,
MemTotalMB: memTotalMB,
FanSpeedPct: fanSpeed,
PowerDrawW: powerDraw,
}, nil
}
func readSysfs() ([]GpuStat, error) {
return nil, ErrNotImplemented
}
func readSysStats() (SysStat, error) {
cpuPcts, err := cpu.Percent(0, true)
if err != nil {
return SysStat{}, err
}
vmStat, err := mem.VirtualMemory()
if err != nil {
return SysStat{}, err
}
const toMB = 1024 * 1024
var swapTotalMB, swapUsedMB int
if swapStat, err := mem.SwapMemory(); err == nil {
swapTotalMB = int(swapStat.Total / toMB)
swapUsedMB = int(swapStat.Used / toMB)
}
var loadAvg1, loadAvg5, loadAvg15 float64
if loadStat, err := load.Avg(); err == nil {
loadAvg1 = loadStat.Load1
loadAvg5 = loadStat.Load5
loadAvg15 = loadStat.Load15
}
netIO := make([]NetIOStat, 0)
if ioCounters, err := psnet.IOCounters(true); err == nil {
for _, ioc := range ioCounters {
if ioc.Name == "lo" {
continue
}
netIO = append(netIO, NetIOStat{
Name: ioc.Name,
BytesRecv: ioc.BytesRecv,
BytesSent: ioc.BytesSent,
})
}
}
return SysStat{
Timestamp: time.Now(),
CpuUtilPerCore: cpuPcts,
MemTotalMB: int(vmStat.Total / toMB),
MemUsedMB: int(vmStat.Used / toMB),
MemFreeMB: int(vmStat.Free / toMB),
SwapTotalMB: swapTotalMB,
SwapUsedMB: swapUsedMB,
LoadAvg1: loadAvg1,
LoadAvg5: loadAvg5,
LoadAvg15: loadAvg15,
NetIO: netIO,
}, nil
}