0ab214d1c8
Add GPU monitoring support for AMD and Intel GPUs on Windows using D3DKMT (DirectX) and PDH performance counters. - Add PDH-based GPU utilization via \GPU Engine(*)\Utilization Percentage counter, summing all engine types per adapter (3D, Compute, Copy, Video). - Add D3DKMT bindings for adapter enumeration, memory segments, and adapter perf data. - Use PDH as primary utilization source (works on all vendors), with D3DKMT RunningTime as fallback for systems without PDH counters. - Prefer nvidia-smi when available, fall back to D3DKMT + PDH for AMD/Intel. - Backend priority: nvidia-smi -> D3DKMT + PDH -> ErrNoGpuTool. Verified on AMD 7900XTX GPU with llama.cpp Vulkan & ROCm backend: GPU utilization correctly shows ~99% during inference, ~0-2% when idle. --- LLM disclosure: GLM 5.1 & Kimi K2.6 have been used extensively during exploration and coding to the point that the LLM's wrote over 3/4 of the code, and I have done additional verification myself. As such, it should be considered experimental. Additional verification is needed. I have tested it on my 7900XTX system with Windows 11, and it works correctly, but as I only have this one rig, I cannot verify it everywhere.
122 lines
2.7 KiB
Go
122 lines
2.7 KiB
Go
package perf
|
|
|
|
import (
|
|
"bufio"
|
|
"context"
|
|
"fmt"
|
|
"os/exec"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/mostlygeek/llama-swap/internal/logmon"
|
|
"github.com/shirou/gopsutil/v4/cpu"
|
|
"github.com/shirou/gopsutil/v4/mem"
|
|
"github.com/shirou/gopsutil/v4/net"
|
|
)
|
|
|
|
func getGpuStats(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
|
|
if ch, err := tryNvidiaSmiWindows(ctx, every, logger); err == nil {
|
|
logger.Info("using nvidia-smi for GPU monitoring")
|
|
return ch, nil
|
|
} else {
|
|
logger.Debugf("nvidia-smi: %s", err.Error())
|
|
}
|
|
|
|
if ch, err := tryD3DKMT(ctx, every, logger); err == nil {
|
|
logger.Info("using D3DKMT for GPU monitoring")
|
|
return ch, nil
|
|
} else {
|
|
logger.Debugf("D3DKMT: %s", err.Error())
|
|
}
|
|
|
|
return nil, ErrNoGpuTool
|
|
}
|
|
|
|
// tryNvidiaSmiWindows starts nvidia-smi in loop mode on Windows and returns
|
|
// a channel receiving GPU stat snapshots. Returns ErrNoGpuTool if nvidia-smi
|
|
// is not available.
|
|
func tryNvidiaSmiWindows(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
|
|
if _, err := exec.LookPath("nvidia-smi"); err != nil {
|
|
return nil, ErrNoGpuTool
|
|
}
|
|
|
|
sec := int(every.Seconds())
|
|
if sec < 1 {
|
|
sec = 1
|
|
}
|
|
|
|
cmd := exec.CommandContext(ctx, "nvidia-smi",
|
|
"--query-gpu=index,name,uuid,temperature.gpu,utilization.gpu,memory.used,memory.total,fan.speed,power.draw",
|
|
"--format=csv,noheader,nounits",
|
|
"--loop", fmt.Sprintf("%d", sec),
|
|
)
|
|
|
|
stdout, err := cmd.StdoutPipe()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("nvidia-smi stdout pipe failed: %w", err)
|
|
}
|
|
|
|
if err := cmd.Start(); err != nil {
|
|
return nil, fmt.Errorf("nvidia-smi start failed: %w", err)
|
|
}
|
|
|
|
ch := make(chan []GpuStat, 1)
|
|
|
|
go func() {
|
|
defer close(ch)
|
|
|
|
scanner := bufio.NewScanner(stdout)
|
|
for scanner.Scan() {
|
|
line := strings.TrimSpace(scanner.Text())
|
|
if line == "" {
|
|
continue
|
|
}
|
|
|
|
stat := ParseNvidiaSmiLine(line)
|
|
if stat != nil {
|
|
select {
|
|
case ch <- []GpuStat{*stat}:
|
|
default:
|
|
}
|
|
}
|
|
}
|
|
cmd.Wait()
|
|
}()
|
|
|
|
return ch, nil
|
|
}
|
|
|
|
func readSysStats() (SysStat, error) {
|
|
cpuPcts, err := cpu.Percent(0, true)
|
|
if err != nil {
|
|
return SysStat{}, err
|
|
}
|
|
|
|
vmStat, err := mem.VirtualMemory()
|
|
if err != nil {
|
|
return SysStat{}, err
|
|
}
|
|
|
|
const toMB = 1024 * 1024
|
|
|
|
netIO := make([]NetIOStat, 0)
|
|
if ioCounters, err := net.IOCounters(true); err == nil {
|
|
for _, ioc := range ioCounters {
|
|
netIO = append(netIO, NetIOStat{
|
|
Name: ioc.Name,
|
|
BytesRecv: ioc.BytesRecv,
|
|
BytesSent: ioc.BytesSent,
|
|
})
|
|
}
|
|
}
|
|
|
|
return SysStat{
|
|
Timestamp: time.Now(),
|
|
CpuUtilPerCore: cpuPcts,
|
|
MemTotalMB: int(vmStat.Total / toMB),
|
|
MemUsedMB: int(vmStat.Used / toMB),
|
|
MemFreeMB: int(vmStat.Free / toMB),
|
|
NetIO: netIO,
|
|
}, nil
|
|
}
|