diff --git a/internal/perf/gpu_parse.go b/internal/perf/gpu_parse.go new file mode 100644 index 00000000..011ef421 --- /dev/null +++ b/internal/perf/gpu_parse.go @@ -0,0 +1,45 @@ +package perf + +import ( + "strconv" + "strings" + "time" +) + +// ParseNvidiaSmiLine parses a single line from nvidia-smi CSV output. +// Format: index,name,uuid,temperature.gpu,utilization.gpu,memory.used,memory.total,fan.speed,power.draw +func ParseNvidiaSmiLine(line string) *GpuStat { + fields := strings.Split(line, ",") + if len(fields) < 9 { + return nil + } + + id, _ := strconv.Atoi(strings.TrimSpace(fields[0])) + name := strings.TrimSpace(fields[1]) + uuid := strings.TrimSpace(fields[2]) + tempC, _ := strconv.Atoi(strings.TrimSpace(fields[3])) + gpuUtil, _ := strconv.ParseFloat(strings.TrimSpace(fields[4]), 64) + memUsed, _ := strconv.Atoi(strings.TrimSpace(fields[5])) + memTotal, _ := strconv.Atoi(strings.TrimSpace(fields[6])) + fanSpeed, _ := strconv.ParseFloat(strings.TrimSpace(fields[7]), 64) + powerDraw, _ := strconv.ParseFloat(strings.TrimSpace(fields[8]), 64) + + var memUtil float64 + if memTotal > 0 { + memUtil = float64(memUsed) / float64(memTotal) * 100 + } + + return &GpuStat{ + Timestamp: time.Now(), + ID: id, + Name: name, + UUID: uuid, + TempC: tempC, + GpuUtilPct: gpuUtil, + MemUtilPct: memUtil, + MemUsedMB: memUsed, + MemTotalMB: memTotal, + FanSpeedPct: fanSpeed, + PowerDrawW: powerDraw, + } +} diff --git a/internal/perf/monitor_test.go b/internal/perf/monitor_test.go index e8aa267e..6ae53637 100644 --- a/internal/perf/monitor_test.go +++ b/internal/perf/monitor_test.go @@ -224,3 +224,43 @@ func TestCurrent_ConcurrentAccess(t *testing.T) { } wg.Wait() } + +func TestParseNvidiaSmiLine_ValidLine(t *testing.T) { + line := "0, NVIDIA GeForce RTX 3080, GPU-12345678-1234-1234-1234-123456789abc, 65, 80, 8192, 10240, 75, 250" + + stat := ParseNvidiaSmiLine(line) + require.NotNil(t, stat) + + assert.Equal(t, 0, stat.ID) + assert.Equal(t, "NVIDIA GeForce RTX 3080", stat.Name) + assert.Equal(t, "GPU-12345678-1234-1234-1234-123456789abc", stat.UUID) + assert.Equal(t, 65, stat.TempC) + assert.Equal(t, 80.0, stat.GpuUtilPct) + assert.Equal(t, 8192, stat.MemUsedMB) + assert.Equal(t, 10240, stat.MemTotalMB) + assert.Equal(t, 75.0, stat.FanSpeedPct) + assert.Equal(t, 250.0, stat.PowerDrawW) + assert.InDelta(t, 80.0, stat.MemUtilPct, 0.01) +} + +func TestParseNvidiaSmiLine_ShortLine(t *testing.T) { + line := "0, NVIDIA GPU, GPU-123" + + stat := ParseNvidiaSmiLine(line) + assert.Nil(t, stat) +} + +func TestParseNvidiaSmiLine_MissingFields(t *testing.T) { + line := "0, NVIDIA GPU, GPU-123, 65, 80, 8192, 10240, 75" + + stat := ParseNvidiaSmiLine(line) + assert.Nil(t, stat) +} + +func TestParseNvidiaSmiLine_ZeroMemoryTotal(t *testing.T) { + line := "0, NVIDIA GPU, GPU-123, 65, 80, 0, 0, 75, 250" + + stat := ParseNvidiaSmiLine(line) + require.NotNil(t, stat) + assert.Equal(t, 0.0, stat.MemUtilPct) +} diff --git a/internal/perf/monitor_unix.go b/internal/perf/monitor_unix.go index 84ba5e06..3cebe053 100644 --- a/internal/perf/monitor_unix.go +++ b/internal/perf/monitor_unix.go @@ -170,7 +170,7 @@ func tryNvidiaSmi(ctx context.Context, every time.Duration, logger *logmon.Monit continue } - stat := parseNvidiaSmiLine(line) + stat := ParseNvidiaSmiLine(line) if stat != nil { select { case ch <- []GpuStat{*stat}: @@ -184,42 +184,6 @@ func tryNvidiaSmi(ctx context.Context, every time.Duration, logger *logmon.Monit return ch, nil } -func parseNvidiaSmiLine(line string) *GpuStat { - fields := strings.Split(line, ", ") - if len(fields) < 9 { - return nil - } - - id, _ := strconv.Atoi(strings.TrimSpace(fields[0])) - name := strings.TrimSpace(fields[1]) - uuid := strings.TrimSpace(fields[2]) - tempC, _ := strconv.Atoi(strings.TrimSpace(fields[3])) - gpuUtil, _ := strconv.ParseFloat(strings.TrimSpace(fields[4]), 64) - memUsed, _ := strconv.Atoi(strings.TrimSpace(fields[5])) - memTotal, _ := strconv.Atoi(strings.TrimSpace(fields[6])) - fanSpeed, _ := strconv.ParseFloat(strings.TrimSpace(fields[7]), 64) - powerDraw, _ := strconv.ParseFloat(strings.TrimSpace(fields[8]), 64) - - var memUtil float64 - if memTotal > 0 { - memUtil = float64(memUsed) / float64(memTotal) * 100 - } - - return &GpuStat{ - Timestamp: time.Now(), - ID: id, - Name: name, - UUID: uuid, - TempC: tempC, - GpuUtilPct: gpuUtil, - MemUtilPct: memUtil, - MemUsedMB: memUsed, - MemTotalMB: memTotal, - FanSpeedPct: fanSpeed, - PowerDrawW: powerDraw, - } -} - func tryRocmSmi(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) { if _, err := exec.LookPath("rocm-smi"); err != nil { return nil, ErrNoGpuTool diff --git a/internal/perf/monitor_windows.go b/internal/perf/monitor_windows.go index acf9f986..656eb880 100644 --- a/internal/perf/monitor_windows.go +++ b/internal/perf/monitor_windows.go @@ -1,7 +1,11 @@ package perf import ( + "bufio" "context" + "fmt" + "os/exec" + "strings" "time" "github.com/mostlygeek/llama-swap/internal/logmon" @@ -11,7 +15,68 @@ import ( ) func getGpuStats(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) { - return nil, ErrNotImplemented + if ch, err := tryNvidiaSmiWindows(ctx, every, logger); err == nil { + logger.Info("using nvidia-smi for GPU monitoring") + return ch, nil + } else { + logger.Debugf("nvidia-smi: %s", err.Error()) + } + + return nil, ErrNoGpuTool +} + +// tryNvidiaSmiWindows starts nvidia-smi in loop mode on Windows and returns +// a channel receiving GPU stat snapshots. Returns ErrNoGpuTool if nvidia-smi +// is not available. +func tryNvidiaSmiWindows(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) { + if _, err := exec.LookPath("nvidia-smi"); err != nil { + return nil, ErrNoGpuTool + } + + sec := int(every.Seconds()) + if sec < 1 { + sec = 1 + } + + cmd := exec.CommandContext(ctx, "nvidia-smi", + "--query-gpu=index,name,uuid,temperature.gpu,utilization.gpu,memory.used,memory.total,fan.speed,power.draw", + "--format=csv,noheader,nounits", + "--loop", fmt.Sprintf("%d", sec), + ) + + stdout, err := cmd.StdoutPipe() + if err != nil { + return nil, fmt.Errorf("nvidia-smi stdout pipe failed: %w", err) + } + + if err := cmd.Start(); err != nil { + return nil, fmt.Errorf("nvidia-smi start failed: %w", err) + } + + ch := make(chan []GpuStat, 1) + + go func() { + defer close(ch) + + scanner := bufio.NewScanner(stdout) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" { + continue + } + + stat := ParseNvidiaSmiLine(line) + if stat != nil { + select { + case ch <- []GpuStat{*stat}: + default: + } + } + } + cmd.Wait() + }() + + return ch, nil } func readSysStats() (SysStat, error) {