Compare commits

...

3 Commits

Author SHA1 Message Date
knguyen298 79dc87f881 Add ROCm stats via rocm-smi (#767)
Add ROCm GPU stats support using `rocm-smi`.
2026-05-17 07:58:26 -07:00
krzychdre b2fcc2daa1 ui-svelte: fix cached tokens total counting -1 sentinel (#760)
The backend uses cache_tokens=-1 as a sentinel for endpoints that don't
report cache stats (embeddings, vLLM). The activity table correctly
renders these as "-", but the totals widget summed the sentinels
directly, so each such request subtracted 1 from the displayed total.

- clamp cache_tokens with Math.max(0, ...) when reducing
2026-05-15 14:42:44 -07:00
cdwaage 6a9c4efc8f fix: use --loop instead of -loop for nvidia-smi (driver 540+ compat) (#759) 2026-05-15 13:20:29 -07:00
2 changed files with 116 additions and 2 deletions
+115 -1
View File
@@ -38,6 +38,13 @@ func getGpuStats(ctx context.Context, every time.Duration, logger *logmon.Monito
logger.Debugf("nvidia-smi: %s", err.Error())
}
if ch, err := tryRocmSmi(ctx, every, logger); err == nil {
logger.Info("using rocm-smi for GPU monitoring")
return ch, nil
} else {
logger.Debugf("rocm-smi: %s", err.Error())
}
if ch, err := trySysfs(ctx, every, logger); err == nil {
logger.Info("using sysfs for GPU monitoring")
return ch, nil
@@ -139,7 +146,7 @@ func tryNvidiaSmi(ctx context.Context, every time.Duration, logger *logmon.Monit
cmd := exec.CommandContext(ctx, "nvidia-smi",
"--query-gpu=index,name,uuid,temperature.gpu,utilization.gpu,memory.used,memory.total,fan.speed,power.draw",
"--format=csv,noheader,nounits",
"-loop", fmt.Sprintf("%d", sec),
"--loop", fmt.Sprintf("%d", sec),
)
stdout, err := cmd.StdoutPipe()
@@ -213,6 +220,113 @@ func parseNvidiaSmiLine(line string) *GpuStat {
}
}
func tryRocmSmi(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
if _, err := exec.LookPath("rocm-smi"); err != nil {
return nil, ErrNoGpuTool
}
if every < time.Second {
every = time.Second
}
const pollTimeout = 5 * time.Second
ch := make(chan []GpuStat, 1)
go func() {
defer close(ch)
ticker := time.NewTicker(every)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
pollCtx, cancel := context.WithTimeout(ctx, pollTimeout)
cmd := exec.CommandContext(pollCtx, "rocm-smi", "-i", "-P", "-t", "-f", "-u", "--showmemuse", "--showmeminfo", "vram", "--showproductname", "--csv")
out, err := cmd.Output()
timedOut := pollCtx.Err() == context.DeadlineExceeded
cancel()
if err != nil {
if timedOut {
logger.Debug("rocm-smi timed out")
}
continue
}
stats := make([]GpuStat, 0)
scanner := bufio.NewScanner(strings.NewReader(string(out)))
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" || strings.HasPrefix(line, "device,") {
continue
}
stat := parseRocmSmiLine(line)
if stat != nil {
stats = append(stats, *stat)
}
}
if len(stats) > 0 {
select {
case ch <- stats:
default:
}
}
}
}
}()
return ch, nil
}
func parseRocmSmiLine(line string) *GpuStat {
fields := strings.Split(line, ",")
if len(fields) < 20 {
return nil
}
device := strings.TrimSpace(fields[0])
id, err := strconv.Atoi(strings.TrimPrefix(device, "card"))
if err != nil {
return nil
}
deviceName := strings.TrimSpace(fields[1])
uuid := strings.TrimSpace(fields[5])
tempC, _ := strconv.ParseFloat(strings.TrimSpace(fields[6]), 64)
vramTempC, _ := strconv.ParseFloat(strings.TrimSpace(fields[8]), 64)
fanSpeed, _ := strconv.ParseFloat(strings.TrimSpace(fields[10]), 64)
powerDraw, _ := strconv.ParseFloat(strings.TrimSpace(fields[12]), 64)
gpuUtil, _ := strconv.ParseFloat(strings.TrimSpace(fields[13]), 64)
memUtil, _ := strconv.ParseFloat(strings.TrimSpace(fields[14]), 64)
memTotal, _ := strconv.ParseUint(strings.TrimSpace(fields[17]), 10, 64)
memUsed, _ := strconv.ParseUint(strings.TrimSpace(fields[18]), 10, 64)
cardSeries := strings.TrimSpace(fields[19])
name := device
if cardSeries != "" && cardSeries != "N/A" {
name = cardSeries + " " + device
} else if deviceName != "" && deviceName != "N/A" {
name = deviceName + " " + device
}
const toMB = 1024 * 1024
return &GpuStat{
Timestamp: time.Now(),
ID: id,
Name: name,
UUID: uuid,
TempC: int(tempC),
VramTempC: int(vramTempC),
GpuUtilPct: gpuUtil,
MemUtilPct: memUtil,
MemUsedMB: int(memUsed / toMB),
MemTotalMB: int(memTotal / toMB),
FanSpeedPct: fanSpeed,
PowerDrawW: powerDraw,
}
}
func trySysfs(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
return nil, ErrNotImplemented
}
@@ -11,7 +11,7 @@
const totalRequests = $metrics.length;
const totalInputTokens = $metrics.reduce((sum, m) => sum + m.tokens.input_tokens, 0);
const totalOutputTokens = $metrics.reduce((sum, m) => sum + m.tokens.output_tokens, 0);
const totalCacheTokens = $metrics.reduce((sum, m) => sum + m.tokens.cache_tokens, 0);
const totalCacheTokens = $metrics.reduce((sum, m) => sum + Math.max(0, m.tokens.cache_tokens), 0);
const promptPerSecond = $metrics.filter((m) => m.tokens.prompt_per_second > 0).map((m) => m.tokens.prompt_per_second);