perf: add vendor-agnostic GPU monitoring for Windows (experimental) (#779)
Add GPU monitoring support for AMD and Intel GPUs on Windows using D3DKMT (DirectX) and PDH performance counters. - Add PDH-based GPU utilization via \GPU Engine(*)\Utilization Percentage counter, summing all engine types per adapter (3D, Compute, Copy, Video). - Add D3DKMT bindings for adapter enumeration, memory segments, and adapter perf data. - Use PDH as primary utilization source (works on all vendors), with D3DKMT RunningTime as fallback for systems without PDH counters. - Prefer nvidia-smi when available, fall back to D3DKMT + PDH for AMD/Intel. - Backend priority: nvidia-smi -> D3DKMT + PDH -> ErrNoGpuTool. Verified on AMD 7900XTX GPU with llama.cpp Vulkan & ROCm backend: GPU utilization correctly shows ~99% during inference, ~0-2% when idle. --- LLM disclosure: GLM 5.1 & Kimi K2.6 have been used extensively during exploration and coding to the point that the LLM's wrote over 3/4 of the code, and I have done additional verification myself. As such, it should be considered experimental. Additional verification is needed. I have tested it on my 7900XTX system with Windows 11, and it works correctly, but as I only have this one rig, I cannot verify it everywhere.
This commit is contained in:
@@ -0,0 +1,92 @@
|
||||
package perf
|
||||
|
||||
type LUID struct {
|
||||
LowPart uint32
|
||||
HighPart int32
|
||||
}
|
||||
|
||||
const maxEnumAdapters = 16
|
||||
|
||||
type D3DKMT_ENUMADAPTERS2 struct {
|
||||
NumAdapters uint32
|
||||
pAdapters uintptr
|
||||
}
|
||||
|
||||
type D3DKMT_ADAPTERINFO struct {
|
||||
hAdapter uint32
|
||||
AdapterLuid LUID
|
||||
NumOfSources uint32
|
||||
bPresentMoveRegionsPreferred int32
|
||||
}
|
||||
|
||||
type D3DKMT_OPENADAPTERFROMLUID struct {
|
||||
AdapterLuid LUID
|
||||
hAdapter uint32
|
||||
}
|
||||
|
||||
type D3DKMT_CLOSEADAPTER struct {
|
||||
hAdapter uint32
|
||||
}
|
||||
|
||||
type KMTQUERYADAPTERINFOTYPE int32
|
||||
|
||||
const (
|
||||
KMTQAITYPE_UMDRIVERPRIVATE KMTQUERYADAPTERINFOTYPE = 0
|
||||
KMTQAITYPE_ADAPTERREGISTRYINFO KMTQUERYADAPTERINFOTYPE = 8
|
||||
KMTQAITYPE_DRIVERVERSION KMTQUERYADAPTERINFOTYPE = 13
|
||||
KMTQAITYPE_PHYSICALADAPTERDEVICEIDS KMTQUERYADAPTERINFOTYPE = 31
|
||||
KMTQAITYPE_NODEPERFDATA KMTQUERYADAPTERINFOTYPE = 61
|
||||
KMTQAITYPE_ADAPTERPERFDATA KMTQUERYADAPTERINFOTYPE = 62
|
||||
KMTQAITYPE_ADAPTERPERFDATA_CAPS KMTQUERYADAPTERINFOTYPE = 63
|
||||
)
|
||||
|
||||
type D3DKMT_QUERYADAPTERINFO struct {
|
||||
hAdapter uint32
|
||||
Type KMTQUERYADAPTERINFOTYPE
|
||||
pPrivateDriverData uintptr
|
||||
PrivateDriverDataSize uint32
|
||||
}
|
||||
|
||||
type D3DKMT_ADAPTER_PERFDATA struct {
|
||||
PhysicalAdapterIndex uint32
|
||||
MemoryFrequency uint64
|
||||
MaxMemoryFrequency uint64
|
||||
MaxMemoryFrequencyOC uint64
|
||||
MemoryBandwidth uint64
|
||||
PCIEBandwidth uint64
|
||||
FanRPM uint32
|
||||
Power uint32
|
||||
Temperature uint32
|
||||
PowerStateOverride byte
|
||||
}
|
||||
|
||||
type D3DKMT_QUERYSTATISTICS_TYPE int32
|
||||
|
||||
const (
|
||||
D3DKMT_QUERYSTATISTICS_ADAPTER D3DKMT_QUERYSTATISTICS_TYPE = 0
|
||||
D3DKMT_QUERYSTATISTICS_PROCESS D3DKMT_QUERYSTATISTICS_TYPE = 1
|
||||
D3DKMT_QUERYSTATISTICS_PROCESS_ADAPTER D3DKMT_QUERYSTATISTICS_TYPE = 2
|
||||
D3DKMT_QUERYSTATISTICS_SEGMENT D3DKMT_QUERYSTATISTICS_TYPE = 3
|
||||
D3DKMT_QUERYSTATISTICS_PROCESS_SEGMENT D3DKMT_QUERYSTATISTICS_TYPE = 4
|
||||
D3DKMT_QUERYSTATISTICS_NODE D3DKMT_QUERYSTATISTICS_TYPE = 5
|
||||
D3DKMT_QUERYSTATISTICS_PROCESS_NODE D3DKMT_QUERYSTATISTICS_TYPE = 6
|
||||
D3DKMT_QUERYSTATISTICS_VIDPNSOURCE D3DKMT_QUERYSTATISTICS_TYPE = 7
|
||||
D3DKMT_QUERYSTATISTICS_PROCESS_VIDPNSOURCE D3DKMT_QUERYSTATISTICS_TYPE = 8
|
||||
)
|
||||
|
||||
type D3DKMT_ADAPTER_PERFDATACAPS struct {
|
||||
PhysicalAdapterIndex uint32
|
||||
MaxMemoryBandwidth uint64
|
||||
MaxPCIEBandwidth uint64
|
||||
MaxFanRPM uint32
|
||||
TemperatureMax uint32
|
||||
TemperatureWarning uint32
|
||||
}
|
||||
|
||||
type D3DKMT_QUERYSTATISTICS_QUERY_SEGMENT struct {
|
||||
SegmentId uint32
|
||||
}
|
||||
|
||||
type D3DKMT_QUERYSTATISTICS_QUERY_NODE struct {
|
||||
NodeId uint32
|
||||
}
|
||||
@@ -0,0 +1,529 @@
|
||||
//go:build windows
|
||||
|
||||
package perf
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
"unsafe"
|
||||
|
||||
"github.com/mostlygeek/llama-swap/internal/logmon"
|
||||
"golang.org/x/sys/windows"
|
||||
)
|
||||
|
||||
var (
|
||||
d3dkmDLL *windows.LazyDLL
|
||||
procEnumAdapters2 *windows.LazyProc
|
||||
procOpenAdapterFromLuid *windows.LazyProc
|
||||
procCloseAdapter *windows.LazyProc
|
||||
procQueryAdapterInfo *windows.LazyProc
|
||||
procQueryStatistics *windows.LazyProc
|
||||
d3dkmtInitOnce sync.Once
|
||||
d3dkmtInitErr error
|
||||
)
|
||||
|
||||
// initD3DKMT lazily loads gdi32.dll and resolves D3DKMT function pointers.
|
||||
// Safe for concurrent use via sync.Once.
|
||||
func initD3DKMT() error {
|
||||
d3dkmtInitOnce.Do(func() {
|
||||
d3dkmDLL = windows.NewLazySystemDLL("gdi32.dll")
|
||||
|
||||
procEnumAdapters2 = d3dkmDLL.NewProc("D3DKMTEnumAdapters2")
|
||||
procOpenAdapterFromLuid = d3dkmDLL.NewProc("D3DKMTOpenAdapterFromLuid")
|
||||
procCloseAdapter = d3dkmDLL.NewProc("D3DKMTCloseAdapter")
|
||||
procQueryAdapterInfo = d3dkmDLL.NewProc("D3DKMTQueryAdapterInfo")
|
||||
procQueryStatistics = d3dkmDLL.NewProc("D3DKMTQueryStatistics")
|
||||
|
||||
for name, p := range map[string]*windows.LazyProc{
|
||||
"D3DKMTEnumAdapters2": procEnumAdapters2,
|
||||
"D3DKMTOpenAdapterFromLuid": procOpenAdapterFromLuid,
|
||||
"D3DKMTCloseAdapter": procCloseAdapter,
|
||||
"D3DKMTQueryAdapterInfo": procQueryAdapterInfo,
|
||||
"D3DKMTQueryStatistics": procQueryStatistics,
|
||||
} {
|
||||
if err := p.Find(); err != nil {
|
||||
d3dkmtInitErr = fmt.Errorf("D3DKMT %s not found: %w", name, err)
|
||||
return
|
||||
}
|
||||
}
|
||||
})
|
||||
return d3dkmtInitErr
|
||||
}
|
||||
|
||||
// ntstatusCall invokes a D3DKMT function and returns a non-nil error if the
|
||||
// NTSTATUS result is not STATUS_SUCCESS (0).
|
||||
func ntstatusCall(proc *windows.LazyProc, arg unsafe.Pointer) error {
|
||||
ret, _, _ := proc.Call(uintptr(arg))
|
||||
if ret != 0 {
|
||||
return fmt.Errorf("NTSTATUS 0x%08x", uint32(ret))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// d3dkmEnumerateAdapters enumerates all available graphics adapters via
|
||||
// D3DKMTEnumAdapters2.
|
||||
func d3dkmEnumerateAdapters() ([]D3DKMT_ADAPTERINFO, error) {
|
||||
var adapters [maxEnumAdapters]D3DKMT_ADAPTERINFO
|
||||
enum := D3DKMT_ENUMADAPTERS2{
|
||||
NumAdapters: maxEnumAdapters,
|
||||
pAdapters: uintptr(unsafe.Pointer(&adapters[0])),
|
||||
}
|
||||
if err := ntstatusCall(procEnumAdapters2, unsafe.Pointer(&enum)); err != nil {
|
||||
return nil, fmt.Errorf("EnumAdapters2: %w", err)
|
||||
}
|
||||
if enum.NumAdapters == 0 {
|
||||
return nil, fmt.Errorf("no adapters found")
|
||||
}
|
||||
result := make([]D3DKMT_ADAPTERINFO, enum.NumAdapters)
|
||||
for i := uint32(0); i < enum.NumAdapters; i++ {
|
||||
result[i] = adapters[i]
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// d3dkmOpenAdapter opens a D3DKMT adapter handle for the given LUID.
|
||||
func d3dkmOpenAdapter(luid LUID) (uint32, error) {
|
||||
req := D3DKMT_OPENADAPTERFROMLUID{
|
||||
AdapterLuid: luid,
|
||||
}
|
||||
if err := ntstatusCall(procOpenAdapterFromLuid, unsafe.Pointer(&req)); err != nil {
|
||||
return 0, fmt.Errorf("OpenAdapterFromLuid: %w", err)
|
||||
}
|
||||
return req.hAdapter, nil
|
||||
}
|
||||
|
||||
// d3dkmCloseAdapter closes a previously opened D3DKMT adapter handle.
|
||||
func d3dkmCloseAdapter(hAdapter uint32) error {
|
||||
req := D3DKMT_CLOSEADAPTER{hAdapter: hAdapter}
|
||||
return ntstatusCall(procCloseAdapter, unsafe.Pointer(&req))
|
||||
}
|
||||
|
||||
// d3dkmGetAdapterPerfData queries per-adapter performance data (temperature,
|
||||
// fan RPM, power, bandwidth) via KMTQAITYPE_ADAPTERPERFDATA.
|
||||
func d3dkmGetAdapterPerfData(hAdapter uint32) (*D3DKMT_ADAPTER_PERFDATA, error) {
|
||||
var data D3DKMT_ADAPTER_PERFDATA
|
||||
req := D3DKMT_QUERYADAPTERINFO{
|
||||
hAdapter: hAdapter,
|
||||
Type: KMTQAITYPE_ADAPTERPERFDATA,
|
||||
pPrivateDriverData: uintptr(unsafe.Pointer(&data)),
|
||||
PrivateDriverDataSize: uint32(unsafe.Sizeof(data)),
|
||||
}
|
||||
if err := ntstatusCall(procQueryAdapterInfo, unsafe.Pointer(&req)); err != nil {
|
||||
return nil, fmt.Errorf("QueryAdapterInfo(ADAPTERPERFDATA): %w", err)
|
||||
}
|
||||
return &data, nil
|
||||
}
|
||||
|
||||
// d3dkmGetAdapterPerfDataCaps queries static adapter performance capabilities
|
||||
// (max fan RPM, temperature limits, max bandwidth) via KMTQAITYPE_ADAPTERPERFDATA_CAPS.
|
||||
func d3dkmGetAdapterPerfDataCaps(hAdapter uint32) (*D3DKMT_ADAPTER_PERFDATACAPS, error) {
|
||||
var data D3DKMT_ADAPTER_PERFDATACAPS
|
||||
req := D3DKMT_QUERYADAPTERINFO{
|
||||
hAdapter: hAdapter,
|
||||
Type: KMTQAITYPE_ADAPTERPERFDATA_CAPS,
|
||||
pPrivateDriverData: uintptr(unsafe.Pointer(&data)),
|
||||
PrivateDriverDataSize: uint32(unsafe.Sizeof(data)),
|
||||
}
|
||||
if err := ntstatusCall(procQueryAdapterInfo, unsafe.Pointer(&req)); err != nil {
|
||||
return nil, fmt.Errorf("QueryAdapterInfo(ADAPTERPERFDATACAPS): %w", err)
|
||||
}
|
||||
return &data, nil
|
||||
}
|
||||
|
||||
type queryStatsBuffer struct {
|
||||
Type int32 // offset 0
|
||||
AdapterLuid LUID // offset 4
|
||||
hProcess uintptr // offset 16
|
||||
// _result mirrors the D3DKMT_QUERYSTATISTICS_RESULT union.
|
||||
// sizeof(D3DKMT_QUERYSTATISTICS) == 0x328 (808 bytes) on x64.
|
||||
//
|
||||
// The C struct layout (x64):
|
||||
// offset 0: Type (int32, 4 bytes)
|
||||
// offset 4: AdapterLuid (LUID, 8 bytes)
|
||||
// offset 12: 4 bytes padding (for 8-byte alignment of hProcess)
|
||||
// offset 16: hProcess (HANDLE, 8 bytes)
|
||||
// offset 24: QueryResult (union, 780 bytes — largest member is AdapterInformation)
|
||||
// offset 804: anonymous input union (QueryNode.NodeId / QuerySegment.SegmentId, 4 bytes)
|
||||
//
|
||||
// Previous bug: _result was [776]byte, placing QueryId at offset 800 instead of 804.
|
||||
// The kernel read NodeId/SegmentId from offset 804 (always zero from _pad),
|
||||
// causing all NODE and SEGMENT queries to use index 0 regardless of the value
|
||||
// passed in QueryId. This produced alternating behavior where only GPU util OR
|
||||
// memory util appeared to work, depending on which test variant happened to put
|
||||
// non-zero data near offset 804 in the result buffer.
|
||||
_result [780]byte // offset 24, size 780 — places QueryId at offset 804
|
||||
QueryId int32 // offset 804 — matches C anonymous union for NodeId/SegmentId
|
||||
}
|
||||
|
||||
func init() {
|
||||
var buf queryStatsBuffer
|
||||
if unsafe.Sizeof(buf) != 808 {
|
||||
panic(fmt.Sprintf("queryStatsBuffer size %d != expected 808 (sizeof D3DKMT_QUERYSTATISTICS on x64)", unsafe.Sizeof(buf)))
|
||||
}
|
||||
if unsafe.Offsetof(buf.QueryId) != 804 {
|
||||
panic(fmt.Sprintf("queryStatsBuffer.QueryId offset %d != expected 804 (C anonymous union offset)", unsafe.Offsetof(buf.QueryId)))
|
||||
}
|
||||
|
||||
var perfData D3DKMT_ADAPTER_PERFDATA
|
||||
if unsafe.Sizeof(perfData) != 64 {
|
||||
panic(fmt.Sprintf("D3DKMT_ADAPTER_PERFDATA size %d != expected 64 on x64", unsafe.Sizeof(perfData)))
|
||||
}
|
||||
|
||||
var caps D3DKMT_ADAPTER_PERFDATACAPS
|
||||
if unsafe.Sizeof(caps) != 40 {
|
||||
panic(fmt.Sprintf("D3DKMT_ADAPTER_PERFDATACAPS size %d != expected 40 on x64", unsafe.Sizeof(caps)))
|
||||
}
|
||||
}
|
||||
|
||||
const (
|
||||
qsoffsetNbSegments = 0
|
||||
qsoffsetNodeCount = 4
|
||||
qsoffsetCommitLimit = 0
|
||||
qsoffsetBytesCommitted = 8
|
||||
qsoffsetBytesResident = 16
|
||||
qsoffsetRunningTime = 0
|
||||
qsoffsetSystemRunningTime = 272
|
||||
)
|
||||
|
||||
// d3dkmQueryAdapterStats returns the number of memory segments and compute
|
||||
// nodes for the adapter identified by luid.
|
||||
func d3dkmQueryAdapterStats(luid LUID) (nbSegments uint32, nodeCount uint32, err error) {
|
||||
buf := queryStatsBuffer{
|
||||
Type: int32(D3DKMT_QUERYSTATISTICS_ADAPTER),
|
||||
AdapterLuid: luid,
|
||||
}
|
||||
if err := ntstatusCall(procQueryStatistics, unsafe.Pointer(&buf)); err != nil {
|
||||
return 0, 0, fmt.Errorf("QueryStatistics(ADAPTER): %w", err)
|
||||
}
|
||||
nbSegments = binary.LittleEndian.Uint32(buf._result[qsoffsetNbSegments : qsoffsetNbSegments+4])
|
||||
nodeCount = binary.LittleEndian.Uint32(buf._result[qsoffsetNodeCount : qsoffsetNodeCount+4])
|
||||
return nbSegments, nodeCount, nil
|
||||
}
|
||||
|
||||
// d3dkmQuerySegmentStats returns the commit limit (total) and resident
|
||||
// (used) bytes for the given memory segment of an adapter.
|
||||
func d3dkmQuerySegmentStats(luid LUID, segmentID uint32) (commitLimit uint64, bytesResident uint64, err error) {
|
||||
buf := queryStatsBuffer{
|
||||
Type: int32(D3DKMT_QUERYSTATISTICS_SEGMENT),
|
||||
AdapterLuid: luid,
|
||||
QueryId: int32(segmentID),
|
||||
}
|
||||
if err := ntstatusCall(procQueryStatistics, unsafe.Pointer(&buf)); err != nil {
|
||||
return 0, 0, fmt.Errorf("QueryStatistics(SEGMENT %d): %w", segmentID, err)
|
||||
}
|
||||
commitLimit = binary.LittleEndian.Uint64(buf._result[qsoffsetCommitLimit : qsoffsetCommitLimit+8])
|
||||
bytesResident = binary.LittleEndian.Uint64(buf._result[qsoffsetBytesResident : qsoffsetBytesResident+8])
|
||||
if bytesResident == 0 {
|
||||
bytesResident = binary.LittleEndian.Uint64(buf._result[qsoffsetBytesCommitted : qsoffsetBytesCommitted+8])
|
||||
}
|
||||
return commitLimit, bytesResident, nil
|
||||
}
|
||||
|
||||
// d3dkmQueryNodeStats returns the global and system running time counters
|
||||
// (in 100ns units) for the given compute node of an adapter.
|
||||
func d3dkmQueryNodeStats(luid LUID, nodeID uint32) (runningTime uint64, systemRunningTime uint64, err error) {
|
||||
buf := queryStatsBuffer{
|
||||
Type: int32(D3DKMT_QUERYSTATISTICS_NODE),
|
||||
AdapterLuid: luid,
|
||||
QueryId: int32(nodeID),
|
||||
}
|
||||
if err := ntstatusCall(procQueryStatistics, unsafe.Pointer(&buf)); err != nil {
|
||||
return 0, 0, fmt.Errorf("QueryStatistics(NODE %d): %w", nodeID, err)
|
||||
}
|
||||
runningTime = binary.LittleEndian.Uint64(buf._result[qsoffsetRunningTime : qsoffsetRunningTime+8])
|
||||
systemRunningTime = binary.LittleEndian.Uint64(buf._result[qsoffsetSystemRunningTime : qsoffsetSystemRunningTime+8])
|
||||
return runningTime, systemRunningTime, nil
|
||||
}
|
||||
|
||||
type nodeRunningTimes struct {
|
||||
Global uint64
|
||||
System uint64
|
||||
}
|
||||
|
||||
// d3dkmtNodeUtil computes GPU node utilization as a percentage from running
|
||||
// time deltas. Returns -1 if counters went backwards (wrap/reset), 0 if idle.
|
||||
func d3dkmtNodeUtil(prevRT, curRT nodeRunningTimes, elapsed100ns int64) float64 {
|
||||
if curRT.Global < prevRT.Global || curRT.System < prevRT.System {
|
||||
return -1
|
||||
}
|
||||
gd := curRT.Global - prevRT.Global
|
||||
sd := curRT.System - prevRT.System
|
||||
|
||||
if gd > 0 && sd > 0 {
|
||||
util := float64(gd) / float64(sd)
|
||||
if util > 1.0 {
|
||||
util = 1.0
|
||||
}
|
||||
return util * 100.0
|
||||
} else if gd > 0 && elapsed100ns > 0 {
|
||||
util := float64(gd) / float64(elapsed100ns) * 100.0
|
||||
if util > 100.0 {
|
||||
util = 100.0
|
||||
}
|
||||
return util
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// d3dkmtFanPct returns fan speed as a percentage of maxFanRPM, clamped to
|
||||
// 100%. Returns 0 if maxFanRPM is unavailable or fan is not spinning.
|
||||
func d3dkmtFanPct(fanRPM, maxFanRPM uint32) float64 {
|
||||
if maxFanRPM > 0 && fanRPM > 0 {
|
||||
pct := float64(fanRPM) / float64(maxFanRPM) * 100.0
|
||||
if pct > 100.0 {
|
||||
pct = 100.0
|
||||
}
|
||||
return pct
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// d3dkmtPowerW converts power from deci-watts (as reported by D3DKMT) to
|
||||
// watts. Returns 0 if the power value is zero.
|
||||
func d3dkmtPowerW(power uint32) float64 {
|
||||
if power > 0 {
|
||||
return float64(power) / 10.0
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// d3dkmtTempC converts temperature from deci-Celsius (as reported by D3DKMT)
|
||||
// to degrees Celsius.
|
||||
func d3dkmtTempC(tempDeciC uint32) int {
|
||||
return int(tempDeciC / 10)
|
||||
}
|
||||
|
||||
type d3dkmtAdapterState struct {
|
||||
luid LUID
|
||||
hAdapter uint32
|
||||
nbSegments uint32
|
||||
nodeCount uint32
|
||||
maxFanRPM uint32
|
||||
prevNodeRT map[uint32]nodeRunningTimes
|
||||
prevTime time.Time
|
||||
}
|
||||
|
||||
// tryD3DKMT attempts to start GPU monitoring using D3DKMT and optional PDH
|
||||
// counters. It returns a channel of GpuStat snapshots or an error if no
|
||||
// usable adapters are found.
|
||||
func tryD3DKMT(ctx context.Context, every time.Duration, logger *logmon.Monitor) (chan []GpuStat, error) {
|
||||
if err := initD3DKMT(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
adapterInfos, err := d3dkmEnumerateAdapters()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
type adapterMeta struct {
|
||||
luid LUID
|
||||
nbSegments uint32
|
||||
nodeCount uint32
|
||||
maxFanRPM uint32
|
||||
}
|
||||
|
||||
var metaList []adapterMeta
|
||||
|
||||
for i, ai := range adapterInfos {
|
||||
hAdapter, err := d3dkmOpenAdapter(ai.AdapterLuid)
|
||||
if err != nil {
|
||||
logger.Debugf("adapter %d: open failed: %s", i, err.Error())
|
||||
continue
|
||||
}
|
||||
|
||||
nbSegments, nodeCount, err := d3dkmQueryAdapterStats(ai.AdapterLuid)
|
||||
if err != nil {
|
||||
logger.Debugf("adapter %d: query stats failed: %s", i, err.Error())
|
||||
d3dkmCloseAdapter(hAdapter)
|
||||
continue
|
||||
}
|
||||
|
||||
caps, err := d3dkmGetAdapterPerfDataCaps(hAdapter)
|
||||
if err != nil {
|
||||
logger.Debugf("adapter %d: perf caps failed: %s", i, err.Error())
|
||||
}
|
||||
|
||||
d3dkmCloseAdapter(hAdapter)
|
||||
|
||||
var maxFanRPM uint32
|
||||
if caps != nil {
|
||||
maxFanRPM = caps.MaxFanRPM
|
||||
}
|
||||
|
||||
metaList = append(metaList, adapterMeta{
|
||||
luid: ai.AdapterLuid,
|
||||
nbSegments: nbSegments,
|
||||
nodeCount: nodeCount,
|
||||
maxFanRPM: maxFanRPM,
|
||||
})
|
||||
logger.Debugf("adapter %d: segments=%d nodes=%d fan_max=%d luid=%d:%d", i, nbSegments, nodeCount, maxFanRPM, ai.AdapterLuid.HighPart, ai.AdapterLuid.LowPart)
|
||||
}
|
||||
|
||||
if len(metaList) == 0 {
|
||||
return nil, fmt.Errorf("no usable D3DKMT adapters found")
|
||||
}
|
||||
|
||||
pdhUtil, pdhErr := initPdhGpuUtil()
|
||||
if pdhErr != nil {
|
||||
logger.Debugf("PDH GPU utilization not available: %s", pdhErr.Error())
|
||||
} else {
|
||||
logger.Info("using PDH performance counters for GPU utilization")
|
||||
}
|
||||
|
||||
ch := make(chan []GpuStat, 1)
|
||||
|
||||
go func() {
|
||||
defer close(ch)
|
||||
if pdhUtil != nil {
|
||||
defer pdhUtil.close()
|
||||
}
|
||||
|
||||
var adapters []d3dkmtAdapterState
|
||||
for _, m := range metaList {
|
||||
hAdapter, err := d3dkmOpenAdapter(m.luid)
|
||||
if err != nil {
|
||||
logger.Debugf("reopen adapter failed: %s", err.Error())
|
||||
continue
|
||||
}
|
||||
adapters = append(adapters, d3dkmtAdapterState{
|
||||
luid: m.luid,
|
||||
hAdapter: hAdapter,
|
||||
nbSegments: m.nbSegments,
|
||||
nodeCount: m.nodeCount,
|
||||
maxFanRPM: m.maxFanRPM,
|
||||
prevNodeRT: make(map[uint32]nodeRunningTimes),
|
||||
})
|
||||
}
|
||||
|
||||
if len(adapters) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
defer func() {
|
||||
for _, a := range adapters {
|
||||
d3dkmCloseAdapter(a.hAdapter)
|
||||
}
|
||||
}()
|
||||
|
||||
for i := range adapters {
|
||||
a := &adapters[i]
|
||||
for node := uint32(0); node < a.nodeCount; node++ {
|
||||
globalRT, systemRT, err := d3dkmQueryNodeStats(a.luid, node)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
a.prevNodeRT[node] = nodeRunningTimes{Global: globalRT, System: systemRT}
|
||||
}
|
||||
a.prevTime = time.Now()
|
||||
}
|
||||
|
||||
ticker := time.NewTicker(every)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
stats := make([]GpuStat, 0, len(adapters))
|
||||
now := time.Now()
|
||||
|
||||
var pdhUtilMap map[LUID]float64
|
||||
if pdhUtil != nil {
|
||||
pdhUtilMap = pdhUtil.collect()
|
||||
}
|
||||
|
||||
for i := range adapters {
|
||||
a := &adapters[i]
|
||||
|
||||
perfData, err := d3dkmGetAdapterPerfData(a.hAdapter)
|
||||
if err != nil {
|
||||
logger.Debugf("adapter %d perfdata: %s", i, err.Error())
|
||||
continue
|
||||
}
|
||||
|
||||
var memUsedMB, memTotalMB int
|
||||
for seg := uint32(0); seg < a.nbSegments; seg++ {
|
||||
limit, resident, err := d3dkmQuerySegmentStats(a.luid, seg)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
memUsedMB += int(resident / (1024 * 1024))
|
||||
memTotalMB += int(limit / (1024 * 1024))
|
||||
}
|
||||
|
||||
var gpuUtil float64
|
||||
pdhGaveValue := false
|
||||
if pdhUtilMap != nil {
|
||||
if util, ok := pdhUtilMap[a.luid]; ok {
|
||||
gpuUtil = util
|
||||
pdhGaveValue = true
|
||||
}
|
||||
}
|
||||
|
||||
if !pdhGaveValue && a.nodeCount > 0 {
|
||||
elapsedNs := now.Sub(a.prevTime).Nanoseconds()
|
||||
elapsed100ns := elapsedNs / 100
|
||||
|
||||
for node := uint32(0); node < a.nodeCount; node++ {
|
||||
globalRT, systemRT, err := d3dkmQueryNodeStats(a.luid, node)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if prevRT, ok := a.prevNodeRT[node]; ok {
|
||||
if globalRT < prevRT.Global || systemRT < prevRT.System {
|
||||
a.prevNodeRT[node] = nodeRunningTimes{Global: globalRT, System: systemRT}
|
||||
continue
|
||||
}
|
||||
nodeUtil := d3dkmtNodeUtil(prevRT, nodeRunningTimes{Global: globalRT, System: systemRT}, elapsed100ns)
|
||||
if nodeUtil > gpuUtil {
|
||||
gpuUtil = nodeUtil
|
||||
}
|
||||
}
|
||||
a.prevNodeRT[node] = nodeRunningTimes{Global: globalRT, System: systemRT}
|
||||
}
|
||||
|
||||
a.prevTime = now
|
||||
}
|
||||
|
||||
tempC := d3dkmtTempC(perfData.Temperature)
|
||||
|
||||
fanSpeedPct := d3dkmtFanPct(perfData.FanRPM, a.maxFanRPM)
|
||||
powerDrawW := d3dkmtPowerW(perfData.Power)
|
||||
|
||||
var memUtilPct float64
|
||||
if memTotalMB > 0 {
|
||||
memUtilPct = float64(memUsedMB) / float64(memTotalMB) * 100.0
|
||||
}
|
||||
|
||||
stats = append(stats, GpuStat{
|
||||
Timestamp: now,
|
||||
ID: i,
|
||||
Name: fmt.Sprintf("GPU %d", i),
|
||||
TempC: tempC,
|
||||
GpuUtilPct: gpuUtil,
|
||||
MemUtilPct: memUtilPct,
|
||||
MemUsedMB: memUsedMB,
|
||||
MemTotalMB: memTotalMB,
|
||||
FanSpeedPct: fanSpeedPct,
|
||||
PowerDrawW: powerDrawW,
|
||||
})
|
||||
}
|
||||
|
||||
if len(stats) > 0 {
|
||||
select {
|
||||
case ch <- stats:
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
return ch, nil
|
||||
}
|
||||
@@ -0,0 +1,98 @@
|
||||
//go:build windows
|
||||
|
||||
package perf
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestD3dkmtNodeUtil_FullLoad(t *testing.T) {
|
||||
prev := nodeRunningTimes{Global: 1000, System: 10000}
|
||||
cur := nodeRunningTimes{Global: 5000, System: 14000}
|
||||
got := d3dkmtNodeUtil(prev, cur, 100000)
|
||||
assert.Equal(t, 100.0, got)
|
||||
}
|
||||
|
||||
func TestD3dkmtNodeUtil_PartialUtil(t *testing.T) {
|
||||
prev := nodeRunningTimes{Global: 1000, System: 10000}
|
||||
cur := nodeRunningTimes{Global: 3000, System: 14000}
|
||||
got := d3dkmtNodeUtil(prev, cur, 100000)
|
||||
assert.Equal(t, 50.0, got)
|
||||
}
|
||||
|
||||
func TestD3dkmtNodeUtil_Identical(t *testing.T) {
|
||||
prev := nodeRunningTimes{Global: 10000, System: 10000}
|
||||
cur := nodeRunningTimes{Global: 20000, System: 20000}
|
||||
got := d3dkmtNodeUtil(prev, cur, 100000)
|
||||
assert.Equal(t, 100.0, got)
|
||||
}
|
||||
|
||||
func TestD3dkmtNodeUtil_CounterWrap(t *testing.T) {
|
||||
prev := nodeRunningTimes{Global: 9000, System: 10000}
|
||||
cur := nodeRunningTimes{Global: 1000, System: 10000}
|
||||
got := d3dkmtNodeUtil(prev, cur, 100000)
|
||||
assert.Equal(t, -1.0, got)
|
||||
}
|
||||
|
||||
func TestD3dkmtNodeUtil_SystemWrap(t *testing.T) {
|
||||
prev := nodeRunningTimes{Global: 1000, System: 9000}
|
||||
cur := nodeRunningTimes{Global: 5000, System: 1000}
|
||||
got := d3dkmtNodeUtil(prev, cur, 100000)
|
||||
assert.Equal(t, -1.0, got)
|
||||
}
|
||||
|
||||
func TestD3dkmtNodeUtil_ZeroDelta(t *testing.T) {
|
||||
prev := nodeRunningTimes{Global: 1000, System: 10000}
|
||||
cur := nodeRunningTimes{Global: 1000, System: 10000}
|
||||
got := d3dkmtNodeUtil(prev, cur, 100000)
|
||||
assert.Equal(t, 0.0, got)
|
||||
}
|
||||
|
||||
func TestD3dkmtNodeUtil_ElapsedFallback(t *testing.T) {
|
||||
prev := nodeRunningTimes{Global: 1000, System: 10000}
|
||||
cur := nodeRunningTimes{Global: 6000, System: 10000}
|
||||
got := d3dkmtNodeUtil(prev, cur, 50000)
|
||||
assert.InDelta(t, 10.0, got, 0.01)
|
||||
}
|
||||
|
||||
func TestD3dkmtFanPct_Normal(t *testing.T) {
|
||||
assert.Equal(t, 50.0, d3dkmtFanPct(1500, 3000))
|
||||
}
|
||||
|
||||
func TestD3dkmtFanPct_MaxFan(t *testing.T) {
|
||||
assert.Equal(t, 100.0, d3dkmtFanPct(3000, 3000))
|
||||
}
|
||||
|
||||
func TestD3dkmtFanPct_OverMaxClamped(t *testing.T) {
|
||||
assert.Equal(t, 100.0, d3dkmtFanPct(4000, 3000))
|
||||
}
|
||||
|
||||
func TestD3dkmtFanPct_ZeroMaxFan(t *testing.T) {
|
||||
assert.Equal(t, 0.0, d3dkmtFanPct(1500, 0))
|
||||
}
|
||||
|
||||
func TestD3dkmtFanPct_ZeroFanRPM(t *testing.T) {
|
||||
assert.Equal(t, 0.0, d3dkmtFanPct(0, 3000))
|
||||
}
|
||||
|
||||
func TestD3dkmtFanPct_BothZero(t *testing.T) {
|
||||
assert.Equal(t, 0.0, d3dkmtFanPct(0, 0))
|
||||
}
|
||||
|
||||
func TestD3dkmtPowerW(t *testing.T) {
|
||||
assert.Equal(t, 250.0, d3dkmtPowerW(2500))
|
||||
}
|
||||
|
||||
func TestD3dkmtPowerW_Zero(t *testing.T) {
|
||||
assert.Equal(t, 0.0, d3dkmtPowerW(0))
|
||||
}
|
||||
|
||||
func TestD3dkmtTempC(t *testing.T) {
|
||||
assert.Equal(t, 65, d3dkmtTempC(650))
|
||||
}
|
||||
|
||||
func TestD3dkmtTempC_Zero(t *testing.T) {
|
||||
assert.Equal(t, 0, d3dkmtTempC(0))
|
||||
}
|
||||
@@ -22,6 +22,13 @@ func getGpuStats(ctx context.Context, every time.Duration, logger *logmon.Monito
|
||||
logger.Debugf("nvidia-smi: %s", err.Error())
|
||||
}
|
||||
|
||||
if ch, err := tryD3DKMT(ctx, every, logger); err == nil {
|
||||
logger.Info("using D3DKMT for GPU monitoring")
|
||||
return ch, nil
|
||||
} else {
|
||||
logger.Debugf("D3DKMT: %s", err.Error())
|
||||
}
|
||||
|
||||
return nil, ErrNoGpuTool
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,159 @@
|
||||
//go:build windows
|
||||
|
||||
package perf
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
"unsafe"
|
||||
|
||||
"golang.org/x/sys/windows"
|
||||
)
|
||||
|
||||
var (
|
||||
pdhDLL = windows.NewLazySystemDLL("pdh.dll")
|
||||
procPdhOpenQuery = pdhDLL.NewProc("PdhOpenQueryW")
|
||||
procPdhAddEnglishCounter = pdhDLL.NewProc("PdhAddEnglishCounterW")
|
||||
procPdhCollectQueryData = pdhDLL.NewProc("PdhCollectQueryData")
|
||||
procPdhGetFormattedCounterArray = pdhDLL.NewProc("PdhGetFormattedCounterArrayW")
|
||||
procPdhCloseQuery = pdhDLL.NewProc("PdhCloseQuery")
|
||||
)
|
||||
|
||||
const (
|
||||
pdhFmtDouble = 0x00000200
|
||||
pdhMoreData = 0x800007D2
|
||||
pdhNoData = 0x800007D5
|
||||
)
|
||||
|
||||
type pdhCounterValue struct {
|
||||
CStatus uint32
|
||||
DblVal float64
|
||||
}
|
||||
|
||||
type pdhCounterValueItem struct {
|
||||
SzName *uint16
|
||||
FmtValue pdhCounterValue
|
||||
}
|
||||
|
||||
func init() {
|
||||
var item pdhCounterValueItem
|
||||
if unsafe.Sizeof(item) != 24 {
|
||||
panic(fmt.Sprintf("pdhCounterValueItem size %d != expected 24 on x64", unsafe.Sizeof(item)))
|
||||
}
|
||||
}
|
||||
|
||||
type pdhGpuUtil struct {
|
||||
query uintptr
|
||||
counter uintptr
|
||||
}
|
||||
|
||||
// initPdhGpuUtil creates a PDH query for the GPU Engine utilization counter.
|
||||
// Returns nil with an error if PDH or the counter is unavailable.
|
||||
func initPdhGpuUtil() (*pdhGpuUtil, error) {
|
||||
var query uintptr
|
||||
if ret, _, _ := procPdhOpenQuery.Call(0, 0, uintptr(unsafe.Pointer(&query))); ret != 0 {
|
||||
return nil, fmt.Errorf("PdhOpenQuery: 0x%x", ret)
|
||||
}
|
||||
|
||||
path, _ := windows.UTF16PtrFromString(`\GPU Engine(*)\Utilization Percentage`)
|
||||
var counter uintptr
|
||||
if ret, _, _ := procPdhAddEnglishCounter.Call(
|
||||
query, uintptr(unsafe.Pointer(path)), 0, uintptr(unsafe.Pointer(&counter)),
|
||||
); ret != 0 {
|
||||
procPdhCloseQuery.Call(query)
|
||||
return nil, fmt.Errorf("PdhAddEnglishCounter(GPU Engine): 0x%x", ret)
|
||||
}
|
||||
|
||||
procPdhCollectQueryData.Call(query)
|
||||
|
||||
return &pdhGpuUtil{query: query, counter: counter}, nil
|
||||
}
|
||||
|
||||
// close releases the PDH query handle.
|
||||
func (p *pdhGpuUtil) close() {
|
||||
if p.query != 0 {
|
||||
procPdhCloseQuery.Call(p.query)
|
||||
p.query = 0
|
||||
}
|
||||
}
|
||||
|
||||
// collect reads the PDH counter and returns a map of adapter LUID to
|
||||
// aggregated GPU utilization percentage, summed across all engine instances
|
||||
// per adapter and clamped to 100%.
|
||||
func (p *pdhGpuUtil) collect() map[LUID]float64 {
|
||||
ret, _, _ := procPdhCollectQueryData.Call(p.query)
|
||||
if ret != 0 && ret != pdhNoData {
|
||||
return nil
|
||||
}
|
||||
|
||||
var bufSize uint32
|
||||
var itemCount uint32
|
||||
ret, _, _ = procPdhGetFormattedCounterArray.Call(
|
||||
p.counter, pdhFmtDouble,
|
||||
uintptr(unsafe.Pointer(&bufSize)),
|
||||
uintptr(unsafe.Pointer(&itemCount)),
|
||||
0,
|
||||
)
|
||||
if ret != pdhMoreData || itemCount == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
buf := make([]byte, bufSize)
|
||||
ret, _, _ = procPdhGetFormattedCounterArray.Call(
|
||||
p.counter, pdhFmtDouble,
|
||||
uintptr(unsafe.Pointer(&bufSize)),
|
||||
uintptr(unsafe.Pointer(&itemCount)),
|
||||
uintptr(unsafe.Pointer(&buf[0])),
|
||||
)
|
||||
if ret != 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
itemSize := uint32(unsafe.Sizeof(pdhCounterValueItem{}))
|
||||
result := make(map[LUID]float64)
|
||||
|
||||
for i := uint32(0); i < itemCount; i++ {
|
||||
item := (*pdhCounterValueItem)(unsafe.Pointer(&buf[i*itemSize]))
|
||||
if item.FmtValue.CStatus != 0 {
|
||||
continue
|
||||
}
|
||||
luid, ok := parsePdhLuid(windows.UTF16PtrToString(item.SzName))
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
result[luid] += item.FmtValue.DblVal
|
||||
}
|
||||
|
||||
for luid := range result {
|
||||
if result[luid] > 100.0 {
|
||||
result[luid] = 100.0
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// parsePdhLuid extracts the adapter LUID (high and low parts) from a PDH
|
||||
// GPU Engine instance name (e.g. "pid_1234_luid_0x00000000_0x000148BF_phys_0_eng_2_engtype_Compute").
|
||||
func parsePdhLuid(name string) (LUID, bool) {
|
||||
idx := strings.Index(name, "luid_0x")
|
||||
if idx < 0 {
|
||||
return LUID{}, false
|
||||
}
|
||||
rest := name[idx+7:]
|
||||
parts := strings.SplitN(rest, "_", 4)
|
||||
if len(parts) < 3 {
|
||||
return LUID{}, false
|
||||
}
|
||||
hp, err := strconv.ParseUint(parts[0], 16, 32)
|
||||
if err != nil {
|
||||
return LUID{}, false
|
||||
}
|
||||
lpStr := strings.TrimPrefix(parts[1], "0x")
|
||||
lp, err := strconv.ParseUint(lpStr, 16, 32)
|
||||
if err != nil {
|
||||
return LUID{}, false
|
||||
}
|
||||
return LUID{LowPart: uint32(lp), HighPart: int32(hp)}, true
|
||||
}
|
||||
@@ -0,0 +1,53 @@
|
||||
//go:build windows
|
||||
|
||||
package perf
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestParsePdhLuid_Valid(t *testing.T) {
|
||||
name := `pid_25312_luid_0x00000000_0x000148BF_phys_0_eng_2_engtype_Compute`
|
||||
got, ok := parsePdhLuid(name)
|
||||
assert.True(t, ok)
|
||||
assert.Equal(t, uint32(0x000148BF), got.LowPart)
|
||||
assert.Equal(t, int32(0x00000000), got.HighPart)
|
||||
}
|
||||
|
||||
func TestParsePdhLuid_ValidNvidia(t *testing.T) {
|
||||
name := `pid_1388_luid_0x00000000_0x00011372_phys_0_eng_8_engtype_Compute_1`
|
||||
got, ok := parsePdhLuid(name)
|
||||
assert.True(t, ok)
|
||||
assert.Equal(t, uint32(0x00011372), got.LowPart)
|
||||
assert.Equal(t, int32(0x00000000), got.HighPart)
|
||||
}
|
||||
|
||||
func TestParsePdhLuid_NonZeroHighPart(t *testing.T) {
|
||||
name := `pid_1234_luid_0x00000001_0x0000C85A_phys_0_eng_5_engtype_Copy`
|
||||
got, ok := parsePdhLuid(name)
|
||||
assert.True(t, ok)
|
||||
assert.Equal(t, uint32(0x0000C85A), got.LowPart)
|
||||
assert.Equal(t, int32(0x00000001), got.HighPart)
|
||||
}
|
||||
|
||||
func TestParsePdhLuid_InvalidNoLuid(t *testing.T) {
|
||||
_, ok := parsePdhLuid("invalid_string_without_luid")
|
||||
assert.False(t, ok)
|
||||
}
|
||||
|
||||
func TestParsePdhLuid_InvalidEmpty(t *testing.T) {
|
||||
_, ok := parsePdhLuid("")
|
||||
assert.False(t, ok)
|
||||
}
|
||||
|
||||
func TestParsePdhLuid_InvalidHex(t *testing.T) {
|
||||
_, ok := parsePdhLuid("pid_1234_luid_0xZZZZ_0xGGGG_phys_0")
|
||||
assert.False(t, ok)
|
||||
}
|
||||
|
||||
func TestParsePdhLuid_ShortAfterLuid(t *testing.T) {
|
||||
_, ok := parsePdhLuid("pid_1234_luid_0x00000000")
|
||||
assert.False(t, ok)
|
||||
}
|
||||
Reference in New Issue
Block a user