From b37dd9c615e74f47a6b69ae02927a8b41507f8d6 Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Fri, 14 Jun 2024 06:42:09 +0000 Subject: [PATCH 01/10] Adding GFD node labels to specify GPU mode Signed-off-by: Vishesh Tanksale --- docs/gpu-feature-discovery/README.md | 1 + internal/lm/nvml.go | 38 ++++++++++++++++++++++++++++ internal/resource/cuda-device.go | 2 ++ internal/resource/device_mock.go | 2 ++ internal/resource/nvml-device.go | 38 ++++++++++++++++++++++++++++ internal/resource/nvml-mig-device.go | 2 ++ internal/resource/sysfs-device.go | 4 +++ internal/resource/types.go | 1 + tests/expected-output-mig-mixed.txt | 1 + tests/expected-output-mig-none.txt | 1 + tests/expected-output-mig-single.txt | 1 + tests/expected-output.txt | 1 + 12 files changed, 92 insertions(+) diff --git a/docs/gpu-feature-discovery/README.md b/docs/gpu-feature-discovery/README.md index 965740420..e29cf3316 100644 --- a/docs/gpu-feature-discovery/README.md +++ b/docs/gpu-feature-discovery/README.md @@ -209,6 +209,7 @@ their meaning: | nvidia.com/gpu.machine | String | Machine type | DGX-1 | | nvidia.com/gpu.memory | Integer | Memory of the GPU in Mb | 2048 | | nvidia.com/gpu.product | String | Model of the GPU | GeForce-GT-710 | +| nvidia.com/gpu.mode | String | Display or Compute Mode of the GPU | compute | Depending on the MIG strategy used, the following set of labels may also be available (or override the default values for some of the labels listed above): diff --git a/internal/lm/nvml.go b/internal/lm/nvml.go index ed5004666..bda94b42f 100644 --- a/internal/lm/nvml.go +++ b/internal/lm/nvml.go @@ -71,12 +71,18 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e return nil, fmt.Errorf("error creating resource labeler: %v", err) } + gpuModeLabeler, err := newGPUModeLabeler(manager) + if err != nil { + return nil, fmt.Errorf("error creating resource labeler: %v", err) + } + l := Merge( machineTypeLabeler, versionLabeler, migCapabilityLabeler, sharingLabeler, resourceLabeler, + gpuModeLabeler, ) return l, nil @@ -193,3 +199,35 @@ func isMPSCapable(manager resource.Manager) (bool, error) { } return true, nil } + +func newGPUModeLabeler(manager resource.Manager) (Labeler, error) { + + devices, err := manager.GetDevices() + if err != nil { + return nil, err + } + if len(devices) == 0 { + // no devices, return empty labels + return empty{}, nil + } + + gpuMode := "" + // loop through all devices to check if all of them are on same gpu mode + for _, d := range devices { + val, err := d.GetDisplayMode() + if err != nil { + return nil, err + } + if gpuMode != "" && val != gpuMode { + gpuMode = "unknown" + break + } else { + gpuMode = val + } + } + + labels := Labels{ + "nvidia.com/gpu.mode": gpuMode, + } + return labels, nil +} diff --git a/internal/resource/cuda-device.go b/internal/resource/cuda-device.go index b6bafc249..0ed42da49 100644 --- a/internal/resource/cuda-device.go +++ b/internal/resource/cuda-device.go @@ -96,3 +96,5 @@ func (d *cudaDevice) IsMigCapable() (bool, error) { func (d *cudaDevice) IsMigEnabled() (bool, error) { return false, nil } + +func (d *cudaDevice) GetDisplayMode() (string, error) { return "unknown", nil } diff --git a/internal/resource/device_mock.go b/internal/resource/device_mock.go index 2f48f89e0..53b65bb90 100644 --- a/internal/resource/device_mock.go +++ b/internal/resource/device_mock.go @@ -324,3 +324,5 @@ func (mock *DeviceMock) IsMigEnabledCalls() []struct { mock.lockIsMigEnabled.RUnlock() return calls } + +func (mock *DeviceMock) GetDisplayMode() (string, error) { return "unknown", nil } diff --git a/internal/resource/nvml-device.go b/internal/resource/nvml-device.go index 58501e09d..1ceb3d4e6 100644 --- a/internal/resource/nvml-device.go +++ b/internal/resource/nvml-device.go @@ -18,6 +18,10 @@ package resource import ( "fmt" + "os" + "strings" + + "k8s.io/klog/v2" "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" "github.com/NVIDIA/go-nvml/pkg/nvml" @@ -86,3 +90,37 @@ func (d nvmlDevice) GetTotalMemoryMB() (uint64, error) { } return info.Total / (1024 * 1024), nil } + +func (d nvmlDevice) GetDisplayMode() (string, error) { + info, retVal := d.Device.GetPciInfo() + if retVal != nvml.SUCCESS { + return "", retVal + } + var bytes []byte + for _, char := range info.BusId { + if char == 0 { + break + } + bytes = append(bytes, byte(char)) + } + pciID := strings.ToLower(strings.TrimPrefix(string(bytes), "0000")) + return resolvePCIAddressToMode(pciID) +} + +func resolvePCIAddressToMode(addr string) (string, error) { + class, err := os.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/class", addr)) + if err != nil { + klog.Errorf("Error getting gpu class: %v", err) + return "unknown", fmt.Errorf("Error getting gpu class: %v", err) + } + classStr := string(class) + classStr = strings.TrimSpace(classStr) + switch classStr { + case "0x030000": + return "display", nil + case "0x030200": + return "compute", nil + default: + return "unknown", nil + } +} diff --git a/internal/resource/nvml-mig-device.go b/internal/resource/nvml-mig-device.go index 599c27ce2..c8db4dfd6 100644 --- a/internal/resource/nvml-mig-device.go +++ b/internal/resource/nvml-mig-device.go @@ -132,3 +132,5 @@ func totalMemory(attr map[string]interface{}) (uint64, error) { return 0, fmt.Errorf("unsupported attribute type %v", t) } } + +func (d nvmlMigDevice) GetDisplayMode() (string, error) { return "unknown", nil } diff --git a/internal/resource/sysfs-device.go b/internal/resource/sysfs-device.go index 9cce3b1fb..d0f27c3a6 100644 --- a/internal/resource/sysfs-device.go +++ b/internal/resource/sysfs-device.go @@ -64,3 +64,7 @@ func (d vfioDevice) IsMigEnabled() (bool, error) { func (d vfioDevice) IsMigCapable() (bool, error) { return false, nil } + +func (d vfioDevice) GetDisplayMode() (string, error) { + return resolvePCIAddressToMode(d.nvidiaPCIDevice.Address) +} diff --git a/internal/resource/types.go b/internal/resource/types.go index 4ef05a957..6ddfb8edd 100644 --- a/internal/resource/types.go +++ b/internal/resource/types.go @@ -39,4 +39,5 @@ type Device interface { GetTotalMemoryMB() (uint64, error) GetDeviceHandleFromMigDeviceHandle() (Device, error) GetCudaComputeCapability() (int, int, error) + GetDisplayMode() (string, error) } diff --git a/tests/expected-output-mig-mixed.txt b/tests/expected-output-mig-mixed.txt index a84b82968..e90a8daf1 100644 --- a/tests/expected-output-mig-mixed.txt +++ b/tests/expected-output-mig-mixed.txt @@ -18,6 +18,7 @@ nvidia\.com\/gpu\.sharing-strategy=[none|mps|time-slicing] nvidia\.com\/gpu\.product=[A-Za-z_-]+ nvidia\.com\/gpu\.memory=[0-9]+ nvidia\.com\/gpu\.family=[a-z]+ +nvidia\.com\/gpu\.mode=unknown nvidia\.com\/mig\.capable=[true|false] nvidia\.com\/gpu\.compute\.major=[0-9]+ nvidia\.com\/gpu\.compute\.minor=[0-9]+ diff --git a/tests/expected-output-mig-none.txt b/tests/expected-output-mig-none.txt index d388c02d2..dc9ca7d2b 100644 --- a/tests/expected-output-mig-none.txt +++ b/tests/expected-output-mig-none.txt @@ -18,6 +18,7 @@ nvidia\.com\/gpu\.sharing-strategy=[none|mps|time-slicing] nvidia\.com\/gpu\.product=[A-Za-z_-]+ nvidia\.com\/gpu\.memory=[0-9]+ nvidia\.com\/gpu\.family=[a-z]+ +nvidia\.com\/gpu\.mode=unknown nvidia\.com\/mig\.capable=[true|false] nvidia\.com\/gpu\.compute\.major=[0-9]+ nvidia\.com\/gpu\.compute\.minor=[0-9]+ diff --git a/tests/expected-output-mig-single.txt b/tests/expected-output-mig-single.txt index 055d7cc99..10d2bb37c 100644 --- a/tests/expected-output-mig-single.txt +++ b/tests/expected-output-mig-single.txt @@ -30,4 +30,5 @@ nvidia\.com\/gpu\.engines\.jpeg=[0-9]+ nvidia\.com\/gpu\.engines\.ofa=[0-9]+ nvidia\.com\/gpu\.slices\.gi=[0-9]+ nvidia\.com\/gpu\.slices\.ci=[0-9]+ +nvidia\.com\/gpu\.mode=unknown nvidia\.com\/mps\.capable=[true|false] diff --git a/tests/expected-output.txt b/tests/expected-output.txt index d388c02d2..0f2468d77 100644 --- a/tests/expected-output.txt +++ b/tests/expected-output.txt @@ -18,6 +18,7 @@ nvidia\.com\/gpu\.sharing-strategy=[none|mps|time-slicing] nvidia\.com\/gpu\.product=[A-Za-z_-]+ nvidia\.com\/gpu\.memory=[0-9]+ nvidia\.com\/gpu\.family=[a-z]+ +nvidia\.com\/gpu\.mode=[unknown|compute|graphics] nvidia\.com\/mig\.capable=[true|false] nvidia\.com\/gpu\.compute\.major=[0-9]+ nvidia\.com\/gpu\.compute\.minor=[0-9]+ From 03f72f90a1de72a9f48f9e188b1e4734a27e98e8 Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Mon, 17 Jun 2024 10:26:07 +0000 Subject: [PATCH 02/10] Using class of the device to add the GPU mode label Signed-off-by: Vishesh Tanksale --- internal/lm/nvml.go | 18 ++++++--- internal/resource/cuda-device.go | 4 +- internal/resource/device_mock.go | 39 ++++++++++++++++++- internal/resource/nvml-device.go | 17 ++------ internal/resource/nvml-mig-device.go | 16 +++++++- internal/resource/sysfs-device.go | 5 ++- internal/resource/testing/resource-testing.go | 1 + internal/resource/types.go | 2 +- tests/expected-output-mig-mixed.txt | 2 +- tests/expected-output-mig-none.txt | 2 +- tests/expected-output-mig-single.txt | 2 +- 11 files changed, 80 insertions(+), 28 deletions(-) diff --git a/internal/lm/nvml.go b/internal/lm/nvml.go index bda94b42f..be24287d9 100644 --- a/internal/lm/nvml.go +++ b/internal/lm/nvml.go @@ -211,21 +211,29 @@ func newGPUModeLabeler(manager resource.Manager) (Labeler, error) { return empty{}, nil } - gpuMode := "" + class := "" // loop through all devices to check if all of them are on same gpu mode for _, d := range devices { - val, err := d.GetDisplayMode() + val, err := d.GetClass() if err != nil { return nil, err } - if gpuMode != "" && val != gpuMode { - gpuMode = "unknown" + if class != "" && val != class { break } else { - gpuMode = val + class = val } } + gpuMode := "" + switch class { + case "0x030000": + gpuMode = "graphics" + case "0x030200": + gpuMode = "compute" + default: + gpuMode = "unknown" + } labels := Labels{ "nvidia.com/gpu.mode": gpuMode, } diff --git a/internal/resource/cuda-device.go b/internal/resource/cuda-device.go index 0ed42da49..142b03ef3 100644 --- a/internal/resource/cuda-device.go +++ b/internal/resource/cuda-device.go @@ -97,4 +97,6 @@ func (d *cudaDevice) IsMigEnabled() (bool, error) { return false, nil } -func (d *cudaDevice) GetDisplayMode() (string, error) { return "unknown", nil } +func (d *cudaDevice) GetClass() (string, error) { + return "unknown", nil +} diff --git a/internal/resource/device_mock.go b/internal/resource/device_mock.go index 53b65bb90..27cb8f897 100644 --- a/internal/resource/device_mock.go +++ b/internal/resource/device_mock.go @@ -20,6 +20,9 @@ var _ Device = &DeviceMock{} // GetAttributesFunc: func() (map[string]interface{}, error) { // panic("mock out the GetAttributes method") // }, +// GetClassFunc: func() (string, error) { +// panic("mock out the GetClass method") +// }, // GetCudaComputeCapabilityFunc: func() (int, int, error) { // panic("mock out the GetCudaComputeCapability method") // }, @@ -51,6 +54,9 @@ type DeviceMock struct { // GetAttributesFunc mocks the GetAttributes method. GetAttributesFunc func() (map[string]interface{}, error) + // GetClassFunc mocks the GetClass method. + GetClassFunc func() (string, error) + // GetCudaComputeCapabilityFunc mocks the GetCudaComputeCapability method. GetCudaComputeCapabilityFunc func() (int, int, error) @@ -77,6 +83,9 @@ type DeviceMock struct { // GetAttributes holds details about calls to the GetAttributes method. GetAttributes []struct { } + // GetClass holds details about calls to the GetClass method. + GetClass []struct { + } // GetCudaComputeCapability holds details about calls to the GetCudaComputeCapability method. GetCudaComputeCapability []struct { } @@ -100,6 +109,7 @@ type DeviceMock struct { } } lockGetAttributes sync.RWMutex + lockGetClass sync.RWMutex lockGetCudaComputeCapability sync.RWMutex lockGetDeviceHandleFromMigDeviceHandle sync.RWMutex lockGetMigDevices sync.RWMutex @@ -136,6 +146,33 @@ func (mock *DeviceMock) GetAttributesCalls() []struct { return calls } +// GetClass calls GetClassFunc. +func (mock *DeviceMock) GetClass() (string, error) { + if mock.GetClassFunc == nil { + panic("DeviceMock.GetClassFunc: method is nil but Device.GetClass was just called") + } + callInfo := struct { + }{} + mock.lockGetClass.Lock() + mock.calls.GetClass = append(mock.calls.GetClass, callInfo) + mock.lockGetClass.Unlock() + return mock.GetClassFunc() +} + +// GetClassCalls gets all the calls that were made to GetClass. +// Check the length with: +// +// len(mockedDevice.GetClassCalls()) +func (mock *DeviceMock) GetClassCalls() []struct { +} { + var calls []struct { + } + mock.lockGetClass.RLock() + calls = mock.calls.GetClass + mock.lockGetClass.RUnlock() + return calls +} + // GetCudaComputeCapability calls GetCudaComputeCapabilityFunc. func (mock *DeviceMock) GetCudaComputeCapability() (int, int, error) { if mock.GetCudaComputeCapabilityFunc == nil { @@ -324,5 +361,3 @@ func (mock *DeviceMock) IsMigEnabledCalls() []struct { mock.lockIsMigEnabled.RUnlock() return calls } - -func (mock *DeviceMock) GetDisplayMode() (string, error) { return "unknown", nil } diff --git a/internal/resource/nvml-device.go b/internal/resource/nvml-device.go index 1ceb3d4e6..c3a1ff5e5 100644 --- a/internal/resource/nvml-device.go +++ b/internal/resource/nvml-device.go @@ -91,7 +91,7 @@ func (d nvmlDevice) GetTotalMemoryMB() (uint64, error) { return info.Total / (1024 * 1024), nil } -func (d nvmlDevice) GetDisplayMode() (string, error) { +func (d nvmlDevice) GetClass() (string, error) { info, retVal := d.Device.GetPciInfo() if retVal != nvml.SUCCESS { return "", retVal @@ -104,23 +104,14 @@ func (d nvmlDevice) GetDisplayMode() (string, error) { bytes = append(bytes, byte(char)) } pciID := strings.ToLower(strings.TrimPrefix(string(bytes), "0000")) - return resolvePCIAddressToMode(pciID) + return resolvePCIAddressToClass(pciID) } -func resolvePCIAddressToMode(addr string) (string, error) { +func resolvePCIAddressToClass(addr string) (string, error) { class, err := os.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/class", addr)) if err != nil { klog.Errorf("Error getting gpu class: %v", err) return "unknown", fmt.Errorf("Error getting gpu class: %v", err) } - classStr := string(class) - classStr = strings.TrimSpace(classStr) - switch classStr { - case "0x030000": - return "display", nil - case "0x030200": - return "compute", nil - default: - return "unknown", nil - } + return strings.TrimSpace(string(class)), nil } diff --git a/internal/resource/nvml-mig-device.go b/internal/resource/nvml-mig-device.go index c8db4dfd6..d45cb3829 100644 --- a/internal/resource/nvml-mig-device.go +++ b/internal/resource/nvml-mig-device.go @@ -133,4 +133,18 @@ func totalMemory(attr map[string]interface{}) (uint64, error) { } } -func (d nvmlMigDevice) GetDisplayMode() (string, error) { return "unknown", nil } +func (d nvmlMigDevice) GetClass() (string, error) { + info, retVal := d.MigDevice.GetPciInfo() + if retVal != nvml.SUCCESS { + return "", retVal + } + var bytes []byte + for _, char := range info.BusId { + if char == 0 { + break + } + bytes = append(bytes, byte(char)) + } + pciID := strings.ToLower(strings.TrimPrefix(string(bytes), "0000")) + return resolvePCIAddressToClass(pciID) +} diff --git a/internal/resource/sysfs-device.go b/internal/resource/sysfs-device.go index d0f27c3a6..a4090f4fa 100644 --- a/internal/resource/sysfs-device.go +++ b/internal/resource/sysfs-device.go @@ -65,6 +65,7 @@ func (d vfioDevice) IsMigCapable() (bool, error) { return false, nil } -func (d vfioDevice) GetDisplayMode() (string, error) { - return resolvePCIAddressToMode(d.nvidiaPCIDevice.Address) +func (d vfioDevice) GetClass() (string, error) { + class := fmt.Sprintf("0x0%x", d.nvidiaPCIDevice.Class) + return class, nil } diff --git a/internal/resource/testing/resource-testing.go b/internal/resource/testing/resource-testing.go index a19916cb7..b012d5314 100644 --- a/internal/resource/testing/resource-testing.go +++ b/internal/resource/testing/resource-testing.go @@ -51,6 +51,7 @@ func NewDeviceMock(migEnabled bool) *DeviceMock { IsMigEnabledFunc: func() (bool, error) { return migEnabled, nil }, IsMigCapableFunc: func() (bool, error) { return migEnabled, nil }, GetMigDevicesFunc: func() ([]resource.Device, error) { return nil, nil }, + GetClassFunc: func() (string, error) { return "0x030000", nil }, }} return &d } diff --git a/internal/resource/types.go b/internal/resource/types.go index 6ddfb8edd..5e89e7972 100644 --- a/internal/resource/types.go +++ b/internal/resource/types.go @@ -39,5 +39,5 @@ type Device interface { GetTotalMemoryMB() (uint64, error) GetDeviceHandleFromMigDeviceHandle() (Device, error) GetCudaComputeCapability() (int, int, error) - GetDisplayMode() (string, error) + GetClass() (string, error) } diff --git a/tests/expected-output-mig-mixed.txt b/tests/expected-output-mig-mixed.txt index e90a8daf1..f80392607 100644 --- a/tests/expected-output-mig-mixed.txt +++ b/tests/expected-output-mig-mixed.txt @@ -18,7 +18,7 @@ nvidia\.com\/gpu\.sharing-strategy=[none|mps|time-slicing] nvidia\.com\/gpu\.product=[A-Za-z_-]+ nvidia\.com\/gpu\.memory=[0-9]+ nvidia\.com\/gpu\.family=[a-z]+ -nvidia\.com\/gpu\.mode=unknown +nvidia\.com\/gpu\.mode=[unknown|compute|graphics] nvidia\.com\/mig\.capable=[true|false] nvidia\.com\/gpu\.compute\.major=[0-9]+ nvidia\.com\/gpu\.compute\.minor=[0-9]+ diff --git a/tests/expected-output-mig-none.txt b/tests/expected-output-mig-none.txt index dc9ca7d2b..0f2468d77 100644 --- a/tests/expected-output-mig-none.txt +++ b/tests/expected-output-mig-none.txt @@ -18,7 +18,7 @@ nvidia\.com\/gpu\.sharing-strategy=[none|mps|time-slicing] nvidia\.com\/gpu\.product=[A-Za-z_-]+ nvidia\.com\/gpu\.memory=[0-9]+ nvidia\.com\/gpu\.family=[a-z]+ -nvidia\.com\/gpu\.mode=unknown +nvidia\.com\/gpu\.mode=[unknown|compute|graphics] nvidia\.com\/mig\.capable=[true|false] nvidia\.com\/gpu\.compute\.major=[0-9]+ nvidia\.com\/gpu\.compute\.minor=[0-9]+ diff --git a/tests/expected-output-mig-single.txt b/tests/expected-output-mig-single.txt index 10d2bb37c..eaf3bfd3c 100644 --- a/tests/expected-output-mig-single.txt +++ b/tests/expected-output-mig-single.txt @@ -30,5 +30,5 @@ nvidia\.com\/gpu\.engines\.jpeg=[0-9]+ nvidia\.com\/gpu\.engines\.ofa=[0-9]+ nvidia\.com\/gpu\.slices\.gi=[0-9]+ nvidia\.com\/gpu\.slices\.ci=[0-9]+ -nvidia\.com\/gpu\.mode=unknown +nvidia\.com\/gpu\.mode=[unknown|compute|graphics] nvidia\.com\/mps\.capable=[true|false] From ec8a1ba5ad03699f5a69b0b8978bc819c5e2bf1b Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Tue, 18 Jun 2024 06:58:04 +0000 Subject: [PATCH 03/10] Updating GPU mode labeler Signed-off-by: Vishesh Tanksale --- internal/lm/nvml.go | 59 ++++++++++++++++++---------- internal/resource/nvml-device.go | 15 ++----- internal/resource/nvml-mig-device.go | 7 +++- internal/resource/sysfs-device.go | 2 +- 4 files changed, 49 insertions(+), 34 deletions(-) diff --git a/internal/lm/nvml.go b/internal/lm/nvml.go index be24287d9..96ce7d615 100644 --- a/internal/lm/nvml.go +++ b/internal/lm/nvml.go @@ -201,7 +201,6 @@ func isMPSCapable(manager resource.Manager) (bool, error) { } func newGPUModeLabeler(manager resource.Manager) (Labeler, error) { - devices, err := manager.GetDevices() if err != nil { return nil, err @@ -210,32 +209,50 @@ func newGPUModeLabeler(manager resource.Manager) (Labeler, error) { // no devices, return empty labels return empty{}, nil } + classes, err := getDeviceClasses(devices) + if err != nil { + return nil, err + } + gpuMode := getModeForClasses(classes) + labels := Labels{ + "nvidia.com/gpu.mode": gpuMode, + } + return labels, nil +} - class := "" - // loop through all devices to check if all of them are on same gpu mode - for _, d := range devices { - val, err := d.GetClass() - if err != nil { - return nil, err - } - if class != "" && val != class { - break - } else { - class = val +func getModeForClasses(classes []string) string { + if len(classes) == 0 { + return "unknown" + } + for _, class := range classes { + if class != classes[0] { + return "unknown" } } - - gpuMode := "" - switch class { + switch classes[0] { case "0x030000": - gpuMode = "graphics" + return "graphics" case "0x030200": - gpuMode = "compute" + return "compute" default: - gpuMode = "unknown" + return "unknown" } - labels := Labels{ - "nvidia.com/gpu.mode": gpuMode, +} + +func getDeviceClasses(devices []resource.Device) ([]string, error) { + seenClasses := make(map[string]bool) + for _, d := range devices { + class, err := d.GetClass() + if err != nil { + return nil, err + } + class = strings.TrimSpace(class) + seenClasses[class] = true } - return labels, nil + + var classes []string + for class := range seenClasses { + classes = append(classes, class) + } + return classes, nil } diff --git a/internal/resource/nvml-device.go b/internal/resource/nvml-device.go index c3a1ff5e5..776937a6c 100644 --- a/internal/resource/nvml-device.go +++ b/internal/resource/nvml-device.go @@ -18,12 +18,10 @@ package resource import ( "fmt" - "os" "strings" - "k8s.io/klog/v2" - "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvlib/pkg/nvpci" "github.com/NVIDIA/go-nvml/pkg/nvml" ) @@ -104,14 +102,9 @@ func (d nvmlDevice) GetClass() (string, error) { bytes = append(bytes, byte(char)) } pciID := strings.ToLower(strings.TrimPrefix(string(bytes), "0000")) - return resolvePCIAddressToClass(pciID) -} - -func resolvePCIAddressToClass(addr string) (string, error) { - class, err := os.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/class", addr)) + nvDevice, err := nvpci.New().GetGPUByPciBusID(pciID) if err != nil { - klog.Errorf("Error getting gpu class: %v", err) - return "unknown", fmt.Errorf("Error getting gpu class: %v", err) + return "", err } - return strings.TrimSpace(string(class)), nil + return fmt.Sprintf("%#06x", nvDevice.Class), nil } diff --git a/internal/resource/nvml-mig-device.go b/internal/resource/nvml-mig-device.go index d45cb3829..4cad926c9 100644 --- a/internal/resource/nvml-mig-device.go +++ b/internal/resource/nvml-mig-device.go @@ -21,6 +21,7 @@ import ( "strings" "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvlib/pkg/nvpci" "github.com/NVIDIA/go-nvml/pkg/nvml" ) @@ -146,5 +147,9 @@ func (d nvmlMigDevice) GetClass() (string, error) { bytes = append(bytes, byte(char)) } pciID := strings.ToLower(strings.TrimPrefix(string(bytes), "0000")) - return resolvePCIAddressToClass(pciID) + nvDevice, err := nvpci.New().GetGPUByPciBusID(pciID) + if err != nil { + return "", err + } + return fmt.Sprintf("%#06x", nvDevice.Class), nil } diff --git a/internal/resource/sysfs-device.go b/internal/resource/sysfs-device.go index a4090f4fa..5f54dcf96 100644 --- a/internal/resource/sysfs-device.go +++ b/internal/resource/sysfs-device.go @@ -66,6 +66,6 @@ func (d vfioDevice) IsMigCapable() (bool, error) { } func (d vfioDevice) GetClass() (string, error) { - class := fmt.Sprintf("0x0%x", d.nvidiaPCIDevice.Class) + class := fmt.Sprintf("%#06x\n", d.nvidiaPCIDevice.Class) return class, nil } From 3991e1631908c9863d86c2d35e580ba983fe4bb1 Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Tue, 18 Jun 2024 09:31:58 +0000 Subject: [PATCH 04/10] Updating go-nvml version Signed-off-by: Vishesh Tanksale --- go.mod | 4 +- go.sum | 8 +- internal/resource/nvml-device.go | 17 +- .../go-nvlib/pkg/nvlib/device/device.go | 25 + .../NVIDIA/go-nvlib/pkg/nvpci/mock.go | 130 +- .../NVIDIA/go-nvlib/pkg/nvpci/nvpci.go | 121 +- .../NVIDIA/go-nvml/pkg/nvml/const.go | 206 +- .../NVIDIA/go-nvml/pkg/nvml/device.go | 356 +- .../github.com/NVIDIA/go-nvml/pkg/nvml/gpm.go | 39 +- .../NVIDIA/go-nvml/pkg/nvml/nvml.go | 855 ++-- .../github.com/NVIDIA/go-nvml/pkg/nvml/nvml.h | 3590 ++++++++++++----- .../NVIDIA/go-nvml/pkg/nvml/system.go | 57 + .../NVIDIA/go-nvml/pkg/nvml/types_gen.go | 333 +- .../go-nvml/pkg/nvml/zz_generated.api.go | 108 +- vendor/modules.txt | 4 +- 15 files changed, 4384 insertions(+), 1469 deletions(-) diff --git a/go.mod b/go.mod index ca0097a8b..e859bfd65 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,8 @@ go 1.22.2 require ( github.com/NVIDIA/go-gpuallocator v0.5.0 - github.com/NVIDIA/go-nvlib v0.5.0 - github.com/NVIDIA/go-nvml v0.12.0-6 + github.com/NVIDIA/go-nvlib v0.6.0 + github.com/NVIDIA/go-nvml v0.12.4-0 github.com/NVIDIA/nvidia-container-toolkit v1.15.1-0.20240528113255-e4b46a09a77e github.com/fsnotify/fsnotify v1.7.0 github.com/google/uuid v1.6.0 diff --git a/go.sum b/go.sum index 8622dc44a..e19044c40 100644 --- a/go.sum +++ b/go.sum @@ -23,10 +23,10 @@ github.com/Microsoft/hcsshim v0.11.4 h1:68vKo2VN8DE9AdN4tnkWnmdhqdbpUFM8OF3Airm7 github.com/Microsoft/hcsshim v0.11.4/go.mod h1:smjE4dvqPX9Zldna+t5FG3rnoHhaB7QYxPRqGcpAD9w= github.com/NVIDIA/go-gpuallocator v0.5.0 h1:166ICvPv2dU9oZ2J3kJ4y3XdbGCi6LhXgFZJtrqeu3A= github.com/NVIDIA/go-gpuallocator v0.5.0/go.mod h1:zos5bTIN01hpQioOyu9oRKglrznImMQvm0bZllMmckw= -github.com/NVIDIA/go-nvlib v0.5.0 h1:951KGrfr+p3cs89alO9z/ZxPPWKxwht9tx9rxiADoLI= -github.com/NVIDIA/go-nvlib v0.5.0/go.mod h1:87z49ULPr4GWPSGfSIp3taU4XENRYN/enIg88MzcL4k= -github.com/NVIDIA/go-nvml v0.12.0-6 h1:FJYc2KrpvX+VOC/8QQvMiQMmZ/nPMRpdJO/Ik4xfcr0= -github.com/NVIDIA/go-nvml v0.12.0-6/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ= +github.com/NVIDIA/go-nvlib v0.6.0 h1:zAMBzCYT9xeyRQo0tb7HJbStkzajD6e5joyaQqJ2OGU= +github.com/NVIDIA/go-nvlib v0.6.0/go.mod h1:9UrsLGx/q1OrENygXjOuM5Ey5KCtiZhbvBlbUIxtGWY= +github.com/NVIDIA/go-nvml v0.12.4-0 h1:4tkbB3pT1O77JGr0gQ6uD8FrsUPqP1A/EOEm2wI1TUg= +github.com/NVIDIA/go-nvml v0.12.4-0/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ= github.com/NVIDIA/nvidia-container-toolkit v1.15.1-0.20240528113255-e4b46a09a77e h1:1xxK51xOXiqqziqMdgEMWVmjfvRu1b265LgQQZBqRaE= github.com/NVIDIA/nvidia-container-toolkit v1.15.1-0.20240528113255-e4b46a09a77e/go.mod h1:Xzovqe8g9W8DiJe93gBD8FPjfduDpT+nfKIpc5s8IPE= github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d h1:UrqY+r/OJnIp5u0s1SbQ8dVfLCZJsnvazdBP5hS4iRs= diff --git a/internal/resource/nvml-device.go b/internal/resource/nvml-device.go index 776937a6c..830bbcbc2 100644 --- a/internal/resource/nvml-device.go +++ b/internal/resource/nvml-device.go @@ -18,7 +18,6 @@ package resource import ( "fmt" - "strings" "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" "github.com/NVIDIA/go-nvlib/pkg/nvpci" @@ -90,19 +89,11 @@ func (d nvmlDevice) GetTotalMemoryMB() (uint64, error) { } func (d nvmlDevice) GetClass() (string, error) { - info, retVal := d.Device.GetPciInfo() - if retVal != nvml.SUCCESS { - return "", retVal - } - var bytes []byte - for _, char := range info.BusId { - if char == 0 { - break - } - bytes = append(bytes, byte(char)) + pciBusID, err := d.GetPCIBusID() + if err != nil { + return "", err } - pciID := strings.ToLower(strings.TrimPrefix(string(bytes), "0000")) - nvDevice, err := nvpci.New().GetGPUByPciBusID(pciID) + nvDevice, err := nvpci.New().GetGPUByPciBusID(pciBusID) if err != nil { return "", err } diff --git a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/device/device.go b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/device/device.go index 5e1510ca6..5b21fc13d 100644 --- a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/device/device.go +++ b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/device/device.go @@ -18,6 +18,7 @@ package device import ( "fmt" + "strings" "github.com/NVIDIA/go-nvml/pkg/nvml" ) @@ -30,6 +31,7 @@ type Device interface { GetCudaComputeCapabilityAsString() (string, error) GetMigDevices() ([]MigDevice, error) GetMigProfiles() ([]MigProfile, error) + GetPCIBusID() (string, error) IsMigCapable() (bool, error) IsMigEnabled() (bool, error) VisitMigDevices(func(j int, m MigDevice) error) error @@ -140,6 +142,29 @@ func (d *device) GetBrandAsString() (string, error) { return "", fmt.Errorf("error interpreting device brand as string: %v", brand) } +// GetPCIBusID returns the string representation of the bus ID. +func (d *device) GetPCIBusID() (string, error) { + info, ret := d.GetPciInfo() + if ret != nvml.SUCCESS { + return "", fmt.Errorf("error getting PCI info: %w", ret) + } + + var bytes []byte + for _, b := range info.BusId { + if byte(b) == '\x00' { + break + } + bytes = append(bytes, byte(b)) + } + id := strings.ToLower(string(bytes)) + + if id != "0000" { + id = strings.TrimPrefix(id, "0000") + } + + return id, nil +} + // GetCudaComputeCapabilityAsString returns the Device's CUDA compute capability as a version string. func (d *device) GetCudaComputeCapabilityAsString() (string, error) { major, minor, ret := d.GetCudaComputeCapability() diff --git a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/mock.go b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/mock.go index 7c1b69ddc..9b3d6e2aa 100644 --- a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/mock.go +++ b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/mock.go @@ -20,6 +20,8 @@ import ( "fmt" "os" "path/filepath" + "regexp" + "strconv" "github.com/NVIDIA/go-nvlib/pkg/nvpci/bytes" ) @@ -55,64 +57,114 @@ func (m *MockNvpci) Cleanup() { os.RemoveAll(m.pciDevicesRoot) } +func validatePCIAddress(addr string) error { + r := regexp.MustCompile(`0{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-9]`) + if !r.Match([]byte(addr)) { + return fmt.Errorf(`invalid PCI address should match 0{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-9]: %s`, addr) + } + + return nil +} + // AddMockA100 Create an A100 like GPU mock device. -func (m *MockNvpci) AddMockA100(address string, numaNode int) error { +func (m *MockNvpci) AddMockA100(address string, numaNode int, sriov *SriovInfo) error { + err := validatePCIAddress(address) + if err != nil { + return err + } + deviceDir := filepath.Join(m.pciDevicesRoot, address) - err := os.MkdirAll(deviceDir, 0755) + err = os.MkdirAll(deviceDir, 0755) if err != nil { return err } - vendor, err := os.Create(filepath.Join(deviceDir, "vendor")) + err = createNVIDIAgpuFiles(deviceDir) if err != nil { return err } - _, err = vendor.WriteString(fmt.Sprintf("0x%x", PCINvidiaVendorID)) + + iommuGroup := 20 + _, err = os.Create(filepath.Join(deviceDir, strconv.Itoa(iommuGroup))) + if err != nil { + return err + } + err = os.Symlink(filepath.Join(deviceDir, strconv.Itoa(iommuGroup)), filepath.Join(deviceDir, "iommu_group")) if err != nil { return err } - class, err := os.Create(filepath.Join(deviceDir, "class")) + numa, err := os.Create(filepath.Join(deviceDir, "numa_node")) if err != nil { return err } - _, err = class.WriteString(fmt.Sprintf("0x%x", PCI3dControllerClass)) + _, err = numa.WriteString(fmt.Sprintf("%v", numaNode)) if err != nil { return err } - device, err := os.Create(filepath.Join(deviceDir, "device")) + if sriov != nil && sriov.PhysicalFunction != nil { + totalVFs, err := os.Create(filepath.Join(deviceDir, "sriov_totalvfs")) + if err != nil { + return err + } + _, err = fmt.Fprintf(totalVFs, "%d", sriov.PhysicalFunction.TotalVFs) + if err != nil { + return err + } + + numVFs, err := os.Create(filepath.Join(deviceDir, "sriov_numvfs")) + if err != nil { + return err + } + _, err = fmt.Fprintf(numVFs, "%d", sriov.PhysicalFunction.NumVFs) + if err != nil { + return err + } + for i := 1; i <= int(sriov.PhysicalFunction.NumVFs); i++ { + err = m.createVf(address, i, iommuGroup, numaNode) + if err != nil { + return err + } + } + } + + return nil +} + +func createNVIDIAgpuFiles(deviceDir string) error { + vendor, err := os.Create(filepath.Join(deviceDir, "vendor")) if err != nil { return err } - _, err = device.WriteString("0x20bf") + _, err = vendor.WriteString(fmt.Sprintf("0x%x", PCINvidiaVendorID)) if err != nil { return err } - _, err = os.Create(filepath.Join(deviceDir, "nvidia")) + class, err := os.Create(filepath.Join(deviceDir, "class")) if err != nil { return err } - err = os.Symlink(filepath.Join(deviceDir, "nvidia"), filepath.Join(deviceDir, "driver")) + _, err = class.WriteString(fmt.Sprintf("0x%x", PCI3dControllerClass)) if err != nil { return err } - _, err = os.Create(filepath.Join(deviceDir, "20")) + device, err := os.Create(filepath.Join(deviceDir, "device")) if err != nil { return err } - err = os.Symlink(filepath.Join(deviceDir, "20"), filepath.Join(deviceDir, "iommu_group")) + _, err = device.WriteString("0x20bf") if err != nil { return err } - numa, err := os.Create(filepath.Join(deviceDir, "numa_node")) + _, err = os.Create(filepath.Join(deviceDir, "nvidia")) if err != nil { return err } - _, err = numa.WriteString(fmt.Sprintf("%v", numaNode)) + err = os.Symlink(filepath.Join(deviceDir, "nvidia"), filepath.Join(deviceDir, "driver")) if err != nil { return err } @@ -156,3 +208,53 @@ func (m *MockNvpci) AddMockA100(address string, numaNode int) error { return nil } + +func (m *MockNvpci) createVf(pfAddress string, id, iommu_group, numaNode int) error { + functionID := pfAddress[len(pfAddress)-1] + // we are verifying the last character of pfAddress is integer. + functionNumber, err := strconv.Atoi(string(functionID)) + if err != nil { + return fmt.Errorf("can't conver physical function pci address function number %s to integer: %v", string(functionID), err) + } + + vfFunctionNumber := functionNumber + id + vfAddress := pfAddress[:len(pfAddress)-1] + strconv.Itoa(vfFunctionNumber) + + deviceDir := filepath.Join(m.pciDevicesRoot, vfAddress) + err = os.MkdirAll(deviceDir, 0755) + if err != nil { + return err + } + + err = createNVIDIAgpuFiles(deviceDir) + if err != nil { + return err + } + + vfIommuGroup := strconv.Itoa(iommu_group + id) + + _, err = os.Create(filepath.Join(deviceDir, vfIommuGroup)) + if err != nil { + return err + } + err = os.Symlink(filepath.Join(deviceDir, vfIommuGroup), filepath.Join(deviceDir, "iommu_group")) + if err != nil { + return err + } + + numa, err := os.Create(filepath.Join(deviceDir, "numa_node")) + if err != nil { + return err + } + _, err = numa.WriteString(fmt.Sprintf("%v", numaNode)) + if err != nil { + return err + } + + err = os.Symlink(filepath.Join(m.pciDevicesRoot, pfAddress), filepath.Join(deviceDir, "physfn")) + if err != nil { + return err + } + + return nil +} diff --git a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/nvpci.go b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/nvpci.go index 6d83a5776..6ff197b15 100644 --- a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/nvpci.go +++ b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/nvpci.go @@ -76,6 +76,32 @@ type nvpci struct { var _ Interface = (*nvpci)(nil) var _ ResourceInterface = (*MemoryResources)(nil) +// SriovInfo indicates whether device is VF/PF for SRIOV capable devices. +// Only one should be set at any given time. +type SriovInfo struct { + PhysicalFunction *SriovPhysicalFunction + VirtualFunction *SriovVirtualFunction +} + +// SriovPhysicalFunction stores info about SRIOV physical function. +type SriovPhysicalFunction struct { + TotalVFs uint64 + NumVFs uint64 +} + +// SriovVirtualFunction keeps data about SRIOV virtual function. +type SriovVirtualFunction struct { + PhysicalFunction *NvidiaPCIDevice +} + +func (s *SriovInfo) IsPF() bool { + return s != nil && s.PhysicalFunction != nil +} + +func (s *SriovInfo) IsVF() bool { + return s != nil && s.VirtualFunction != nil +} + // NvidiaPCIDevice represents a PCI device for an NVIDIA product. type NvidiaPCIDevice struct { Path string @@ -90,7 +116,7 @@ type NvidiaPCIDevice struct { NumaNode int Config *ConfigSpace Resources MemoryResources - IsVF bool + SriovInfo SriovInfo } // IsVGAController if class == 0x300. @@ -178,9 +204,11 @@ func (p *nvpci) GetAllDevices() ([]*NvidiaPCIDevice, error) { } var nvdevices []*NvidiaPCIDevice + // Cache devices for each GetAllDevices invocation to speed things up. + cache := make(map[string]*NvidiaPCIDevice) for _, deviceDir := range deviceDirs { deviceAddress := deviceDir.Name() - nvdevice, err := p.GetGPUByPciBusID(deviceAddress) + nvdevice, err := p.getGPUByPciBusID(deviceAddress, cache) if err != nil { return nil, fmt.Errorf("error constructing NVIDIA PCI device %s: %v", deviceAddress, err) } @@ -206,6 +234,16 @@ func (p *nvpci) GetAllDevices() ([]*NvidiaPCIDevice, error) { // GetGPUByPciBusID constructs an NvidiaPCIDevice for the specified address (PCI Bus ID). func (p *nvpci) GetGPUByPciBusID(address string) (*NvidiaPCIDevice, error) { + // Pass nil as to force reading device information from sysfs. + return p.getGPUByPciBusID(address, nil) +} + +func (p *nvpci) getGPUByPciBusID(address string, cache map[string]*NvidiaPCIDevice) (*NvidiaPCIDevice, error) { + if cache != nil { + if pciDevice, exists := cache[address]; exists { + return pciDevice, nil + } + } devicePath := filepath.Join(p.pciDevicesRoot, address) vendor, err := os.ReadFile(path.Join(devicePath, "vendor")) @@ -265,16 +303,6 @@ func (p *nvpci) GetGPUByPciBusID(address string) (*NvidiaPCIDevice, error) { return nil, fmt.Errorf("unable to detect iommu_group for %s: %v", address, err) } - // device is a virtual function (VF) if "physfn" symlink exists. - var isVF bool - _, err = filepath.EvalSymlinks(path.Join(devicePath, "physfn")) - if err == nil { - isVF = true - } - if err != nil && !os.IsNotExist(err) { - return nil, fmt.Errorf("unable to resolve %s: %v", path.Join(devicePath, "physfn"), err) - } - numa, err := os.ReadFile(path.Join(devicePath, "numa_node")) if err != nil { return nil, fmt.Errorf("unable to read PCI NUMA node for %s: %v", address, err) @@ -328,6 +356,28 @@ func (p *nvpci) GetGPUByPciBusID(address string) (*NvidiaPCIDevice, error) { className = UnknownClassString } + var sriovInfo SriovInfo + // Device is a virtual function (VF) if "physfn" symlink exists. + physFnAddress, err := filepath.EvalSymlinks(path.Join(devicePath, "physfn")) + if err == nil { + physFn, err := p.getGPUByPciBusID(filepath.Base(physFnAddress), cache) + if err != nil { + return nil, fmt.Errorf("unable to detect physfn for %s: %v", address, err) + } + sriovInfo = SriovInfo{ + VirtualFunction: &SriovVirtualFunction{ + PhysicalFunction: physFn, + }, + } + } else if os.IsNotExist(err) { + sriovInfo, err = p.getSriovInfoForPhysicalFunction(devicePath) + if err != nil { + return nil, fmt.Errorf("unable to read SRIOV physical function details for %s: %v", devicePath, err) + } + } else { + return nil, fmt.Errorf("unable to read %s: %v", path.Join(devicePath, "physfn"), err) + } + nvdevice := &NvidiaPCIDevice{ Path: devicePath, Address: address, @@ -339,9 +389,14 @@ func (p *nvpci) GetGPUByPciBusID(address string) (*NvidiaPCIDevice, error) { NumaNode: int(numaNode), Config: config, Resources: resources, - IsVF: isVF, DeviceName: deviceName, ClassName: className, + SriovInfo: sriovInfo, + } + + // Cache physical functions only as VF can't be a root device. + if cache != nil && sriovInfo.IsPF() { + cache[address] = nvdevice } return nvdevice, nil @@ -407,7 +462,7 @@ func (p *nvpci) GetGPUs() ([]*NvidiaPCIDevice, error) { var filtered []*NvidiaPCIDevice for _, d := range devices { - if d.IsGPU() && !d.IsVF { + if d.IsGPU() && !d.SriovInfo.IsVF() { filtered = append(filtered, d) } } @@ -428,3 +483,41 @@ func (p *nvpci) GetGPUByIndex(i int) (*NvidiaPCIDevice, error) { return gpus[i], nil } + +func (p *nvpci) getSriovInfoForPhysicalFunction(devicePath string) (sriovInfo SriovInfo, err error) { + totalVfsPath := filepath.Join(devicePath, "sriov_totalvfs") + numVfsPath := filepath.Join(devicePath, "sriov_numvfs") + + // No file for sriov_totalvfs exists? Not an SRIOV device, return nil + _, err = os.Stat(totalVfsPath) + if err != nil && os.IsNotExist(err) { + return sriovInfo, nil + } + sriovTotalVfs, err := os.ReadFile(totalVfsPath) + if err != nil { + return sriovInfo, fmt.Errorf("unable to read sriov_totalvfs: %v", err) + } + totalVfsStr := strings.TrimSpace(string(sriovTotalVfs)) + totalVfsInt, err := strconv.ParseUint(totalVfsStr, 10, 16) + if err != nil { + return sriovInfo, fmt.Errorf("unable to convert sriov_totalvfs to uint64: %v", err) + } + + sriovNumVfs, err := os.ReadFile(numVfsPath) + if err != nil { + return sriovInfo, fmt.Errorf("unable to read sriov_numvfs for: %v", err) + } + numVfsStr := strings.TrimSpace(string(sriovNumVfs)) + numVfsInt, err := strconv.ParseUint(numVfsStr, 10, 16) + if err != nil { + return sriovInfo, fmt.Errorf("unable to convert sriov_numvfs to uint64: %v", err) + } + + sriovInfo = SriovInfo{ + PhysicalFunction: &SriovPhysicalFunction{ + TotalVFs: totalVfsInt, + NumVFs: numVfsInt, + }, + } + return sriovInfo, nil +} diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/const.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/const.go index f4cecfbc3..1ccb5016b 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/const.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/const.go @@ -31,9 +31,9 @@ const ( // NO_UNVERSIONED_FUNC_DEFS as defined in go-nvml/:24 NO_UNVERSIONED_FUNC_DEFS = 1 // API_VERSION as defined in nvml/nvml.h - API_VERSION = 11 + API_VERSION = 12 // API_VERSION_STR as defined in nvml/nvml.h - API_VERSION_STR = "11" + API_VERSION_STR = "12" // VALUE_NOT_AVAILABLE as defined in nvml/nvml.h VALUE_NOT_AVAILABLE = -1 // DEVICE_PCI_BUS_ID_BUFFER_SIZE as defined in nvml/nvml.h @@ -78,6 +78,8 @@ const ( VGPU_NAME_BUFFER_SIZE = 64 // GRID_LICENSE_FEATURE_MAX_COUNT as defined in nvml/nvml.h GRID_LICENSE_FEATURE_MAX_COUNT = 3 + // INVALID_VGPU_PLACEMENT_ID as defined in nvml/nvml.h + INVALID_VGPU_PLACEMENT_ID = 65535 // VGPU_SCHEDULER_POLICY_UNKNOWN as defined in nvml/nvml.h VGPU_SCHEDULER_POLICY_UNKNOWN = 0 // VGPU_SCHEDULER_POLICY_BEST_EFFORT as defined in nvml/nvml.h @@ -90,6 +92,12 @@ const ( SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT = 3 // SCHEDULER_SW_MAX_LOG_ENTRIES as defined in nvml/nvml.h SCHEDULER_SW_MAX_LOG_ENTRIES = 200 + // VGPU_SCHEDULER_ARR_DEFAULT as defined in nvml/nvml.h + VGPU_SCHEDULER_ARR_DEFAULT = 0 + // VGPU_SCHEDULER_ARR_DISABLE as defined in nvml/nvml.h + VGPU_SCHEDULER_ARR_DISABLE = 1 + // VGPU_SCHEDULER_ARR_ENABLE as defined in nvml/nvml.h + VGPU_SCHEDULER_ARR_ENABLE = 2 // GRID_LICENSE_STATE_UNKNOWN as defined in nvml/nvml.h GRID_LICENSE_STATE_UNKNOWN = 0 // GRID_LICENSE_STATE_UNINITIALIZED as defined in nvml/nvml.h @@ -140,6 +148,8 @@ const ( POWER_SOURCE_AC = 0 // POWER_SOURCE_BATTERY as defined in nvml/nvml.h POWER_SOURCE_BATTERY = 1 + // POWER_SOURCE_UNDERSIZED as defined in nvml/nvml.h + POWER_SOURCE_UNDERSIZED = 2 // PCIE_LINK_MAX_SPEED_INVALID as defined in nvml/nvml.h PCIE_LINK_MAX_SPEED_INVALID = 0 // PCIE_LINK_MAX_SPEED_2500MBPS as defined in nvml/nvml.h @@ -498,8 +508,64 @@ const ( FI_DEV_NVLINK_GET_POWER_THRESHOLD = 168 // FI_DEV_PCIE_L0_TO_RECOVERY_COUNTER as defined in nvml/nvml.h FI_DEV_PCIE_L0_TO_RECOVERY_COUNTER = 169 + // FI_DEV_C2C_LINK_COUNT as defined in nvml/nvml.h + FI_DEV_C2C_LINK_COUNT = 170 + // FI_DEV_C2C_LINK_GET_STATUS as defined in nvml/nvml.h + FI_DEV_C2C_LINK_GET_STATUS = 171 + // FI_DEV_C2C_LINK_GET_MAX_BW as defined in nvml/nvml.h + FI_DEV_C2C_LINK_GET_MAX_BW = 172 + // FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS as defined in nvml/nvml.h + FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS = 173 + // FI_DEV_PCIE_COUNT_NAKS_RECEIVED as defined in nvml/nvml.h + FI_DEV_PCIE_COUNT_NAKS_RECEIVED = 174 + // FI_DEV_PCIE_COUNT_RECEIVER_ERROR as defined in nvml/nvml.h + FI_DEV_PCIE_COUNT_RECEIVER_ERROR = 175 + // FI_DEV_PCIE_COUNT_BAD_TLP as defined in nvml/nvml.h + FI_DEV_PCIE_COUNT_BAD_TLP = 176 + // FI_DEV_PCIE_COUNT_NAKS_SENT as defined in nvml/nvml.h + FI_DEV_PCIE_COUNT_NAKS_SENT = 177 + // FI_DEV_PCIE_COUNT_BAD_DLLP as defined in nvml/nvml.h + FI_DEV_PCIE_COUNT_BAD_DLLP = 178 + // FI_DEV_PCIE_COUNT_NON_FATAL_ERROR as defined in nvml/nvml.h + FI_DEV_PCIE_COUNT_NON_FATAL_ERROR = 179 + // FI_DEV_PCIE_COUNT_FATAL_ERROR as defined in nvml/nvml.h + FI_DEV_PCIE_COUNT_FATAL_ERROR = 180 + // FI_DEV_PCIE_COUNT_UNSUPPORTED_REQ as defined in nvml/nvml.h + FI_DEV_PCIE_COUNT_UNSUPPORTED_REQ = 181 + // FI_DEV_PCIE_COUNT_LCRC_ERROR as defined in nvml/nvml.h + FI_DEV_PCIE_COUNT_LCRC_ERROR = 182 + // FI_DEV_PCIE_COUNT_LANE_ERROR as defined in nvml/nvml.h + FI_DEV_PCIE_COUNT_LANE_ERROR = 183 + // FI_DEV_IS_RESETLESS_MIG_SUPPORTED as defined in nvml/nvml.h + FI_DEV_IS_RESETLESS_MIG_SUPPORTED = 184 + // FI_DEV_POWER_AVERAGE as defined in nvml/nvml.h + FI_DEV_POWER_AVERAGE = 185 + // FI_DEV_POWER_INSTANT as defined in nvml/nvml.h + FI_DEV_POWER_INSTANT = 186 + // FI_DEV_POWER_MIN_LIMIT as defined in nvml/nvml.h + FI_DEV_POWER_MIN_LIMIT = 187 + // FI_DEV_POWER_MAX_LIMIT as defined in nvml/nvml.h + FI_DEV_POWER_MAX_LIMIT = 188 + // FI_DEV_POWER_DEFAULT_LIMIT as defined in nvml/nvml.h + FI_DEV_POWER_DEFAULT_LIMIT = 189 + // FI_DEV_POWER_CURRENT_LIMIT as defined in nvml/nvml.h + FI_DEV_POWER_CURRENT_LIMIT = 190 + // FI_DEV_ENERGY as defined in nvml/nvml.h + FI_DEV_ENERGY = 191 + // FI_DEV_POWER_REQUESTED_LIMIT as defined in nvml/nvml.h + FI_DEV_POWER_REQUESTED_LIMIT = 192 + // FI_DEV_TEMPERATURE_SHUTDOWN_TLIMIT as defined in nvml/nvml.h + FI_DEV_TEMPERATURE_SHUTDOWN_TLIMIT = 193 + // FI_DEV_TEMPERATURE_SLOWDOWN_TLIMIT as defined in nvml/nvml.h + FI_DEV_TEMPERATURE_SLOWDOWN_TLIMIT = 194 + // FI_DEV_TEMPERATURE_MEM_MAX_TLIMIT as defined in nvml/nvml.h + FI_DEV_TEMPERATURE_MEM_MAX_TLIMIT = 195 + // FI_DEV_TEMPERATURE_GPU_MAX_TLIMIT as defined in nvml/nvml.h + FI_DEV_TEMPERATURE_GPU_MAX_TLIMIT = 196 + // FI_DEV_IS_MIG_MODE_INDEPENDENT_MIG_QUERY_CAPABLE as defined in nvml/nvml.h + FI_DEV_IS_MIG_MODE_INDEPENDENT_MIG_QUERY_CAPABLE = 199 // FI_MAX as defined in nvml/nvml.h - FI_MAX = 170 + FI_MAX = 200 // EventTypeSingleBitEccError as defined in nvml/nvml.h EventTypeSingleBitEccError = 1 // EventTypeDoubleBitEccError as defined in nvml/nvml.h @@ -518,24 +584,40 @@ const ( EventTypeNone = 0 // EventTypeAll as defined in nvml/nvml.h EventTypeAll = 415 - // ClocksThrottleReasonGpuIdle as defined in nvml/nvml.h - ClocksThrottleReasonGpuIdle = 1 - // ClocksThrottleReasonApplicationsClocksSetting as defined in nvml/nvml.h - ClocksThrottleReasonApplicationsClocksSetting = 2 + // ClocksEventReasonGpuIdle as defined in nvml/nvml.h + ClocksEventReasonGpuIdle = 1 + // ClocksEventReasonApplicationsClocksSetting as defined in nvml/nvml.h + ClocksEventReasonApplicationsClocksSetting = 2 // ClocksThrottleReasonUserDefinedClocks as defined in nvml/nvml.h ClocksThrottleReasonUserDefinedClocks = 2 - // ClocksThrottleReasonSwPowerCap as defined in nvml/nvml.h - ClocksThrottleReasonSwPowerCap = 4 + // ClocksEventReasonSwPowerCap as defined in nvml/nvml.h + ClocksEventReasonSwPowerCap = 4 // ClocksThrottleReasonHwSlowdown as defined in nvml/nvml.h ClocksThrottleReasonHwSlowdown = 8 - // ClocksThrottleReasonSyncBoost as defined in nvml/nvml.h - ClocksThrottleReasonSyncBoost = 16 - // ClocksThrottleReasonSwThermalSlowdown as defined in nvml/nvml.h - ClocksThrottleReasonSwThermalSlowdown = 32 + // ClocksEventReasonSyncBoost as defined in nvml/nvml.h + ClocksEventReasonSyncBoost = 16 + // ClocksEventReasonSwThermalSlowdown as defined in nvml/nvml.h + ClocksEventReasonSwThermalSlowdown = 32 // ClocksThrottleReasonHwThermalSlowdown as defined in nvml/nvml.h ClocksThrottleReasonHwThermalSlowdown = 64 // ClocksThrottleReasonHwPowerBrakeSlowdown as defined in nvml/nvml.h ClocksThrottleReasonHwPowerBrakeSlowdown = 128 + // ClocksEventReasonDisplayClockSetting as defined in nvml/nvml.h + ClocksEventReasonDisplayClockSetting = 256 + // ClocksEventReasonNone as defined in nvml/nvml.h + ClocksEventReasonNone = 0 + // ClocksEventReasonAll as defined in nvml/nvml.h + ClocksEventReasonAll = 511 + // ClocksThrottleReasonGpuIdle as defined in nvml/nvml.h + ClocksThrottleReasonGpuIdle = 1 + // ClocksThrottleReasonApplicationsClocksSetting as defined in nvml/nvml.h + ClocksThrottleReasonApplicationsClocksSetting = 2 + // ClocksThrottleReasonSyncBoost as defined in nvml/nvml.h + ClocksThrottleReasonSyncBoost = 16 + // ClocksThrottleReasonSwPowerCap as defined in nvml/nvml.h + ClocksThrottleReasonSwPowerCap = 4 + // ClocksThrottleReasonSwThermalSlowdown as defined in nvml/nvml.h + ClocksThrottleReasonSwThermalSlowdown = 32 // ClocksThrottleReasonDisplayClockSetting as defined in nvml/nvml.h ClocksThrottleReasonDisplayClockSetting = 256 // ClocksThrottleReasonNone as defined in nvml/nvml.h @@ -552,6 +634,56 @@ const ( NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_INFINITE = 8 // NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_TIMEOUT as defined in nvml/nvml.h NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_TIMEOUT = 16 + // CC_SYSTEM_CPU_CAPS_NONE as defined in nvml/nvml.h + CC_SYSTEM_CPU_CAPS_NONE = 0 + // CC_SYSTEM_CPU_CAPS_AMD_SEV as defined in nvml/nvml.h + CC_SYSTEM_CPU_CAPS_AMD_SEV = 1 + // CC_SYSTEM_CPU_CAPS_INTEL_TDX as defined in nvml/nvml.h + CC_SYSTEM_CPU_CAPS_INTEL_TDX = 2 + // CC_SYSTEM_GPUS_CC_NOT_CAPABLE as defined in nvml/nvml.h + CC_SYSTEM_GPUS_CC_NOT_CAPABLE = 0 + // CC_SYSTEM_GPUS_CC_CAPABLE as defined in nvml/nvml.h + CC_SYSTEM_GPUS_CC_CAPABLE = 1 + // CC_SYSTEM_DEVTOOLS_MODE_OFF as defined in nvml/nvml.h + CC_SYSTEM_DEVTOOLS_MODE_OFF = 0 + // CC_SYSTEM_DEVTOOLS_MODE_ON as defined in nvml/nvml.h + CC_SYSTEM_DEVTOOLS_MODE_ON = 1 + // CC_SYSTEM_ENVIRONMENT_UNAVAILABLE as defined in nvml/nvml.h + CC_SYSTEM_ENVIRONMENT_UNAVAILABLE = 0 + // CC_SYSTEM_ENVIRONMENT_SIM as defined in nvml/nvml.h + CC_SYSTEM_ENVIRONMENT_SIM = 1 + // CC_SYSTEM_ENVIRONMENT_PROD as defined in nvml/nvml.h + CC_SYSTEM_ENVIRONMENT_PROD = 2 + // CC_SYSTEM_FEATURE_DISABLED as defined in nvml/nvml.h + CC_SYSTEM_FEATURE_DISABLED = 0 + // CC_SYSTEM_FEATURE_ENABLED as defined in nvml/nvml.h + CC_SYSTEM_FEATURE_ENABLED = 1 + // CC_SYSTEM_MULTIGPU_NONE as defined in nvml/nvml.h + CC_SYSTEM_MULTIGPU_NONE = 0 + // CC_SYSTEM_MULTIGPU_PROTECTED_PCIE as defined in nvml/nvml.h + CC_SYSTEM_MULTIGPU_PROTECTED_PCIE = 1 + // CC_ACCEPTING_CLIENT_REQUESTS_FALSE as defined in nvml/nvml.h + CC_ACCEPTING_CLIENT_REQUESTS_FALSE = 0 + // CC_ACCEPTING_CLIENT_REQUESTS_TRUE as defined in nvml/nvml.h + CC_ACCEPTING_CLIENT_REQUESTS_TRUE = 1 + // GPU_CERT_CHAIN_SIZE as defined in nvml/nvml.h + GPU_CERT_CHAIN_SIZE = 4096 + // GPU_ATTESTATION_CERT_CHAIN_SIZE as defined in nvml/nvml.h + GPU_ATTESTATION_CERT_CHAIN_SIZE = 5120 + // CC_GPU_CEC_NONCE_SIZE as defined in nvml/nvml.h + CC_GPU_CEC_NONCE_SIZE = 32 + // CC_GPU_ATTESTATION_REPORT_SIZE as defined in nvml/nvml.h + CC_GPU_ATTESTATION_REPORT_SIZE = 8192 + // CC_GPU_CEC_ATTESTATION_REPORT_SIZE as defined in nvml/nvml.h + CC_GPU_CEC_ATTESTATION_REPORT_SIZE = 4096 + // CC_CEC_ATTESTATION_REPORT_NOT_PRESENT as defined in nvml/nvml.h + CC_CEC_ATTESTATION_REPORT_NOT_PRESENT = 0 + // CC_CEC_ATTESTATION_REPORT_PRESENT as defined in nvml/nvml.h + CC_CEC_ATTESTATION_REPORT_PRESENT = 1 + // CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MIN as defined in nvml/nvml.h + CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MIN = 50 + // CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MAX as defined in nvml/nvml.h + CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MAX = 75 // GPU_FABRIC_UUID_LEN as defined in nvml/nvml.h GPU_FABRIC_UUID_LEN = 16 // GPU_FABRIC_STATE_NOT_SUPPORTED as defined in nvml/nvml.h @@ -562,6 +694,22 @@ const ( GPU_FABRIC_STATE_IN_PROGRESS = 2 // GPU_FABRIC_STATE_COMPLETED as defined in nvml/nvml.h GPU_FABRIC_STATE_COMPLETED = 3 + // GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_NOT_SUPPORTED as defined in nvml/nvml.h + GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_NOT_SUPPORTED = 0 + // GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_TRUE as defined in nvml/nvml.h + GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_TRUE = 1 + // GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_FALSE as defined in nvml/nvml.h + GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_FALSE = 2 + // GPU_FABRIC_HEALTH_MASK_SHIFT_DEGRADED_BW as defined in nvml/nvml.h + GPU_FABRIC_HEALTH_MASK_SHIFT_DEGRADED_BW = 0 + // GPU_FABRIC_HEALTH_MASK_WIDTH_DEGRADED_BW as defined in nvml/nvml.h + GPU_FABRIC_HEALTH_MASK_WIDTH_DEGRADED_BW = 17 + // POWER_SCOPE_GPU as defined in nvml/nvml.h + POWER_SCOPE_GPU = 0 + // POWER_SCOPE_MODULE as defined in nvml/nvml.h + POWER_SCOPE_MODULE = 1 + // POWER_SCOPE_MEMORY as defined in nvml/nvml.h + POWER_SCOPE_MEMORY = 2 // INIT_FLAG_NO_GPUS as defined in nvml/nvml.h INIT_FLAG_NO_GPUS = 1 // INIT_FLAG_NO_ATTACH as defined in nvml/nvml.h @@ -616,6 +764,8 @@ const ( GPU_INSTANCE_PROFILE_1_SLICE_REV2 = 9 // GPU_INSTANCE_PROFILE_COUNT as defined in nvml/nvml.h GPU_INSTANCE_PROFILE_COUNT = 10 + // GPU_INTSTANCE_PROFILE_CAPS_P2P as defined in nvml/nvml.h + GPU_INTSTANCE_PROFILE_CAPS_P2P = 1 // COMPUTE_INSTANCE_PROFILE_1_SLICE as defined in nvml/nvml.h COMPUTE_INSTANCE_PROFILE_1_SLICE = 0 // COMPUTE_INSTANCE_PROFILE_2_SLICE as defined in nvml/nvml.h @@ -642,10 +792,6 @@ const ( GPM_METRICS_GET_VERSION = 1 // GPM_SUPPORT_VERSION as defined in nvml/nvml.h GPM_SUPPORT_VERSION = 1 - // COUNTER_COLLECTION_UNIT_STREAM_STATE_DISABLE as defined in nvml/nvml.h - COUNTER_COLLECTION_UNIT_STREAM_STATE_DISABLE = 0 - // COUNTER_COLLECTION_UNIT_STREAM_STATE_ENABLE as defined in nvml/nvml.h - COUNTER_COLLECTION_UNIT_STREAM_STATE_ENABLE = 1 // NVLINK_POWER_STATE_HIGH_SPEED as defined in nvml/nvml.h NVLINK_POWER_STATE_HIGH_SPEED = 0 // NVLINK_POWER_STATE_LOW as defined in nvml/nvml.h @@ -753,6 +899,7 @@ type GpuP2PStatus int32 const ( P2P_STATUS_OK GpuP2PStatus = iota P2P_STATUS_CHIPSET_NOT_SUPPORED GpuP2PStatus = 1 + P2P_STATUS_CHIPSET_NOT_SUPPORTED GpuP2PStatus = 1 P2P_STATUS_GPU_NOT_SUPPORTED GpuP2PStatus = 2 P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED GpuP2PStatus = 3 P2P_STATUS_DISABLED_BY_REGKEY GpuP2PStatus = 4 @@ -769,6 +916,7 @@ const ( P2P_CAPS_INDEX_WRITE GpuP2PCapsIndex = 1 P2P_CAPS_INDEX_NVLINK GpuP2PCapsIndex = 2 P2P_CAPS_INDEX_ATOMICS GpuP2PCapsIndex = 3 + P2P_CAPS_INDEX_PCI GpuP2PCapsIndex = 4 P2P_CAPS_INDEX_PROP GpuP2PCapsIndex = 4 P2P_CAPS_INDEX_UNKNOWN GpuP2PCapsIndex = 5 ) @@ -785,7 +933,10 @@ const ( DEC_UTILIZATION_SAMPLES SamplingType = 4 PROCESSOR_CLK_SAMPLES SamplingType = 5 MEMORY_CLK_SAMPLES SamplingType = 6 - SAMPLINGTYPE_COUNT SamplingType = 7 + MODULE_POWER_SAMPLES SamplingType = 7 + JPG_UTILIZATION_SAMPLES SamplingType = 8 + OFA_UTILIZATION_SAMPLES SamplingType = 9 + SAMPLINGTYPE_COUNT SamplingType = 10 ) // PcieUtilCounter as declared in nvml/nvml.h @@ -808,7 +959,8 @@ const ( VALUE_TYPE_UNSIGNED_LONG ValueType = 2 VALUE_TYPE_UNSIGNED_LONG_LONG ValueType = 3 VALUE_TYPE_SIGNED_LONG_LONG ValueType = 4 - VALUE_TYPE_COUNT ValueType = 5 + VALUE_TYPE_SIGNED_INT ValueType = 5 + VALUE_TYPE_COUNT ValueType = 6 ) // PerfPolicyType as declared in nvml/nvml.h @@ -1028,6 +1180,9 @@ const ( ERROR_FREQ_NOT_SUPPORTED Return = 24 ERROR_ARGUMENT_VERSION_MISMATCH Return = 25 ERROR_DEPRECATED Return = 26 + ERROR_NOT_READY Return = 27 + ERROR_GPU_NOT_FOUND Return = 28 + ERROR_INVALID_STATE Return = 29 ERROR_UNKNOWN Return = 999 ) @@ -1137,7 +1292,12 @@ const ( DEVICE_VGPU_CAP_FRACTIONAL_MULTI_VGPU DeviceVgpuCapability = iota DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_PROFILES DeviceVgpuCapability = 1 DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_SIZES DeviceVgpuCapability = 2 - DEVICE_VGPU_CAP_COUNT DeviceVgpuCapability = 3 + DEVICE_VGPU_CAP_READ_DEVICE_BUFFER_BW DeviceVgpuCapability = 3 + DEVICE_VGPU_CAP_WRITE_DEVICE_BUFFER_BW DeviceVgpuCapability = 4 + DEVICE_VGPU_CAP_DEVICE_STREAMING DeviceVgpuCapability = 5 + DEVICE_VGPU_CAP_MINI_QUARTER_GPU DeviceVgpuCapability = 6 + DEVICE_VGPU_CAP_COMPUTE_MEDIA_ENGINE_GPU DeviceVgpuCapability = 7 + DEVICE_VGPU_CAP_COUNT DeviceVgpuCapability = 8 ) // GpuUtilizationDomainId as declared in nvml/nvml.h @@ -1174,8 +1334,10 @@ type EncoderType int32 // EncoderType enumeration from nvml/nvml.h const ( - ENCODER_QUERY_H264 EncoderType = iota - ENCODER_QUERY_HEVC EncoderType = 1 + ENCODER_QUERY_H264 EncoderType = iota + ENCODER_QUERY_HEVC EncoderType = 1 + ENCODER_QUERY_AV1 EncoderType = 2 + ENCODER_QUERY_UNKNOWN EncoderType = 255 ) // FBCSessionType as declared in nvml/nvml.h diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/device.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/device.go index 7604d39fa..ac778e5ab 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/device.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/device.go @@ -2085,28 +2085,28 @@ func (device nvmlDevice) GetGpuInstanceProfileInfo(profile int) (GpuInstanceProf } // nvml.DeviceGetGpuInstanceProfileInfoV() -type GpuInstanceProfileInfoV struct { +type GpuInstanceProfileInfoHandler struct { device nvmlDevice profile int } -func (infoV GpuInstanceProfileInfoV) V1() (GpuInstanceProfileInfo, Return) { - return DeviceGetGpuInstanceProfileInfo(infoV.device, infoV.profile) +func (handler GpuInstanceProfileInfoHandler) V1() (GpuInstanceProfileInfo, Return) { + return DeviceGetGpuInstanceProfileInfo(handler.device, handler.profile) } -func (infoV GpuInstanceProfileInfoV) V2() (GpuInstanceProfileInfo_v2, Return) { +func (handler GpuInstanceProfileInfoHandler) V2() (GpuInstanceProfileInfo_v2, Return) { var info GpuInstanceProfileInfo_v2 info.Version = STRUCT_VERSION(info, 2) - ret := nvmlDeviceGetGpuInstanceProfileInfoV(infoV.device, uint32(infoV.profile), &info) + ret := nvmlDeviceGetGpuInstanceProfileInfoV(handler.device, uint32(handler.profile), &info) return info, ret } -func (l *library) DeviceGetGpuInstanceProfileInfoV(device Device, profile int) GpuInstanceProfileInfoV { +func (l *library) DeviceGetGpuInstanceProfileInfoV(device Device, profile int) GpuInstanceProfileInfoHandler { return device.GetGpuInstanceProfileInfoV(profile) } -func (device nvmlDevice) GetGpuInstanceProfileInfoV(profile int) GpuInstanceProfileInfoV { - return GpuInstanceProfileInfoV{device, profile} +func (device nvmlDevice) GetGpuInstanceProfileInfoV(profile int) GpuInstanceProfileInfoHandler { + return GpuInstanceProfileInfoHandler{device, profile} } // nvml.DeviceGetGpuInstancePossiblePlacements() @@ -2231,29 +2231,29 @@ func (gpuInstance nvmlGpuInstance) GetComputeInstanceProfileInfo(profile int, en } // nvml.GpuInstanceGetComputeInstanceProfileInfoV() -type ComputeInstanceProfileInfoV struct { +type ComputeInstanceProfileInfoHandler struct { gpuInstance nvmlGpuInstance profile int engProfile int } -func (infoV ComputeInstanceProfileInfoV) V1() (ComputeInstanceProfileInfo, Return) { - return GpuInstanceGetComputeInstanceProfileInfo(infoV.gpuInstance, infoV.profile, infoV.engProfile) +func (handler ComputeInstanceProfileInfoHandler) V1() (ComputeInstanceProfileInfo, Return) { + return GpuInstanceGetComputeInstanceProfileInfo(handler.gpuInstance, handler.profile, handler.engProfile) } -func (infoV ComputeInstanceProfileInfoV) V2() (ComputeInstanceProfileInfo_v2, Return) { +func (handler ComputeInstanceProfileInfoHandler) V2() (ComputeInstanceProfileInfo_v2, Return) { var info ComputeInstanceProfileInfo_v2 info.Version = STRUCT_VERSION(info, 2) - ret := nvmlGpuInstanceGetComputeInstanceProfileInfoV(infoV.gpuInstance, uint32(infoV.profile), uint32(infoV.engProfile), &info) + ret := nvmlGpuInstanceGetComputeInstanceProfileInfoV(handler.gpuInstance, uint32(handler.profile), uint32(handler.engProfile), &info) return info, ret } -func (l *library) GpuInstanceGetComputeInstanceProfileInfoV(gpuInstance GpuInstance, profile int, engProfile int) ComputeInstanceProfileInfoV { +func (l *library) GpuInstanceGetComputeInstanceProfileInfoV(gpuInstance GpuInstance, profile int, engProfile int) ComputeInstanceProfileInfoHandler { return gpuInstance.GetComputeInstanceProfileInfoV(profile, engProfile) } -func (gpuInstance nvmlGpuInstance) GetComputeInstanceProfileInfoV(profile int, engProfile int) ComputeInstanceProfileInfoV { - return ComputeInstanceProfileInfoV{gpuInstance, profile, engProfile} +func (gpuInstance nvmlGpuInstance) GetComputeInstanceProfileInfoV(profile int, engProfile int) ComputeInstanceProfileInfoHandler { + return ComputeInstanceProfileInfoHandler{gpuInstance, profile, engProfile} } // nvml.GpuInstanceGetComputeInstanceRemainingCapacity() @@ -2737,31 +2737,321 @@ func (device nvmlDevice) GetGpuFabricInfo() (GpuFabricInfo, Return) { return gpuFabricInfo, ret } -// nvml.DeviceCcuGetStreamState() -func (l *library) DeviceCcuGetStreamState(device Device) (int, Return) { - return device.CcuGetStreamState() +// nvml.DeviceSetNvLinkDeviceLowPowerThreshold() +func (l *library) DeviceSetNvLinkDeviceLowPowerThreshold(device Device, info *NvLinkPowerThres) Return { + return device.SetNvLinkDeviceLowPowerThreshold(info) } -func (device nvmlDevice) CcuGetStreamState() (int, Return) { - var state uint32 - ret := nvmlDeviceCcuGetStreamState(device, &state) - return int(state), ret +func (device nvmlDevice) SetNvLinkDeviceLowPowerThreshold(info *NvLinkPowerThres) Return { + return nvmlDeviceSetNvLinkDeviceLowPowerThreshold(device, info) } -// nvml.DeviceCcuSetStreamState() -func (l *library) DeviceCcuSetStreamState(device Device, state int) Return { - return device.CcuSetStreamState(state) +// nvml.DeviceGetModuleId() +func (l *library) DeviceGetModuleId(device Device) (int, Return) { + return device.GetModuleId() } -func (device nvmlDevice) CcuSetStreamState(state int) Return { - return nvmlDeviceCcuSetStreamState(device, uint32(state)) +func (device nvmlDevice) GetModuleId() (int, Return) { + var moduleID uint32 + ret := nvmlDeviceGetModuleId(device, &moduleID) + return int(moduleID), ret } -// nvml.DeviceSetNvLinkDeviceLowPowerThreshold() -func (l *library) DeviceSetNvLinkDeviceLowPowerThreshold(device Device, info *NvLinkPowerThres) Return { - return device.SetNvLinkDeviceLowPowerThreshold(info) +// nvml.DeviceGetCurrentClocksEventReasons() +func (l *library) DeviceGetCurrentClocksEventReasons(device Device) (uint64, Return) { + return device.GetCurrentClocksEventReasons() } -func (device nvmlDevice) SetNvLinkDeviceLowPowerThreshold(info *NvLinkPowerThres) Return { - return nvmlDeviceSetNvLinkDeviceLowPowerThreshold(device, info) +func (device nvmlDevice) GetCurrentClocksEventReasons() (uint64, Return) { + var clocksEventReasons uint64 + ret := nvmlDeviceGetCurrentClocksEventReasons(device, &clocksEventReasons) + return clocksEventReasons, ret +} + +// nvml.DeviceGetSupportedClocksEventReasons() +func (l *library) DeviceGetSupportedClocksEventReasons(device Device) (uint64, Return) { + return device.GetSupportedClocksEventReasons() +} + +func (device nvmlDevice) GetSupportedClocksEventReasons() (uint64, Return) { + var supportedClocksEventReasons uint64 + ret := nvmlDeviceGetSupportedClocksEventReasons(device, &supportedClocksEventReasons) + return supportedClocksEventReasons, ret +} + +// nvml.DeviceGetJpgUtilization() +func (l *library) DeviceGetJpgUtilization(device Device) (uint32, uint32, Return) { + return device.GetJpgUtilization() +} + +func (device nvmlDevice) GetJpgUtilization() (uint32, uint32, Return) { + var utilization, samplingPeriodUs uint32 + ret := nvmlDeviceGetJpgUtilization(device, &utilization, &samplingPeriodUs) + return utilization, samplingPeriodUs, ret +} + +// nvml.DeviceGetOfaUtilization() +func (l *library) DeviceGetOfaUtilization(device Device) (uint32, uint32, Return) { + return device.GetOfaUtilization() +} + +func (device nvmlDevice) GetOfaUtilization() (uint32, uint32, Return) { + var utilization, samplingPeriodUs uint32 + ret := nvmlDeviceGetOfaUtilization(device, &utilization, &samplingPeriodUs) + return utilization, samplingPeriodUs, ret +} + +// nvml.DeviceGetRunningProcessDetailList() +func (l *library) DeviceGetRunningProcessDetailList(device Device) (ProcessDetailList, Return) { + return device.GetRunningProcessDetailList() +} + +func (device nvmlDevice) GetRunningProcessDetailList() (ProcessDetailList, Return) { + var plist ProcessDetailList + ret := nvmlDeviceGetRunningProcessDetailList(device, &plist) + return plist, ret +} + +// nvml.DeviceGetConfComputeMemSizeInfo() +func (l *library) DeviceGetConfComputeMemSizeInfo(device Device) (ConfComputeMemSizeInfo, Return) { + return device.GetConfComputeMemSizeInfo() +} + +func (device nvmlDevice) GetConfComputeMemSizeInfo() (ConfComputeMemSizeInfo, Return) { + var memInfo ConfComputeMemSizeInfo + ret := nvmlDeviceGetConfComputeMemSizeInfo(device, &memInfo) + return memInfo, ret +} + +// nvml.DeviceGetConfComputeProtectedMemoryUsage() +func (l *library) DeviceGetConfComputeProtectedMemoryUsage(device Device) (Memory, Return) { + return device.GetConfComputeProtectedMemoryUsage() +} + +func (device nvmlDevice) GetConfComputeProtectedMemoryUsage() (Memory, Return) { + var memory Memory + ret := nvmlDeviceGetConfComputeProtectedMemoryUsage(device, &memory) + return memory, ret +} + +// nvml.DeviceGetConfComputeGpuCertificate() +func (l *library) DeviceGetConfComputeGpuCertificate(device Device) (ConfComputeGpuCertificate, Return) { + return device.GetConfComputeGpuCertificate() +} + +func (device nvmlDevice) GetConfComputeGpuCertificate() (ConfComputeGpuCertificate, Return) { + var gpuCert ConfComputeGpuCertificate + ret := nvmlDeviceGetConfComputeGpuCertificate(device, &gpuCert) + return gpuCert, ret +} + +// nvml.DeviceGetConfComputeGpuAttestationReport() +func (l *library) DeviceGetConfComputeGpuAttestationReport(device Device) (ConfComputeGpuAttestationReport, Return) { + return device.GetConfComputeGpuAttestationReport() +} + +func (device nvmlDevice) GetConfComputeGpuAttestationReport() (ConfComputeGpuAttestationReport, Return) { + var gpuAtstReport ConfComputeGpuAttestationReport + ret := nvmlDeviceGetConfComputeGpuAttestationReport(device, &gpuAtstReport) + return gpuAtstReport, ret +} + +// nvml.DeviceSetConfComputeUnprotectedMemSize() +func (l *library) DeviceSetConfComputeUnprotectedMemSize(device Device, sizeKiB uint64) Return { + return device.SetConfComputeUnprotectedMemSize(sizeKiB) +} + +func (device nvmlDevice) SetConfComputeUnprotectedMemSize(sizeKiB uint64) Return { + return nvmlDeviceSetConfComputeUnprotectedMemSize(device, sizeKiB) +} + +// nvml.DeviceSetPowerManagementLimit_v2() +func (l *library) DeviceSetPowerManagementLimit_v2(device Device, powerValue *PowerValue_v2) Return { + return device.SetPowerManagementLimit_v2(powerValue) +} + +func (device nvmlDevice) SetPowerManagementLimit_v2(powerValue *PowerValue_v2) Return { + return nvmlDeviceSetPowerManagementLimit_v2(device, powerValue) +} + +// nvml.DeviceGetC2cModeInfoV() +type C2cModeInfoHandler struct { + device nvmlDevice +} + +func (handler C2cModeInfoHandler) V1() (C2cModeInfo_v1, Return) { + var c2cModeInfo C2cModeInfo_v1 + ret := nvmlDeviceGetC2cModeInfoV(handler.device, &c2cModeInfo) + return c2cModeInfo, ret +} + +func (l *library) DeviceGetC2cModeInfoV(device Device) C2cModeInfoHandler { + return device.GetC2cModeInfoV() +} + +func (device nvmlDevice) GetC2cModeInfoV() C2cModeInfoHandler { + return C2cModeInfoHandler{device} +} + +// nvml.DeviceGetLastBBXFlushTime() +func (l *library) DeviceGetLastBBXFlushTime(device Device) (uint64, uint, Return) { + return device.GetLastBBXFlushTime() +} + +func (device nvmlDevice) GetLastBBXFlushTime() (uint64, uint, Return) { + var timestamp uint64 + var durationUs uint + ret := nvmlDeviceGetLastBBXFlushTime(device, ×tamp, &durationUs) + return timestamp, durationUs, ret +} + +// nvml.DeviceGetNumaNodeId() +func (l *library) DeviceGetNumaNodeId(device Device) (int, Return) { + return device.GetNumaNodeId() +} + +func (device nvmlDevice) GetNumaNodeId() (int, Return) { + var node uint32 + ret := nvmlDeviceGetNumaNodeId(device, &node) + return int(node), ret +} + +// nvml.DeviceGetPciInfoExt() +func (l *library) DeviceGetPciInfoExt(device Device) (PciInfoExt, Return) { + return device.GetPciInfoExt() +} + +func (device nvmlDevice) GetPciInfoExt() (PciInfoExt, Return) { + var pciInfo PciInfoExt + ret := nvmlDeviceGetPciInfoExt(device, &pciInfo) + return pciInfo, ret +} + +// nvml.DeviceGetGpuFabricInfoV() +type GpuFabricInfoHandler struct { + device nvmlDevice +} + +func (handler GpuFabricInfoHandler) V1() (GpuFabricInfo, Return) { + return handler.device.GetGpuFabricInfo() +} + +func (handler GpuFabricInfoHandler) V2() (GpuFabricInfo_v2, Return) { + var info GpuFabricInfoV + info.Version = STRUCT_VERSION(info, 2) + ret := nvmlDeviceGetGpuFabricInfoV(handler.device, &info) + return GpuFabricInfo_v2(info), ret +} + +func (l *library) DeviceGetGpuFabricInfoV(device Device) GpuFabricInfoHandler { + return device.GetGpuFabricInfoV() +} + +func (device nvmlDevice) GetGpuFabricInfoV() GpuFabricInfoHandler { + return GpuFabricInfoHandler{device} +} + +// nvml.DeviceGetProcessesUtilizationInfo() +func (l *library) DeviceGetProcessesUtilizationInfo(device Device) (ProcessesUtilizationInfo, Return) { + return device.GetProcessesUtilizationInfo() +} + +func (device nvmlDevice) GetProcessesUtilizationInfo() (ProcessesUtilizationInfo, Return) { + var processesUtilInfo ProcessesUtilizationInfo + ret := nvmlDeviceGetProcessesUtilizationInfo(device, &processesUtilInfo) + return processesUtilInfo, ret +} + +// nvml.DeviceGetVgpuHeterogeneousMode() +func (l *library) DeviceGetVgpuHeterogeneousMode(device Device) (VgpuHeterogeneousMode, Return) { + return device.GetVgpuHeterogeneousMode() +} + +func (device nvmlDevice) GetVgpuHeterogeneousMode() (VgpuHeterogeneousMode, Return) { + var heterogeneousMode VgpuHeterogeneousMode + ret := nvmlDeviceGetVgpuHeterogeneousMode(device, &heterogeneousMode) + return heterogeneousMode, ret +} + +// nvml.DeviceSetVgpuHeterogeneousMode() +func (l *library) DeviceSetVgpuHeterogeneousMode(device Device, heterogeneousMode VgpuHeterogeneousMode) Return { + return device.SetVgpuHeterogeneousMode(heterogeneousMode) +} + +func (device nvmlDevice) SetVgpuHeterogeneousMode(heterogeneousMode VgpuHeterogeneousMode) Return { + ret := nvmlDeviceSetVgpuHeterogeneousMode(device, &heterogeneousMode) + return ret +} + +// nvml.DeviceGetVgpuTypeSupportedPlacements() +func (l *library) DeviceGetVgpuTypeSupportedPlacements(device Device, vgpuTypeId VgpuTypeId) (VgpuPlacementList, Return) { + return device.GetVgpuTypeSupportedPlacements(vgpuTypeId) +} + +func (device nvmlDevice) GetVgpuTypeSupportedPlacements(vgpuTypeId VgpuTypeId) (VgpuPlacementList, Return) { + return vgpuTypeId.GetSupportedPlacements(device) +} + +func (vgpuTypeId nvmlVgpuTypeId) GetSupportedPlacements(device Device) (VgpuPlacementList, Return) { + var placementList VgpuPlacementList + ret := nvmlDeviceGetVgpuTypeSupportedPlacements(nvmlDeviceHandle(device), vgpuTypeId, &placementList) + return placementList, ret +} + +// nvml.DeviceGetVgpuTypeCreatablePlacements() +func (l *library) DeviceGetVgpuTypeCreatablePlacements(device Device, vgpuTypeId VgpuTypeId) (VgpuPlacementList, Return) { + return device.GetVgpuTypeCreatablePlacements(vgpuTypeId) +} + +func (device nvmlDevice) GetVgpuTypeCreatablePlacements(vgpuTypeId VgpuTypeId) (VgpuPlacementList, Return) { + return vgpuTypeId.GetCreatablePlacements(device) +} + +func (vgpuTypeId nvmlVgpuTypeId) GetCreatablePlacements(device Device) (VgpuPlacementList, Return) { + var placementList VgpuPlacementList + ret := nvmlDeviceGetVgpuTypeCreatablePlacements(nvmlDeviceHandle(device), vgpuTypeId, &placementList) + return placementList, ret +} + +// nvml.DeviceSetVgpuCapabilities() +func (l *library) DeviceSetVgpuCapabilities(device Device, capability DeviceVgpuCapability, state EnableState) Return { + return device.SetVgpuCapabilities(capability, state) +} + +func (device nvmlDevice) SetVgpuCapabilities(capability DeviceVgpuCapability, state EnableState) Return { + ret := nvmlDeviceSetVgpuCapabilities(device, capability, state) + return ret +} + +// nvml.DeviceGetVgpuInstancesUtilizationInfo() +func (l *library) DeviceGetVgpuInstancesUtilizationInfo(device Device) (VgpuInstancesUtilizationInfo, Return) { + return device.GetVgpuInstancesUtilizationInfo() +} + +func (device nvmlDevice) GetVgpuInstancesUtilizationInfo() (VgpuInstancesUtilizationInfo, Return) { + var vgpuUtilInfo VgpuInstancesUtilizationInfo + ret := nvmlDeviceGetVgpuInstancesUtilizationInfo(device, &vgpuUtilInfo) + return vgpuUtilInfo, ret +} + +// nvml.DeviceGetVgpuProcessesUtilizationInfo() +func (l *library) DeviceGetVgpuProcessesUtilizationInfo(device Device) (VgpuProcessesUtilizationInfo, Return) { + return device.GetVgpuProcessesUtilizationInfo() +} + +func (device nvmlDevice) GetVgpuProcessesUtilizationInfo() (VgpuProcessesUtilizationInfo, Return) { + var vgpuProcUtilInfo VgpuProcessesUtilizationInfo + ret := nvmlDeviceGetVgpuProcessesUtilizationInfo(device, &vgpuProcUtilInfo) + return vgpuProcUtilInfo, ret +} + +// nvml.DeviceGetSramEccErrorStatus() +func (l *library) DeviceGetSramEccErrorStatus(device Device) (EccSramErrorStatus, Return) { + return device.GetSramEccErrorStatus() +} + +func (device nvmlDevice) GetSramEccErrorStatus() (EccSramErrorStatus, Return) { + var status EccSramErrorStatus + ret := nvmlDeviceGetSramEccErrorStatus(device, &status) + return status, ret } diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/gpm.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/gpm.go index acdb2e0ca..7f8995cc7 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/gpm.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/gpm.go @@ -51,20 +51,31 @@ func (g *nvmlGpmMetricsGetType) convert() *GpmMetricsGetType { // nvml.GpmMetricsGet() type GpmMetricsGetVType struct { - metricsGet *nvmlGpmMetricsGetType + metricsGet *GpmMetricsGetType } func (l *library) GpmMetricsGetV(metricsGet *GpmMetricsGetType) GpmMetricsGetVType { - return GpmMetricsGetVType{metricsGet.convert()} + return GpmMetricsGetVType{metricsGet} } + +// nvmlGpmMetricsGetStub is a stub function that can be overridden for testing. +var nvmlGpmMetricsGetStub = nvmlGpmMetricsGet + func (metricsGetV GpmMetricsGetVType) V1() Return { metricsGetV.metricsGet.Version = 1 - return nvmlGpmMetricsGet(metricsGetV.metricsGet) + return gpmMetricsGet(metricsGetV.metricsGet) } func (l *library) GpmMetricsGet(metricsGet *GpmMetricsGetType) Return { metricsGet.Version = GPM_METRICS_GET_VERSION - return nvmlGpmMetricsGet(metricsGet.convert()) + return gpmMetricsGet(metricsGet) +} + +func gpmMetricsGet(metricsGet *GpmMetricsGetType) Return { + nvmlMetricsGet := metricsGet.convert() + ret := nvmlGpmMetricsGetStub(nvmlMetricsGet) + *metricsGet = *nvmlMetricsGet.convert() + return ret } // nvml.GpmSampleFree() @@ -139,3 +150,23 @@ func (device nvmlDevice) GpmMigSampleGet(gpuInstanceId int, gpmSample GpmSample) func (gpmSample nvmlGpmSample) MigGet(device Device, gpuInstanceId int) Return { return nvmlGpmMigSampleGet(nvmlDeviceHandle(device), uint32(gpuInstanceId), gpmSample) } + +// nvml.GpmQueryIfStreamingEnabled() +func (l *library) GpmQueryIfStreamingEnabled(device Device) (uint32, Return) { + return device.GpmQueryIfStreamingEnabled() +} + +func (device nvmlDevice) GpmQueryIfStreamingEnabled() (uint32, Return) { + var state uint32 + ret := nvmlGpmQueryIfStreamingEnabled(device, &state) + return state, ret +} + +// nvml.GpmSetStreamingEnabled() +func (l *library) GpmSetStreamingEnabled(device Device, state uint32) Return { + return device.GpmSetStreamingEnabled(state) +} + +func (device nvmlDevice) GpmSetStreamingEnabled(state uint32) Return { + return nvmlGpmSetStreamingEnabled(device, state) +} diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.go index 65fcffac5..6ba290c5f 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.go @@ -102,6 +102,25 @@ func nvmlSystemGetProcessName(Pid uint32, Name *byte, Length uint32) Return { return __v } +// nvmlSystemGetHicVersion function as declared in nvml/nvml.h +func nvmlSystemGetHicVersion(HwbcCount *uint32, HwbcEntries *HwbcEntry) Return { + cHwbcCount, _ := (*C.uint)(unsafe.Pointer(HwbcCount)), cgoAllocsUnknown + cHwbcEntries, _ := (*C.nvmlHwbcEntry_t)(unsafe.Pointer(HwbcEntries)), cgoAllocsUnknown + __ret := C.nvmlSystemGetHicVersion(cHwbcCount, cHwbcEntries) + __v := (Return)(__ret) + return __v +} + +// nvmlSystemGetTopologyGpuSet function as declared in nvml/nvml.h +func nvmlSystemGetTopologyGpuSet(CpuNumber uint32, Count *uint32, DeviceArray *nvmlDevice) Return { + cCpuNumber, _ := (C.uint)(CpuNumber), cgoAllocsUnknown + cCount, _ := (*C.uint)(unsafe.Pointer(Count)), cgoAllocsUnknown + cDeviceArray, _ := (*C.nvmlDevice_t)(unsafe.Pointer(DeviceArray)), cgoAllocsUnknown + __ret := C.nvmlSystemGetTopologyGpuSet(cCpuNumber, cCount, cDeviceArray) + __v := (Return)(__ret) + return __v +} + // nvmlUnitGetCount function as declared in nvml/nvml.h func nvmlUnitGetCount(UnitCount *uint32) Return { cUnitCount, _ := (*C.uint)(unsafe.Pointer(UnitCount)), cgoAllocsUnknown @@ -175,15 +194,6 @@ func nvmlUnitGetDevices(nvmlUnit nvmlUnit, DeviceCount *uint32, Devices *nvmlDev return __v } -// nvmlSystemGetHicVersion function as declared in nvml/nvml.h -func nvmlSystemGetHicVersion(HwbcCount *uint32, HwbcEntries *HwbcEntry) Return { - cHwbcCount, _ := (*C.uint)(unsafe.Pointer(HwbcCount)), cgoAllocsUnknown - cHwbcEntries, _ := (*C.nvmlHwbcEntry_t)(unsafe.Pointer(HwbcEntries)), cgoAllocsUnknown - __ret := C.nvmlSystemGetHicVersion(cHwbcCount, cHwbcEntries) - __v := (Return)(__ret) - return __v -} - // nvmlDeviceGetCount_v2 function as declared in nvml/nvml.h func nvmlDeviceGetCount_v2(DeviceCount *uint32) Return { cDeviceCount, _ := (*C.uint)(unsafe.Pointer(DeviceCount)), cgoAllocsUnknown @@ -275,6 +285,24 @@ func nvmlDeviceGetSerial(nvmlDevice nvmlDevice, Serial *byte, Length uint32) Ret return __v } +// nvmlDeviceGetModuleId function as declared in nvml/nvml.h +func nvmlDeviceGetModuleId(nvmlDevice nvmlDevice, ModuleId *uint32) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cModuleId, _ := (*C.uint)(unsafe.Pointer(ModuleId)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetModuleId(cnvmlDevice, cModuleId) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetC2cModeInfoV function as declared in nvml/nvml.h +func nvmlDeviceGetC2cModeInfoV(nvmlDevice nvmlDevice, C2cModeInfo *C2cModeInfo_v1) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cC2cModeInfo, _ := (*C.nvmlC2cModeInfo_v1_t)(unsafe.Pointer(C2cModeInfo)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetC2cModeInfoV(cnvmlDevice, cC2cModeInfo) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetMemoryAffinity function as declared in nvml/nvml.h func nvmlDeviceGetMemoryAffinity(nvmlDevice nvmlDevice, NodeSetSize uint32, NodeSet *uint, Scope AffinityScope) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown @@ -323,6 +351,15 @@ func nvmlDeviceClearCpuAffinity(nvmlDevice nvmlDevice) Return { return __v } +// nvmlDeviceGetNumaNodeId function as declared in nvml/nvml.h +func nvmlDeviceGetNumaNodeId(nvmlDevice nvmlDevice, Node *uint32) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cNode, _ := (*C.uint)(unsafe.Pointer(Node)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetNumaNodeId(cnvmlDevice, cNode) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetTopologyCommonAncestor function as declared in nvml/nvml.h func nvmlDeviceGetTopologyCommonAncestor(Device1 nvmlDevice, Device2 nvmlDevice, PathInfo *GpuTopologyLevel) Return { cDevice1, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device1)), cgoAllocsUnknown @@ -344,16 +381,6 @@ func nvmlDeviceGetTopologyNearestGpus(nvmlDevice nvmlDevice, Level GpuTopologyLe return __v } -// nvmlSystemGetTopologyGpuSet function as declared in nvml/nvml.h -func nvmlSystemGetTopologyGpuSet(CpuNumber uint32, Count *uint32, DeviceArray *nvmlDevice) Return { - cCpuNumber, _ := (C.uint)(CpuNumber), cgoAllocsUnknown - cCount, _ := (*C.uint)(unsafe.Pointer(Count)), cgoAllocsUnknown - cDeviceArray, _ := (*C.nvmlDevice_t)(unsafe.Pointer(DeviceArray)), cgoAllocsUnknown - __ret := C.nvmlSystemGetTopologyGpuSet(cCpuNumber, cCount, cDeviceArray) - __v := (Return)(__ret) - return __v -} - // nvmlDeviceGetP2PStatus function as declared in nvml/nvml.h func nvmlDeviceGetP2PStatus(Device1 nvmlDevice, Device2 nvmlDevice, P2pIndex GpuP2PCapsIndex, P2pStatus *GpuP2PStatus) Return { cDevice1, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device1)), cgoAllocsUnknown @@ -375,16 +402,6 @@ func nvmlDeviceGetUUID(nvmlDevice nvmlDevice, Uuid *byte, Length uint32) Return return __v } -// nvmlVgpuInstanceGetMdevUUID function as declared in nvml/nvml.h -func nvmlVgpuInstanceGetMdevUUID(nvmlVgpuInstance nvmlVgpuInstance, MdevUuid *byte, Size uint32) Return { - cnvmlVgpuInstance, _ := (C.nvmlVgpuInstance_t)(nvmlVgpuInstance), cgoAllocsUnknown - cMdevUuid, _ := (*C.char)(unsafe.Pointer(MdevUuid)), cgoAllocsUnknown - cSize, _ := (C.uint)(Size), cgoAllocsUnknown - __ret := C.nvmlVgpuInstanceGetMdevUUID(cnvmlVgpuInstance, cMdevUuid, cSize) - __v := (Return)(__ret) - return __v -} - // nvmlDeviceGetMinorNumber function as declared in nvml/nvml.h func nvmlDeviceGetMinorNumber(nvmlDevice nvmlDevice, MinorNumber *uint32) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown @@ -442,6 +459,16 @@ func nvmlDeviceValidateInforom(nvmlDevice nvmlDevice) Return { return __v } +// nvmlDeviceGetLastBBXFlushTime function as declared in nvml/nvml.h +func nvmlDeviceGetLastBBXFlushTime(nvmlDevice nvmlDevice, Timestamp *uint64, DurationUs *uint) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cTimestamp, _ := (*C.ulonglong)(unsafe.Pointer(Timestamp)), cgoAllocsUnknown + cDurationUs, _ := (*C.ulong)(unsafe.Pointer(DurationUs)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetLastBBXFlushTime(cnvmlDevice, cTimestamp, cDurationUs) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetDisplayMode function as declared in nvml/nvml.h func nvmlDeviceGetDisplayMode(nvmlDevice nvmlDevice, Display *EnableState) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown @@ -469,6 +496,15 @@ func nvmlDeviceGetPersistenceMode(nvmlDevice nvmlDevice, Mode *EnableState) Retu return __v } +// nvmlDeviceGetPciInfoExt function as declared in nvml/nvml.h +func nvmlDeviceGetPciInfoExt(nvmlDevice nvmlDevice, Pci *PciInfoExt) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cPci, _ := (*C.nvmlPciInfoExt_t)(unsafe.Pointer(Pci)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetPciInfoExt(cnvmlDevice, cPci) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetPciInfo_v3 function as declared in nvml/nvml.h func nvmlDeviceGetPciInfo_v3(nvmlDevice nvmlDevice, Pci *PciInfo) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown @@ -562,6 +598,15 @@ func nvmlDeviceGetMaxClockInfo(nvmlDevice nvmlDevice, _type ClockType, Clock *ui return __v } +// nvmlDeviceGetGpcClkVfOffset function as declared in nvml/nvml.h +func nvmlDeviceGetGpcClkVfOffset(nvmlDevice nvmlDevice, Offset *int32) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cOffset, _ := (*C.int)(unsafe.Pointer(Offset)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetGpcClkVfOffset(cnvmlDevice, cOffset) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetApplicationsClock function as declared in nvml/nvml.h func nvmlDeviceGetApplicationsClock(nvmlDevice nvmlDevice, ClockType ClockType, ClockMHz *uint32) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown @@ -582,14 +627,6 @@ func nvmlDeviceGetDefaultApplicationsClock(nvmlDevice nvmlDevice, ClockType Cloc return __v } -// nvmlDeviceResetApplicationsClocks function as declared in nvml/nvml.h -func nvmlDeviceResetApplicationsClocks(nvmlDevice nvmlDevice) Return { - cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - __ret := C.nvmlDeviceResetApplicationsClocks(cnvmlDevice) - __v := (Return)(__ret) - return __v -} - // nvmlDeviceGetClock function as declared in nvml/nvml.h func nvmlDeviceGetClock(nvmlDevice nvmlDevice, ClockType ClockType, ClockId ClockId, ClockMHz *uint32) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown @@ -642,25 +679,6 @@ func nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice nvmlDevice, IsEnabled *Ena return __v } -// nvmlDeviceSetAutoBoostedClocksEnabled function as declared in nvml/nvml.h -func nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice nvmlDevice, Enabled EnableState) Return { - cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cEnabled, _ := (C.nvmlEnableState_t)(Enabled), cgoAllocsUnknown - __ret := C.nvmlDeviceSetAutoBoostedClocksEnabled(cnvmlDevice, cEnabled) - __v := (Return)(__ret) - return __v -} - -// nvmlDeviceSetDefaultAutoBoostedClocksEnabled function as declared in nvml/nvml.h -func nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice nvmlDevice, Enabled EnableState, Flags uint32) Return { - cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cEnabled, _ := (C.nvmlEnableState_t)(Enabled), cgoAllocsUnknown - cFlags, _ := (C.uint)(Flags), cgoAllocsUnknown - __ret := C.nvmlDeviceSetDefaultAutoBoostedClocksEnabled(cnvmlDevice, cEnabled, cFlags) - __v := (Return)(__ret) - return __v -} - // nvmlDeviceGetFanSpeed function as declared in nvml/nvml.h func nvmlDeviceGetFanSpeed(nvmlDevice nvmlDevice, Speed *uint32) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown @@ -690,15 +708,6 @@ func nvmlDeviceGetTargetFanSpeed(nvmlDevice nvmlDevice, Fan uint32, TargetSpeed return __v } -// nvmlDeviceSetDefaultFanSpeed_v2 function as declared in nvml/nvml.h -func nvmlDeviceSetDefaultFanSpeed_v2(nvmlDevice nvmlDevice, Fan uint32) Return { - cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cFan, _ := (C.uint)(Fan), cgoAllocsUnknown - __ret := C.nvmlDeviceSetDefaultFanSpeed_v2(cnvmlDevice, cFan) - __v := (Return)(__ret) - return __v -} - // nvmlDeviceGetMinMaxFanSpeed function as declared in nvml/nvml.h func nvmlDeviceGetMinMaxFanSpeed(nvmlDevice nvmlDevice, MinSpeed *uint32, MaxSpeed *uint32) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown @@ -719,16 +728,6 @@ func nvmlDeviceGetFanControlPolicy_v2(nvmlDevice nvmlDevice, Fan uint32, Policy return __v } -// nvmlDeviceSetFanControlPolicy function as declared in nvml/nvml.h -func nvmlDeviceSetFanControlPolicy(nvmlDevice nvmlDevice, Fan uint32, Policy FanControlPolicy) Return { - cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cFan, _ := (C.uint)(Fan), cgoAllocsUnknown - cPolicy, _ := (C.nvmlFanControlPolicy_t)(Policy), cgoAllocsUnknown - __ret := C.nvmlDeviceSetFanControlPolicy(cnvmlDevice, cFan, cPolicy) - __v := (Return)(__ret) - return __v -} - // nvmlDeviceGetNumFans function as declared in nvml/nvml.h func nvmlDeviceGetNumFans(nvmlDevice nvmlDevice, NumFans *uint32) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown @@ -758,16 +757,6 @@ func nvmlDeviceGetTemperatureThreshold(nvmlDevice nvmlDevice, ThresholdType Temp return __v } -// nvmlDeviceSetTemperatureThreshold function as declared in nvml/nvml.h -func nvmlDeviceSetTemperatureThreshold(nvmlDevice nvmlDevice, ThresholdType TemperatureThresholds, Temp *int32) Return { - cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cThresholdType, _ := (C.nvmlTemperatureThresholds_t)(ThresholdType), cgoAllocsUnknown - cTemp, _ := (*C.int)(unsafe.Pointer(Temp)), cgoAllocsUnknown - __ret := C.nvmlDeviceSetTemperatureThreshold(cnvmlDevice, cThresholdType, cTemp) - __v := (Return)(__ret) - return __v -} - // nvmlDeviceGetThermalSettings function as declared in nvml/nvml.h func nvmlDeviceGetThermalSettings(nvmlDevice nvmlDevice, SensorIndex uint32, PThermalSettings *GpuThermalSettings) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown @@ -787,6 +776,15 @@ func nvmlDeviceGetPerformanceState(nvmlDevice nvmlDevice, PState *Pstates) Retur return __v } +// nvmlDeviceGetCurrentClocksEventReasons function as declared in nvml/nvml.h +func nvmlDeviceGetCurrentClocksEventReasons(nvmlDevice nvmlDevice, ClocksEventReasons *uint64) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cClocksEventReasons, _ := (*C.ulonglong)(unsafe.Pointer(ClocksEventReasons)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetCurrentClocksEventReasons(cnvmlDevice, cClocksEventReasons) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetCurrentClocksThrottleReasons function as declared in nvml/nvml.h func nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice nvmlDevice, ClocksThrottleReasons *uint64) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown @@ -796,6 +794,15 @@ func nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice nvmlDevice, ClocksThro return __v } +// nvmlDeviceGetSupportedClocksEventReasons function as declared in nvml/nvml.h +func nvmlDeviceGetSupportedClocksEventReasons(nvmlDevice nvmlDevice, SupportedClocksEventReasons *uint64) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cSupportedClocksEventReasons, _ := (*C.ulonglong)(unsafe.Pointer(SupportedClocksEventReasons)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetSupportedClocksEventReasons(cnvmlDevice, cSupportedClocksEventReasons) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetSupportedClocksThrottleReasons function as declared in nvml/nvml.h func nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice nvmlDevice, SupportedClocksThrottleReasons *uint64) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown @@ -814,6 +821,66 @@ func nvmlDeviceGetPowerState(nvmlDevice nvmlDevice, PState *Pstates) Return { return __v } +// nvmlDeviceGetDynamicPstatesInfo function as declared in nvml/nvml.h +func nvmlDeviceGetDynamicPstatesInfo(nvmlDevice nvmlDevice, PDynamicPstatesInfo *GpuDynamicPstatesInfo) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cPDynamicPstatesInfo, _ := (*C.nvmlGpuDynamicPstatesInfo_t)(unsafe.Pointer(PDynamicPstatesInfo)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetDynamicPstatesInfo(cnvmlDevice, cPDynamicPstatesInfo) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetMemClkVfOffset function as declared in nvml/nvml.h +func nvmlDeviceGetMemClkVfOffset(nvmlDevice nvmlDevice, Offset *int32) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cOffset, _ := (*C.int)(unsafe.Pointer(Offset)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetMemClkVfOffset(cnvmlDevice, cOffset) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetMinMaxClockOfPState function as declared in nvml/nvml.h +func nvmlDeviceGetMinMaxClockOfPState(nvmlDevice nvmlDevice, _type ClockType, Pstate Pstates, MinClockMHz *uint32, MaxClockMHz *uint32) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + c_type, _ := (C.nvmlClockType_t)(_type), cgoAllocsUnknown + cPstate, _ := (C.nvmlPstates_t)(Pstate), cgoAllocsUnknown + cMinClockMHz, _ := (*C.uint)(unsafe.Pointer(MinClockMHz)), cgoAllocsUnknown + cMaxClockMHz, _ := (*C.uint)(unsafe.Pointer(MaxClockMHz)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetMinMaxClockOfPState(cnvmlDevice, c_type, cPstate, cMinClockMHz, cMaxClockMHz) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetSupportedPerformanceStates function as declared in nvml/nvml.h +func nvmlDeviceGetSupportedPerformanceStates(nvmlDevice nvmlDevice, Pstates *Pstates, Size uint32) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cPstates, _ := (*C.nvmlPstates_t)(unsafe.Pointer(Pstates)), cgoAllocsUnknown + cSize, _ := (C.uint)(Size), cgoAllocsUnknown + __ret := C.nvmlDeviceGetSupportedPerformanceStates(cnvmlDevice, cPstates, cSize) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetGpcClkMinMaxVfOffset function as declared in nvml/nvml.h +func nvmlDeviceGetGpcClkMinMaxVfOffset(nvmlDevice nvmlDevice, MinOffset *int32, MaxOffset *int32) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cMinOffset, _ := (*C.int)(unsafe.Pointer(MinOffset)), cgoAllocsUnknown + cMaxOffset, _ := (*C.int)(unsafe.Pointer(MaxOffset)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetGpcClkMinMaxVfOffset(cnvmlDevice, cMinOffset, cMaxOffset) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetMemClkMinMaxVfOffset function as declared in nvml/nvml.h +func nvmlDeviceGetMemClkMinMaxVfOffset(nvmlDevice nvmlDevice, MinOffset *int32, MaxOffset *int32) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cMinOffset, _ := (*C.int)(unsafe.Pointer(MinOffset)), cgoAllocsUnknown + cMaxOffset, _ := (*C.int)(unsafe.Pointer(MaxOffset)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetMemClkMinMaxVfOffset(cnvmlDevice, cMinOffset, cMaxOffset) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetPowerManagementMode function as declared in nvml/nvml.h func nvmlDeviceGetPowerManagementMode(nvmlDevice nvmlDevice, Mode *EnableState) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown @@ -1056,6 +1123,26 @@ func nvmlDeviceGetDecoderUtilization(nvmlDevice nvmlDevice, Utilization *uint32, return __v } +// nvmlDeviceGetJpgUtilization function as declared in nvml/nvml.h +func nvmlDeviceGetJpgUtilization(nvmlDevice nvmlDevice, Utilization *uint32, SamplingPeriodUs *uint32) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cUtilization, _ := (*C.uint)(unsafe.Pointer(Utilization)), cgoAllocsUnknown + cSamplingPeriodUs, _ := (*C.uint)(unsafe.Pointer(SamplingPeriodUs)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetJpgUtilization(cnvmlDevice, cUtilization, cSamplingPeriodUs) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetOfaUtilization function as declared in nvml/nvml.h +func nvmlDeviceGetOfaUtilization(nvmlDevice nvmlDevice, Utilization *uint32, SamplingPeriodUs *uint32) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cUtilization, _ := (*C.uint)(unsafe.Pointer(Utilization)), cgoAllocsUnknown + cSamplingPeriodUs, _ := (*C.uint)(unsafe.Pointer(SamplingPeriodUs)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetOfaUtilization(cnvmlDevice, cUtilization, cSamplingPeriodUs) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetFBCStats function as declared in nvml/nvml.h func nvmlDeviceGetFBCStats(nvmlDevice nvmlDevice, FbcStats *FBCStats) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown @@ -1134,6 +1221,15 @@ func nvmlDeviceGetMPSComputeRunningProcesses_v3(nvmlDevice nvmlDevice, InfoCount return __v } +// nvmlDeviceGetRunningProcessDetailList function as declared in nvml/nvml.h +func nvmlDeviceGetRunningProcessDetailList(nvmlDevice nvmlDevice, Plist *ProcessDetailList) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cPlist, _ := (*C.nvmlProcessDetailList_t)(unsafe.Pointer(Plist)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetRunningProcessDetailList(cnvmlDevice, cPlist) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceOnSameBoard function as declared in nvml/nvml.h func nvmlDeviceOnSameBoard(Device1 nvmlDevice, Device2 nvmlDevice, OnSameBoard *int32) Return { cDevice1, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device1)), cgoAllocsUnknown @@ -1249,6 +1345,128 @@ func nvmlDeviceGetAdaptiveClockInfoStatus(nvmlDevice nvmlDevice, AdaptiveClockSt return __v } +// nvmlDeviceGetBusType function as declared in nvml/nvml.h +func nvmlDeviceGetBusType(nvmlDevice nvmlDevice, _type *BusType) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + c_type, _ := (*C.nvmlBusType_t)(unsafe.Pointer(_type)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetBusType(cnvmlDevice, c_type) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetGpuFabricInfo function as declared in nvml/nvml.h +func nvmlDeviceGetGpuFabricInfo(nvmlDevice nvmlDevice, GpuFabricInfo *GpuFabricInfo) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cGpuFabricInfo, _ := (*C.nvmlGpuFabricInfo_t)(unsafe.Pointer(GpuFabricInfo)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetGpuFabricInfo(cnvmlDevice, cGpuFabricInfo) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetGpuFabricInfoV function as declared in nvml/nvml.h +func nvmlDeviceGetGpuFabricInfoV(nvmlDevice nvmlDevice, GpuFabricInfo *GpuFabricInfoV) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cGpuFabricInfo, _ := (*C.nvmlGpuFabricInfoV_t)(unsafe.Pointer(GpuFabricInfo)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetGpuFabricInfoV(cnvmlDevice, cGpuFabricInfo) + __v := (Return)(__ret) + return __v +} + +// nvmlSystemGetConfComputeCapabilities function as declared in nvml/nvml.h +func nvmlSystemGetConfComputeCapabilities(Capabilities *ConfComputeSystemCaps) Return { + cCapabilities, _ := (*C.nvmlConfComputeSystemCaps_t)(unsafe.Pointer(Capabilities)), cgoAllocsUnknown + __ret := C.nvmlSystemGetConfComputeCapabilities(cCapabilities) + __v := (Return)(__ret) + return __v +} + +// nvmlSystemGetConfComputeState function as declared in nvml/nvml.h +func nvmlSystemGetConfComputeState(State *ConfComputeSystemState) Return { + cState, _ := (*C.nvmlConfComputeSystemState_t)(unsafe.Pointer(State)), cgoAllocsUnknown + __ret := C.nvmlSystemGetConfComputeState(cState) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetConfComputeMemSizeInfo function as declared in nvml/nvml.h +func nvmlDeviceGetConfComputeMemSizeInfo(nvmlDevice nvmlDevice, MemInfo *ConfComputeMemSizeInfo) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cMemInfo, _ := (*C.nvmlConfComputeMemSizeInfo_t)(unsafe.Pointer(MemInfo)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetConfComputeMemSizeInfo(cnvmlDevice, cMemInfo) + __v := (Return)(__ret) + return __v +} + +// nvmlSystemGetConfComputeGpusReadyState function as declared in nvml/nvml.h +func nvmlSystemGetConfComputeGpusReadyState(IsAcceptingWork *uint32) Return { + cIsAcceptingWork, _ := (*C.uint)(unsafe.Pointer(IsAcceptingWork)), cgoAllocsUnknown + __ret := C.nvmlSystemGetConfComputeGpusReadyState(cIsAcceptingWork) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetConfComputeProtectedMemoryUsage function as declared in nvml/nvml.h +func nvmlDeviceGetConfComputeProtectedMemoryUsage(nvmlDevice nvmlDevice, Memory *Memory) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cMemory, _ := (*C.nvmlMemory_t)(unsafe.Pointer(Memory)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetConfComputeProtectedMemoryUsage(cnvmlDevice, cMemory) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetConfComputeGpuCertificate function as declared in nvml/nvml.h +func nvmlDeviceGetConfComputeGpuCertificate(nvmlDevice nvmlDevice, GpuCert *ConfComputeGpuCertificate) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cGpuCert, _ := (*C.nvmlConfComputeGpuCertificate_t)(unsafe.Pointer(GpuCert)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetConfComputeGpuCertificate(cnvmlDevice, cGpuCert) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetConfComputeGpuAttestationReport function as declared in nvml/nvml.h +func nvmlDeviceGetConfComputeGpuAttestationReport(nvmlDevice nvmlDevice, GpuAtstReport *ConfComputeGpuAttestationReport) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cGpuAtstReport, _ := (*C.nvmlConfComputeGpuAttestationReport_t)(unsafe.Pointer(GpuAtstReport)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetConfComputeGpuAttestationReport(cnvmlDevice, cGpuAtstReport) + __v := (Return)(__ret) + return __v +} + +// nvmlSystemGetConfComputeKeyRotationThresholdInfo function as declared in nvml/nvml.h +func nvmlSystemGetConfComputeKeyRotationThresholdInfo(PKeyRotationThrInfo *ConfComputeGetKeyRotationThresholdInfo) Return { + cPKeyRotationThrInfo, _ := (*C.nvmlConfComputeGetKeyRotationThresholdInfo_t)(unsafe.Pointer(PKeyRotationThrInfo)), cgoAllocsUnknown + __ret := C.nvmlSystemGetConfComputeKeyRotationThresholdInfo(cPKeyRotationThrInfo) + __v := (Return)(__ret) + return __v +} + +// nvmlSystemGetConfComputeSettings function as declared in nvml/nvml.h +func nvmlSystemGetConfComputeSettings(Settings *SystemConfComputeSettings) Return { + cSettings, _ := (*C.nvmlSystemConfComputeSettings_t)(unsafe.Pointer(Settings)), cgoAllocsUnknown + __ret := C.nvmlSystemGetConfComputeSettings(cSettings) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetGspFirmwareVersion function as declared in nvml/nvml.h +func nvmlDeviceGetGspFirmwareVersion(nvmlDevice nvmlDevice, Version *byte) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cVersion, _ := (*C.char)(unsafe.Pointer(Version)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetGspFirmwareVersion(cnvmlDevice, cVersion) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetGspFirmwareMode function as declared in nvml/nvml.h +func nvmlDeviceGetGspFirmwareMode(nvmlDevice nvmlDevice, IsEnabled *uint32, DefaultMode *uint32) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cIsEnabled, _ := (*C.uint)(unsafe.Pointer(IsEnabled)), cgoAllocsUnknown + cDefaultMode, _ := (*C.uint)(unsafe.Pointer(DefaultMode)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetGspFirmwareMode(cnvmlDevice, cIsEnabled, cDefaultMode) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetAccountingMode function as declared in nvml/nvml.h func nvmlDeviceGetAccountingMode(nvmlDevice nvmlDevice, Mode *EnableState) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown @@ -1349,6 +1567,35 @@ func nvmlDeviceGetArchitecture(nvmlDevice nvmlDevice, Arch *DeviceArchitecture) return __v } +// nvmlDeviceGetClkMonStatus function as declared in nvml/nvml.h +func nvmlDeviceGetClkMonStatus(nvmlDevice nvmlDevice, Status *ClkMonStatus) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cStatus, _ := (*C.nvmlClkMonStatus_t)(unsafe.Pointer(Status)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetClkMonStatus(cnvmlDevice, cStatus) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetProcessUtilization function as declared in nvml/nvml.h +func nvmlDeviceGetProcessUtilization(nvmlDevice nvmlDevice, Utilization *ProcessUtilizationSample, ProcessSamplesCount *uint32, LastSeenTimeStamp uint64) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cUtilization, _ := (*C.nvmlProcessUtilizationSample_t)(unsafe.Pointer(Utilization)), cgoAllocsUnknown + cProcessSamplesCount, _ := (*C.uint)(unsafe.Pointer(ProcessSamplesCount)), cgoAllocsUnknown + cLastSeenTimeStamp, _ := (C.ulonglong)(LastSeenTimeStamp), cgoAllocsUnknown + __ret := C.nvmlDeviceGetProcessUtilization(cnvmlDevice, cUtilization, cProcessSamplesCount, cLastSeenTimeStamp) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetProcessesUtilizationInfo function as declared in nvml/nvml.h +func nvmlDeviceGetProcessesUtilizationInfo(nvmlDevice nvmlDevice, ProcesesUtilInfo *ProcessesUtilizationInfo) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cProcesesUtilInfo, _ := (*C.nvmlProcessesUtilizationInfo_t)(unsafe.Pointer(ProcesesUtilInfo)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetProcessesUtilizationInfo(cnvmlDevice, cProcesesUtilInfo) + __v := (Return)(__ret) + return __v +} + // nvmlUnitSetLedState function as declared in nvml/nvml.h func nvmlUnitSetLedState(nvmlUnit nvmlUnit, Color LedColor) Return { cnvmlUnit, _ := *(*C.nvmlUnit_t)(unsafe.Pointer(&nvmlUnit)), cgoAllocsUnknown @@ -1435,54 +1682,154 @@ func nvmlDeviceSetMemoryLockedClocks(nvmlDevice nvmlDevice, MinMemClockMHz uint3 // nvmlDeviceResetMemoryLockedClocks function as declared in nvml/nvml.h func nvmlDeviceResetMemoryLockedClocks(nvmlDevice nvmlDevice) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - __ret := C.nvmlDeviceResetMemoryLockedClocks(cnvmlDevice) + __ret := C.nvmlDeviceResetMemoryLockedClocks(cnvmlDevice) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceSetApplicationsClocks function as declared in nvml/nvml.h +func nvmlDeviceSetApplicationsClocks(nvmlDevice nvmlDevice, MemClockMHz uint32, GraphicsClockMHz uint32) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cMemClockMHz, _ := (C.uint)(MemClockMHz), cgoAllocsUnknown + cGraphicsClockMHz, _ := (C.uint)(GraphicsClockMHz), cgoAllocsUnknown + __ret := C.nvmlDeviceSetApplicationsClocks(cnvmlDevice, cMemClockMHz, cGraphicsClockMHz) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceResetApplicationsClocks function as declared in nvml/nvml.h +func nvmlDeviceResetApplicationsClocks(nvmlDevice nvmlDevice) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + __ret := C.nvmlDeviceResetApplicationsClocks(cnvmlDevice) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceSetAutoBoostedClocksEnabled function as declared in nvml/nvml.h +func nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice nvmlDevice, Enabled EnableState) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cEnabled, _ := (C.nvmlEnableState_t)(Enabled), cgoAllocsUnknown + __ret := C.nvmlDeviceSetAutoBoostedClocksEnabled(cnvmlDevice, cEnabled) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceSetDefaultAutoBoostedClocksEnabled function as declared in nvml/nvml.h +func nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice nvmlDevice, Enabled EnableState, Flags uint32) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cEnabled, _ := (C.nvmlEnableState_t)(Enabled), cgoAllocsUnknown + cFlags, _ := (C.uint)(Flags), cgoAllocsUnknown + __ret := C.nvmlDeviceSetDefaultAutoBoostedClocksEnabled(cnvmlDevice, cEnabled, cFlags) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceSetDefaultFanSpeed_v2 function as declared in nvml/nvml.h +func nvmlDeviceSetDefaultFanSpeed_v2(nvmlDevice nvmlDevice, Fan uint32) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cFan, _ := (C.uint)(Fan), cgoAllocsUnknown + __ret := C.nvmlDeviceSetDefaultFanSpeed_v2(cnvmlDevice, cFan) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceSetFanControlPolicy function as declared in nvml/nvml.h +func nvmlDeviceSetFanControlPolicy(nvmlDevice nvmlDevice, Fan uint32, Policy FanControlPolicy) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cFan, _ := (C.uint)(Fan), cgoAllocsUnknown + cPolicy, _ := (C.nvmlFanControlPolicy_t)(Policy), cgoAllocsUnknown + __ret := C.nvmlDeviceSetFanControlPolicy(cnvmlDevice, cFan, cPolicy) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceSetTemperatureThreshold function as declared in nvml/nvml.h +func nvmlDeviceSetTemperatureThreshold(nvmlDevice nvmlDevice, ThresholdType TemperatureThresholds, Temp *int32) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cThresholdType, _ := (C.nvmlTemperatureThresholds_t)(ThresholdType), cgoAllocsUnknown + cTemp, _ := (*C.int)(unsafe.Pointer(Temp)), cgoAllocsUnknown + __ret := C.nvmlDeviceSetTemperatureThreshold(cnvmlDevice, cThresholdType, cTemp) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceSetPowerManagementLimit function as declared in nvml/nvml.h +func nvmlDeviceSetPowerManagementLimit(nvmlDevice nvmlDevice, Limit uint32) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cLimit, _ := (C.uint)(Limit), cgoAllocsUnknown + __ret := C.nvmlDeviceSetPowerManagementLimit(cnvmlDevice, cLimit) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceSetGpuOperationMode function as declared in nvml/nvml.h +func nvmlDeviceSetGpuOperationMode(nvmlDevice nvmlDevice, Mode GpuOperationMode) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cMode, _ := (C.nvmlGpuOperationMode_t)(Mode), cgoAllocsUnknown + __ret := C.nvmlDeviceSetGpuOperationMode(cnvmlDevice, cMode) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceSetAPIRestriction function as declared in nvml/nvml.h +func nvmlDeviceSetAPIRestriction(nvmlDevice nvmlDevice, ApiType RestrictedAPI, IsRestricted EnableState) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cApiType, _ := (C.nvmlRestrictedAPI_t)(ApiType), cgoAllocsUnknown + cIsRestricted, _ := (C.nvmlEnableState_t)(IsRestricted), cgoAllocsUnknown + __ret := C.nvmlDeviceSetAPIRestriction(cnvmlDevice, cApiType, cIsRestricted) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceSetFanSpeed_v2 function as declared in nvml/nvml.h +func nvmlDeviceSetFanSpeed_v2(nvmlDevice nvmlDevice, Fan uint32, Speed uint32) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cFan, _ := (C.uint)(Fan), cgoAllocsUnknown + cSpeed, _ := (C.uint)(Speed), cgoAllocsUnknown + __ret := C.nvmlDeviceSetFanSpeed_v2(cnvmlDevice, cFan, cSpeed) __v := (Return)(__ret) return __v } -// nvmlDeviceSetApplicationsClocks function as declared in nvml/nvml.h -func nvmlDeviceSetApplicationsClocks(nvmlDevice nvmlDevice, MemClockMHz uint32, GraphicsClockMHz uint32) Return { +// nvmlDeviceSetGpcClkVfOffset function as declared in nvml/nvml.h +func nvmlDeviceSetGpcClkVfOffset(nvmlDevice nvmlDevice, Offset int32) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cMemClockMHz, _ := (C.uint)(MemClockMHz), cgoAllocsUnknown - cGraphicsClockMHz, _ := (C.uint)(GraphicsClockMHz), cgoAllocsUnknown - __ret := C.nvmlDeviceSetApplicationsClocks(cnvmlDevice, cMemClockMHz, cGraphicsClockMHz) + cOffset, _ := (C.int)(Offset), cgoAllocsUnknown + __ret := C.nvmlDeviceSetGpcClkVfOffset(cnvmlDevice, cOffset) __v := (Return)(__ret) return __v } -// nvmlDeviceGetClkMonStatus function as declared in nvml/nvml.h -func nvmlDeviceGetClkMonStatus(nvmlDevice nvmlDevice, Status *ClkMonStatus) Return { +// nvmlDeviceSetMemClkVfOffset function as declared in nvml/nvml.h +func nvmlDeviceSetMemClkVfOffset(nvmlDevice nvmlDevice, Offset int32) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cStatus, _ := (*C.nvmlClkMonStatus_t)(unsafe.Pointer(Status)), cgoAllocsUnknown - __ret := C.nvmlDeviceGetClkMonStatus(cnvmlDevice, cStatus) + cOffset, _ := (C.int)(Offset), cgoAllocsUnknown + __ret := C.nvmlDeviceSetMemClkVfOffset(cnvmlDevice, cOffset) __v := (Return)(__ret) return __v } -// nvmlDeviceSetPowerManagementLimit function as declared in nvml/nvml.h -func nvmlDeviceSetPowerManagementLimit(nvmlDevice nvmlDevice, Limit uint32) Return { +// nvmlDeviceSetConfComputeUnprotectedMemSize function as declared in nvml/nvml.h +func nvmlDeviceSetConfComputeUnprotectedMemSize(nvmlDevice nvmlDevice, SizeKiB uint64) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cLimit, _ := (C.uint)(Limit), cgoAllocsUnknown - __ret := C.nvmlDeviceSetPowerManagementLimit(cnvmlDevice, cLimit) + cSizeKiB, _ := (C.ulonglong)(SizeKiB), cgoAllocsUnknown + __ret := C.nvmlDeviceSetConfComputeUnprotectedMemSize(cnvmlDevice, cSizeKiB) __v := (Return)(__ret) return __v } -// nvmlDeviceSetGpuOperationMode function as declared in nvml/nvml.h -func nvmlDeviceSetGpuOperationMode(nvmlDevice nvmlDevice, Mode GpuOperationMode) Return { - cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cMode, _ := (C.nvmlGpuOperationMode_t)(Mode), cgoAllocsUnknown - __ret := C.nvmlDeviceSetGpuOperationMode(cnvmlDevice, cMode) +// nvmlSystemSetConfComputeGpusReadyState function as declared in nvml/nvml.h +func nvmlSystemSetConfComputeGpusReadyState(IsAcceptingWork uint32) Return { + cIsAcceptingWork, _ := (C.uint)(IsAcceptingWork), cgoAllocsUnknown + __ret := C.nvmlSystemSetConfComputeGpusReadyState(cIsAcceptingWork) __v := (Return)(__ret) return __v } -// nvmlDeviceSetAPIRestriction function as declared in nvml/nvml.h -func nvmlDeviceSetAPIRestriction(nvmlDevice nvmlDevice, ApiType RestrictedAPI, IsRestricted EnableState) Return { - cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cApiType, _ := (C.nvmlRestrictedAPI_t)(ApiType), cgoAllocsUnknown - cIsRestricted, _ := (C.nvmlEnableState_t)(IsRestricted), cgoAllocsUnknown - __ret := C.nvmlDeviceSetAPIRestriction(cnvmlDevice, cApiType, cIsRestricted) +// nvmlSystemSetConfComputeKeyRotationThresholdInfo function as declared in nvml/nvml.h +func nvmlSystemSetConfComputeKeyRotationThresholdInfo(PKeyRotationThrInfo *ConfComputeSetKeyRotationThresholdInfo) Return { + cPKeyRotationThrInfo, _ := (*C.nvmlConfComputeSetKeyRotationThresholdInfo_t)(unsafe.Pointer(PKeyRotationThrInfo)), cgoAllocsUnknown + __ret := C.nvmlSystemSetConfComputeKeyRotationThresholdInfo(cPKeyRotationThrInfo) __v := (Return)(__ret) return __v } @@ -1759,41 +2106,86 @@ func nvmlDeviceSetVirtualizationMode(nvmlDevice nvmlDevice, VirtualMode GpuVirtu return __v } -// nvmlDeviceGetGridLicensableFeatures_v4 function as declared in nvml/nvml.h -func nvmlDeviceGetGridLicensableFeatures_v4(nvmlDevice nvmlDevice, PGridLicensableFeatures *GridLicensableFeatures) Return { +// nvmlDeviceGetVgpuHeterogeneousMode function as declared in nvml/nvml.h +func nvmlDeviceGetVgpuHeterogeneousMode(nvmlDevice nvmlDevice, PHeterogeneousMode *VgpuHeterogeneousMode) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cPGridLicensableFeatures, _ := (*C.nvmlGridLicensableFeatures_t)(unsafe.Pointer(PGridLicensableFeatures)), cgoAllocsUnknown - __ret := C.nvmlDeviceGetGridLicensableFeatures_v4(cnvmlDevice, cPGridLicensableFeatures) + cPHeterogeneousMode, _ := (*C.nvmlVgpuHeterogeneousMode_t)(unsafe.Pointer(PHeterogeneousMode)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetVgpuHeterogeneousMode(cnvmlDevice, cPHeterogeneousMode) __v := (Return)(__ret) return __v } -// nvmlDeviceGetProcessUtilization function as declared in nvml/nvml.h -func nvmlDeviceGetProcessUtilization(nvmlDevice nvmlDevice, Utilization *ProcessUtilizationSample, ProcessSamplesCount *uint32, LastSeenTimeStamp uint64) Return { +// nvmlDeviceSetVgpuHeterogeneousMode function as declared in nvml/nvml.h +func nvmlDeviceSetVgpuHeterogeneousMode(nvmlDevice nvmlDevice, PHeterogeneousMode *VgpuHeterogeneousMode) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cUtilization, _ := (*C.nvmlProcessUtilizationSample_t)(unsafe.Pointer(Utilization)), cgoAllocsUnknown - cProcessSamplesCount, _ := (*C.uint)(unsafe.Pointer(ProcessSamplesCount)), cgoAllocsUnknown - cLastSeenTimeStamp, _ := (C.ulonglong)(LastSeenTimeStamp), cgoAllocsUnknown - __ret := C.nvmlDeviceGetProcessUtilization(cnvmlDevice, cUtilization, cProcessSamplesCount, cLastSeenTimeStamp) + cPHeterogeneousMode, _ := (*C.nvmlVgpuHeterogeneousMode_t)(unsafe.Pointer(PHeterogeneousMode)), cgoAllocsUnknown + __ret := C.nvmlDeviceSetVgpuHeterogeneousMode(cnvmlDevice, cPHeterogeneousMode) __v := (Return)(__ret) return __v } -// nvmlDeviceGetGspFirmwareVersion function as declared in nvml/nvml.h -func nvmlDeviceGetGspFirmwareVersion(nvmlDevice nvmlDevice, Version *byte) Return { +// nvmlVgpuInstanceGetPlacementId function as declared in nvml/nvml.h +func nvmlVgpuInstanceGetPlacementId(nvmlVgpuInstance nvmlVgpuInstance, PPlacement *VgpuPlacementId) Return { + cnvmlVgpuInstance, _ := (C.nvmlVgpuInstance_t)(nvmlVgpuInstance), cgoAllocsUnknown + cPPlacement, _ := (*C.nvmlVgpuPlacementId_t)(unsafe.Pointer(PPlacement)), cgoAllocsUnknown + __ret := C.nvmlVgpuInstanceGetPlacementId(cnvmlVgpuInstance, cPPlacement) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetVgpuTypeSupportedPlacements function as declared in nvml/nvml.h +func nvmlDeviceGetVgpuTypeSupportedPlacements(nvmlDevice nvmlDevice, nvmlVgpuTypeId nvmlVgpuTypeId, PPlacementList *VgpuPlacementList) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cVersion, _ := (*C.char)(unsafe.Pointer(Version)), cgoAllocsUnknown - __ret := C.nvmlDeviceGetGspFirmwareVersion(cnvmlDevice, cVersion) + cnvmlVgpuTypeId, _ := (C.nvmlVgpuTypeId_t)(nvmlVgpuTypeId), cgoAllocsUnknown + cPPlacementList, _ := (*C.nvmlVgpuPlacementList_t)(unsafe.Pointer(PPlacementList)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetVgpuTypeSupportedPlacements(cnvmlDevice, cnvmlVgpuTypeId, cPPlacementList) __v := (Return)(__ret) return __v } -// nvmlDeviceGetGspFirmwareMode function as declared in nvml/nvml.h -func nvmlDeviceGetGspFirmwareMode(nvmlDevice nvmlDevice, IsEnabled *uint32, DefaultMode *uint32) Return { +// nvmlDeviceGetVgpuTypeCreatablePlacements function as declared in nvml/nvml.h +func nvmlDeviceGetVgpuTypeCreatablePlacements(nvmlDevice nvmlDevice, nvmlVgpuTypeId nvmlVgpuTypeId, PPlacementList *VgpuPlacementList) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cIsEnabled, _ := (*C.uint)(unsafe.Pointer(IsEnabled)), cgoAllocsUnknown - cDefaultMode, _ := (*C.uint)(unsafe.Pointer(DefaultMode)), cgoAllocsUnknown - __ret := C.nvmlDeviceGetGspFirmwareMode(cnvmlDevice, cIsEnabled, cDefaultMode) + cnvmlVgpuTypeId, _ := (C.nvmlVgpuTypeId_t)(nvmlVgpuTypeId), cgoAllocsUnknown + cPPlacementList, _ := (*C.nvmlVgpuPlacementList_t)(unsafe.Pointer(PPlacementList)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetVgpuTypeCreatablePlacements(cnvmlDevice, cnvmlVgpuTypeId, cPPlacementList) + __v := (Return)(__ret) + return __v +} + +// nvmlVgpuTypeGetGspHeapSize function as declared in nvml/nvml.h +func nvmlVgpuTypeGetGspHeapSize(nvmlVgpuTypeId nvmlVgpuTypeId, GspHeapSize *uint64) Return { + cnvmlVgpuTypeId, _ := (C.nvmlVgpuTypeId_t)(nvmlVgpuTypeId), cgoAllocsUnknown + cGspHeapSize, _ := (*C.ulonglong)(unsafe.Pointer(GspHeapSize)), cgoAllocsUnknown + __ret := C.nvmlVgpuTypeGetGspHeapSize(cnvmlVgpuTypeId, cGspHeapSize) + __v := (Return)(__ret) + return __v +} + +// nvmlVgpuTypeGetFbReservation function as declared in nvml/nvml.h +func nvmlVgpuTypeGetFbReservation(nvmlVgpuTypeId nvmlVgpuTypeId, FbReservation *uint64) Return { + cnvmlVgpuTypeId, _ := (C.nvmlVgpuTypeId_t)(nvmlVgpuTypeId), cgoAllocsUnknown + cFbReservation, _ := (*C.ulonglong)(unsafe.Pointer(FbReservation)), cgoAllocsUnknown + __ret := C.nvmlVgpuTypeGetFbReservation(cnvmlVgpuTypeId, cFbReservation) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceSetVgpuCapabilities function as declared in nvml/nvml.h +func nvmlDeviceSetVgpuCapabilities(nvmlDevice nvmlDevice, Capability DeviceVgpuCapability, State EnableState) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cCapability, _ := (C.nvmlDeviceVgpuCapability_t)(Capability), cgoAllocsUnknown + cState, _ := (C.nvmlEnableState_t)(State), cgoAllocsUnknown + __ret := C.nvmlDeviceSetVgpuCapabilities(cnvmlDevice, cCapability, cState) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetGridLicensableFeatures_v4 function as declared in nvml/nvml.h +func nvmlDeviceGetGridLicensableFeatures_v4(nvmlDevice nvmlDevice, PGridLicensableFeatures *GridLicensableFeatures) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cPGridLicensableFeatures, _ := (*C.nvmlGridLicensableFeatures_t)(unsafe.Pointer(PGridLicensableFeatures)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetGridLicensableFeatures_v4(cnvmlDevice, cPGridLicensableFeatures) __v := (Return)(__ret) return __v } @@ -2116,6 +2508,16 @@ func nvmlVgpuTypeGetCapabilities(nvmlVgpuTypeId nvmlVgpuTypeId, Capability VgpuC return __v } +// nvmlVgpuInstanceGetMdevUUID function as declared in nvml/nvml.h +func nvmlVgpuInstanceGetMdevUUID(nvmlVgpuInstance nvmlVgpuInstance, MdevUuid *byte, Size uint32) Return { + cnvmlVgpuInstance, _ := (C.nvmlVgpuInstance_t)(nvmlVgpuInstance), cgoAllocsUnknown + cMdevUuid, _ := (*C.char)(unsafe.Pointer(MdevUuid)), cgoAllocsUnknown + cSize, _ := (C.uint)(Size), cgoAllocsUnknown + __ret := C.nvmlVgpuInstanceGetMdevUUID(cnvmlVgpuInstance, cMdevUuid, cSize) + __v := (Return)(__ret) + return __v +} + // nvmlVgpuInstanceGetMetadata function as declared in nvml/nvml.h func nvmlVgpuInstanceGetMetadata(nvmlVgpuInstance nvmlVgpuInstance, nvmlVgpuMetadata *nvmlVgpuMetadata, BufferSize *uint32) Return { cnvmlVgpuInstance, _ := (C.nvmlVgpuInstance_t)(nvmlVgpuInstance), cgoAllocsUnknown @@ -2174,20 +2576,20 @@ func nvmlDeviceGetVgpuSchedulerState(nvmlDevice nvmlDevice, PSchedulerState *Vgp return __v } -// nvmlDeviceSetVgpuSchedulerState function as declared in nvml/nvml.h -func nvmlDeviceSetVgpuSchedulerState(nvmlDevice nvmlDevice, PSchedulerState *VgpuSchedulerSetState) Return { +// nvmlDeviceGetVgpuSchedulerCapabilities function as declared in nvml/nvml.h +func nvmlDeviceGetVgpuSchedulerCapabilities(nvmlDevice nvmlDevice, PCapabilities *VgpuSchedulerCapabilities) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cPSchedulerState, _ := (*C.nvmlVgpuSchedulerSetState_t)(unsafe.Pointer(PSchedulerState)), cgoAllocsUnknown - __ret := C.nvmlDeviceSetVgpuSchedulerState(cnvmlDevice, cPSchedulerState) + cPCapabilities, _ := (*C.nvmlVgpuSchedulerCapabilities_t)(unsafe.Pointer(PCapabilities)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetVgpuSchedulerCapabilities(cnvmlDevice, cPCapabilities) __v := (Return)(__ret) return __v } -// nvmlDeviceGetVgpuSchedulerCapabilities function as declared in nvml/nvml.h -func nvmlDeviceGetVgpuSchedulerCapabilities(nvmlDevice nvmlDevice, PCapabilities *VgpuSchedulerCapabilities) Return { +// nvmlDeviceSetVgpuSchedulerState function as declared in nvml/nvml.h +func nvmlDeviceSetVgpuSchedulerState(nvmlDevice nvmlDevice, PSchedulerState *VgpuSchedulerSetState) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cPCapabilities, _ := (*C.nvmlVgpuSchedulerCapabilities_t)(unsafe.Pointer(PCapabilities)), cgoAllocsUnknown - __ret := C.nvmlDeviceGetVgpuSchedulerCapabilities(cnvmlDevice, cPCapabilities) + cPSchedulerState, _ := (*C.nvmlVgpuSchedulerSetState_t)(unsafe.Pointer(PSchedulerState)), cgoAllocsUnknown + __ret := C.nvmlDeviceSetVgpuSchedulerState(cnvmlDevice, cPSchedulerState) __v := (Return)(__ret) return __v } @@ -2221,6 +2623,15 @@ func nvmlDeviceGetVgpuUtilization(nvmlDevice nvmlDevice, LastSeenTimeStamp uint6 return __v } +// nvmlDeviceGetVgpuInstancesUtilizationInfo function as declared in nvml/nvml.h +func nvmlDeviceGetVgpuInstancesUtilizationInfo(nvmlDevice nvmlDevice, VgpuUtilInfo *VgpuInstancesUtilizationInfo) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cVgpuUtilInfo, _ := (*C.nvmlVgpuInstancesUtilizationInfo_t)(unsafe.Pointer(VgpuUtilInfo)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetVgpuInstancesUtilizationInfo(cnvmlDevice, cVgpuUtilInfo) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetVgpuProcessUtilization function as declared in nvml/nvml.h func nvmlDeviceGetVgpuProcessUtilization(nvmlDevice nvmlDevice, LastSeenTimeStamp uint64, VgpuProcessSamplesCount *uint32, UtilizationSamples *VgpuProcessUtilizationSample) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown @@ -2232,6 +2643,15 @@ func nvmlDeviceGetVgpuProcessUtilization(nvmlDevice nvmlDevice, LastSeenTimeStam return __v } +// nvmlDeviceGetVgpuProcessesUtilizationInfo function as declared in nvml/nvml.h +func nvmlDeviceGetVgpuProcessesUtilizationInfo(nvmlDevice nvmlDevice, VgpuProcUtilInfo *VgpuProcessesUtilizationInfo) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cVgpuProcUtilInfo, _ := (*C.nvmlVgpuProcessesUtilizationInfo_t)(unsafe.Pointer(VgpuProcUtilInfo)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetVgpuProcessesUtilizationInfo(cnvmlDevice, cVgpuProcUtilInfo) + __v := (Return)(__ret) + return __v +} + // nvmlVgpuInstanceGetAccountingMode function as declared in nvml/nvml.h func nvmlVgpuInstanceGetAccountingMode(nvmlVgpuInstance nvmlVgpuInstance, Mode *EnableState) Return { cnvmlVgpuInstance, _ := (C.nvmlVgpuInstance_t)(nvmlVgpuInstance), cgoAllocsUnknown @@ -2572,121 +2992,6 @@ func nvmlDeviceGetDeviceHandleFromMigDeviceHandle(MigDevice nvmlDevice, nvmlDevi return __v } -// nvmlDeviceGetBusType function as declared in nvml/nvml.h -func nvmlDeviceGetBusType(nvmlDevice nvmlDevice, _type *BusType) Return { - cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - c_type, _ := (*C.nvmlBusType_t)(unsafe.Pointer(_type)), cgoAllocsUnknown - __ret := C.nvmlDeviceGetBusType(cnvmlDevice, c_type) - __v := (Return)(__ret) - return __v -} - -// nvmlDeviceGetDynamicPstatesInfo function as declared in nvml/nvml.h -func nvmlDeviceGetDynamicPstatesInfo(nvmlDevice nvmlDevice, PDynamicPstatesInfo *GpuDynamicPstatesInfo) Return { - cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cPDynamicPstatesInfo, _ := (*C.nvmlGpuDynamicPstatesInfo_t)(unsafe.Pointer(PDynamicPstatesInfo)), cgoAllocsUnknown - __ret := C.nvmlDeviceGetDynamicPstatesInfo(cnvmlDevice, cPDynamicPstatesInfo) - __v := (Return)(__ret) - return __v -} - -// nvmlDeviceSetFanSpeed_v2 function as declared in nvml/nvml.h -func nvmlDeviceSetFanSpeed_v2(nvmlDevice nvmlDevice, Fan uint32, Speed uint32) Return { - cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cFan, _ := (C.uint)(Fan), cgoAllocsUnknown - cSpeed, _ := (C.uint)(Speed), cgoAllocsUnknown - __ret := C.nvmlDeviceSetFanSpeed_v2(cnvmlDevice, cFan, cSpeed) - __v := (Return)(__ret) - return __v -} - -// nvmlDeviceGetGpcClkVfOffset function as declared in nvml/nvml.h -func nvmlDeviceGetGpcClkVfOffset(nvmlDevice nvmlDevice, Offset *int32) Return { - cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cOffset, _ := (*C.int)(unsafe.Pointer(Offset)), cgoAllocsUnknown - __ret := C.nvmlDeviceGetGpcClkVfOffset(cnvmlDevice, cOffset) - __v := (Return)(__ret) - return __v -} - -// nvmlDeviceSetGpcClkVfOffset function as declared in nvml/nvml.h -func nvmlDeviceSetGpcClkVfOffset(nvmlDevice nvmlDevice, Offset int32) Return { - cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cOffset, _ := (C.int)(Offset), cgoAllocsUnknown - __ret := C.nvmlDeviceSetGpcClkVfOffset(cnvmlDevice, cOffset) - __v := (Return)(__ret) - return __v -} - -// nvmlDeviceGetMemClkVfOffset function as declared in nvml/nvml.h -func nvmlDeviceGetMemClkVfOffset(nvmlDevice nvmlDevice, Offset *int32) Return { - cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cOffset, _ := (*C.int)(unsafe.Pointer(Offset)), cgoAllocsUnknown - __ret := C.nvmlDeviceGetMemClkVfOffset(cnvmlDevice, cOffset) - __v := (Return)(__ret) - return __v -} - -// nvmlDeviceSetMemClkVfOffset function as declared in nvml/nvml.h -func nvmlDeviceSetMemClkVfOffset(nvmlDevice nvmlDevice, Offset int32) Return { - cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cOffset, _ := (C.int)(Offset), cgoAllocsUnknown - __ret := C.nvmlDeviceSetMemClkVfOffset(cnvmlDevice, cOffset) - __v := (Return)(__ret) - return __v -} - -// nvmlDeviceGetMinMaxClockOfPState function as declared in nvml/nvml.h -func nvmlDeviceGetMinMaxClockOfPState(nvmlDevice nvmlDevice, _type ClockType, Pstate Pstates, MinClockMHz *uint32, MaxClockMHz *uint32) Return { - cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - c_type, _ := (C.nvmlClockType_t)(_type), cgoAllocsUnknown - cPstate, _ := (C.nvmlPstates_t)(Pstate), cgoAllocsUnknown - cMinClockMHz, _ := (*C.uint)(unsafe.Pointer(MinClockMHz)), cgoAllocsUnknown - cMaxClockMHz, _ := (*C.uint)(unsafe.Pointer(MaxClockMHz)), cgoAllocsUnknown - __ret := C.nvmlDeviceGetMinMaxClockOfPState(cnvmlDevice, c_type, cPstate, cMinClockMHz, cMaxClockMHz) - __v := (Return)(__ret) - return __v -} - -// nvmlDeviceGetSupportedPerformanceStates function as declared in nvml/nvml.h -func nvmlDeviceGetSupportedPerformanceStates(nvmlDevice nvmlDevice, Pstates *Pstates, Size uint32) Return { - cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cPstates, _ := (*C.nvmlPstates_t)(unsafe.Pointer(Pstates)), cgoAllocsUnknown - cSize, _ := (C.uint)(Size), cgoAllocsUnknown - __ret := C.nvmlDeviceGetSupportedPerformanceStates(cnvmlDevice, cPstates, cSize) - __v := (Return)(__ret) - return __v -} - -// nvmlDeviceGetGpcClkMinMaxVfOffset function as declared in nvml/nvml.h -func nvmlDeviceGetGpcClkMinMaxVfOffset(nvmlDevice nvmlDevice, MinOffset *int32, MaxOffset *int32) Return { - cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cMinOffset, _ := (*C.int)(unsafe.Pointer(MinOffset)), cgoAllocsUnknown - cMaxOffset, _ := (*C.int)(unsafe.Pointer(MaxOffset)), cgoAllocsUnknown - __ret := C.nvmlDeviceGetGpcClkMinMaxVfOffset(cnvmlDevice, cMinOffset, cMaxOffset) - __v := (Return)(__ret) - return __v -} - -// nvmlDeviceGetMemClkMinMaxVfOffset function as declared in nvml/nvml.h -func nvmlDeviceGetMemClkMinMaxVfOffset(nvmlDevice nvmlDevice, MinOffset *int32, MaxOffset *int32) Return { - cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cMinOffset, _ := (*C.int)(unsafe.Pointer(MinOffset)), cgoAllocsUnknown - cMaxOffset, _ := (*C.int)(unsafe.Pointer(MaxOffset)), cgoAllocsUnknown - __ret := C.nvmlDeviceGetMemClkMinMaxVfOffset(cnvmlDevice, cMinOffset, cMaxOffset) - __v := (Return)(__ret) - return __v -} - -// nvmlDeviceGetGpuFabricInfo function as declared in nvml/nvml.h -func nvmlDeviceGetGpuFabricInfo(nvmlDevice nvmlDevice, GpuFabricInfo *GpuFabricInfo) Return { - cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown - cGpuFabricInfo, _ := (*C.nvmlGpuFabricInfo_t)(unsafe.Pointer(GpuFabricInfo)), cgoAllocsUnknown - __ret := C.nvmlDeviceGetGpuFabricInfo(cnvmlDevice, cGpuFabricInfo) - __v := (Return)(__ret) - return __v -} - // nvmlGpmMetricsGet function as declared in nvml/nvml.h func nvmlGpmMetricsGet(MetricsGet *nvmlGpmMetricsGetType) Return { cMetricsGet, _ := (*C.nvmlGpmMetricsGet_t)(unsafe.Pointer(MetricsGet)), cgoAllocsUnknown @@ -2739,20 +3044,20 @@ func nvmlGpmQueryDeviceSupport(nvmlDevice nvmlDevice, GpmSupport *GpmSupport) Re return __v } -// nvmlDeviceCcuGetStreamState function as declared in nvml/nvml.h -func nvmlDeviceCcuGetStreamState(nvmlDevice nvmlDevice, State *uint32) Return { +// nvmlGpmQueryIfStreamingEnabled function as declared in nvml/nvml.h +func nvmlGpmQueryIfStreamingEnabled(nvmlDevice nvmlDevice, State *uint32) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown cState, _ := (*C.uint)(unsafe.Pointer(State)), cgoAllocsUnknown - __ret := C.nvmlDeviceCcuGetStreamState(cnvmlDevice, cState) + __ret := C.nvmlGpmQueryIfStreamingEnabled(cnvmlDevice, cState) __v := (Return)(__ret) return __v } -// nvmlDeviceCcuSetStreamState function as declared in nvml/nvml.h -func nvmlDeviceCcuSetStreamState(nvmlDevice nvmlDevice, State uint32) Return { +// nvmlGpmSetStreamingEnabled function as declared in nvml/nvml.h +func nvmlGpmSetStreamingEnabled(nvmlDevice nvmlDevice, State uint32) Return { cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown cState, _ := (C.uint)(State), cgoAllocsUnknown - __ret := C.nvmlDeviceCcuSetStreamState(cnvmlDevice, cState) + __ret := C.nvmlGpmSetStreamingEnabled(cnvmlDevice, cState) __v := (Return)(__ret) return __v } @@ -2766,6 +3071,40 @@ func nvmlDeviceSetNvLinkDeviceLowPowerThreshold(nvmlDevice nvmlDevice, Info *NvL return __v } +// nvmlSystemSetNvlinkBwMode function as declared in nvml/nvml.h +func nvmlSystemSetNvlinkBwMode(NvlinkBwMode uint32) Return { + cNvlinkBwMode, _ := (C.uint)(NvlinkBwMode), cgoAllocsUnknown + __ret := C.nvmlSystemSetNvlinkBwMode(cNvlinkBwMode) + __v := (Return)(__ret) + return __v +} + +// nvmlSystemGetNvlinkBwMode function as declared in nvml/nvml.h +func nvmlSystemGetNvlinkBwMode(NvlinkBwMode *uint32) Return { + cNvlinkBwMode, _ := (*C.uint)(unsafe.Pointer(NvlinkBwMode)), cgoAllocsUnknown + __ret := C.nvmlSystemGetNvlinkBwMode(cNvlinkBwMode) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceSetPowerManagementLimit_v2 function as declared in nvml/nvml.h +func nvmlDeviceSetPowerManagementLimit_v2(nvmlDevice nvmlDevice, PowerValue *PowerValue_v2) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cPowerValue, _ := (*C.nvmlPowerValue_v2_t)(unsafe.Pointer(PowerValue)), cgoAllocsUnknown + __ret := C.nvmlDeviceSetPowerManagementLimit_v2(cnvmlDevice, cPowerValue) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetSramEccErrorStatus function as declared in nvml/nvml.h +func nvmlDeviceGetSramEccErrorStatus(nvmlDevice nvmlDevice, Status *EccSramErrorStatus) Return { + cnvmlDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&nvmlDevice)), cgoAllocsUnknown + cStatus, _ := (*C.nvmlEccSramErrorStatus_t)(unsafe.Pointer(Status)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetSramEccErrorStatus(cnvmlDevice, cStatus) + __v := (Return)(__ret) + return __v +} + // nvmlInit_v1 function as declared in nvml/nvml.h func nvmlInit_v1() Return { __ret := C.nvmlInit() diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.h b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.h index 8c71ff8ab..1e4eb12dc 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.h +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.h @@ -1,7 +1,7 @@ -/*** NVML VERSION: 12.0.76 ***/ -/*** From https://api.anaconda.org/download/nvidia/cuda-nvml-dev/12.0.76/linux-64/cuda-nvml-dev-12.0.76-0.tar.bz2 ***/ +/*** NVML VERSION: 12.4.127 ***/ +/*** From https://api.anaconda.org/download/nvidia/cuda-nvml-dev/12.4.127/linux-64/cuda-nvml-dev-12.4.127-0.tar.bz2 ***/ /* - * Copyright 1993-2022 NVIDIA Corporation. All rights reserved. + * Copyright 1993-2023 NVIDIA Corporation. All rights reserved. * * NOTICE TO USER: * @@ -97,8 +97,8 @@ extern "C" { /** * NVML API versioning support */ -#define NVML_API_VERSION 11 -#define NVML_API_VERSION_STR "11" +#define NVML_API_VERSION 12 +#define NVML_API_VERSION_STR "12" /** * Defining NVML_NO_UNVERSIONED_FUNC_DEFS will disable "auto upgrading" of APIs. * e.g. the user will have to call nvmlInit_v2 instead of nvmlInit. Enable this @@ -158,6 +158,27 @@ typedef struct */ #define NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE 16 +/** + * PCI information about a GPU device. + */ +typedef struct +{ + unsigned int version; //!< The version number of this struct + unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffffffff + unsigned int bus; //!< The bus on which the device resides, 0 to 0xff + unsigned int device; //!< The device's id on the bus, 0 to 31 + + unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id + unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID + + unsigned int baseClass; //!< The 8-bit PCI base class code + unsigned int subClass; //!< The 8-bit PCI sub class code + + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (& NULL terminator) +} nvmlPciInfoExt_v1_t; +typedef nvmlPciInfoExt_v1_t nvmlPciInfoExt_t; +#define nvmlPciInfoExt_v1 NVML_STRUCT_VERSION(PciInfoExt, 1) + /** * PCI information about a GPU device. */ @@ -232,7 +253,6 @@ typedef struct nvmlMemory_st * Memory allocation information for a device (v2). * * Version 2 adds versioning for the struct and the amount of system-reserved memory as an output. - * @note The \ref nvmlMemory_v2_t.used amount also includes the \ref nvmlMemory_v2_t.reserved amount. */ typedef struct nvmlMemory_v2_st { @@ -240,7 +260,7 @@ typedef struct nvmlMemory_v2_st unsigned long long total; //!< Total physical device memory (in bytes) unsigned long long reserved; //!< Device memory (in bytes) reserved for system use (driver or firmware) unsigned long long free; //!< Unallocated device memory (in bytes) - unsigned long long used; //!< Allocated device memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping + unsigned long long used; //!< Allocated device memory (in bytes). } nvmlMemory_v2_t; #define nvmlMemory_v2 NVML_STRUCT_VERSION(Memory, 2) @@ -280,23 +300,41 @@ typedef struct nvmlProcessInfo_v2_st // 0xFFFFFFFF otherwise. unsigned int computeInstanceId; //!< If MIG is enabled, stores a valid compute instance ID. computeInstanceId is set to // 0xFFFFFFFF otherwise. -} nvmlProcessInfo_v2_t; +} nvmlProcessInfo_v2_t, nvmlProcessInfo_t; /** - * Information about running compute processes on the GPU - * Version 2 adds versioning for the struct + * Information about running process on the GPU with protected memory */ -typedef struct nvmlProcessInfo_st +typedef struct { - unsigned int pid; //!< Process ID - unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes. - //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported - //! because Windows KMD manages all the memory and not the NVIDIA driver - unsigned int gpuInstanceId; //!< If MIG is enabled, stores a valid GPU instance ID. gpuInstanceId is set to - // 0xFFFFFFFF otherwise. - unsigned int computeInstanceId; //!< If MIG is enabled, stores a valid compute instance ID. computeInstanceId is set to - // 0xFFFFFFFF otherwise. -} nvmlProcessInfo_t; + unsigned int pid; //!< Process ID + unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes. + //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported + //! because Windows KMD manages all the memory and not the NVIDIA driver + unsigned int gpuInstanceId; //!< If MIG is enabled, stores a valid GPU instance ID. gpuInstanceId is + // set to 0xFFFFFFFF otherwise. + unsigned int computeInstanceId; //!< If MIG is enabled, stores a valid compute instance ID. computeInstanceId + // is set to 0xFFFFFFFF otherwise. + unsigned long long usedGpuCcProtectedMemory; //!< Amount of used GPU conf compute protected memory in bytes. +} nvmlProcessDetail_v1_t; + +/** + * Information about all running processes on the GPU for the given mode + */ +typedef struct +{ + unsigned int version; //!< Struct version, MUST be nvmlProcessDetailList_v1 + unsigned int mode; //!< Process mode(Compute/Graphics/MPSCompute) + unsigned int numProcArrayEntries; //!< Number of process entries in procArray + nvmlProcessDetail_v1_t *procArray; //!< Process array +} nvmlProcessDetailList_v1_t; + +typedef nvmlProcessDetailList_v1_t nvmlProcessDetailList_t; + +/** + * nvmlProcessDetailList version + */ +#define nvmlProcessDetailList_v1 NVML_STRUCT_VERSION(ProcessDetailList, 1) typedef struct nvmlDeviceAttributes_st { @@ -311,6 +349,16 @@ typedef struct nvmlDeviceAttributes_st unsigned long long memorySizeMB; //!< Device memory size (in MiB) } nvmlDeviceAttributes_t; +/** + * C2C Mode information for a device + */ +typedef struct +{ + unsigned int isC2cEnabled; +} nvmlC2cModeInfo_v1_t; + +#define nvmlC2cModeInfo_v1 NVML_STRUCT_VERSION(C2cModeInfo, 1) + /** * Possible values that classify the remap availability for each bank. The max * field will contain the number of banks that have maximum remap availability @@ -447,6 +495,7 @@ typedef enum nvmlGpuP2PStatus_enum { NVML_P2P_STATUS_OK = 0, NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED, + NVML_P2P_STATUS_CHIPSET_NOT_SUPPORTED = NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED, NVML_P2P_STATUS_GPU_NOT_SUPPORTED, NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED, NVML_P2P_STATUS_DISABLED_BY_REGKEY, @@ -459,11 +508,16 @@ typedef enum nvmlGpuP2PStatus_enum typedef enum nvmlGpuP2PCapsIndex_enum { NVML_P2P_CAPS_INDEX_READ = 0, - NVML_P2P_CAPS_INDEX_WRITE, - NVML_P2P_CAPS_INDEX_NVLINK, - NVML_P2P_CAPS_INDEX_ATOMICS, - NVML_P2P_CAPS_INDEX_PROP, - NVML_P2P_CAPS_INDEX_UNKNOWN + NVML_P2P_CAPS_INDEX_WRITE = 1, + NVML_P2P_CAPS_INDEX_NVLINK = 2, + NVML_P2P_CAPS_INDEX_ATOMICS = 3, + NVML_P2P_CAPS_INDEX_PCI = 4, + /* + * DO NOT USE! NVML_P2P_CAPS_INDEX_PROP is deprecated. + * Use NVML_P2P_CAPS_INDEX_PCI instead. + */ + NVML_P2P_CAPS_INDEX_PROP = NVML_P2P_CAPS_INDEX_PCI, + NVML_P2P_CAPS_INDEX_UNKNOWN = 5, }nvmlGpuP2PCapsIndex_t; /** @@ -502,6 +556,9 @@ typedef enum nvmlSamplingType_enum NVML_DEC_UTILIZATION_SAMPLES = 4, //!< To represent percent of time during which NVDEC remains busy NVML_PROCESSOR_CLK_SAMPLES = 5, //!< To represent processor clock samples NVML_MEMORY_CLK_SAMPLES = 6, //!< To represent memory clock samples + NVML_MODULE_POWER_SAMPLES = 7, //!< To represent module power samples for total module starting Grace Hopper + NVML_JPG_UTILIZATION_SAMPLES = 8, //!< To represent percent of time during which NVJPG remains busy + NVML_OFA_UTILIZATION_SAMPLES = 9, //!< To represent percent of time during which NVOFA remains busy // Keep this last NVML_SAMPLINGTYPE_COUNT @@ -529,6 +586,7 @@ typedef enum nvmlValueType_enum NVML_VALUE_TYPE_UNSIGNED_LONG = 2, NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3, NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4, + NVML_VALUE_TYPE_SIGNED_INT = 5, // Keep this last NVML_VALUE_TYPE_COUNT @@ -541,6 +599,7 @@ typedef enum nvmlValueType_enum typedef union nvmlValue_st { double dVal; //!< If the value is double + int siVal; //!< If the value is signed int unsigned int uiVal; //!< If the value is unsigned int unsigned long ulVal; //!< If the value is unsigned long unsigned long long ullVal; //!< If the value is unsigned long long @@ -977,7 +1036,10 @@ typedef enum nvmlReturn_enum NVML_ERROR_INSUFFICIENT_RESOURCES = 23, //!< Ran out of critical resources, other than memory NVML_ERROR_FREQ_NOT_SUPPORTED = 24, //!< Ran out of critical resources, other than memory NVML_ERROR_ARGUMENT_VERSION_MISMATCH = 25, //!< The provided version is invalid/unsupported - NVML_ERROR_DEPRECATED = 26, //!< The requested functionality has been deprecated + NVML_ERROR_DEPRECATED = 26, //!< The requested functionality has been deprecated + NVML_ERROR_NOT_READY = 27, //!< The system is not ready for the request + NVML_ERROR_GPU_NOT_FOUND = 28, //!< No GPUs were found + NVML_ERROR_INVALID_STATE = 29, //!< Resource not in correct state to perform requested operation NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred } nvmlReturn_t; @@ -1108,7 +1170,6 @@ typedef enum nvmlVgpuCapability_enum NVML_VGPU_CAP_COUNT } nvmlVgpuCapability_t; - /** * vGPU driver queryable capabilities */ @@ -1119,15 +1180,19 @@ typedef enum nvmlVgpuDriverCapability_enum NVML_VGPU_DRIVER_CAP_COUNT } nvmlVgpuDriverCapability_t; - /** * Device vGPU queryable capabilities */ typedef enum nvmlDeviceVgpuCapability_enum { - NVML_DEVICE_VGPU_CAP_FRACTIONAL_MULTI_VGPU = 0, //!< Fractional vGPU profiles on this GPU can be used in multi-vGPU configurations - NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_PROFILES = 1, //!< Supports concurrent execution of timesliced vGPU profiles of differing types - NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_SIZES = 2, //!< Supports concurrent execution of timesliced vGPU profiles of differing framebuffer sizes + NVML_DEVICE_VGPU_CAP_FRACTIONAL_MULTI_VGPU = 0, //!< Query if the fractional vGPU profiles on this GPU can be used in multi-vGPU configurations + NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_PROFILES = 1, //!< Query if the GPU support concurrent execution of timesliced vGPU profiles of differing types + NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_SIZES = 2, //!< Query if the GPU support concurrent execution of timesliced vGPU profiles of differing framebuffer sizes + NVML_DEVICE_VGPU_CAP_READ_DEVICE_BUFFER_BW = 3, //!< Query the GPU's read_device_buffer expected bandwidth capacity in megabytes per second + NVML_DEVICE_VGPU_CAP_WRITE_DEVICE_BUFFER_BW = 4, //!< Query the GPU's write_device_buffer expected bandwidth capacity in megabytes per second + NVML_DEVICE_VGPU_CAP_DEVICE_STREAMING = 5, //!< Query if vGPU profiles on the GPU supports migration data streaming + NVML_DEVICE_VGPU_CAP_MINI_QUARTER_GPU = 6, //!< Set/Get support for mini-quarter vGPU profiles + NVML_DEVICE_VGPU_CAP_COMPUTE_MEDIA_ENGINE_GPU = 7, //!< Set/Get support for compute media engine vGPU profiles // Keep this last NVML_DEVICE_VGPU_CAP_COUNT } nvmlDeviceVgpuCapability_t; @@ -1154,6 +1219,8 @@ typedef enum nvmlDeviceVgpuCapability_enum #define INVALID_GPU_INSTANCE_ID 0xFFFFFFFF +#define NVML_INVALID_VGPU_PLACEMENT_ID 0xFFFF + /*! * Macros for vGPU instance's virtualization capabilities bitfield. */ @@ -1180,6 +1247,41 @@ typedef unsigned int nvmlVgpuTypeId_t; typedef unsigned int nvmlVgpuInstance_t; +/** + * Structure to store the vGPU heterogeneous mode of device -- version 1 + */ +typedef struct +{ + unsigned int version; //!< The version number of this struct + unsigned int mode; //!< The vGPU heterogeneous mode +} nvmlVgpuHeterogeneousMode_v1_t; +typedef nvmlVgpuHeterogeneousMode_v1_t nvmlVgpuHeterogeneousMode_t; +#define nvmlVgpuHeterogeneousMode_v1 NVML_STRUCT_VERSION(VgpuHeterogeneousMode, 1) + +/** + * Structure to store the placement ID of vGPU instance -- version 1 + */ +typedef struct +{ + unsigned int version; //!< The version number of this struct + unsigned int placementId; //!< Placement ID of the active vGPU instance +} nvmlVgpuPlacementId_v1_t; +typedef nvmlVgpuPlacementId_v1_t nvmlVgpuPlacementId_t; +#define nvmlVgpuPlacementId_v1 NVML_STRUCT_VERSION(VgpuPlacementId, 1) + +/** + * Structure to store the list of vGPU placements -- version 1 + */ +typedef struct +{ + unsigned int version; //!< The version number of this struct + unsigned int placementSize; //!< The number of slots occupied by the vGPU type + unsigned int count; //!< Count of placement IDs fetched + unsigned int *placementIds; //!< Placement IDs for the vGPU type +} nvmlVgpuPlacementList_v1_t; +typedef nvmlVgpuPlacementList_v1_t nvmlVgpuPlacementList_t; +#define nvmlVgpuPlacementList_v1 NVML_STRUCT_VERSION(VgpuPlacementList, 1) + /** * Structure to store Utilization Value and vgpuInstance */ @@ -1193,6 +1295,35 @@ typedef struct nvmlVgpuInstanceUtilizationSample_st nvmlValue_t decUtil; //!< Decoder Util Value } nvmlVgpuInstanceUtilizationSample_t; +/** + * Structure to store Utilization Value and vgpuInstance Info -- Version 1 + */ +typedef struct +{ + unsigned long long timeStamp; //!< CPU Timestamp in microseconds + nvmlVgpuInstance_t vgpuInstance; //!< vGPU Instance + nvmlValue_t smUtil; //!< SM (3D/Compute) Util Value + nvmlValue_t memUtil; //!< Frame Buffer Memory Util Value + nvmlValue_t encUtil; //!< Encoder Util Value + nvmlValue_t decUtil; //!< Decoder Util Value + nvmlValue_t jpgUtil; //!< Jpeg Util Value + nvmlValue_t ofaUtil; //!< Ofa Util Value +} nvmlVgpuInstanceUtilizationInfo_v1_t; + +/** + * Structure to store recent utilization for vGPU instances running on a device -- version 1 + */ +typedef struct +{ + unsigned int version; //!< The version number of this struct + nvmlValueType_t sampleValType; //!< Hold the type of returned sample values + unsigned int vgpuInstanceCount; //!< Hold the number of vGPU instances + unsigned long long lastSeenTimeStamp; //!< Return only samples with timestamp greater than lastSeenTimeStamp + nvmlVgpuInstanceUtilizationInfo_v1_t *vgpuUtilArray; //!< The array (allocated by caller) in which vGPU utilization are returned +} nvmlVgpuInstancesUtilizationInfo_v1_t; +typedef nvmlVgpuInstancesUtilizationInfo_v1_t nvmlVgpuInstancesUtilizationInfo_t; +#define nvmlVgpuInstancesUtilizationInfo_v1 NVML_STRUCT_VERSION(VgpuInstancesUtilizationInfo, 1) + /** * Structure to store Utilization Value, vgpuInstance and subprocess information */ @@ -1208,6 +1339,36 @@ typedef struct nvmlVgpuProcessUtilizationSample_st unsigned int decUtil; //!< Decoder Util Value } nvmlVgpuProcessUtilizationSample_t; +/** + * Structure to store Utilization Value, vgpuInstance and subprocess information for process running on vGPU instance -- version 1 + */ +typedef struct +{ + char processName[NVML_VGPU_NAME_BUFFER_SIZE]; //!< Name of process running within the vGPU VM + unsigned long long timeStamp; //!< CPU Timestamp in microseconds + nvmlVgpuInstance_t vgpuInstance; //!< vGPU Instance + unsigned int pid; //!< PID of process running within the vGPU VM + unsigned int smUtil; //!< SM (3D/Compute) Util Value + unsigned int memUtil; //!< Frame Buffer Memory Util Value + unsigned int encUtil; //!< Encoder Util Value + unsigned int decUtil; //!< Decoder Util Value + unsigned int jpgUtil; //!< Jpeg Util Value + unsigned int ofaUtil; //!< Ofa Util Value +} nvmlVgpuProcessUtilizationInfo_v1_t; + +/** + * Structure to store recent utilization, vgpuInstance and subprocess information for processes running on vGPU instances active on a device -- version 1 + */ +typedef struct +{ + unsigned int version; //!< The version number of this struct + unsigned int vgpuProcessCount; //!< Hold the number of processes running on vGPU instances + unsigned long long lastSeenTimeStamp; //!< Return only samples with timestamp greater than lastSeenTimeStamp + nvmlVgpuProcessUtilizationInfo_v1_t *vgpuProcUtilArray; //!< The array (allocated by caller) in which utilization of processes running on vGPU instances are returned +} nvmlVgpuProcessesUtilizationInfo_v1_t; +typedef nvmlVgpuProcessesUtilizationInfo_v1_t nvmlVgpuProcessesUtilizationInfo_t; +#define nvmlVgpuProcessesUtilizationInfo_v1 NVML_STRUCT_VERSION(VgpuProcessesUtilizationInfo, 1) + /** * vGPU scheduler policies */ @@ -1220,6 +1381,10 @@ typedef struct nvmlVgpuProcessUtilizationSample_st #define NVML_SCHEDULER_SW_MAX_LOG_ENTRIES 200 +#define NVML_VGPU_SCHEDULER_ARR_DEFAULT 0 +#define NVML_VGPU_SCHEDULER_ARR_DISABLE 1 +#define NVML_VGPU_SCHEDULER_ARR_ENABLE 2 + typedef struct { unsigned int avgFactor; unsigned int timeslice; @@ -1260,7 +1425,7 @@ typedef struct nvmlVgpuSchedulerLog_st { unsigned int engineId; //!< Engine whose software runlist log entries are fetched unsigned int schedulerPolicy; //!< Scheduler policy - unsigned int isEnabledARR; //!< Flag to check Adaptive Round Robin scheduler mode + unsigned int arrMode; //!< Adaptive Round Robin scheduler mode. One of the NVML_VGPU_SCHEDULER_ARR_*. nvmlVgpuSchedulerParams_t schedulerParams; unsigned int entriesCount; //!< Count of log entries fetched nvmlVgpuSchedulerLogEntry_t logEntries[NVML_SCHEDULER_SW_MAX_LOG_ENTRIES]; @@ -1272,7 +1437,7 @@ typedef struct nvmlVgpuSchedulerLog_st typedef struct nvmlVgpuSchedulerGetState_st { unsigned int schedulerPolicy; //!< Scheduler policy - unsigned int isEnabledARR; //!< Flag to check Adaptive Round Robin scheduler mode + unsigned int arrMode; //!< Adaptive Round Robin scheduler mode. One of the NVML_VGPU_SCHEDULER_ARR_*. nvmlVgpuSchedulerParams_t schedulerParams; } nvmlVgpuSchedulerGetState_t; @@ -1302,7 +1467,7 @@ typedef union typedef struct nvmlVgpuSchedulerSetState_st { unsigned int schedulerPolicy; //!< Scheduler policy - unsigned int enableARRMode; //!< Flag to enable/disable Adaptive Round Robin scheduler + unsigned int enableARRMode; //!< Adaptive Round Robin scheduler nvmlVgpuSchedulerSetParams_t schedulerParams; } nvmlVgpuSchedulerSetState_t; @@ -1365,6 +1530,34 @@ typedef struct nvmlProcessUtilizationSample_st unsigned int decUtil; //!< Decoder Util Value } nvmlProcessUtilizationSample_t; +/** + * Structure to store utilization value and process Id -- version 1 + */ +typedef struct +{ + unsigned long long timeStamp; //!< CPU Timestamp in microseconds + unsigned int pid; //!< PID of process + unsigned int smUtil; //!< SM (3D/Compute) Util Value + unsigned int memUtil; //!< Frame Buffer Memory Util Value + unsigned int encUtil; //!< Encoder Util Value + unsigned int decUtil; //!< Decoder Util Value + unsigned int jpgUtil; //!< Jpeg Util Value + unsigned int ofaUtil; //!< Ofa Util Value +} nvmlProcessUtilizationInfo_v1_t; + +/** + * Structure to store utilization and process ID for each running process -- version 1 + */ +typedef struct +{ + unsigned int version; //!< The version number of this struct + unsigned int processSamplesCount; //!< Caller-supplied array size, and returns number of processes running + unsigned long long lastSeenTimeStamp; //!< Return only samples with timestamp greater than lastSeenTimeStamp + nvmlProcessUtilizationInfo_v1_t *procUtilArray; //!< The array (allocated by caller) of the utilization of GPU SM, framebuffer, video encoder, video decoder, JPEG, and OFA +} nvmlProcessesUtilizationInfo_v1_t; +typedef nvmlProcessesUtilizationInfo_v1_t nvmlProcessesUtilizationInfo_t; +#define nvmlProcessesUtilizationInfo_v1 NVML_STRUCT_VERSION(ProcessesUtilizationInfo, 1) + /** * Structure to store license expiry date and time values */ @@ -1402,6 +1595,29 @@ typedef struct nvmlGridLicensableFeatures_st nvmlGridLicensableFeature_t gridLicensableFeatures[NVML_GRID_LICENSE_FEATURE_MAX_COUNT]; //!< Array of vGPU software licensable features. } nvmlGridLicensableFeatures_t; +/** + * Structure to store SRAM uncorrectable error counters + */ +typedef struct +{ + unsigned int version; //!< the API version number + unsigned long long aggregateUncParity; //!< aggregate uncorrectable parity error count + unsigned long long aggregateUncSecDed; //!< aggregate uncorrectable SEC-DED error count + unsigned long long aggregateCor; //!< aggregate correctable error count + unsigned long long volatileUncParity; //!< volatile uncorrectable parity error count + unsigned long long volatileUncSecDed; //!< volatile uncorrectable SEC-DED error count + unsigned long long volatileCor; //!< volatile correctable error count + unsigned long long aggregateUncBucketL2; //!< aggregate uncorrectable error count for L2 cache bucket + unsigned long long aggregateUncBucketSm; //!< aggregate uncorrectable error count for SM bucket + unsigned long long aggregateUncBucketPcie; //!< aggregate uncorrectable error count for PCIE bucket + unsigned long long aggregateUncBucketMcu; //!< aggregate uncorrectable error count for Microcontroller bucket + unsigned long long aggregateUncBucketOther; //!< aggregate uncorrectable error count for Other bucket + unsigned int bThresholdExceeded; //!< if the error threshold of field diag is exceeded +} nvmlEccSramErrorStatus_v1_t; + +typedef nvmlEccSramErrorStatus_v1_t nvmlEccSramErrorStatus_t; +#define nvmlEccSramErrorStatus_v1 NVML_STRUCT_VERSION(EccSramErrorStatus, 1) + /** * GSP firmware */ @@ -1415,11 +1631,8 @@ typedef struct nvmlGridLicensableFeatures_st #define NVML_DEVICE_ARCH_PASCAL 4 // Devices based on the NVIDIA Pascal architecture #define NVML_DEVICE_ARCH_VOLTA 5 // Devices based on the NVIDIA Volta architecture #define NVML_DEVICE_ARCH_TURING 6 // Devices based on the NVIDIA Turing architecture - #define NVML_DEVICE_ARCH_AMPERE 7 // Devices based on the NVIDIA Ampere architecture - #define NVML_DEVICE_ARCH_ADA 8 // Devices based on the NVIDIA Ada architecture - #define NVML_DEVICE_ARCH_HOPPER 9 // Devices based on the NVIDIA Hopper architecture #define NVML_DEVICE_ARCH_UNKNOWN 0xffffffff // Anything else, presumably something newer @@ -1452,8 +1665,9 @@ typedef unsigned int nvmlFanControlPolicy_t; /** * Device Power Source */ -#define NVML_POWER_SOURCE_AC 0x00000000 -#define NVML_POWER_SOURCE_BATTERY 0x00000001 +#define NVML_POWER_SOURCE_AC 0x00000000 +#define NVML_POWER_SOURCE_BATTERY 0x00000001 +#define NVML_POWER_SOURCE_UNDERSIZED 0x00000002 typedef unsigned int nvmlPowerSource_t; @@ -1752,21 +1966,70 @@ typedef struct nvmlGpuDynamicPstatesInfo_st #define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L9 157 //!< NVLink data ECC Error Counter for Link 9 #define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L10 158 //!< NVLink data ECC Error Counter for Link 10 #define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L11 159 //!< NVLink data ECC Error Counter for Link 11 -#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL 160 //!< NvLink data ECC Error Counter total for all Links +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL 160 //!< NVLink data ECC Error Counter total for all Links + +#define NVML_FI_DEV_NVLINK_ERROR_DL_REPLAY 161 //!< NVLink Replay Error Counter +#define NVML_FI_DEV_NVLINK_ERROR_DL_RECOVERY 162 //!< NVLink Recovery Error Counter +#define NVML_FI_DEV_NVLINK_ERROR_DL_CRC 163 //!< NVLink CRC Error Counter +#define NVML_FI_DEV_NVLINK_GET_SPEED 164 //!< NVLink Speed in MBps +#define NVML_FI_DEV_NVLINK_GET_STATE 165 //!< NVLink State - Active,Inactive +#define NVML_FI_DEV_NVLINK_GET_VERSION 166 //!< NVLink Version + +#define NVML_FI_DEV_NVLINK_GET_POWER_STATE 167 //!< NVLink Power state. 0=HIGH_SPEED 1=LOW_SPEED +#define NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD 168 //!< NVLink length of idle period (in units of 100us) before transitioning links to sleep state -#define NVML_FI_DEV_NVLINK_ERROR_DL_REPLAY 161 -#define NVML_FI_DEV_NVLINK_ERROR_DL_RECOVERY 162 -#define NVML_FI_DEV_NVLINK_ERROR_DL_CRC 163 -#define NVML_FI_DEV_NVLINK_GET_SPEED 164 -#define NVML_FI_DEV_NVLINK_GET_STATE 165 -#define NVML_FI_DEV_NVLINK_GET_VERSION 166 +#define NVML_FI_DEV_PCIE_L0_TO_RECOVERY_COUNTER 169 //!< Device PEX error recovery counter -#define NVML_FI_DEV_NVLINK_GET_POWER_STATE 167 -#define NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD 168 +#define NVML_FI_DEV_C2C_LINK_COUNT 170 //!< Number of C2C Links present on the device +#define NVML_FI_DEV_C2C_LINK_GET_STATUS 171 //!< C2C Link Status 0=INACTIVE 1=ACTIVE +#define NVML_FI_DEV_C2C_LINK_GET_MAX_BW 172 //!< C2C Link Speed in MBps for active links + +#define NVML_FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS 173 +#define NVML_FI_DEV_PCIE_COUNT_NAKS_RECEIVED 174 +#define NVML_FI_DEV_PCIE_COUNT_RECEIVER_ERROR 175 +#define NVML_FI_DEV_PCIE_COUNT_BAD_TLP 176 +#define NVML_FI_DEV_PCIE_COUNT_NAKS_SENT 177 +#define NVML_FI_DEV_PCIE_COUNT_BAD_DLLP 178 +#define NVML_FI_DEV_PCIE_COUNT_NON_FATAL_ERROR 179 +#define NVML_FI_DEV_PCIE_COUNT_FATAL_ERROR 180 +#define NVML_FI_DEV_PCIE_COUNT_UNSUPPORTED_REQ 181 +#define NVML_FI_DEV_PCIE_COUNT_LCRC_ERROR 182 +#define NVML_FI_DEV_PCIE_COUNT_LANE_ERROR 183 + +#define NVML_FI_DEV_IS_RESETLESS_MIG_SUPPORTED 184 + +/** + * Retrieves power usage for this GPU in milliwatts. + * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode and + * \ref nvmlDeviceGetPowerUsage. + * + * scopeId needs to be specified. It signifies: + * 0 - GPU Only Scope - Metrics for GPU are retrieved + * 1 - Module scope - Metrics for the module (e.g. CPU + GPU) are retrieved. + * Note: CPU here refers to NVIDIA CPU (e.g. Grace). x86 or non-NVIDIA ARM is not supported + */ +#define NVML_FI_DEV_POWER_AVERAGE 185 //!< GPU power averaged over 1 sec interval, supported on Ampere (except GA100) or newer architectures. +#define NVML_FI_DEV_POWER_INSTANT 186 //!< Current GPU power, supported on all architectures. +#define NVML_FI_DEV_POWER_MIN_LIMIT 187 //!< Minimum power limit in milliwatts. +#define NVML_FI_DEV_POWER_MAX_LIMIT 188 //!< Maximum power limit in milliwatts. +#define NVML_FI_DEV_POWER_DEFAULT_LIMIT 189 //!< Default power limit in milliwatts (limit which device boots with). +#define NVML_FI_DEV_POWER_CURRENT_LIMIT 190 //!< Limit currently enforced in milliwatts (This includes other limits set elsewhere. E.g. Out-of-band). +#define NVML_FI_DEV_ENERGY 191 //!< Total energy consumption (in mJ) since the driver was last reloaded. Same as \ref NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION for the GPU. +#define NVML_FI_DEV_POWER_REQUESTED_LIMIT 192 //!< Power limit requested by NVML or any other userspace client. + +/** + * GPU T.Limit temperature thresholds in degree Celsius + * + * These fields are supported on Ada and later architectures and supersedes \ref nvmlDeviceGetTemperatureThreshold. + */ +#define NVML_FI_DEV_TEMPERATURE_SHUTDOWN_TLIMIT 193 //!< T.Limit temperature after which GPU may shut down for HW protection +#define NVML_FI_DEV_TEMPERATURE_SLOWDOWN_TLIMIT 194 //!< T.Limit temperature after which GPU may begin HW slowdown +#define NVML_FI_DEV_TEMPERATURE_MEM_MAX_TLIMIT 195 //!< T.Limit temperature after which GPU may begin SW slowdown due to memory temperature +#define NVML_FI_DEV_TEMPERATURE_GPU_MAX_TLIMIT 196 //!< T.Limit temperature after which GPU may be throttled below base clock -#define NVML_FI_DEV_PCIE_L0_TO_RECOVERY_COUNTER 169 +#define NVML_FI_DEV_IS_MIG_MODE_INDEPENDENT_MIG_QUERY_CAPABLE 199 //!< MIG mode independent, MIG query capable device. 1=yes. 0=no. -#define NVML_FI_MAX 170 //!< One greater than the largest field ID defined above +#define NVML_FI_MAX 200 //!< One greater than the largest field ID defined above /** * Information for a Field Value Sample @@ -1981,7 +2244,7 @@ typedef struct nvmlEventData_st /** @} */ /***************************************************************************************************/ -/** @addtogroup nvmlClocksThrottleReasons +/** @addtogroup nvmlClocksEventReasons * @{ */ /***************************************************************************************************/ @@ -1989,28 +2252,28 @@ typedef struct nvmlEventData_st /** Nothing is running on the GPU and the clocks are dropping to Idle state * \note This limiter may be removed in a later release */ -#define nvmlClocksThrottleReasonGpuIdle 0x0000000000000001LL +#define nvmlClocksEventReasonGpuIdle 0x0000000000000001LL /** GPU clocks are limited by current setting of applications clocks * * @see nvmlDeviceSetApplicationsClocks * @see nvmlDeviceGetApplicationsClock */ -#define nvmlClocksThrottleReasonApplicationsClocksSetting 0x0000000000000002LL +#define nvmlClocksEventReasonApplicationsClocksSetting 0x0000000000000002LL /** * @deprecated Renamed to \ref nvmlClocksThrottleReasonApplicationsClocksSetting * as the name describes the situation more accurately. */ -#define nvmlClocksThrottleReasonUserDefinedClocks nvmlClocksThrottleReasonApplicationsClocksSetting +#define nvmlClocksThrottleReasonUserDefinedClocks nvmlClocksEventReasonApplicationsClocksSetting -/** SW Power Scaling algorithm is reducing the clocks below requested clocks +/** The clocks have been optimized to ensure not to exceed currently set power limits * * @see nvmlDeviceGetPowerUsage * @see nvmlDeviceSetPowerManagementLimit * @see nvmlDeviceGetPowerManagementLimit */ -#define nvmlClocksThrottleReasonSwPowerCap 0x0000000000000004LL +#define nvmlClocksEventReasonSwPowerCap 0x0000000000000004LL /** HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged * @@ -2036,16 +2299,16 @@ typedef struct nvmlEventData_st * holding this one at lower clocks. * */ -#define nvmlClocksThrottleReasonSyncBoost 0x0000000000000010LL +#define nvmlClocksEventReasonSyncBoost 0x0000000000000010LL /** SW Thermal Slowdown * - * This is an indicator of one or more of the following: - * - Current GPU temperature above the GPU Max Operating Temperature - * - Current memory temperature above the Memory Max Operating Temperature + * The current clocks have been optimized to ensure the the following is true: + * - Current GPU temperature does not exceed GPU Max Operating Temperature + * - Current memory temperature does not exceeed Memory Max Operating Temperature * */ -#define nvmlClocksThrottleReasonSwThermalSlowdown 0x0000000000000020LL +#define nvmlClocksEventReasonSwThermalSlowdown 0x0000000000000020LL /** HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged * @@ -2073,28 +2336,61 @@ typedef struct nvmlEventData_st * * @see bug 1997531 */ -#define nvmlClocksThrottleReasonDisplayClockSetting 0x0000000000000100LL +#define nvmlClocksEventReasonDisplayClockSetting 0x0000000000000100LL /** Bit mask representing no clocks throttling * * Clocks are as high as possible. * */ -#define nvmlClocksThrottleReasonNone 0x0000000000000000LL +#define nvmlClocksEventReasonNone 0x0000000000000000LL /** Bit mask representing all supported clocks throttling reasons * New reasons might be added to this list in the future */ -#define nvmlClocksThrottleReasonAll (nvmlClocksThrottleReasonNone \ - | nvmlClocksThrottleReasonGpuIdle \ - | nvmlClocksThrottleReasonApplicationsClocksSetting \ - | nvmlClocksThrottleReasonSwPowerCap \ +#define nvmlClocksEventReasonAll (nvmlClocksThrottleReasonNone \ + | nvmlClocksEventReasonGpuIdle \ + | nvmlClocksEventReasonApplicationsClocksSetting \ + | nvmlClocksEventReasonSwPowerCap \ | nvmlClocksThrottleReasonHwSlowdown \ - | nvmlClocksThrottleReasonSyncBoost \ - | nvmlClocksThrottleReasonSwThermalSlowdown \ + | nvmlClocksEventReasonSyncBoost \ + | nvmlClocksEventReasonSwThermalSlowdown \ | nvmlClocksThrottleReasonHwThermalSlowdown \ | nvmlClocksThrottleReasonHwPowerBrakeSlowdown \ - | nvmlClocksThrottleReasonDisplayClockSetting \ + | nvmlClocksEventReasonDisplayClockSetting \ ) + +/** + * @deprecated Use \ref nvmlClocksEventReasonGpuIdle instead + */ +#define nvmlClocksThrottleReasonGpuIdle nvmlClocksEventReasonGpuIdle +/** + * @deprecated Use \ref nvmlClocksEventReasonApplicationsClocksSetting instead + */ +#define nvmlClocksThrottleReasonApplicationsClocksSetting nvmlClocksEventReasonApplicationsClocksSetting +/** + * @deprecated Use \ref nvmlClocksEventReasonSyncBoost instead + */ +#define nvmlClocksThrottleReasonSyncBoost nvmlClocksEventReasonSyncBoost +/** + * @deprecated Use \ref nvmlClocksEventReasonSwPowerCap instead + */ +#define nvmlClocksThrottleReasonSwPowerCap nvmlClocksEventReasonSwPowerCap +/** + * @deprecated Use \ref nvmlClocksEventReasonSwThermalSlowdown instead + */ +#define nvmlClocksThrottleReasonSwThermalSlowdown nvmlClocksEventReasonSwThermalSlowdown +/** + * @deprecated Use \ref nvmlClocksEventReasonDisplayClockSetting instead + */ +#define nvmlClocksThrottleReasonDisplayClockSetting nvmlClocksEventReasonDisplayClockSetting +/** + * @deprecated Use \ref nvmlClocksEventReasonNone instead + */ +#define nvmlClocksThrottleReasonNone nvmlClocksEventReasonNone +/** + * @deprecated Use \ref nvmlClocksEventReasonAll instead + */ +#define nvmlClocksThrottleReasonAll nvmlClocksEventReasonAll /** @} */ /***************************************************************************************************/ @@ -2150,8 +2446,10 @@ typedef struct nvmlAccountingStats_st { */ typedef enum nvmlEncoderQueryType_enum { - NVML_ENCODER_QUERY_H264 = 0, //!< H264 encoder - NVML_ENCODER_QUERY_HEVC = 1 //!< HEVC encoder + NVML_ENCODER_QUERY_H264 = 0x00, //!< H264 encoder + NVML_ENCODER_QUERY_HEVC = 0x01, //!< HEVC encoder + NVML_ENCODER_QUERY_AV1 = 0x02, //!< AV1 encoder + NVML_ENCODER_QUERY_UNKNOWN = 0xFF //!< Unknown encoder }nvmlEncoderType_t; /** @@ -2182,7 +2480,7 @@ typedef struct nvmlEncoderSessionInfo_st */ typedef enum nvmlFBCSessionType_enum { - NVML_FBC_SESSION_TYPE_UNKNOWN = 0, //!< Unknwon + NVML_FBC_SESSION_TYPE_UNKNOWN = 0, //!< Unknown NVML_FBC_SESSION_TYPE_TOSYS, //!< ToSys NVML_FBC_SESSION_TYPE_CUDA, //!< Cuda NVML_FBC_SESSION_TYPE_VID, //!< Vid @@ -2252,6 +2550,143 @@ typedef enum nvmlPcieLinkState_enum /** @} */ +/***************************************************************************************************/ +/** @defgroup nvmlSystem/nvmlDevice definitions related to Confidential Computing + * @{ + */ +/***************************************************************************************************/ +/** + * Confidential Compute CPU Capabilities values + */ +#define NVML_CC_SYSTEM_CPU_CAPS_NONE 0 +#define NVML_CC_SYSTEM_CPU_CAPS_AMD_SEV 1 +#define NVML_CC_SYSTEM_CPU_CAPS_INTEL_TDX 2 + +/** + * Confidenial Compute GPU Capabilities values + */ +#define NVML_CC_SYSTEM_GPUS_CC_NOT_CAPABLE 0 +#define NVML_CC_SYSTEM_GPUS_CC_CAPABLE 1 + +typedef struct nvmlConfComputeSystemCaps_st { + unsigned int cpuCaps; + unsigned int gpusCaps; +} nvmlConfComputeSystemCaps_t; + +/** + * Confidential Compute DevTools Mode values + */ +#define NVML_CC_SYSTEM_DEVTOOLS_MODE_OFF 0 +#define NVML_CC_SYSTEM_DEVTOOLS_MODE_ON 1 + +/** + * Confidential Compute Environment values + */ +#define NVML_CC_SYSTEM_ENVIRONMENT_UNAVAILABLE 0 +#define NVML_CC_SYSTEM_ENVIRONMENT_SIM 1 +#define NVML_CC_SYSTEM_ENVIRONMENT_PROD 2 + +/** + * Confidential Compute Feature Status values + */ +#define NVML_CC_SYSTEM_FEATURE_DISABLED 0 +#define NVML_CC_SYSTEM_FEATURE_ENABLED 1 + +typedef struct nvmlConfComputeSystemState_st { + unsigned int environment; + unsigned int ccFeature; + unsigned int devToolsMode; +} nvmlConfComputeSystemState_t; + +/** + * Confidential Compute Multigpu mode values + */ +#define NVML_CC_SYSTEM_MULTIGPU_NONE 0 +#define NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE 1 + +/** + * Confidential Compute System settings + */ +typedef struct { + unsigned int version; + unsigned int environment; + unsigned int ccFeature; + unsigned int devToolsMode; + unsigned int multiGpuMode; +} nvmlSystemConfComputeSettings_v1_t; + +typedef nvmlSystemConfComputeSettings_v1_t nvmlSystemConfComputeSettings_t; +#define nvmlSystemConfComputeSettings_v1 NVML_STRUCT_VERSION(SystemConfComputeSettings, 1) + +/** + * Protected memory size + */ +typedef struct +nvmlConfComputeMemSizeInfo_st +{ + unsigned long long protectedMemSizeKib; + unsigned long long unprotectedMemSizeKib; +} nvmlConfComputeMemSizeInfo_t; + +/** + * Confidential Compute GPUs/System Ready State values + */ +#define NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE 0 +#define NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE 1 + +/** + * GPU Certificate Details + */ +#define NVML_GPU_CERT_CHAIN_SIZE 0x1000 +#define NVML_GPU_ATTESTATION_CERT_CHAIN_SIZE 0x1400 + +typedef struct nvmlConfComputeGpuCertificate_st { + unsigned int certChainSize; + unsigned int attestationCertChainSize; + unsigned char certChain[NVML_GPU_CERT_CHAIN_SIZE]; + unsigned char attestationCertChain[NVML_GPU_ATTESTATION_CERT_CHAIN_SIZE]; +} nvmlConfComputeGpuCertificate_t; + +/** + * GPU Attestation Report + */ +#define NVML_CC_GPU_CEC_NONCE_SIZE 0x20 +#define NVML_CC_GPU_ATTESTATION_REPORT_SIZE 0x2000 +#define NVML_CC_GPU_CEC_ATTESTATION_REPORT_SIZE 0x1000 +#define NVML_CC_CEC_ATTESTATION_REPORT_NOT_PRESENT 0 +#define NVML_CC_CEC_ATTESTATION_REPORT_PRESENT 1 +#define NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MIN 50 +#define NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MAX 75 + +typedef struct nvmlConfComputeGpuAttestationReport_st { + unsigned int isCecAttestationReportPresent; + unsigned int attestationReportSize; + unsigned int cecAttestationReportSize; + unsigned char nonce[NVML_CC_GPU_CEC_NONCE_SIZE]; + unsigned char attestationReport[NVML_CC_GPU_ATTESTATION_REPORT_SIZE]; + unsigned char cecAttestationReport[NVML_CC_GPU_CEC_ATTESTATION_REPORT_SIZE]; +} nvmlConfComputeGpuAttestationReport_t; + +typedef struct nvmlConfComputeSetKeyRotationThresholdInfo_st { + unsigned int version; + unsigned long long maxAttackerAdvantage; +} nvmlConfComputeSetKeyRotationThresholdInfo_v1_t; + +typedef nvmlConfComputeSetKeyRotationThresholdInfo_v1_t nvmlConfComputeSetKeyRotationThresholdInfo_t; +#define nvmlConfComputeSetKeyRotationThresholdInfo_v1 \ + NVML_STRUCT_VERSION(ConfComputeSetKeyRotationThresholdInfo, 1) + +typedef struct nvmlConfComputeGetKeyRotationThresholdInfo_st { + unsigned int version; + unsigned long long attackerAdvantage; +} nvmlConfComputeGetKeyRotationThresholdInfo_v1_t; + +typedef nvmlConfComputeGetKeyRotationThresholdInfo_v1_t nvmlConfComputeGetKeyRotationThresholdInfo_t; +#define nvmlConfComputeGetKeyRotationThresholdInfo_v1 \ + NVML_STRUCT_VERSION(ConfComputeGetKeyRotationThresholdInfo, 1) + +/** @} */ + #define NVML_GPU_FABRIC_UUID_LEN 16 #define NVML_GPU_FABRIC_STATE_NOT_SUPPORTED 0 @@ -2262,11 +2697,79 @@ typedef enum nvmlPcieLinkState_enum typedef unsigned char nvmlGpuFabricState_t; typedef struct { - char clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs + unsigned char clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs nvmlReturn_t status; //!< Error status, if any. Must be checked only if state returns "complete". - unsigned int partitionId; //!< ID of the fabric partition to which this GPU belongs + unsigned int cliqueId; //!< ID of the fabric clique to which this GPU belongs nvmlGpuFabricState_t state; //!< Current state of GPU registration process } nvmlGpuFabricInfo_t; + +#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_NOT_SUPPORTED 0 +#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_TRUE 1 +#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_FALSE 2 + +#define NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_DEGRADED_BW 0 +#define NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_DEGRADED_BW 0x11 + +/** + * GPU Fabric Health Status Mask for various fields can be obtained + * using the below macro. + * Ex - NVML_GPU_FABRIC_HEALTH_GET(var, _DEGRADED_BW) + */ +#define NVML_GPU_FABRIC_HEALTH_GET(var, type) \ + (((var) >> NVML_GPU_FABRIC_HEALTH_MASK_SHIFT##type) & \ + (NVML_GPU_FABRIC_HEALTH_MASK_WIDTH##type)) + +/** + * GPU Fabric Health Status Mask for various fields can be tested + * using the below macro. + * Ex - NVML_GPU_FABRIC_HEALTH_TEST(var, _DEGRADED_BW, _TRUE) + */ +#define NVML_GPU_FABRIC_HEALTH_TEST(var, type, val) \ + (NVML_GPU_FABRIC_HEALTH_GET(var, type) == \ + NVML_GPU_FABRIC_HEALTH_MASK##type##val) + +/** +* GPU Fabric information (v2). +* +* Version 2 adds the \ref nvmlGpuFabricInfo_v2_t.version field +* to the start of the structure, and the \ref nvmlGpuFabricInfo_v2_t.healthMask +* field to the end. This structure is not backwards-compatible with +* \ref nvmlGpuFabricInfo_t. +*/ +typedef struct { + unsigned int version; //!< Structure version identifier (set to \ref nvmlGpuFabricInfo_v2) + unsigned char clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs + nvmlReturn_t status; //!< Error status, if any. Must be checked only if state returns "complete". + unsigned int cliqueId; //!< ID of the fabric clique to which this GPU belongs + nvmlGpuFabricState_t state; //!< Current state of GPU registration process + unsigned int healthMask; //!< GPU Fabric health Status Mask +} nvmlGpuFabricInfo_v2_t; + +typedef nvmlGpuFabricInfo_v2_t nvmlGpuFabricInfoV_t; + +/** +* Version identifier value for \ref nvmlGpuFabricInfo_v2_t.version. +*/ +#define nvmlGpuFabricInfo_v2 NVML_STRUCT_VERSION(GpuFabricInfo, 2) + +/** + * Device Scope - This is useful to retrieve the telemetry at GPU and module (e.g. GPU + CPU) level + */ +#define NVML_POWER_SCOPE_GPU 0U //!< Targets only GPU +#define NVML_POWER_SCOPE_MODULE 1U //!< Targets the whole module +#define NVML_POWER_SCOPE_MEMORY 2U //!< Targets the GPU Memory + +typedef unsigned char nvmlPowerScopeType_t; + +typedef struct +{ + unsigned int version; //!< Structure format version (must be 1) + nvmlPowerScopeType_t powerScope; //!< [in] Device type: GPU or Total Module + unsigned int powerValueMw; //!< [out] Power value to retrieve or set in milliwatts +} nvmlPowerValue_v2_t; + +#define nvmlPowerValue_v2 NVML_STRUCT_VERSION(PowerValue, 2) + /** @} */ /***************************************************************************************************/ @@ -2531,34 +3034,73 @@ nvmlReturn_t DECLDIR nvmlSystemGetCudaDriverVersion_v2(int *cudaDriverVersion); */ nvmlReturn_t DECLDIR nvmlSystemGetProcessName(unsigned int pid, char *name, unsigned int length); -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlUnitQueries Unit Queries - * This chapter describes that queries that NVML can perform against each unit. For S-class systems only. - * In each case the device is identified with an nvmlUnit_t handle. This handle is obtained by - * calling \ref nvmlUnitGetHandleByIndex(). - * @{ - */ -/***************************************************************************************************/ - - /** - * Retrieves the number of units in the system. +/** + * Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system. * * For S-class products. * - * @param unitCount Reference in which to return the number of units + * The \a hwbcCount argument is expected to be set to the size of the input \a hwbcEntries array. + * The HIC must be connected to an S-class system for it to be reported by this function. + * + * @param hwbcCount Size of hwbcEntries array + * @param hwbcEntries Array holding information about hwbc * * @return - * - \ref NVML_SUCCESS if \a unitCount has been set + * - \ref NVML_SUCCESS if \a hwbcCount and \a hwbcEntries have been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unitCount is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_ERROR_INVALID_ARGUMENT if either \a hwbcCount or \a hwbcEntries is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a hwbcCount indicates that the \a hwbcEntries array is too small */ -nvmlReturn_t DECLDIR nvmlUnitGetCount(unsigned int *unitCount); +nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEntry_t *hwbcEntries); /** - * Acquire the handle for a particular unit, based on its index. + * Retrieve the set of GPUs that have a CPU affinity with the given CPU number + * For all products. + * Supported on Linux only. + * + * @param cpuNumber The CPU number + * @param count When zero, is set to the number of matching GPUs such that \a deviceArray + * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count + * number of device handles. + * @param deviceArray An array of device handles for GPUs found with affinity to \a cpuNumber + * + * @return + * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cpuNumber, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count + * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature + * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery + */ +nvmlReturn_t DECLDIR nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray); + + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlUnitQueries Unit Queries + * This chapter describes that queries that NVML can perform against each unit. For S-class systems only. + * In each case the device is identified with an nvmlUnit_t handle. This handle is obtained by + * calling \ref nvmlUnitGetHandleByIndex(). + * @{ + */ +/***************************************************************************************************/ + + /** + * Retrieves the number of units in the system. + * + * For S-class products. + * + * @param unitCount Reference in which to return the number of units + * + * @return + * - \ref NVML_SUCCESS if \a unitCount has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unitCount is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetCount(unsigned int *unitCount); + +/** + * Acquire the handle for a particular unit, based on its index. * * For S-class products. * @@ -2696,24 +3238,6 @@ nvmlReturn_t DECLDIR nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_ */ nvmlReturn_t DECLDIR nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int *deviceCount, nvmlDevice_t *devices); -/** - * Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system. - * - * For S-class products. - * - * The \a hwbcCount argument is expected to be set to the size of the input \a hwbcEntries array. - * The HIC must be connected to an S-class system for it to be reported by this function. - * - * @param hwbcCount Size of hwbcEntries array - * @param hwbcEntries Array holding information about hwbc - * - * @return - * - \ref NVML_SUCCESS if \a hwbcCount and \a hwbcEntries have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if either \a hwbcCount or \a hwbcEntries is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a hwbcCount indicates that the \a hwbcEntries array is too small - */ -nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEntry_t *hwbcEntries); /** @} */ /***************************************************************************************************/ @@ -3008,6 +3532,37 @@ nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index */ nvmlReturn_t DECLDIR nvmlDeviceGetSerial(nvmlDevice_t device, char *serial, unsigned int length); +/* +* Get a unique identifier for the device module on the baseboard +* +* This API retrieves a unique identifier for each GPU module that exists on a given baseboard. +* For non-baseboard products, this ID would always be 0. +* +* @param device The identifier of the target device +* @param moduleId Unique identifier for the GPU module +* +* @return +* - \ref NVML_SUCCESS if \a moduleId has been successfully retrieved +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a moduleId is invalid +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlDeviceGetModuleId(nvmlDevice_t device, unsigned int *moduleId); + +/** + * Retrieves the Device's C2C Mode information + * + * @param device The identifier of the target device + * @param c2cModeInfo Output struct containing the device's C2C Mode info + * + * @return + * - \ref NVML_SUCCESS if \a C2C Mode Infor query is successful + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a serial is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetC2cModeInfoV(nvmlDevice_t device, nvmlC2cModeInfo_v1_t *c2cModeInfo); /***************************************************************************************************/ @@ -3147,6 +3702,19 @@ nvmlReturn_t DECLDIR nvmlDeviceSetCpuAffinity(nvmlDevice_t device); */ nvmlReturn_t DECLDIR nvmlDeviceClearCpuAffinity(nvmlDevice_t device); +/** + * Get the NUMA node of the given GPU device. + * This only applies to platforms where the GPUs are NUMA nodes. + * + * @param[in] device The device handle + * @param[out] node NUMA node ID of the device + * + * @returns + * - \ref NVML_SUCCESS if the NUMA node is retrieved successfully + * - \ref NVML_ERROR_NOT_SUPPORTED if request is not supported on the current platform + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device \a node is invalid + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNumaNodeId(nvmlDevice_t device, unsigned int *node); /** * Retrieve the common ancestor for two devices * For all products. @@ -3186,25 +3754,6 @@ nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, n */ nvmlReturn_t DECLDIR nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopologyLevel_t level, unsigned int *count, nvmlDevice_t *deviceArray); -/** - * Retrieve the set of GPUs that have a CPU affinity with the given CPU number - * For all products. - * Supported on Linux only. - * - * @param cpuNumber The CPU number - * @param count When zero, is set to the number of matching GPUs such that \a deviceArray - * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count - * number of device handles. - * @param deviceArray An array of device handles for GPUs found with affinity to \a cpuNumber - * - * @return - * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cpuNumber, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count - * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature - * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery - */ -nvmlReturn_t DECLDIR nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray); - /** * Retrieve the status for a given p2p capability index between a given pair of GPU * @@ -3248,31 +3797,6 @@ nvmlReturn_t DECLDIR nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t d */ nvmlReturn_t DECLDIR nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length); -/** - * Retrieve the MDEV UUID of a vGPU instance. - * - * The MDEV UUID is a globally unique identifier of the mdev device assigned to the VM, and is returned as a 5-part hexadecimal string, - * not exceeding 80 characters in length (including the NULL terminator). - * MDEV UUID is displayed only on KVM platform. - * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param mdevUuid Pointer to caller-supplied buffer to hold MDEV UUID - * @param size Size of buffer in bytes - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NOT_SUPPORTED on any hypervisor other than KVM - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mdevUuid is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMdevUUID(nvmlVgpuInstance_t vgpuInstance, char *mdevUuid, unsigned int size); - /** * Retrieves minor number for the device. The minor number for the device is such that the Nvidia device node file for * each GPU will have the form /dev/nvidia[minor number]. @@ -3408,6 +3932,27 @@ nvmlReturn_t DECLDIR nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t devi */ nvmlReturn_t DECLDIR nvmlDeviceValidateInforom(nvmlDevice_t device); +/** + * Retrieves the timestamp and the duration of the last flush of the BBX (blackbox) infoROM object during the current run. + * + * For all products with an inforom. + * + * @param device The identifier of the target device + * @param timestamp The start timestamp of the last BBX Flush + * @param durationUs The duration (us) of the last BBX Flush + * + * @return + * - \ref NVML_SUCCESS if \a timestamp and \a durationUs are successfully retrieved + * - \ref NVML_ERROR_NOT_READY if the BBX object has not been flushed yet + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetInforomVersion + */ +nvmlReturn_t DECLDIR nvmlDeviceGetLastBBXFlushTime(nvmlDevice_t device, unsigned long long *timestamp, + unsigned long *durationUs); + /** * Retrieves the display mode for the device. * @@ -3482,6 +4027,25 @@ nvmlReturn_t DECLDIR nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableS */ nvmlReturn_t DECLDIR nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t *mode); +/** + * Retrieves PCI attributes of this device. + * + * For all products. + * + * See \ref nvmlPciInfoExt_t for details on the available PCI info. + * + * @param device The identifier of the target device + * @param pci Reference in which to return the PCI info + * + * @return + * - \ref NVML_SUCCESS if \a pci has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pci is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPciInfoExt(nvmlDevice_t device, nvmlPciInfoExt_t *pci); + /** * Retrieves the PCI attributes of this device. * @@ -3683,6 +4247,20 @@ nvmlReturn_t DECLDIR nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t */ nvmlReturn_t DECLDIR nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); +/** + * Retrieve the GPCCLK VF offset value + * @param[in] device The identifier of the target device + * @param[out] offset The retrieved GPCCLK VF offset value + * + * @return + * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpcClkVfOffset(nvmlDevice_t device, int *offset); + /** * Retrieves the current setting of a clock that applications will use unless an overspec situation occurs. * Can be changed using \ref nvmlDeviceSetApplicationsClocks. @@ -3725,33 +4303,6 @@ nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClo */ nvmlReturn_t DECLDIR nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); -/** - * Resets the application clock to the default value - * - * This is the applications clock that will be used after system reboot or driver reload. - * Default value is constant, but the current value an be changed using \ref nvmlDeviceSetApplicationsClocks. - * - * On Pascal and newer hardware, if clocks were previously locked with \ref nvmlDeviceSetApplicationsClocks, - * this call will unlock clocks. This returns clocks their default behavior ofautomatically boosting above - * base clocks as thermal limits allow. - * - * @see nvmlDeviceGetApplicationsClock - * @see nvmlDeviceSetApplicationsClocks - * - * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if new settings were successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device); - /** * Retrieves the clock speed for the clock specified by the clock type and clock ID. * @@ -3871,107 +4422,44 @@ nvmlReturn_t DECLDIR nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, u nvmlReturn_t DECLDIR nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled); /** - * Try to set the current state of Auto Boosted clocks on a device. - * - * For Kepler &tm; or newer fully supported devices. + * Retrieves the intended operating speed of the device's fan. * - * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates - * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock - * rates are desired. + * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the + * output will not match the actual fan speed. * - * Non-root users may use this API by default but can be restricted by root from using this API by calling - * \ref nvmlDeviceSetAPIRestriction with apiType=NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS. - * Note: Persistence Mode is required to modify current Auto Boost settings, therefore, it must be enabled. + * For all discrete products with dedicated fans. * - * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. - * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost - * behavior. + * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. + * This value may exceed 100% in certain cases. * * @param device The identifier of the target device - * @param enabled What state to try to set Auto Boosted clocks of the target device to + * @param speed Reference in which to return the fan speed percentage * * @return - * - \ref NVML_SUCCESS If the Auto Boosted clocks were successfully set to the state specified by \a enabled + * - \ref NVML_SUCCESS if \a speed has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a speed is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * */ -nvmlReturn_t DECLDIR nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled); +nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed); + /** - * Try to set the default state of Auto Boosted clocks on a device. This is the default state that Auto Boosted clocks will - * return to when no compute running processes (e.g. CUDA application which have an active context) are running + * Retrieves the intended operating speed of the device's specified fan. * - * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. - * Requires root/admin permissions. + * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the + * output will not match the actual fan speed. * - * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates - * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock - * rates are desired. + * For all discrete products with dedicated fans. * - * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. - * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost - * behavior. + * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. + * This value may exceed 100% in certain cases. * - * @param device The identifier of the target device - * @param enabled What state to try to set default Auto Boosted clocks of the target device to - * @param flags Flags that change the default behavior. Currently Unused. - * - * @return - * - \ref NVML_SUCCESS If the Auto Boosted clock's default state was successfully set to the state specified by \a enabled - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NO_PERMISSION If the calling user does not have permission to change Auto Boosted clock's default state. - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags); - - -/** - * Retrieves the intended operating speed of the device's fan. - * - * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the - * output will not match the actual fan speed. - * - * For all discrete products with dedicated fans. - * - * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. - * This value may exceed 100% in certain cases. - * - * @param device The identifier of the target device - * @param speed Reference in which to return the fan speed percentage - * - * @return - * - \ref NVML_SUCCESS if \a speed has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a speed is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed); - - -/** - * Retrieves the intended operating speed of the device's specified fan. - * - * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the - * output will not match the actual fan speed. - * - * For all discrete products with dedicated fans. - * - * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. - * This value may exceed 100% in certain cases. - * - * @param device The identifier of the target device - * @param fan The index of the target fan, zero indexed. - * @param speed Reference in which to return the fan speed percentage + * @param device The identifier of the target device + * @param fan The index of the target fan, zero indexed. + * @param speed Reference in which to return the fan speed percentage * * @return * - \ref NVML_SUCCESS if \a speed has been set @@ -4011,24 +4499,6 @@ nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed_v2(nvmlDevice_t device, unsigned int */ nvmlReturn_t DECLDIR nvmlDeviceGetTargetFanSpeed(nvmlDevice_t device, unsigned int fan, unsigned int *targetSpeed); -/** - * Sets the speed of the fan control policy to default. - * - * For all cuda-capable discrete products with fans - * - * @param device The identifier of the target device - * @param fan The index of the fan, starting at zero - * - * return - * NVML_SUCCESS if speed has been adjusted - * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * NVML_ERROR_INVALID_ARGUMENT if device is invalid - * NVML_ERROR_NOT_SUPPORTED if the device does not support this - * (doesn't have fans) - * NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetDefaultFanSpeed_v2(nvmlDevice_t device, unsigned int fan); - /** * Retrieves the min and max fan speed that user can set for the GPU fan. * @@ -4070,29 +4540,6 @@ nvmlReturn_t DECLDIR nvmlDeviceGetMinMaxFanSpeed(nvmlDevice_t device, unsigned i nvmlReturn_t DECLDIR nvmlDeviceGetFanControlPolicy_v2(nvmlDevice_t device, unsigned int fan, nvmlFanControlPolicy_t *policy); -/** - * Sets current fan control policy. - * - * For Maxwell &tm; or newer fully supported devices. - * - * Requires privileged user. - * - * For all cuda-capable discrete products with fans - * - * device The identifier of the target \a device - * policy The fan control \a policy to set - * - * return - * NVML_SUCCESS if \a policy has been set - * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a policy is null or the \a fan given doesn't reference - * a fan that exists. - * NVML_ERROR_NOT_SUPPORTED if the \a device is older than Maxwell - * NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetFanControlPolicy(nvmlDevice_t device, unsigned int fan, - nvmlFanControlPolicy_t policy); - /** * Retrieves the number of fans on the device. * @@ -4139,6 +4586,14 @@ nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatu * * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds. * + * Note: This API is no longer the preferred interface for retrieving the following temperature thresholds + * on Ada and later architectures: NVML_TEMPERATURE_THRESHOLD_SHUTDOWN, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, + * NVML_TEMPERATURE_THRESHOLD_MEM_MAX and NVML_TEMPERATURE_THRESHOLD_GPU_MAX. + * + * Support for reading these temperature thresholds for Ada and later architectures would be removed from this + * API in future releases. Please use \ref nvmlDeviceGetFieldValues with NVML_FI_DEV_TEMPERATURE_* fields to retrieve + * temperature thresholds on these architectures. + * * @param device The identifier of the target device * @param thresholdType The type of threshold value queried * @param temp Reference in which to return the temperature reading @@ -4152,26 +4607,6 @@ nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatu */ nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp); -/** - * Sets the temperature threshold for the GPU with the specified threshold type in degrees C. - * - * For Maxwell &tm; or newer fully supported devices. - * - * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds. - * - * @param device The identifier of the target device - * @param thresholdType The type of threshold value to be set - * @param temp Reference which hold the value to be set - * @return - * - \ref NVML_SUCCESS if \a temp has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a thresholdType is invalid or \a temp is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a temperature sensor or is unsupported - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, int *temp); - /** * Used to execute a list of thermal system instructions. * @@ -4210,50 +4645,60 @@ nvmlReturn_t DECLDIR nvmlDeviceGetThermalSettings(nvmlDevice_t device, unsigned nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *pState); /** - * Retrieves current clocks throttling reasons. + * Retrieves current clocks event reasons. * * For all fully supported products. * * \note More than one bit can be enabled at the same time. Multiple reasons can be affecting clocks at once. * * @param device The identifier of the target device - * @param clocksThrottleReasons Reference in which to return bitmask of active clocks throttle + * @param clocksEventReasons Reference in which to return bitmask of active clocks event * reasons * * @return - * - \ref NVML_SUCCESS if \a clocksThrottleReasons has been set + * - \ref NVML_SUCCESS if \a clocksEventReasons has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clocksThrottleReasons is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clocksEventReasons is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error * - * @see nvmlClocksThrottleReasons - * @see nvmlDeviceGetSupportedClocksThrottleReasons + * @see nvmlClocksEventReasons + * @see nvmlDeviceGetSupportedClocksEventReasons + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksEventReasons(nvmlDevice_t device, unsigned long long *clocksEventReasons); + +/** + * @deprecated Use \ref nvmlDeviceGetCurrentClocksEventReasons instead */ nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long *clocksThrottleReasons); /** - * Retrieves bitmask of supported clocks throttle reasons that can be returned by - * \ref nvmlDeviceGetCurrentClocksThrottleReasons + * Retrieves bitmask of supported clocks event reasons that can be returned by + * \ref nvmlDeviceGetCurrentClocksEventReasons * * For all fully supported products. * * This method is not supported in virtual machines running virtual GPU (vGPU). * * @param device The identifier of the target device - * @param supportedClocksThrottleReasons Reference in which to return bitmask of supported - * clocks throttle reasons + * @param supportedClocksEventReasons Reference in which to return bitmask of supported + * clocks event reasons * * @return - * - \ref NVML_SUCCESS if \a supportedClocksThrottleReasons has been set + * - \ref NVML_SUCCESS if \a supportedClocksEventReasons has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a supportedClocksThrottleReasons is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a supportedClocksEventReasons is NULL * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error * - * @see nvmlClocksThrottleReasons - * @see nvmlDeviceGetCurrentClocksThrottleReasons + * @see nvmlClocksEventReasons + * @see nvmlDeviceGetCurrentClocksEventReasons + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksEventReasons(nvmlDevice_t device, unsigned long long *supportedClocksEventReasons); + +/** + * @deprecated Use \ref nvmlDeviceGetSupportedClocksEventReasons instead */ nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons); @@ -4279,6 +4724,113 @@ nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t de */ nvmlReturn_t DECLDIR nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t *pState); +/** + * Retrieve performance monitor samples from the associated subdevice. + * + * @param device + * @param pDynamicPstatesInfo + * + * @return + * - \ref NVML_SUCCESS if \a pDynamicPstatesInfo has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pDynamicPstatesInfo is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDynamicPstatesInfo(nvmlDevice_t device, nvmlGpuDynamicPstatesInfo_t *pDynamicPstatesInfo); + +/** + * Retrieve the MemClk (Memory Clock) VF offset value. + * @param[in] device The identifier of the target device + * @param[out] offset The retrieved MemClk VF offset value + * + * @return + * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMemClkVfOffset(nvmlDevice_t device, int *offset); + +/** + * Retrieve min and max clocks of some clock domain for a given PState + * + * @param device The identifier of the target device + * @param type Clock domain + * @param pstate PState to query + * @param minClockMHz Reference in which to return min clock frequency + * @param maxClockMHz Reference in which to return max clock frequency + * + * @return + * - \ref NVML_SUCCESS if everything worked + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a type or \a pstate are invalid or both + * \a minClockMHz and \a maxClockMHz are NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMinMaxClockOfPState(nvmlDevice_t device, nvmlClockType_t type, nvmlPstates_t pstate, + unsigned int * minClockMHz, unsigned int * maxClockMHz); + +/** + * Get all supported Performance States (P-States) for the device. + * + * The returned array would contain a contiguous list of valid P-States supported by + * the device. If the number of supported P-States is fewer than the size of the array + * supplied missing elements would contain \a NVML_PSTATE_UNKNOWN. + * + * The number of elements in the returned list will never exceed \a NVML_MAX_GPU_PERF_PSTATES. + * + * @param device The identifier of the target device + * @param pstates Container to return the list of performance states + * supported by device + * @param size Size of the supplied \a pstates array in bytes + * + * @return + * - \ref NVML_SUCCESS if \a pstates array has been retrieved + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if the the container supplied was not large enough to + * hold the resulting list + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a pstates is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support performance state readings + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedPerformanceStates(nvmlDevice_t device, + nvmlPstates_t *pstates, unsigned int size); + +/** + * Retrieve the GPCCLK min max VF offset value. + * @param[in] device The identifier of the target device + * @param[out] minOffset The retrieved GPCCLK VF min offset value + * @param[out] maxOffset The retrieved GPCCLK VF max offset value + * + * @return + * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpcClkMinMaxVfOffset(nvmlDevice_t device, + int *minOffset, int *maxOffset); + +/** + * Retrieve the MemClk (Memory Clock) min max VF offset value. + * @param[in] device The identifier of the target device + * @param[out] minOffset The retrieved MemClk VF min offset value + * @param[out] maxOffset The retrieved MemClk VF max offset value + * + * @return + * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMemClkMinMaxVfOffset(nvmlDevice_t device, + int *minOffset, int *maxOffset); + /** * This API has been deprecated. * @@ -4378,7 +4930,12 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t devic * * For Fermi &tm; or newer fully supported devices. * - * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. + * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. On Ampere + * (except GA100) or newer GPUs, the API returns power averaged over 1 sec interval. On GA100 and + * older architectures, instantaneous power is returned. + * + * See \ref NVML_FI_DEV_POWER_AVERAGE and \ref NVML_FI_DEV_POWER_INSTANT to query specific power + * values. * * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode. * @@ -4478,6 +5035,14 @@ nvmlReturn_t DECLDIR nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuO * * @note nvmlDeviceGetMemoryInfo_v2 adds additional memory information. * + * @note On systems where GPUs are NUMA nodes, the accuracy of FB memory utilization + * provided by this API depends on the memory accounting of the operating system. + * This is because FB memory is managed by the operating system instead of the NVIDIA GPU driver. + * Typically, pages allocated from FB memory are not released even after + * the process terminates to enhance performance. In scenarios where + * the operating system is under memory pressure, it may resort to utilizing FB memory. + * Such actions can result in discrepancies in the accuracy of memory reporting. + * * @param device The identifier of the target device * @param memory Reference in which to return the memory information * @@ -4834,10 +5399,10 @@ nvmlReturn_t DECLDIR nvmlDeviceGetEncoderStats (nvmlDevice_t device, unsigned in * Retrieves information about active encoder sessions on a target device. * * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfos. The - * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions + * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions * written to the buffer. * - * If the supplied buffer is not large enough to accomodate the active session array, the function returns + * If the supplied buffer is not large enough to accommodate the active session array, the function returns * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. @@ -4881,33 +5446,75 @@ nvmlReturn_t DECLDIR nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned nvmlReturn_t DECLDIR nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); /** -* Retrieves the active frame buffer capture sessions statistics for a given device. -* -* For Maxwell &tm; or newer fully supported devices. -* -* @param device The identifier of the target device -* @param fbcStats Reference to nvmlFBCStats_t structure contianing NvFBC stats -* -* @return -* - \ref NVML_SUCCESS if \a fbcStats is fetched -* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized -* - \ref NVML_ERROR_INVALID_ARGUMENT if \a fbcStats is NULL -* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible -* - \ref NVML_ERROR_UNKNOWN on any unexpected error -*/ -nvmlReturn_t DECLDIR nvmlDeviceGetFBCStats(nvmlDevice_t device, nvmlFBCStats_t *fbcStats); - -/** -* Retrieves information about active frame buffer capture sessions on a target device. -* -* An array of active FBC sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The -* array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions -* written to the buffer. -* -* If the supplied buffer is not large enough to accomodate the active session array, the function returns -* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount. -* To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return -* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount. + * Retrieves the current utilization and sampling size in microseconds for the JPG + * + * %TURING_OR_NEWER% + * + * @note On MIG-enabled GPUs, querying decoder utilization is not currently supported. + * + * @param device The identifier of the target device + * @param utilization Reference to an unsigned int for jpg utilization info + * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US + * + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetJpgUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); + +/** + * Retrieves the current utilization and sampling size in microseconds for the OFA (Optical Flow Accelerator) + * + * %TURING_OR_NEWER% + * + * @note On MIG-enabled GPUs, querying decoder utilization is not currently supported. + * + * @param device The identifier of the target device + * @param utilization Reference to an unsigned int for ofa utilization info + * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US + * + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetOfaUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); + +/** +* Retrieves the active frame buffer capture sessions statistics for a given device. +* +* For Maxwell &tm; or newer fully supported devices. +* +* @param device The identifier of the target device +* @param fbcStats Reference to nvmlFBCStats_t structure containing NvFBC stats +* +* @return +* - \ref NVML_SUCCESS if \a fbcStats is fetched +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a fbcStats is NULL +* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlDeviceGetFBCStats(nvmlDevice_t device, nvmlFBCStats_t *fbcStats); + +/** +* Retrieves information about active frame buffer capture sessions on a target device. +* +* An array of active FBC sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The +* array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions +* written to the buffer. +* +* If the supplied buffer is not large enough to accommodate the active session array, the function returns +* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount. +* To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return +* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount. * * For Maxwell &tm; or newer fully supported devices. * @@ -5126,6 +5733,57 @@ nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses_v3(nvmlDevice_t devic */ nvmlReturn_t DECLDIR nvmlDeviceGetMPSComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); +/** + * Get information about running processes on a device for input context + * + * %HOPPER_OR_NEWER% + * + * This function returns information only about running processes (e.g. CUDA application which have + * active context). + * + * To determine the size of the @ref plist->procArray array to allocate, call the function with + * @ref plist->numProcArrayEntries set to zero and @ref plist->procArray set to NULL. The return + * code will be either NVML_ERROR_INSUFFICIENT_SIZE (if there are valid processes of type + * @ref plist->mode to report on, in which case the @ref plist->numProcArrayEntries field will + * indicate the required number of entries in the array) or NVML_SUCCESS (if no processes of type + * @ref plist->mode exist). + * + * The usedGpuMemory field returned is all of the memory used by the application. + * The usedGpuCcProtectedMemory field returned is all of the protected memory used by the application. + * + * Keep in mind that information returned by this call is dynamic and the number of elements might change in + * time. Allocate more space for \a plist->procArray table in case new processes are spawned. + * + * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if + * the caller has appropriate privileges. Per-instance information can be queried by using + * specific MIG device handles. + * Querying per-instance information using MIG device handles is not supported if the device is in + * vGPU Host virtualization mode. + * Protected memory usage is currently not available in MIG mode and in windows. + * + * @param device The device handle or MIG device handle + * @param plist Reference in which to process detail list + * @param plist->version The api version + * @param plist->mode The process mode + * @param plist->procArray Reference in which to return the process information + * @param plist->numProcArrayEntries Proc array size of returned entries + * + * @return + * - \ref NVML_SUCCESS if \a plist->numprocArrayEntries and \a plist->procArray have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a plist->numprocArrayEntries indicates that the \a plist->procArray is too small + * \a plist->numprocArrayEntries will contain minimal amount of space necessary for + * the call to complete + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a plist is NULL, \a plist->version is invalid, + * \a plist->mode is invalid, + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetRunningProcessDetailList(nvmlDevice_t device, nvmlProcessDetailList_t *plist); + /** * Check if the GPU devices are on the same physical board. * @@ -5369,7 +6027,9 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPcieSpeed(nvmlDevice_t device, unsigned int *p * Gets the device's Adaptive Clock status * * @param device The identifier of the target device - * @param adaptiveClockStatus The current adaptive clocking status + * @param adaptiveClockStatus The current adaptive clocking status, either + * @ref NVML_ADAPTIVE_CLOCKING_INFO_STATUS_DISABLED + * or @ref NVML_ADAPTIVE_CLOCKING_INFO_STATUS_ENABLED * * @return * - \ref NVML_SUCCESS if the current adaptive clocking status is successfully retrieved @@ -5382,152 +6042,407 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPcieSpeed(nvmlDevice_t device, unsigned int *p nvmlReturn_t DECLDIR nvmlDeviceGetAdaptiveClockInfoStatus(nvmlDevice_t device, unsigned int *adaptiveClockStatus); /** - * @} + * Get the type of the GPU Bus (PCIe, PCI, ...) + * + * @param device The identifier of the target device + * @param type The PCI Bus type + * + * return + * - \ref NVML_SUCCESS if the bus \a type is successfully retreived + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \device is invalid or \type is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ +nvmlReturn_t DECLDIR nvmlDeviceGetBusType(nvmlDevice_t device, nvmlBusType_t *type); -/** @addtogroup nvmlAccountingStats - * @{ - */ -/** - * Queries the state of per process accounting mode. + /** + * Deprecated: Will be deprecated in a future release. Use \ref nvmlDeviceGetGpuFabricInfoV instead * - * For Kepler &tm; or newer fully supported devices. + * Get fabric information associated with the device. + * + * %HOPPER_OR_NEWER% + * + * On Hopper + NVSwitch systems, GPU is registered with the NVIDIA Fabric Manager + * Upon successful registration, the GPU is added to the NVLink fabric to enable + * peer-to-peer communication. + * This API reports the current state of the GPU in the NVLink fabric + * along with other useful information. * - * See \ref nvmlDeviceGetAccountingStats for more details. - * See \ref nvmlDeviceSetAccountingMode * * @param device The identifier of the target device - * @param mode Reference in which to return the current accounting mode + * @param gpuFabricInfo Information about GPU fabric state * * @return - * - \ref NVML_SUCCESS if the mode has been successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode are NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support gpu fabric */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode); +nvmlReturn_t DECLDIR nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t *gpuFabricInfo); /** - * Queries process's accounting stats. - * - * For Kepler &tm; or newer fully supported devices. +* Versioned wrapper around \ref nvmlDeviceGetGpuFabricInfo that accepts a versioned +* \ref nvmlGpuFabricInfo_v2_t or later output structure. +* +* @note The caller must set the \ref nvmlGpuFabricInfoV_t.version field to the +* appropriate version prior to calling this function. For example: +* \code +* nvmlGpuFabricInfoV_t fabricInfo = +* { .version = nvmlGpuFabricInfo_v2 }; +* nvmlReturn_t result = nvmlDeviceGetGpuFabricInfoV(device,&fabricInfo); +* \endcode +* +* %HOPPER_OR_NEWER% +* +* @param device The identifier of the target device +* @param gpuFabricInfo Information about GPU fabric state +* +* @return +* - \ref NVML_SUCCESS Upon success +* - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support gpu fabric +*/ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, + nvmlGpuFabricInfoV_t *gpuFabricInfo); + +/** + * Get Conf Computing System capabilities. * - * Accounting stats capture GPU utilization and other statistics across the lifetime of a process. - * Accounting stats can be queried during life time of the process and after its termination. - * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and - * updated to actual running time after its termination. - * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old - * processes. + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. * - * See \ref nvmlAccountingStats_t for description of each returned metric. - * List of processes that can be queried can be retrieved from \ref nvmlDeviceGetAccountingPids. + * @param capabilities System CC capabilities * - * @note Accounting Mode needs to be on. See \ref nvmlDeviceGetAccountingMode. - * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be - * queried since they don't contribute to GPU utilization. - * @note In case of pid collision stats of only the latest process (that terminated last) will be reported + * @return + * - \ref NVML_SUCCESS if \a capabilities were successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a capabilities is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlSystemGetConfComputeCapabilities(nvmlConfComputeSystemCaps_t *capabilities); + +/** + * Get Conf Computing System State. * - * @warning On Kepler devices per process statistics are accurate only if there's one process running on a GPU. + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. * - * @param device The identifier of the target device - * @param pid Process Id of the target process to query stats for - * @param stats Reference in which to return the process's accounting stats + * @param state System CC State * * @return - * - \ref NVML_SUCCESS if stats have been successfully retrieved + * - \ref NVML_SUCCESS if \a state were successfully queried * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a stats are NULL - * - \ref NVML_ERROR_NOT_FOUND if process stats were not found - * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature or accounting mode is disabled - * or on vGPU host. - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a state is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlSystemGetConfComputeState(nvmlConfComputeSystemState_t *state); + +/** + * Get Conf Computing Protected and Unprotected Memory Sizes. * - * @see nvmlDeviceGetAccountingBufferSize + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param device Device handle + * @param memInfo Protected/Unprotected Memory sizes + * + * @return + * - \ref NVML_SUCCESS if \a memInfo were successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a memInfo or \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats); +nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeMemSizeInfo(nvmlDevice_t device, nvmlConfComputeMemSizeInfo_t *memInfo); /** - * Queries list of processes that can be queried for accounting stats. The list of processes returned - * can be in running or terminated state. + * Get Conf Computing GPUs ready state. * - * For Kepler &tm; or newer fully supported devices. + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. * - * To just query the number of processes ready to be queried, call this function with *count = 0 and - * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. + * @param isAcceptingWork Returns GPU current work accepting state, + * NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE or + * NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE * - * For more details see \ref nvmlDeviceGetAccountingStats. + * return + * - \ref NVML_SUCCESS if \a current GPUs ready state were successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a isAcceptingWork is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlSystemGetConfComputeGpusReadyState(unsigned int *isAcceptingWork); + +/** + * Get Conf Computing protected memory usage. * - * @note In case of PID collision some processes might not be accessible before the circular buffer is full. + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. * * @param device The identifier of the target device - * @param count Reference in which to provide the \a pids array size, and - * to return the number of elements ready to be queried - * @param pids Reference in which to return list of process ids + * @param memory Reference in which to return the memory information * * @return - * - \ref NVML_SUCCESS if pids were successfully retrieved + * - \ref NVML_SUCCESS if \a memory has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature or accounting mode is disabled - * or on vGPU host. - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to - * expected value) + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetAccountingBufferSize */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids); +nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeProtectedMemoryUsage(nvmlDevice_t device, nvmlMemory_t *memory); /** - * Returns the number of processes that the circular buffer with accounting pids can hold. - * - * For Kepler &tm; or newer fully supported devices. + * Get Conf Computing Gpu certificate details. * - * This is the maximum number of processes that accounting information will be stored for before information - * about oldest processes will get overwritten by information about new processes. + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. * * @param device The identifier of the target device - * @param bufferSize Reference in which to provide the size (in number of elements) - * of the circular buffer for accounting stats. + * @param gpuCert Reference in which to return the gpu certificate information * * @return - * - \ref NVML_SUCCESS if buffer size was successfully retrieved + * - \ref NVML_SUCCESS if \a gpu certificate info has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a bufferSize is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetAccountingStats - * @see nvmlDeviceGetAccountingPids - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int *bufferSize); - -/** @} */ - -/** @addtogroup nvmlDeviceQueries - * @{ */ +nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeGpuCertificate(nvmlDevice_t device, + nvmlConfComputeGpuCertificate_t *gpuCert); /** - * Returns the list of retired pages by source, including pages that are pending retirement - * The address information provided from this API is the hardware address of the page that was retired. Note - * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 + * Get Conf Computing Gpu attestation report. * - * For Kepler &tm; or newer fully supported devices. + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. * - * @param device The identifier of the target device - * @param cause Filter page addresses by cause of retirement - * @param pageCount Reference in which to provide the \a addresses buffer size, and - * to return the number of retired pages that match \a cause - * Set to 0 to query the size without allocating an \a addresses buffer - * @param addresses Buffer to write the page addresses into + * @param device The identifier of the target device + * @param gpuAtstReport Reference in which to return the gpu attestation report * * @return - * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the - * matching page addresses. \a pageCount is set to the needed size. + * - \ref NVML_SUCCESS if \a gpu attestation report has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeGpuAttestationReport(nvmlDevice_t device, + nvmlConfComputeGpuAttestationReport_t *gpuAtstReport); +/** + * Get Conf Computing key rotation threshold detail. + * + * %HOPPER_OR_NEWER% + * Supported on Linux, Windows TCC. + * + * @param pKeyRotationThrInfo Reference in which to return the key rotation threshold data + * + * @return + * - \ref NVML_SUCCESS if \a gpu key rotation threshold info has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlSystemGetConfComputeKeyRotationThresholdInfo( + nvmlConfComputeGetKeyRotationThresholdInfo_t *pKeyRotationThrInfo); + +/** + * Get Conf Computing System Settings. + * + * %HOPPER_OR_NEWER% + * Supported on Linux, Windows TCC. + * + * @param settings System CC settings + * + * @return + * - \ref NVML_SUCCESS if the query is success + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counters is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the provided version is invalid/unsupported + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlSystemGetConfComputeSettings(nvmlSystemConfComputeSettings_t *settings); + +/** + * Retrieve GSP firmware version. + * + * The caller passes in buffer via \a version and corresponding GSP firmware numbered version + * is returned with the same parameter in string format. + * + * @param device Device handle + * @param version The retrieved GSP firmware version + * + * @return + * - \ref NVML_SUCCESS if GSP firmware version is sucessfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or GSP \a version pointer is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if GSP firmware is not enabled for GPU + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGspFirmwareVersion(nvmlDevice_t device, char *version); + +/** + * Retrieve GSP firmware mode. + * + * The caller passes in integer pointers. GSP firmware enablement and default mode information is returned with + * corresponding parameters. The return value in \a isEnabled and \a defaultMode should be treated as boolean. + * + * @param device Device handle + * @param isEnabled Pointer to specify if GSP firmware is enabled + * @param defaultMode Pointer to specify if GSP firmware is supported by default on \a device + * + * @return + * - \ref NVML_SUCCESS if GSP firmware mode is sucessfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or any of \a isEnabled or \a defaultMode is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGspFirmwareMode(nvmlDevice_t device, unsigned int *isEnabled, unsigned int *defaultMode); + +/** + * @} + */ + +/** @addtogroup nvmlAccountingStats + * @{ + */ + +/** + * Queries the state of per process accounting mode. + * + * For Kepler &tm; or newer fully supported devices. + * + * See \ref nvmlDeviceGetAccountingStats for more details. + * See \ref nvmlDeviceSetAccountingMode + * + * @param device The identifier of the target device + * @param mode Reference in which to return the current accounting mode + * + * @return + * - \ref NVML_SUCCESS if the mode has been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode are NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode); + +/** + * Queries process's accounting stats. + * + * For Kepler &tm; or newer fully supported devices. + * + * Accounting stats capture GPU utilization and other statistics across the lifetime of a process. + * Accounting stats can be queried during life time of the process and after its termination. + * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and + * updated to actual running time after its termination. + * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old + * processes. + * + * See \ref nvmlAccountingStats_t for description of each returned metric. + * List of processes that can be queried can be retrieved from \ref nvmlDeviceGetAccountingPids. + * + * @note Accounting Mode needs to be on. See \ref nvmlDeviceGetAccountingMode. + * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be + * queried since they don't contribute to GPU utilization. + * @note In case of pid collision stats of only the latest process (that terminated last) will be reported + * + * @warning On Kepler devices per process statistics are accurate only if there's one process running on a GPU. + * + * @param device The identifier of the target device + * @param pid Process Id of the target process to query stats for + * @param stats Reference in which to return the process's accounting stats + * + * @return + * - \ref NVML_SUCCESS if stats have been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a stats are NULL + * - \ref NVML_ERROR_NOT_FOUND if process stats were not found + * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature or accounting mode is disabled + * or on vGPU host. + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetAccountingBufferSize + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats); + +/** + * Queries list of processes that can be queried for accounting stats. The list of processes returned + * can be in running or terminated state. + * + * For Kepler &tm; or newer fully supported devices. + * + * To just query the number of processes ready to be queried, call this function with *count = 0 and + * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. + * + * For more details see \ref nvmlDeviceGetAccountingStats. + * + * @note In case of PID collision some processes might not be accessible before the circular buffer is full. + * + * @param device The identifier of the target device + * @param count Reference in which to provide the \a pids array size, and + * to return the number of elements ready to be queried + * @param pids Reference in which to return list of process ids + * + * @return + * - \ref NVML_SUCCESS if pids were successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature or accounting mode is disabled + * or on vGPU host. + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to + * expected value) + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetAccountingBufferSize + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids); + +/** + * Returns the number of processes that the circular buffer with accounting pids can hold. + * + * For Kepler &tm; or newer fully supported devices. + * + * This is the maximum number of processes that accounting information will be stored for before information + * about oldest processes will get overwritten by information about new processes. + * + * @param device The identifier of the target device + * @param bufferSize Reference in which to provide the size (in number of elements) + * of the circular buffer for accounting stats. + * + * @return + * - \ref NVML_SUCCESS if buffer size was successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a bufferSize is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetAccountingStats + * @see nvmlDeviceGetAccountingPids + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int *bufferSize); + +/** @} */ + +/** @addtogroup nvmlDeviceQueries + * @{ + */ + +/** + * Returns the list of retired pages by source, including pages that are pending retirement + * The address information provided from this API is the hardware address of the page that was retired. Note + * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param cause Filter page addresses by cause of retirement + * @param pageCount Reference in which to provide the \a addresses buffer size, and + * to return the number of retired pages that match \a cause + * Set to 0 to query the size without allocating an \a addresses buffer + * @param addresses Buffer to write the page addresses into + * + * @return + * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the + * matching page addresses. \a pageCount is set to the needed size. * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or * \a addresses is NULL @@ -5543,7 +6458,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageReti * The address information provided from this API is the hardware address of the page that was retired. Note * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 * - * \note nvmlDeviceGetRetiredPages_v2 adds an additional timestamps paramter to return the time of each page's + * \note nvmlDeviceGetRetiredPages_v2 adds an additional timestamps parameter to return the time of each page's * retirement. * * For Kepler &tm; or newer fully supported devices. @@ -5643,6 +6558,119 @@ nvmlReturn_t DECLDIR nvmlDeviceGetRowRemapperHistogram(nvmlDevice_t device, nvml */ nvmlReturn_t DECLDIR nvmlDeviceGetArchitecture(nvmlDevice_t device, nvmlDeviceArchitecture_t *arch); +/** + * Retrieves the frequency monitor fault status for the device. + * + * For Ampere &tm; or newer fully supported devices. + * Requires root user. + * + * See \ref nvmlClkMonStatus_t for details on decoding the status output. + * + * @param device The identifier of the target device + * @param status Reference in which to return the clkmon fault status + * + * @return + * - \ref NVML_SUCCESS if \a status has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a status is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetClkMonStatus() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetClkMonStatus(nvmlDevice_t device, nvmlClkMonStatus_t *status); + +/** + * Retrieves the current utilization and process ID + * + * For Maxwell &tm; or newer fully supported devices. + * + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running. + * Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at + * by \a utilization. One utilization sample structure is returned per process running, that had some non-zero utilization + * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values + * are returned as "unsigned int" values. If no valid sample entries are found since the lastSeenTimeStamp, NVML_ERROR_NOT_FOUND + * is returned. + * + * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with + * \a utilization set to NULL. The caller should allocate a buffer of size + * processSamplesCount * sizeof(nvmlProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed + * in \a utilization, and \a processSamplesCount set to the number of entries the buffer is sized for. + * + * On successful return, the function updates \a processSamplesCount with the number of process utilization sample + * structures that were actually written. This may differ from a previously read value as instances are created or + * destroyed. + * + * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * @note On MIG-enabled GPUs, querying process utilization is not currently supported. + * + * @param device The identifier of the target device + * @param utilization Pointer to caller-supplied buffer in which guest process utilization samples are returned + * @param processSamplesCount Pointer to caller-supplied array size, and returns number of processes running + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization, + unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp); + +/** + * Retrieves the recent utilization and process ID for all running processes + * + * For Maxwell &tm; or newer fully supported devices. + * + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder, jpeg decoder, OFA (Optical Flow Accelerator) + * for all running processes. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at + * by \a procesesUtilInfo->procUtilArray. One utilization sample structure is returned per process running, that had some non-zero utilization + * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values + * are returned as "unsigned int" values. + * + * The caller should allocate a buffer of size processSamplesCount * sizeof(nvmlProcessUtilizationInfo_t). If the buffer is too small, the API will + * return \a NVML_ERROR_INSUFFICIENT_SIZE, with the recommended minimal buffer size at \a procesesUtilInfo->processSamplesCount. The caller should + * invoke the function again with the allocated buffer passed in \a procesesUtilInfo->procUtilArray, and \a procesesUtilInfo->processSamplesCount + * set to the number no less than the recommended value by the previous API return. + * + * On successful return, the function updates \a procesesUtilInfo->processSamplesCount with the number of process utilization info structures + * that were actually written. This may differ from a previously read value as instances are created or destroyed. + * + * \a procesesUtilInfo->lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set \a procesesUtilInfo->lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * \a procesesUtilInfo->version is the version number of the structure nvmlProcessesUtilizationInfo_t, the caller should set the correct version + * number to retrieve the specific version of processes utilization information. + * + * @note On MIG-enabled GPUs, querying process utilization is not currently supported. + * + * @param device The identifier of the target device + * @param procesesUtilInfo Pointer to the caller-provided structure of nvmlProcessesUtilizationInfo_t. + + * @return + * - \ref NVML_SUCCESS if \a procesesUtilInfo->procUtilArray has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a procesesUtilInfo is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_VERSION_MISMATCH if the version of \a procesesUtilInfo is invalid + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a procesesUtilInfo->procUtilArray is NULL, or the buffer size of procesesUtilInfo->procUtilArray is too small. + * The caller should check the minimul array size from the returned procesesUtilInfo->processSamplesCount, and call + * the function again with a buffer no smaller than procesesUtilInfo->processSamplesCount * sizeof(nvmlProcessUtilizationInfo_t) + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetProcessesUtilizationInfo(nvmlDevice_t device, nvmlProcessesUtilizationInfo_t *procesesUtilInfo); + /** @} */ /***************************************************************************************************/ @@ -5880,7 +6908,7 @@ typedef enum nvmlClockLimitId_enum { * Set clocks that device will lock to. * * Sets the clocks that the device will be running at to the value in the range of minGpuClockMHz to maxGpuClockMHz. - * Setting this will supercede application clock values and take effect regardless if a cuda app is running. + * Setting this will supersede application clock values and take effect regardless if a cuda app is running. * See /ref nvmlDeviceSetApplicationsClocks * * Can be used as a setting to request constant performance. @@ -5957,109 +6985,236 @@ nvmlReturn_t DECLDIR nvmlDeviceResetGpuLockedClocks(nvmlDevice_t device); * After system reboot or driver reload applications clocks go back to their default value. * See \ref nvmlDeviceResetMemoryLockedClocks. * - * For Ampere &tm; or newer fully supported devices. + * For Ampere &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param minMemClockMHz Requested minimum memory clock in MHz + * @param maxMemClockMHz Requested maximum memory clock in MHz + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minGpuClockMHz and \a maxGpuClockMHz + * is not a valid clock combination + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetMemoryLockedClocks(nvmlDevice_t device, unsigned int minMemClockMHz, unsigned int maxMemClockMHz); + +/** + * Resets the memory clock to the default value + * + * This is the memory clock that will be used after system reboot or driver reload. + * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks. + * + * @see nvmlDeviceSetMemoryLockedClocks + * + * For Ampere &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceResetMemoryLockedClocks(nvmlDevice_t device); + +/** + * Set clocks that applications will lock to. + * + * Sets the clocks that compute and graphics applications will be running at. + * e.g. CUDA driver requests these clocks during context creation which means this property + * defines clocks at which CUDA applications will be running unless some overspec event + * occurs (e.g. over power, over thermal or external HW brake). + * + * Can be used as a setting to request constant performance. + * + * On Pascal and newer hardware, this will automatically disable automatic boosting of clocks. + * + * On K80 and newer Kepler and Maxwell GPUs, users desiring fixed performance should also call + * \ref nvmlDeviceSetAutoBoostedClocksEnabled to prevent clocks from automatically boosting + * above the clock value being set. + * + * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. + * Requires root/admin permissions. + * + * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks + * for details on how to list available clocks combinations. + * + * After system reboot or driver reload applications clocks go back to their default value. + * See \ref nvmlDeviceResetApplicationsClocks. + * + * @param device The identifier of the target device + * @param memClockMHz Requested memory clock in MHz + * @param graphicsClockMHz Requested graphics clock in MHz + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memClockMHz and \a graphicsClockMHz + * is not a valid clock combination + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz); + +/** + * Resets the application clock to the default value + * + * This is the applications clock that will be used after system reboot or driver reload. + * Default value is constant, but the current value an be changed using \ref nvmlDeviceSetApplicationsClocks. + * + * On Pascal and newer hardware, if clocks were previously locked with \ref nvmlDeviceSetApplicationsClocks, + * this call will unlock clocks. This returns clocks their default behavior ofautomatically boosting above + * base clocks as thermal limits allow. + * + * @see nvmlDeviceGetApplicationsClock + * @see nvmlDeviceSetApplicationsClocks + * + * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device); + +/** + * Try to set the current state of Auto Boosted clocks on a device. + * + * For Kepler &tm; or newer fully supported devices. + * + * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates + * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock + * rates are desired. + * + * Non-root users may use this API by default but can be restricted by root from using this API by calling + * \ref nvmlDeviceSetAPIRestriction with apiType=NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS. + * Note: Persistence Mode is required to modify current Auto Boost settings, therefore, it must be enabled. + * + * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. + * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost + * behavior. * * @param device The identifier of the target device - * @param minMemClockMHz Requested minimum memory clock in MHz - * @param maxMemClockMHz Requested maximum memory clock in MHz + * @param enabled What state to try to set Auto Boosted clocks of the target device to * * @return - * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_SUCCESS If the Auto Boosted clocks were successfully set to the state specified by \a enabled * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minGpuClockMHz and \a maxGpuClockMHz - * is not a valid clock combination - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * */ -nvmlReturn_t DECLDIR nvmlDeviceSetMemoryLockedClocks(nvmlDevice_t device, unsigned int minMemClockMHz, unsigned int maxMemClockMHz); +nvmlReturn_t DECLDIR nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled); /** - * Resets the memory clock to the default value + * Try to set the default state of Auto Boosted clocks on a device. This is the default state that Auto Boosted clocks will + * return to when no compute running processes (e.g. CUDA application which have an active context) are running * - * This is the memory clock that will be used after system reboot or driver reload. - * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks. + * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. + * Requires root/admin permissions. * - * @see nvmlDeviceSetMemoryLockedClocks + * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates + * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock + * rates are desired. * - * For Ampere &tm; or newer fully supported devices. + * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. + * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost + * behavior. * * @param device The identifier of the target device + * @param enabled What state to try to set default Auto Boosted clocks of the target device to + * @param flags Flags that change the default behavior. Currently Unused. * * @return - * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_SUCCESS If the Auto Boosted clock's default state was successfully set to the state specified by \a enabled * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NO_PERMISSION If the calling user does not have permission to change Auto Boosted clock's default state. * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * */ -nvmlReturn_t DECLDIR nvmlDeviceResetMemoryLockedClocks(nvmlDevice_t device); +nvmlReturn_t DECLDIR nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags); /** - * Set clocks that applications will lock to. - * - * Sets the clocks that compute and graphics applications will be running at. - * e.g. CUDA driver requests these clocks during context creation which means this property - * defines clocks at which CUDA applications will be running unless some overspec event - * occurs (e.g. over power, over thermal or external HW brake). + * Sets the speed of the fan control policy to default. * - * Can be used as a setting to request constant performance. + * For all cuda-capable discrete products with fans * - * On Pascal and newer hardware, this will automatically disable automatic boosting of clocks. + * @param device The identifier of the target device + * @param fan The index of the fan, starting at zero * - * On K80 and newer Kepler and Maxwell GPUs, users desiring fixed performance should also call - * \ref nvmlDeviceSetAutoBoostedClocksEnabled to prevent clocks from automatically boosting - * above the clock value being set. + * return + * NVML_SUCCESS if speed has been adjusted + * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * NVML_ERROR_INVALID_ARGUMENT if device is invalid + * NVML_ERROR_NOT_SUPPORTED if the device does not support this + * (doesn't have fans) + * NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetDefaultFanSpeed_v2(nvmlDevice_t device, unsigned int fan); + +/** + * Sets current fan control policy. * - * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. - * Requires root/admin permissions. + * For Maxwell &tm; or newer fully supported devices. * - * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks - * for details on how to list available clocks combinations. + * Requires privileged user. * - * After system reboot or driver reload applications clocks go back to their default value. - * See \ref nvmlDeviceResetApplicationsClocks. + * For all cuda-capable discrete products with fans * - * @param device The identifier of the target device - * @param memClockMHz Requested memory clock in MHz - * @param graphicsClockMHz Requested graphics clock in MHz + * device The identifier of the target \a device + * policy The fan control \a policy to set * - * @return - * - \ref NVML_SUCCESS if new settings were successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memClockMHz and \a graphicsClockMHz - * is not a valid clock combination - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * return + * NVML_SUCCESS if \a policy has been set + * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a policy is null or the \a fan given doesn't reference + * a fan that exists. + * NVML_ERROR_NOT_SUPPORTED if the \a device is older than Maxwell + * NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz); +nvmlReturn_t DECLDIR nvmlDeviceSetFanControlPolicy(nvmlDevice_t device, unsigned int fan, + nvmlFanControlPolicy_t policy); /** - * Retrieves the frequency monitor fault status for the device. + * Sets the temperature threshold for the GPU with the specified threshold type in degrees C. * - * For Ampere &tm; or newer fully supported devices. - * Requires root user. + * For Maxwell &tm; or newer fully supported devices. * - * See \ref nvmlClkMonStatus_t for details on decoding the status output. + * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds. * * @param device The identifier of the target device - * @param status Reference in which to return the clkmon fault status - * + * @param thresholdType The type of threshold value to be set + * @param temp Reference which hold the value to be set * @return - * - \ref NVML_SUCCESS if \a status has been set + * - \ref NVML_SUCCESS if \a temp has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a status is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a thresholdType is invalid or \a temp is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a temperature sensor or is unsupported * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetClkMonStatus() */ -nvmlReturn_t DECLDIR nvmlDeviceGetClkMonStatus(nvmlDevice_t device, nvmlClkMonStatus_t *status); +nvmlReturn_t DECLDIR nvmlDeviceSetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, int *temp); /** * Set new power limit of this device. @@ -6147,6 +7302,118 @@ nvmlReturn_t DECLDIR nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuO */ nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t isRestricted); +/** + * Sets the speed of a specified fan. + * + * WARNING: This function changes the fan control policy to manual. It means that YOU have to monitor + * the temperature and adjust the fan speed accordingly. + * If you set the fan speed too low you can burn your GPU! + * Use nvmlDeviceSetDefaultFanSpeed_v2 to restore default control policy. + * + * For all cuda-capable discrete products with fans that are Maxwell or Newer. + * + * device The identifier of the target device + * fan The index of the fan, starting at zero + * speed The target speed of the fan [0-100] in % of max speed + * + * return + * NVML_SUCCESS if the fan speed has been set + * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * NVML_ERROR_INVALID_ARGUMENT if the device is not valid, or the speed is outside acceptable ranges, + * or if the fan index doesn't reference an actual fan. + * NVML_ERROR_NOT_SUPPORTED if the device is older than Maxwell. + * NVML_ERROR_UNKNOWN if there was an unexpected error. + */ +nvmlReturn_t DECLDIR nvmlDeviceSetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int speed); + +/** + * Set the GPCCLK VF offset value + * @param[in] device The identifier of the target device + * @param[in] offset The GPCCLK VF offset value to set + * + * @return + * - \ref NVML_SUCCESS if \a offset has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetGpcClkVfOffset(nvmlDevice_t device, int offset); + +/** + * Set the MemClk (Memory Clock) VF offset value. It requires elevated privileges. + * @param[in] device The identifier of the target device + * @param[in] offset The MemClk VF offset value to set + * + * @return + * - \ref NVML_SUCCESS if \a offset has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetMemClkVfOffset(nvmlDevice_t device, int offset); + +/** + * Set Conf Computing Unprotected Memory Size. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param device Device Handle + * @param sizeKiB Unprotected Memory size to be set in KiB + * + * @return + * - \ref NVML_SUCCESS if \a sizeKiB successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlDeviceSetConfComputeUnprotectedMemSize(nvmlDevice_t device, unsigned long long sizeKiB); + +/** + * Set Conf Computing GPUs ready state. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param isAcceptingWork GPU accepting new work, NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE or + * NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE + * + * return + * - \ref NVML_SUCCESS if \a current GPUs ready state is successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a isAcceptingWork is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlSystemSetConfComputeGpusReadyState(unsigned int isAcceptingWork); + +/** + * Set Conf Computing key rotation threshold. + * + * %HOPPER_OR_NEWER% + * Supported on Linux, Windows TCC. + * + * This function is to set the confidential compute key rotation threshold parameters. + * @ref pKeyRotationThrInfo->maxAttackerAdvantage should be in the range from + * NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MIN to NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MAX. + * Default value is 60. + * + * @param pKeyRotationThrInfo Reference to the key rotation threshold data + * + * @return + * - \ref NVML_SUCCESS if \a key rotation threashold max attacker advantage has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_INVALID_STATE if confidential compute GPU ready state is enabled + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlSystemSetConfComputeKeyRotationThresholdInfo( + nvmlConfComputeSetKeyRotationThresholdInfo_t *pKeyRotationThrInfo); + /** * @} */ @@ -6824,118 +8091,206 @@ nvmlReturn_t DECLDIR nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGp * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature. * - \ref NVML_ERROR_UNKNOWN if any unexpected error occurred */ -nvmlReturn_t DECLDIR nvmlDeviceGetHostVgpuMode(nvmlDevice_t device, nvmlHostVgpuMode_t *pHostVgpuMode); +nvmlReturn_t DECLDIR nvmlDeviceGetHostVgpuMode(nvmlDevice_t device, nvmlHostVgpuMode_t *pHostVgpuMode); + +/** + * This method is used to set the virtualization mode corresponding to the GPU. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device Identifier of the target device + * @param virtualMode virtualization mode. One of NVML_GPU_VIRTUALIZATION_? + * + * @return + * - \ref NVML_SUCCESS if \a virtualMode is set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a virtualMode is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_SUPPORTED if setting of virtualization mode is not supported. + * - \ref NVML_ERROR_NO_PERMISSION if setting of virtualization mode is not allowed for this client. + */ +nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode); + +/** + * Get the vGPU heterogeneous mode for the device. + * + * When in heterogeneous mode, a vGPU can concurrently host timesliced vGPUs with differing framebuffer sizes. + * + * On successful return, the function returns \a pHeterogeneousMode->mode with the current vGPU heterogeneous mode. + * \a pHeterogeneousMode->version is the version number of the structure nvmlVgpuHeterogeneousMode_t, the caller should + * set the correct version number to retrieve the vGPU heterogeneous mode. + * \a pHeterogeneousMode->mode can either be \ref NVML_FEATURE_ENABLED or \ref NVML_FEATURE_DISABLED. + * + * @param device The identifier of the target device + * @param pHeterogeneousMode Pointer to the caller-provided structure of nvmlVgpuHeterogeneousMode_t + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device is invalid or \a pHeterogeneousMode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support this feature + * - \ref NVML_ERROR_VERSION_MISMATCH If the version of \a pHeterogeneousMode is invalid + * - \ref NVML_ERROR_UNKNOWN On any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuHeterogeneousMode(nvmlDevice_t device, nvmlVgpuHeterogeneousMode_t *pHeterogeneousMode); + +/** + * Enable or disable vGPU heterogeneous mode for the device. + * + * When in heterogeneous mode, a vGPU can concurrently host timesliced vGPUs with differing framebuffer sizes. + * + * API would return an appropriate error code upon unsuccessful activation. For example, the heterogeneous mode + * set will fail with error \ref NVML_ERROR_IN_USE if any vGPU instance is active on the device. The caller of this API + * is expected to shutdown the vGPU VMs and retry setting the \a mode. + * On successful return, the function updates the vGPU heterogeneous mode with the user provided \a pHeterogeneousMode->mode. + * \a pHeterogeneousMode->version is the version number of the structure nvmlVgpuHeterogeneousMode_t, the caller should + * set the correct version number to set the vGPU heterogeneous mode. + * + * @param device Identifier of the target device + * @param pHeterogeneousMode Pointer to the caller-provided structure of nvmlVgpuHeterogeneousMode_t + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device or \a pHeterogeneousMode is NULL or \a pHeterogeneousMode->mode is invalid + * - \ref NVML_ERROR_IN_USE If the \a device is in use + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_NOT_SUPPORTED If MIG is enabled or \a device doesn't support this feature + * - \ref NVML_ERROR_VERSION_MISMATCH If the version of \a pHeterogeneousMode is invalid + * - \ref NVML_ERROR_UNKNOWN On any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetVgpuHeterogeneousMode(nvmlDevice_t device, const nvmlVgpuHeterogeneousMode_t *pHeterogeneousMode); /** - * This method is used to set the virtualization mode corresponding to the GPU. + * Query the placement ID of active vGPU instance. * - * For Kepler &tm; or newer fully supported devices. + * When in vGPU heterogeneous mode, this function returns a valid placement ID as \a pPlacement->placementId + * else NVML_INVALID_VGPU_PLACEMENT_ID is returned. + * \a pPlacement->version is the version number of the structure nvmlVgpuPlacementId_t, the caller should + * set the correct version number to get placement id of the vGPU instance \a vgpuInstance. * - * @param device Identifier of the target device - * @param virtualMode virtualization mode. One of NVML_GPU_VIRTUALIZATION_? + * @param vgpuInstance Identifier of the target vGPU instance + * @param pPlacement Pointer to vGPU placement ID structure \a nvmlVgpuPlacementId_t * * @return - * - \ref NVML_SUCCESS if \a pVirtualMode is set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_SUPPORTED if setting of virtualization mode is not supported. - * - \ref NVML_ERROR_NO_PERMISSION if setting of virtualization mode is not allowed for this client. + * - \ref NVML_SUCCESS If information is successfully retrieved + * - \ref NVML_ERROR_NOT_FOUND If \a vgpuInstance does not match a valid active vGPU instance + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a vgpuInstance is invalid or \a pPlacement is NULL + * - \ref NVML_ERROR_VERSION_MISMATCH If the version of \a pPlacement is invalid + * - \ref NVML_ERROR_UNKNOWN On any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode); +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetPlacementId(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuPlacementId_t *pPlacement); /** - * Retrieve the vGPU Software licensable features. + * Query the supported vGPU placement ID of the vGPU type. * - * Identifies whether the system supports vGPU Software Licensing. If it does, return the list of licensable feature(s) - * and their current license status. + * An array of supported vGPU placement IDs for the vGPU type ID indicated by \a vgpuTypeId is returned in the + * caller-supplied buffer of \a pPlacementList->placementIds. Memory needed for the placementIds array should be + * allocated based on maximum instances of a vGPU type which can be queried via \ref nvmlVgpuTypeGetMaxInstances(). * - * @param device Identifier of the target device - * @param pGridLicensableFeatures Pointer to structure in which vGPU software licensable features are returned + * This function will return supported placement IDs even if GPU is not in vGPU heterogeneous mode. + * + * @param device Identifier of the target device + * @param vgpuTypeId Handle to vGPU type. The vGPU type ID + * @param pPlacementList Pointer to the vGPU placement structure \a nvmlVgpuPlacementList_t * * @return - * - \ref NVML_SUCCESS if licensable features are successfully retrieved - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pGridLicensableFeatures is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device or \a vgpuTypeId is invalid or \a pPlacementList is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device or \a vgpuTypeId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_VERSION_MISMATCH If the version of \a pPlacementList is invalid + * - \ref NVML_ERROR_UNKNOWN On any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v4(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuTypeSupportedPlacements(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuPlacementList_t *pPlacementList); /** - * Retrieves the current utilization and process ID - * - * For Maxwell &tm; or newer fully supported devices. - * - * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running. - * Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at - * by \a utilization. One utilization sample structure is returned per process running, that had some non-zero utilization - * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values - * are returned as "unsigned int" values. + * Query the creatable vGPU placement ID of the vGPU type. * - * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with - * \a utilization set to NULL. The caller should allocate a buffer of size - * processSamplesCount * sizeof(nvmlProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed - * in \a utilization, and \a processSamplesCount set to the number of entries the buffer is sized for. + * An array of creatable vGPU placement IDs for the vGPU type ID indicated by \a vgpuTypeId is returned in the + * caller-supplied buffer of \a pPlacementList->placementIds. Memory needed for the placementIds array should be + * allocated based on maximum instances of a vGPU type which can be queried via \ref nvmlVgpuTypeGetMaxInstances(). + * The creatable vGPU placement IDs may differ over time, as there may be restrictions on what type of vGPU the + * vGPU instance is running. * - * On successful return, the function updates \a processSamplesCount with the number of process utilization sample - * structures that were actually written. This may differ from a previously read value as instances are created or - * destroyed. + * The function will return \ref NVML_ERROR_NOT_SUPPORTED if the \a device is not in vGPU heterogeneous mode. * - * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 - * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp - * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * @param device The identifier of the target device + * @param vgpuTypeId Handle to vGPU type. The vGPU type ID + * @param pPlacementList Pointer to the list of vGPU placement structure \a nvmlVgpuPlacementList_t * - * @note On MIG-enabled GPUs, querying process utilization is not currently supported. + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device or \a vgpuTypeId is invalid or \a pPlacementList is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device or \a vgpuTypeId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_VERSION_MISMATCH If the version of \a pPlacementList is invalid + * - \ref NVML_ERROR_UNKNOWN On any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuTypeCreatablePlacements(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuPlacementList_t *pPlacementList); + +/** + * Retrieve the static GSP heap size of the vGPU type in bytes * - * @param device The identifier of the target device - * @param utilization Pointer to caller-supplied buffer in which guest process utilization samples are returned - * @param processSamplesCount Pointer to caller-supplied array size, and returns number of processes running - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + * @param vgpuTypeId Handle to vGPU type + * @param gspHeapSize Reference to return the GSP heap size value + * @return + * - \ref NVML_SUCCESS Successful completion + * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a vgpuTypeId is invalid, or \a gspHeapSize is NULL + * - \ref NVML_ERROR_UNKNOWN On any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetGspHeapSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *gspHeapSize); +/** + * Retrieve the static framebuffer reservation of the vGPU type in bytes + * + * @param vgpuTypeId Handle to vGPU type + * @param fbReservation Reference to return the framebuffer reservation * @return - * - \ref NVML_SUCCESS if \a utilization has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS Successful completion + * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a vgpuTypeId is invalid, or \a fbReservation is NULL + * - \ref NVML_ERROR_UNKNOWN On any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization, - unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp); +nvmlReturn_t DECLDIR nvmlVgpuTypeGetFbReservation(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbReservation); /** - * Retrieve GSP firmware version. + * Set the desirable vGPU capability of a device * - * The caller passes in buffer via \a version and corresponding GSP firmware numbered version - * is returned with the same parameter in string format. + * Refer to the \a nvmlDeviceVgpuCapability_t structure for the specific capabilities that can be set. + * See \ref nvmlEnableState_t for available state. * - * @param device Device handle - * @param version The retrieved GSP firmware version + * @param device The identifier of the target device + * @param capability Specifies the \a nvmlDeviceVgpuCapability_t to be set + * @param state The target capability mode * * @return - * - \ref NVML_SUCCESS if GSP firmware version is sucessfully retrieved - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or GSP \a version pointer is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if GSP firmware is not enabled for GPU - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGspFirmwareVersion(nvmlDevice_t device, char *version); + * - \ref NVML_SUCCESS Successful completion + * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device is invalid, or \a capability is invalid, or \a state is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state, or \a device not in vGPU mode + * - \ref NVML_ERROR_UNKNOWN On any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlDeviceSetVgpuCapabilities(nvmlDevice_t device, nvmlDeviceVgpuCapability_t capability, nvmlEnableState_t state); /** - * Retrieve GSP firmware mode. + * Retrieve the vGPU Software licensable features. * - * The caller passes in integer pointers. GSP firmware enablement and default mode information is returned with - * corresponding parameters. The return value in \a isEnabled and \a defaultMode should be treated as boolean. + * Identifies whether the system supports vGPU Software Licensing. If it does, return the list of licensable feature(s) + * and their current license status. * - * @param device Device handle - * @param isEnabled Pointer to specify if GSP firmware is enabled - * @param defaultMode Pointer to specify if GSP firmware is supported by default on \a device + * @param device Identifier of the target device + * @param pGridLicensableFeatures Pointer to structure in which vGPU software licensable features are returned * * @return - * - \ref NVML_SUCCESS if GSP firmware mode is sucessfully retrieved - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or any of \a isEnabled or \a defaultMode is NULL + * - \ref NVML_SUCCESS if licensable features are successfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pGridLicensableFeatures is NULL * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetGspFirmwareMode(nvmlDevice_t device, unsigned int *isEnabled, unsigned int *defaultMode); +nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v4(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); /** @} */ @@ -6972,14 +8327,14 @@ nvmlReturn_t DECLDIR nvmlGetVgpuDriverCapabilities(nvmlVgpuDriverCapability_t ca * Retrieve the requested vGPU capability for GPU. * * Refer to the \a nvmlDeviceVgpuCapability_t structure for the specific capabilities that can be queried. - * The return value in \a capResult should be treated as a boolean, with a non-zero value indicating that the capability - * is supported. + * The return value in \a capResult reports a non-zero value indicating that the capability + * is supported, and also reports the capability's data based on the queried capability. * * For Maxwell &tm; or newer fully supported devices. * * @param device The identifier of the target device * @param capability Specifies the \a nvmlDeviceVgpuCapability_t to be queried - * @param capResult A boolean for the queried capability indicating that feature is supported + * @param capResult Specifies that the queried capability is supported, and also returns capability's data * * @return * - \ref NVML_SUCCESS successful completion @@ -6997,7 +8352,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetVgpuCapabilities(nvmlDevice_t device, nvmlDevi * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount * is used to return the number of vGPU types written to the buffer. * - * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns + * If the supplied buffer is not large enough to accommodate the vGPU type array, the function returns * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. * To query the number of vGPU types supported for the GPU, call this function with *vgpuCount = 0. * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are supported. @@ -7026,9 +8381,9 @@ nvmlReturn_t DECLDIR nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned i * can concurrently run on a device. For example, if only one vGPU type is allowed at a time on a device, then the creatable * list will be restricted to whatever vGPU type is already running on the device. * - * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns + * If the supplied buffer is not large enough to accommodate the vGPU type array, the function returns * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. - * To query the number of vGPU types createable for the GPU, call this function with *vgpuCount = 0. + * To query the number of vGPU types that can be created for the GPU, call this function with *vgpuCount = 0. * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are creatable. * * @param device The identifier of the target device @@ -7108,7 +8463,7 @@ nvmlReturn_t DECLDIR nvmlVgpuTypeGetGpuInstanceProfileId(nvmlVgpuTypeId_t vgpuTy * * @param vgpuTypeId Handle to vGPU type * @param deviceID Device ID and vendor ID of the device contained in single 32 bit value - * @param subsystemID Subsytem ID and subsytem vendor ID of the device contained in single 32 bit value + * @param subsystemID Subsystem ID and subsystem vendor ID of the device contained in single 32 bit value * * @return * - \ref NVML_SUCCESS successful completion @@ -7247,10 +8602,10 @@ nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstancesPerVm(nvmlVgpuTypeId_t vgpuTypeI * Retrieve the active vGPU instances on a device. * * An array of active vGPU instances is returned in the caller-supplied buffer pointed at by \a vgpuInstances. The - * array elememt count is passed in \a vgpuCount, and \a vgpuCount is used to return the number of vGPU instances + * array element count is passed in \a vgpuCount, and \a vgpuCount is used to return the number of vGPU instances * written to the buffer. * - * If the supplied buffer is not large enough to accomodate the vGPU instance array, the function returns + * If the supplied buffer is not large enough to accommodate the vGPU instance array, the function returns * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuInstance_t array required in \a vgpuCount. * To query the number of active vGPU instances, call this function with *vgpuCount = 0. The code will return * NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU Types are supported. @@ -7451,7 +8806,7 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEccMode(nvmlVgpuInstance_t vgpuInstance, * @param encoderCapacity Reference to an unsigned int for the encoder capacity * * @return - * - \ref NVML_SUCCESS if \a encoderCapacity has been retrived + * - \ref NVML_SUCCESS if \a encoderCapacity has been retrieved * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a encoderQueryType is invalid * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system @@ -7504,7 +8859,7 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInst * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions * written to the buffer. * - * If the supplied buffer is not large enough to accomodate the active session array, the function returns + * If the supplied buffer is not large enough to accommodate the active session array, the function returns * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. @@ -7534,7 +8889,7 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuI * For Maxwell &tm; or newer fully supported devices. * * @param vgpuInstance Identifier of the target vGPU instance -* @param fbcStats Reference to nvmlFBCStats_t structure contianing NvFBC stats +* @param fbcStats Reference to nvmlFBCStats_t structure containing NvFBC stats * * @return * - \ref NVML_SUCCESS if \a fbcStats is fetched @@ -7552,7 +8907,7 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCStats(nvmlVgpuInstance_t vgpuInstance * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions * written to the buffer. * -* If the supplied buffer is not large enough to accomodate the active session array, the function returns +* If the supplied buffer is not large enough to accommodate the active session array, the function returns * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount. * To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return * NVML_SUCCESS with number of active FBC sessions updated in *sessionCount. @@ -7633,6 +8988,31 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetGpuPciId(nvmlVgpuInstance_t vgpuInstance */ nvmlReturn_t DECLDIR nvmlVgpuTypeGetCapabilities(nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuCapability_t capability, unsigned int *capResult); +/** + * Retrieve the MDEV UUID of a vGPU instance. + * + * The MDEV UUID is a globally unique identifier of the mdev device assigned to the VM, and is returned as a 5-part hexadecimal string, + * not exceeding 80 characters in length (including the NULL terminator). + * MDEV UUID is displayed only on KVM platform. + * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param mdevUuid Pointer to caller-supplied buffer to hold MDEV UUID + * @param size Size of buffer in bytes + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_SUPPORTED on any hypervisor other than KVM + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mdevUuid is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMdevUUID(nvmlVgpuInstance_t vgpuInstance, char *mdevUuid, unsigned int size); + /** @} */ /***************************************************************************************************/ @@ -7662,7 +9042,7 @@ typedef struct nvmlVgpuMetadata_st char guestDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in guest char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in host unsigned int reserved[6]; //!< Reserved for internal use - unsigned int vgpuVirtualizationCaps; //!< vGPU virtualizaion capabilities bitfileld + unsigned int vgpuVirtualizationCaps; //!< vGPU virtualization capabilities bitfield unsigned int guestVgpuVersion; //!< vGPU version of guest driver unsigned int opaqueDataSize; //!< Size of opaque data field in bytes char opaqueData[4]; //!< Opaque data @@ -7676,7 +9056,7 @@ typedef struct nvmlVgpuPgpuMetadata_st unsigned int version; //!< Current version of the structure unsigned int revision; //!< Current revision of the structure char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Host driver version - unsigned int pgpuVirtualizationCaps; //!< Pgpu virtualizaion capabilities bitfileld + unsigned int pgpuVirtualizationCaps; //!< Pgpu virtualization capabilities bitfield unsigned int reserved[5]; //!< Reserved for internal use nvmlVgpuVersion_t hostSupportedVgpuRange; //!< vGPU version range supported by host driver unsigned int opaqueDataSize; //!< Size of opaque data field in bytes @@ -7774,7 +9154,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpu * * The caller passes in a buffer via \a compatibilityInfo, into which a compatibility information structure is written. The * structure defines the states in which the vGPU / VM may be booted on the physical GPU. If the vGPU / VM compatibility - * with the physical GPU is limited, a limit code indicates the factor limiting compability. + * with the physical GPU is limited, a limit code indicates the factor limiting compatability. * (see \ref nvmlVgpuPgpuCompatibilityLimitCode_t for details). * * Note: vGPU compatibility does not take into account dynamic capacity conditions that may limit a system's ability to @@ -7833,6 +9213,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetVgpuSchedulerLog(nvmlDevice_t device, nvmlVgpu /** * Returns the vGPU scheduler state. + * The information returned in \a nvmlVgpuSchedulerGetState_t is not relevant if the BEST EFFORT policy is set. * * For Pascal &tm; or newer fully supported devices. * @@ -7847,31 +9228,6 @@ nvmlReturn_t DECLDIR nvmlDeviceGetVgpuSchedulerLog(nvmlDevice_t device, nvmlVgpu */ nvmlReturn_t DECLDIR nvmlDeviceGetVgpuSchedulerState(nvmlDevice_t device, nvmlVgpuSchedulerGetState_t *pSchedulerState); -/** - * Sets the vGPU scheduler state. - * - * For Pascal &tm; or newer fully supported devices. - * - * The scheduler state change won’t persist across module load/unload. - * Scheduler state and params will be allowed to set only when no VM is running. - * In \a nvmlVgpuSchedulerSetState_t, IFF enableARRMode=1 then - * provide avgFactorForARR and frequency as input. If enableARRMode is disabled - * then provide timeslice as input. - * - * @param device The identifier of the target \a device - * @param pSchedulerState vGPU \a pSchedulerState to set - * - * @return - * - \ref NVML_SUCCESS vGPU scheduler state has been successfully set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pSchedulerState is NULL or \a device is invalid - * - \ref NVML_ERROR_RESET_REQUIRED if setting \a pSchedulerState failed with fatal error, - * reboot is required to overcome from this error. - * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state or \a device not in vGPU host mode - * or if any vGPU instance currently exists on the \a device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetVgpuSchedulerState(nvmlDevice_t device, nvmlVgpuSchedulerSetState_t *pSchedulerState); - /** * Returns the vGPU scheduler capabilities. * The list of supported vGPU schedulers returned in \a nvmlVgpuSchedulerCapabilities_t is from @@ -7895,6 +9251,31 @@ nvmlReturn_t DECLDIR nvmlDeviceSetVgpuSchedulerState(nvmlDevice_t device, nvmlVg */ nvmlReturn_t DECLDIR nvmlDeviceGetVgpuSchedulerCapabilities(nvmlDevice_t device, nvmlVgpuSchedulerCapabilities_t *pCapabilities); +/** + * Sets the vGPU scheduler state. + * + * For Pascal &tm; or newer fully supported devices. + * + * The scheduler state change won't persist across module load/unload. + * Scheduler state and params will be allowed to set only when no VM is running. + * In \a nvmlVgpuSchedulerSetState_t, IFF enableARRMode is enabled then + * provide avgFactorForARR and frequency as input. If enableARRMode is disabled + * then provide timeslice as input. + * + * @param device The identifier of the target \a device + * @param pSchedulerState vGPU \a pSchedulerState to set + * + * @return + * - \ref NVML_SUCCESS vGPU scheduler state has been successfully set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pSchedulerState is NULL or \a device is invalid + * - \ref NVML_ERROR_RESET_REQUIRED if setting \a pSchedulerState failed with fatal error, + * reboot is required to overcome from this error. + * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state or \a device not in vGPU host mode + * or if any vGPU instance currently exists on the \a device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetVgpuSchedulerState(nvmlDevice_t device, nvmlVgpuSchedulerSetState_t *pSchedulerState); + /* * Virtual GPU (vGPU) version * @@ -8012,6 +9393,52 @@ nvmlReturn_t DECLDIR nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned nvmlValueType_t *sampleValType, unsigned int *vgpuInstanceSamplesCount, nvmlVgpuInstanceUtilizationSample_t *utilizationSamples); +/** + * Retrieves recent utilization for vGPU instances running on a physical GPU (device). + * + * For Kepler &tm; or newer fully supported devices. + * + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, video decoder, jpeg decoder, and OFA for vGPU + * instances running on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied + * buffer pointed at by \a vgpuUtilInfo->vgpuUtilArray. One utilization sample structure is returned per vGPU instance, and includes the + * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values + * in nvmlValue_t unions. The function sets the caller-supplied \a vgpuUtilInfo->sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to + * indicate the returned value type. + * + * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with + * \a vgpuUtilInfo->vgpuUtilArray set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance + * count in \a vgpuUtilInfo->vgpuInstanceCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate + * a buffer of size vgpuUtilInfo->vgpuInstanceCount * sizeof(nvmlVgpuInstanceUtilizationInfo_t). Invoke the function again with + * the allocated buffer passed in \a vgpuUtilInfo->vgpuUtilArray, and \a vgpuUtilInfo->vgpuInstanceCount set to the number of entries the + * buffer is sized for. + * + * On successful return, the function updates \a vgpuUtilInfo->vgpuInstanceCount with the number of vGPU utilization sample + * structures that were actually written. This may differ from a previously read value as vGPU instances are created or + * destroyed. + * + * \a vgpuUtilInfo->lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set \a vgpuUtilInfo->lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * @param device The identifier for the target device + * @param vgpuUtilInfo Pointer to the caller-provided structure of nvmlVgpuInstancesUtilizationInfo_t + + * @return + * - \ref NVML_SUCCESS if utilization samples are successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuUtilInfo is NULL, or \a vgpuUtilInfo->vgpuInstanceCount is 0 + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_VERSION_MISMATCH if the version of \a vgpuUtilInfo is invalid + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a vgpuUtilInfo->vgpuUtilArray is NULL, or the buffer size of vgpuUtilInfo->vgpuInstanceCount is too small. + * The caller should check the current vGPU instance count from the returned vgpuUtilInfo->vgpuInstanceCount, and call + * the function again with a buffer of size vgpuUtilInfo->vgpuInstanceCount * sizeof(nvmlVgpuInstanceUtilizationInfo_t) + * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuInstancesUtilizationInfo(nvmlDevice_t device, + nvmlVgpuInstancesUtilizationInfo_t *vgpuUtilInfo); + /** * Retrieves current utilization for processes running on vGPUs on a physical GPU (device). * @@ -8058,6 +9485,52 @@ nvmlReturn_t DECLDIR nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, unsigned int *vgpuProcessSamplesCount, nvmlVgpuProcessUtilizationSample_t *utilizationSamples); + +/** + * Retrieves recent utilization for processes running on vGPU instances on a physical GPU (device). + * + * For Maxwell &tm; or newer fully supported devices. + * + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, video decoder, jpeg decoder, and OFA for processes running + * on vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied + * buffer pointed at by \a vgpuProcUtilInfo->vgpuProcUtilArray. One utilization sample structure is returned per process running + * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which + * the samples were recorded. Individual utilization values are returned as "unsigned int" values. + * + * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with + * \a vgpuProcUtilInfo->vgpuProcUtilArray set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current processes' count + * running on vGPU instances in \a vgpuProcUtilInfo->vgpuProcessCount. The caller should allocate a buffer of size + * vgpuProcUtilInfo->vgpuProcessCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed + * in \a vgpuProcUtilInfo->vgpuProcUtilArray, and \a vgpuProcUtilInfo->vgpuProcessCount set to the number of entries the buffer is sized for. + * + * On successful return, the function updates \a vgpuProcUtilInfo->vgpuProcessCount with the number of vGPU sub process utilization sample + * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active + * in any given sample period. + * + * vgpuProcUtilInfo->lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set vgpuProcUtilInfo->lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * @param device The identifier for the target device + * @param vgpuProcUtilInfo Pointer to the caller-provided structure of nvmlVgpuProcessesUtilizationInfo_t + + * @return + * - \ref NVML_SUCCESS if utilization samples are successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a vgpuProcUtilInfo is null + * - \ref NVML_ERROR_VERSION_MISMATCH if the version of \a vgpuProcUtilInfo is invalid + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a vgpuProcUtilInfo->vgpuProcUtilArray is null, or supplied \a vgpuProcUtilInfo->vgpuProcessCount + * is too small to return samples for all processes on vGPU instances currently executing on the device. + * The caller should check the current processes count from the returned \a vgpuProcUtilInfo->vgpuProcessCount, + * and call the function again with a buffer of size + * vgpuProcUtilInfo->vgpuProcessCount * sizeof(nvmlVgpuProcessUtilizationSample_t) + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessesUtilizationInfo(nvmlDevice_t device, nvmlVgpuProcessesUtilizationInfo_t *vgpuProcUtilInfo); + /** * Queries the state of per process accounting mode on vGPU. * @@ -8269,6 +9742,22 @@ nvmlReturn_t DECLDIR nvmlGetExcludedDeviceInfoByIndex(unsigned int index, nvmlEx #define NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV2 0x9 #define NVML_GPU_INSTANCE_PROFILE_COUNT 0xA +/** + * MIG GPU instance profile capability. + * + * Bit field values representing MIG profile capabilities + * \ref nvmlGpuInstanceProfileInfo_v3_t.capabilities + */ +#define NVML_GPU_INTSTANCE_PROFILE_CAPS_P2P 0x1 + +/** + * MIG compute instance profile capability. + * + * Bit field values representing MIG profile capabilities + * \ref nvmlComputeInstanceProfileInfo_v3_t.capabilities + */ +/* No capabilities for compute profiles currently exposed */ + typedef struct nvmlGpuInstancePlacement_st { unsigned int start; //!< Index of first occupied memory slice @@ -8303,9 +9792,36 @@ typedef struct nvmlGpuInstanceProfileInfo_st */ typedef struct nvmlGpuInstanceProfileInfo_v2_st { - unsigned int version; //!< Structure version identifier (set to \ref nvmlGpuInstanceProfileInfo_v2) + unsigned int version; //!< Structure version identifier (set to \ref nvmlGpuInstanceProfileInfo_v2) + unsigned int id; //!< Unique profile ID within the device + unsigned int isP2pSupported; //!< Peer-to-Peer support + unsigned int sliceCount; //!< GPU Slice count + unsigned int instanceCount; //!< GPU instance count + unsigned int multiprocessorCount; //!< Streaming Multiprocessor count + unsigned int copyEngineCount; //!< Copy Engine count + unsigned int decoderCount; //!< Decoder Engine count + unsigned int encoderCount; //!< Encoder Engine count + unsigned int jpegCount; //!< JPEG Engine count + unsigned int ofaCount; //!< OFA Engine count + unsigned long long memorySizeMB; //!< Memory size in MBytes + char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name +} nvmlGpuInstanceProfileInfo_v2_t; + +/** + * Version identifier value for \ref nvmlGpuInstanceProfileInfo_v2_t.version. + */ +#define nvmlGpuInstanceProfileInfo_v2 NVML_STRUCT_VERSION(GpuInstanceProfileInfo, 2) + +/** + * GPU instance profile information (v3). + * + * Version 3 removes isP2pSupported field and adds the \ref nvmlGpuInstanceProfileInfo_v3_t.capabilities + * field \ref nvmlGpuInstanceProfileInfo_t. + */ +typedef struct nvmlGpuInstanceProfileInfo_v3_st +{ + unsigned int version; //!< Structure version identifier (set to \ref nvmlGpuInstanceProfileInfo_v3) unsigned int id; //!< Unique profile ID within the device - unsigned int isP2pSupported; //!< Peer-to-Peer support unsigned int sliceCount; //!< GPU Slice count unsigned int instanceCount; //!< GPU instance count unsigned int multiprocessorCount; //!< Streaming Multiprocessor count @@ -8316,12 +9832,13 @@ typedef struct nvmlGpuInstanceProfileInfo_v2_st unsigned int ofaCount; //!< OFA Engine count unsigned long long memorySizeMB; //!< Memory size in MBytes char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name -} nvmlGpuInstanceProfileInfo_v2_t; + unsigned int capabilities; //!< Additional capabilities +} nvmlGpuInstanceProfileInfo_v3_t; /** - * Version identifier value for \ref nvmlGpuInstanceProfileInfo_v2_t.version. + * Version identifier value for \ref nvmlGpuInstanceProfileInfo_v3_t.version. */ -#define nvmlGpuInstanceProfileInfo_v2 NVML_STRUCT_VERSION(GpuInstanceProfileInfo, 2) +#define nvmlGpuInstanceProfileInfo_v3 NVML_STRUCT_VERSION(GpuInstanceProfileInfo, 3) typedef struct nvmlGpuInstanceInfo_st { @@ -8342,13 +9859,13 @@ typedef struct * These macros should be passed to \ref nvmlGpuInstanceGetComputeInstanceProfileInfo to retrieve the * detailed information about a compute instance such as profile ID, engine counts */ -#define NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE 0x0 -#define NVML_COMPUTE_INSTANCE_PROFILE_2_SLICE 0x1 -#define NVML_COMPUTE_INSTANCE_PROFILE_3_SLICE 0x2 -#define NVML_COMPUTE_INSTANCE_PROFILE_4_SLICE 0x3 -#define NVML_COMPUTE_INSTANCE_PROFILE_7_SLICE 0x4 -#define NVML_COMPUTE_INSTANCE_PROFILE_8_SLICE 0x5 -#define NVML_COMPUTE_INSTANCE_PROFILE_6_SLICE 0x6 +#define NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE 0x0 +#define NVML_COMPUTE_INSTANCE_PROFILE_2_SLICE 0x1 +#define NVML_COMPUTE_INSTANCE_PROFILE_3_SLICE 0x2 +#define NVML_COMPUTE_INSTANCE_PROFILE_4_SLICE 0x3 +#define NVML_COMPUTE_INSTANCE_PROFILE_7_SLICE 0x4 +#define NVML_COMPUTE_INSTANCE_PROFILE_8_SLICE 0x5 +#define NVML_COMPUTE_INSTANCE_PROFILE_6_SLICE 0x6 #define NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 0x7 #define NVML_COMPUTE_INSTANCE_PROFILE_COUNT 0x8 @@ -8405,6 +9922,33 @@ typedef struct nvmlComputeInstanceProfileInfo_v2_st */ #define nvmlComputeInstanceProfileInfo_v2 NVML_STRUCT_VERSION(ComputeInstanceProfileInfo, 2) +/** + * Compute instance profile information (v3). + * + * Version 3 adds the \ref nvmlComputeInstanceProfileInfo_v3_t.capabilities field + * \ref nvmlComputeInstanceProfileInfo_t. + */ +typedef struct nvmlComputeInstanceProfileInfo_v3_st +{ + unsigned int version; //!< Structure version identifier (set to \ref nvmlComputeInstanceProfileInfo_v3) + unsigned int id; //!< Unique profile ID within the GPU instance + unsigned int sliceCount; //!< GPU Slice count + unsigned int instanceCount; //!< Compute instance count + unsigned int multiprocessorCount; //!< Streaming Multiprocessor count + unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count + unsigned int sharedDecoderCount; //!< Shared Decoder Engine count + unsigned int sharedEncoderCount; //!< Shared Encoder Engine count + unsigned int sharedJpegCount; //!< Shared JPEG Engine count + unsigned int sharedOfaCount; //!< Shared OFA Engine count + char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name + unsigned int capabilities; //!< Additional capabilities +} nvmlComputeInstanceProfileInfo_v3_t; + +/** + * Version identifier value for \ref nvmlComputeInstanceProfileInfo_v3_t.version. + */ +#define nvmlComputeInstanceProfileInfo_v3 NVML_STRUCT_VERSION(ComputeInstanceProfileInfo, 3) + typedef struct nvmlComputeInstanceInfo_st { nvmlDevice_t device; //!< Parent device @@ -8477,7 +10021,7 @@ nvmlReturn_t DECLDIR nvmlDeviceSetMigMode(nvmlDevice_t device, unsigned int mode nvmlReturn_t DECLDIR nvmlDeviceGetMigMode(nvmlDevice_t device, unsigned int *currentMode, unsigned int *pendingMode); /** - * Get GPU instance profile information. + * Get GPU instance profile information * * Information provided by this API is immutable throughout the lifetime of a MIG mode. * @@ -8492,7 +10036,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetMigMode(nvmlDevice_t device, unsigned int *cur * - \ref NVML_SUCCESS Upon success * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile or \a info are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profile isn't supported + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG or \a profile isn't supported * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation */ nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfo(nvmlDevice_t device, unsigned int profile, @@ -8533,7 +10077,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfoV(nvmlDevice_t device, u * Get GPU instance placements. * * A placement represents the location of a GPU instance within a device. This API only returns all the possible - * placements for the given profile. + * placements for the given profile regardless of whether MIG is enabled or not. * A created GPU instance occupies memory slices described by its placement. Creation of new GPU instance will * fail if there is overlap with the already occupied memory slices. * @@ -8552,7 +10096,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfoV(nvmlDevice_t device, u * - \ref NVML_SUCCESS Upon success * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId or \a count are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profileId isn't supported + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG or \a profileId isn't supported * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation */ nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstancePossiblePlacements_v2(nvmlDevice_t device, unsigned int profileId, @@ -8946,365 +10490,157 @@ nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstances(nvmlGpuInstance_t gpuIns * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a ID or \a computeInstance are invalid * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - * - \ref NVML_ERROR_NOT_FOUND If the compute instance is not found. - */ -nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceById(nvmlGpuInstance_t gpuInstance, unsigned int id, - nvmlComputeInstance_t *computeInstance); - -/** - * Get compute instance information. - * - * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param computeInstance The compute instance handle - * @param info Return compute instance information - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a computeInstance or \a info are invalid - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - */ -nvmlReturn_t DECLDIR nvmlComputeInstanceGetInfo_v2(nvmlComputeInstance_t computeInstance, nvmlComputeInstanceInfo_t *info); - -/** - * Test if the given handle refers to a MIG device. - * - * A MIG device handle is an NVML abstraction which maps to a MIG compute instance. - * These overloaded references can be used (with some restrictions) interchangeably - * with a GPU device handle to execute queries at a per-compute instance granularity. - * - * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param device NVML handle to test - * @param isMigDevice True when handle refers to a MIG device - * - * @return - * - \ref NVML_SUCCESS if \a device status was successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle or \a isMigDevice reference is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceIsMigDeviceHandle(nvmlDevice_t device, unsigned int *isMigDevice); - -/** - * Get GPU instance ID for the given MIG device handle. - * - * GPU instance IDs are unique per device and remain valid until the GPU instance is destroyed. - * - * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param device Target MIG device handle - * @param id GPU instance ID - * - * @return - * - \ref NVML_SUCCESS if instance ID was successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a id reference is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceId(nvmlDevice_t device, unsigned int *id); - -/** - * Get compute instance ID for the given MIG device handle. - * - * Compute instance IDs are unique per GPU instance and remain valid until the compute instance - * is destroyed. - * - * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param device Target MIG device handle - * @param id Compute instance ID - * - * @return - * - \ref NVML_SUCCESS if instance ID was successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a id reference is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetComputeInstanceId(nvmlDevice_t device, unsigned int *id); - -/** - * Get the maximum number of MIG devices that can exist under a given parent NVML device. - * - * Returns zero if MIG is not supported or enabled. - * - * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param device Target device handle - * @param count Count of MIG devices - * - * @return - * - \ref NVML_SUCCESS if \a count was successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a count reference is invalid - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMaxMigDeviceCount(nvmlDevice_t device, unsigned int *count); - -/** - * Get MIG device handle for the given index under its parent NVML device. - * - * If the compute instance is destroyed either explicitly or by destroying, - * resetting or unbinding the parent GPU instance or the GPU device itself - * the MIG device handle would remain invalid and must be requested again - * using this API. Handles may be reused and their properties can change in - * the process. - * - * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param device Reference to the parent GPU device handle - * @param index Index of the MIG device - * @param migDevice Reference to the MIG device handle - * - * @return - * - \ref NVML_SUCCESS if \a migDevice handle was successfully created - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a index or \a migDevice reference is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_NOT_FOUND if no valid MIG device was found at \a index - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMigDeviceHandleByIndex(nvmlDevice_t device, unsigned int index, - nvmlDevice_t *migDevice); - -/** - * Get parent device handle from a MIG device handle. - * - * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param migDevice MIG device handle - * @param device Device handle - * - * @return - * - \ref NVML_SUCCESS if \a device handle was successfully created - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a migDevice or \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDeviceHandleFromMigDeviceHandle(nvmlDevice_t migDevice, nvmlDevice_t *device); - -/** - * Get the type of the GPU Bus (PCIe, PCI, ...) - * - * @param device The identifier of the target device - * @param type The PCI Bus type - * - * return - * - \ref NVML_SUCCESS if the bus \a type is successfully retreived - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \device is invalid or \type is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBusType(nvmlDevice_t device, nvmlBusType_t *type); - -/** - * Retrieve performance monitor samples from the associated subdevice. - * - * @param device - * @param pDynamicPstatesInfo - * - * @return - * - \ref NVML_SUCCESS if \a pDynamicPstatesInfo has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pDynamicPstatesInfo is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDynamicPstatesInfo(nvmlDevice_t device, nvmlGpuDynamicPstatesInfo_t *pDynamicPstatesInfo); - -/** - * Sets the speed of a specified fan. - * - * WARNING: This function changes the fan control policy to manual. It means that YOU have to monitor - * the temperature and adjust the fan speed accordingly. - * If you set the fan speed too low you can burn your GPU! - * Use nvmlDeviceSetDefaultFanSpeed_v2 to restore default control policy. - * - * For all cuda-capable discrete products with fans that are Maxwell or Newer. - * - * device The identifier of the target device - * fan The index of the fan, starting at zero - * speed The target speed of the fan [0-100] in % of max speed - * - * return - * NVML_SUCCESS if the fan speed has been set - * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * NVML_ERROR_INVALID_ARGUMENT if the device is not valid, or the speed is outside acceptable ranges, - * or if the fan index doesn't reference an actual fan. - * NVML_ERROR_NOT_SUPPORTED if the device is older than Maxwell. - * NVML_ERROR_UNKNOWN if there was an unexpected error. - */ -nvmlReturn_t DECLDIR nvmlDeviceSetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int speed); - -/** - * Retrieve the GPCCLK VF offset value - * @param[in] device The identifier of the target device - * @param[out] offset The retrieved GPCCLK VF offset value - * - * @return - * - \ref NVML_SUCCESS if \a offset has been successfully queried - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpcClkVfOffset(nvmlDevice_t device, int *offset); - -/** - * Set the GPCCLK VF offset value - * @param[in] device The identifier of the target device - * @param[in] offset The GPCCLK VF offset value to set - * - * @return - * - \ref NVML_SUCCESS if \a offset has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_ERROR_NOT_FOUND If the compute instance is not found. */ -nvmlReturn_t DECLDIR nvmlDeviceSetGpcClkVfOffset(nvmlDevice_t device, int offset); +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceById(nvmlGpuInstance_t gpuInstance, unsigned int id, + nvmlComputeInstance_t *computeInstance); /** - * Retrieve the MemClk (Memory Clock) VF offset value. - * @param[in] device The identifier of the target device - * @param[out] offset The retrieved MemClk VF offset value + * Get compute instance information. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param computeInstance The compute instance handle + * @param info Return compute instance information * * @return - * - \ref NVML_SUCCESS if \a offset has been successfully queried - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a computeInstance or \a info are invalid + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation */ -nvmlReturn_t DECLDIR nvmlDeviceGetMemClkVfOffset(nvmlDevice_t device, int *offset); +nvmlReturn_t DECLDIR nvmlComputeInstanceGetInfo_v2(nvmlComputeInstance_t computeInstance, nvmlComputeInstanceInfo_t *info); /** - * Set the MemClk (Memory Clock) VF offset value. It requires elevated privileges. - * @param[in] device The identifier of the target device - * @param[in] offset The MemClk VF offset value to set + * Test if the given handle refers to a MIG device. + * + * A MIG device handle is an NVML abstraction which maps to a MIG compute instance. + * These overloaded references can be used (with some restrictions) interchangeably + * with a GPU device handle to execute queries at a per-compute instance granularity. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device NVML handle to test + * @param isMigDevice True when handle refers to a MIG device * * @return - * - \ref NVML_SUCCESS if \a offset has been set + * - \ref NVML_SUCCESS if \a device status was successfully retrieved * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle or \a isMigDevice reference is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceSetMemClkVfOffset(nvmlDevice_t device, int offset); +nvmlReturn_t DECLDIR nvmlDeviceIsMigDeviceHandle(nvmlDevice_t device, unsigned int *isMigDevice); /** - * Retrieve min and max clocks of some clock domain for a given PState + * Get GPU instance ID for the given MIG device handle. * - * @param device The identifier of the target device - * @param type Clock domain - * @param pstate PState to query - * @param minClockMHz Reference in which to return min clock frequency - * @param maxClockMHz Reference in which to return max clock frequency + * GPU instance IDs are unique per device and remain valid until the GPU instance is destroyed. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device Target MIG device handle + * @param id GPU instance ID * * @return - * - \ref NVML_SUCCESS if everything worked + * - \ref NVML_SUCCESS if instance ID was successfully retrieved * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a type or \a pstate are invalid or both - * \a minClockMHz and \a maxClockMHz are NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a id reference is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetMinMaxClockOfPState(nvmlDevice_t device, nvmlClockType_t type, nvmlPstates_t pstate, - unsigned int * minClockMHz, unsigned int * maxClockMHz); +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceId(nvmlDevice_t device, unsigned int *id); /** - * Get all supported Performance States (P-States) for the device. + * Get compute instance ID for the given MIG device handle. * - * The returned array would contain a contiguous list of valid P-States supported by - * the device. If the number of supported P-States is fewer than the size of the array - * supplied missing elements would contain \a NVML_PSTATE_UNKNOWN. + * Compute instance IDs are unique per GPU instance and remain valid until the compute instance + * is destroyed. * - * The number of elements in the returned list will never exceed \a NVML_MAX_GPU_PERF_PSTATES. + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. * - * @param device The identifier of the target device - * @param pstates Container to return the list of performance states - * supported by device - * @param size Size of the supplied \a pstates array in bytes + * @param device Target MIG device handle + * @param id Compute instance ID * * @return - * - \ref NVML_SUCCESS if \a pstates array has been retrieved - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if the the container supplied was not large enough to - * hold the resulting list + * - \ref NVML_SUCCESS if instance ID was successfully retrieved * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a pstates is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support performance state readings + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a id reference is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedPerformanceStates(nvmlDevice_t device, - nvmlPstates_t *pstates, unsigned int size); +nvmlReturn_t DECLDIR nvmlDeviceGetComputeInstanceId(nvmlDevice_t device, unsigned int *id); /** - * Retrieve the GPCCLK min max VF offset value. - * @param[in] device The identifier of the target device - * @param[out] minOffset The retrieved GPCCLK VF min offset value - * @param[out] maxOffset The retrieved GPCCLK VF max offset value + * Get the maximum number of MIG devices that can exist under a given parent NVML device. + * + * Returns zero if MIG is not supported or enabled. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device Target device handle + * @param count Count of MIG devices * * @return - * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_SUCCESS if \a count was successfully retrieved * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a count reference is invalid * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpcClkMinMaxVfOffset(nvmlDevice_t device, - int *minOffset, int *maxOffset); +nvmlReturn_t DECLDIR nvmlDeviceGetMaxMigDeviceCount(nvmlDevice_t device, unsigned int *count); /** - * Retrieve the MemClk (Memory Clock) min max VF offset value. - * @param[in] device The identifier of the target device - * @param[out] minOffset The retrieved MemClk VF min offset value - * @param[out] maxOffset The retrieved MemClk VF max offset value + * Get MIG device handle for the given index under its parent NVML device. + * + * If the compute instance is destroyed either explicitly or by destroying, + * resetting or unbinding the parent GPU instance or the GPU device itself + * the MIG device handle would remain invalid and must be requested again + * using this API. Handles may be reused and their properties can change in + * the process. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device Reference to the parent GPU device handle + * @param index Index of the MIG device + * @param migDevice Reference to the MIG device handle * * @return - * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_SUCCESS if \a migDevice handle was successfully created * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a index or \a migDevice reference is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_NOT_FOUND if no valid MIG device was found at \a index * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetMemClkMinMaxVfOffset(nvmlDevice_t device, - int *minOffset, int *maxOffset); +nvmlReturn_t DECLDIR nvmlDeviceGetMigDeviceHandleByIndex(nvmlDevice_t device, unsigned int index, + nvmlDevice_t *migDevice); /** - * Get fabric information associated with the device. - * - * %HOPPER_OR_NEWER% + * Get parent device handle from a MIG device handle. * - * On Hopper + NVSwitch systems, GPU is registered with the NVIDIA Fabric Manager - * Upon successful registration, the GPU is added to the NVLink fabric to enable - * peer-to-peer communication. - * This API reports the current state of the GPU in the NVLink fabric - * along with other useful information. + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. * - * @param device The identifier of the target device - * @param gpuFabricInfo Information about GPU fabric state + * @param migDevice MIG device handle + * @param device Device handle * * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support gpu fabric + * - \ref NVML_SUCCESS if \a device handle was successfully created + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a migDevice or \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t *gpuFabricInfo); +nvmlReturn_t DECLDIR nvmlDeviceGetDeviceHandleFromMigDeviceHandle(nvmlDevice_t migDevice, nvmlDevice_t *device); + +/** @} */ // @defgroup nvmlMultiInstanceGPU -/** @} */ /***************************************************************************************************/ /** @defgroup GPM NVML GPM @@ -9316,81 +10652,81 @@ nvmlReturn_t DECLDIR nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabr */ /***************************************************************************************************/ -/* GPM Metric Identifiers */ +/** + * GPM Metric Identifiers + */ typedef enum { - NVML_GPM_METRIC_GRAPHICS_UTIL = 1, /* Percentage of time any compute/graphics app was active on the GPU. 0.0 - 100.0 */ - NVML_GPM_METRIC_SM_UTIL = 2, /* Percentage of SMs that were busy. 0.0 - 100.0 */ - NVML_GPM_METRIC_SM_OCCUPANCY = 3, /* Percentage of warps that were active vs theoretical maximum. 0.0 - 100.0 */ - NVML_GPM_METRIC_INTEGER_UTIL = 4, /* Percentage of time the GPU's SMs were doing integer operations. 0.0 - 100.0 */ - NVML_GPM_METRIC_ANY_TENSOR_UTIL = 5, /* Percentage of time the GPU's SMs were doing ANY tensor operations. 0.0 - 100.0 */ - NVML_GPM_METRIC_DFMA_TENSOR_UTIL = 6, /* Percentage of time the GPU's SMs were doing DFMA tensor operations. 0.0 - 100.0 */ - NVML_GPM_METRIC_HMMA_TENSOR_UTIL = 7, /* Percentage of time the GPU's SMs were doing HMMA tensor operations. 0.0 - 100.0 */ - NVML_GPM_METRIC_IMMA_TENSOR_UTIL = 9, /* Percentage of time the GPU's SMs were doing IMMA tensor operations. 0.0 - 100.0 */ - NVML_GPM_METRIC_DRAM_BW_UTIL = 10, /* Percentage of DRAM bw used vs theoretical maximum. 0.0 - 100.0 */ - NVML_GPM_METRIC_FP64_UTIL = 11, /* Percentage of time the GPU's SMs were doing non-tensor FP64 math. 0.0 - 100.0 */ - NVML_GPM_METRIC_FP32_UTIL = 12, /* Percentage of time the GPU's SMs were doing non-tensor FP32 math. 0.0 - 100.0 */ - NVML_GPM_METRIC_FP16_UTIL = 13, /* Percentage of time the GPU's SMs were doing non-tensor FP16 math. 0.0 - 100.0 */ - NVML_GPM_METRIC_PCIE_TX_PER_SEC = 20, /* PCIe traffic from this GPU in MiB/sec */ - NVML_GPM_METRIC_PCIE_RX_PER_SEC = 21, /* PCIe traffic to this GPU in MiB/sec */ - NVML_GPM_METRIC_NVDEC_0_UTIL = 30, /* Percent utilization of NVDEC 0. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVDEC_1_UTIL = 31, /* Percent utilization of NVDEC 1. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVDEC_2_UTIL = 32, /* Percent utilization of NVDEC 2. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVDEC_3_UTIL = 33, /* Percent utilization of NVDEC 3. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVDEC_4_UTIL = 34, /* Percent utilization of NVDEC 4. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVDEC_5_UTIL = 35, /* Percent utilization of NVDEC 5. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVDEC_6_UTIL = 36, /* Percent utilization of NVDEC 6. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVDEC_7_UTIL = 37, /* Percent utilization of NVDEC 7. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVJPG_0_UTIL = 40, /* Percent utilization of NVJPG 0. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVJPG_1_UTIL = 41, /* Percent utilization of NVJPG 1. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVJPG_2_UTIL = 42, /* Percent utilization of NVJPG 2. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVJPG_3_UTIL = 43, /* Percent utilization of NVJPG 3. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVJPG_4_UTIL = 44, /* Percent utilization of NVJPG 4. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVJPG_5_UTIL = 45, /* Percent utilization of NVJPG 5. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVJPG_6_UTIL = 46, /* Percent utilization of NVJPG 6. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVJPG_7_UTIL = 47, /* Percent utilization of NVJPG 7. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVOFA_0_UTIL = 50, /* Percent utilization of NVOFA 0. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVLINK_TOTAL_RX_PER_SEC = 60, /* NvLink read bandwidth for all links in MiB/sec */ - NVML_GPM_METRIC_NVLINK_TOTAL_TX_PER_SEC = 61, /* NvLink write bandwidth for all links in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L0_RX_PER_SEC = 62, /* NvLink read bandwidth for link 0 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L0_TX_PER_SEC = 63, /* NvLink write bandwidth for link 0 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L1_RX_PER_SEC = 64, /* NvLink read bandwidth for link 1 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L1_TX_PER_SEC = 65, /* NvLink write bandwidth for link 1 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L2_RX_PER_SEC = 66, /* NvLink read bandwidth for link 2 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L2_TX_PER_SEC = 67, /* NvLink write bandwidth for link 2 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L3_RX_PER_SEC = 68, /* NvLink read bandwidth for link 3 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L3_TX_PER_SEC = 69, /* NvLink write bandwidth for link 3 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L4_RX_PER_SEC = 70, /* NvLink read bandwidth for link 4 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L4_TX_PER_SEC = 71, /* NvLink write bandwidth for link 4 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L5_RX_PER_SEC = 72, /* NvLink read bandwidth for link 5 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L5_TX_PER_SEC = 73, /* NvLink write bandwidth for link 5 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L6_RX_PER_SEC = 74, /* NvLink read bandwidth for link 6 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L6_TX_PER_SEC = 75, /* NvLink write bandwidth for link 6 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L7_RX_PER_SEC = 76, /* NvLink read bandwidth for link 7 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L7_TX_PER_SEC = 77, /* NvLink write bandwidth for link 7 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L8_RX_PER_SEC = 78, /* NvLink read bandwidth for link 8 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L8_TX_PER_SEC = 79, /* NvLink write bandwidth for link 8 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L9_RX_PER_SEC = 80, /* NvLink read bandwidth for link 9 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L9_TX_PER_SEC = 81, /* NvLink write bandwidth for link 9 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L10_RX_PER_SEC = 82, /* NvLink read bandwidth for link 10 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L10_TX_PER_SEC = 83, /* NvLink write bandwidth for link 10 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L11_RX_PER_SEC = 84, /* NvLink read bandwidth for link 11 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L11_TX_PER_SEC = 85, /* NvLink write bandwidth for link 11 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L12_RX_PER_SEC = 86, /* NvLink read bandwidth for link 12 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L12_TX_PER_SEC = 87, /* NvLink write bandwidth for link 12 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L13_RX_PER_SEC = 88, /* NvLink read bandwidth for link 13 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L13_TX_PER_SEC = 89, /* NvLink write bandwidth for link 13 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L14_RX_PER_SEC = 90, /* NvLink read bandwidth for link 14 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L14_TX_PER_SEC = 91, /* NvLink write bandwidth for link 14 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L15_RX_PER_SEC = 92, /* NvLink read bandwidth for link 15 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L15_TX_PER_SEC = 93, /* NvLink write bandwidth for link 15 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L16_RX_PER_SEC = 94, /* NvLink read bandwidth for link 16 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L16_TX_PER_SEC = 95, /* NvLink write bandwidth for link 16 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L17_RX_PER_SEC = 96, /* NvLink read bandwidth for link 17 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L17_TX_PER_SEC = 97, /* NvLink write bandwidth for link 17 in MiB/sec */ - NVML_GPM_METRIC_MAX = 98, /* Maximum value above +1. Note that changing this - should also change NVML_GPM_METRICS_GET_VERSION - due to struct size change */ + NVML_GPM_METRIC_GRAPHICS_UTIL = 1, //!< Percentage of time any compute/graphics app was active on the GPU. 0.0 - 100.0 + NVML_GPM_METRIC_SM_UTIL = 2, //!< Percentage of SMs that were busy. 0.0 - 100.0 + NVML_GPM_METRIC_SM_OCCUPANCY = 3, //!< Percentage of warps that were active vs theoretical maximum. 0.0 - 100.0 + NVML_GPM_METRIC_INTEGER_UTIL = 4, //!< Percentage of time the GPU's SMs were doing integer operations. 0.0 - 100.0 + NVML_GPM_METRIC_ANY_TENSOR_UTIL = 5, //!< Percentage of time the GPU's SMs were doing ANY tensor operations. 0.0 - 100.0 + NVML_GPM_METRIC_DFMA_TENSOR_UTIL = 6, //!< Percentage of time the GPU's SMs were doing DFMA tensor operations. 0.0 - 100.0 + NVML_GPM_METRIC_HMMA_TENSOR_UTIL = 7, //!< Percentage of time the GPU's SMs were doing HMMA tensor operations. 0.0 - 100.0 + NVML_GPM_METRIC_IMMA_TENSOR_UTIL = 9, //!< Percentage of time the GPU's SMs were doing IMMA tensor operations. 0.0 - 100.0 + NVML_GPM_METRIC_DRAM_BW_UTIL = 10, //!< Percentage of DRAM bw used vs theoretical maximum. 0.0 - 100.0 */ + NVML_GPM_METRIC_FP64_UTIL = 11, //!< Percentage of time the GPU's SMs were doing non-tensor FP64 math. 0.0 - 100.0 + NVML_GPM_METRIC_FP32_UTIL = 12, //!< Percentage of time the GPU's SMs were doing non-tensor FP32 math. 0.0 - 100.0 + NVML_GPM_METRIC_FP16_UTIL = 13, //!< Percentage of time the GPU's SMs were doing non-tensor FP16 math. 0.0 - 100.0 + NVML_GPM_METRIC_PCIE_TX_PER_SEC = 20, //!< PCIe traffic from this GPU in MiB/sec + NVML_GPM_METRIC_PCIE_RX_PER_SEC = 21, //!< PCIe traffic to this GPU in MiB/sec + NVML_GPM_METRIC_NVDEC_0_UTIL = 30, //!< Percent utilization of NVDEC 0. 0.0 - 100.0 + NVML_GPM_METRIC_NVDEC_1_UTIL = 31, //!< Percent utilization of NVDEC 1. 0.0 - 100.0 + NVML_GPM_METRIC_NVDEC_2_UTIL = 32, //!< Percent utilization of NVDEC 2. 0.0 - 100.0 + NVML_GPM_METRIC_NVDEC_3_UTIL = 33, //!< Percent utilization of NVDEC 3. 0.0 - 100.0 + NVML_GPM_METRIC_NVDEC_4_UTIL = 34, //!< Percent utilization of NVDEC 4. 0.0 - 100.0 + NVML_GPM_METRIC_NVDEC_5_UTIL = 35, //!< Percent utilization of NVDEC 5. 0.0 - 100.0 + NVML_GPM_METRIC_NVDEC_6_UTIL = 36, //!< Percent utilization of NVDEC 6. 0.0 - 100.0 + NVML_GPM_METRIC_NVDEC_7_UTIL = 37, //!< Percent utilization of NVDEC 7. 0.0 - 100.0 + NVML_GPM_METRIC_NVJPG_0_UTIL = 40, //!< Percent utilization of NVJPG 0. 0.0 - 100.0 + NVML_GPM_METRIC_NVJPG_1_UTIL = 41, //!< Percent utilization of NVJPG 1. 0.0 - 100.0 + NVML_GPM_METRIC_NVJPG_2_UTIL = 42, //!< Percent utilization of NVJPG 2. 0.0 - 100.0 + NVML_GPM_METRIC_NVJPG_3_UTIL = 43, //!< Percent utilization of NVJPG 3. 0.0 - 100.0 + NVML_GPM_METRIC_NVJPG_4_UTIL = 44, //!< Percent utilization of NVJPG 4. 0.0 - 100.0 + NVML_GPM_METRIC_NVJPG_5_UTIL = 45, //!< Percent utilization of NVJPG 5. 0.0 - 100.0 + NVML_GPM_METRIC_NVJPG_6_UTIL = 46, //!< Percent utilization of NVJPG 6. 0.0 - 100.0 + NVML_GPM_METRIC_NVJPG_7_UTIL = 47, //!< Percent utilization of NVJPG 7. 0.0 - 100.0 + NVML_GPM_METRIC_NVOFA_0_UTIL = 50, //!< Percent utilization of NVOFA 0. 0.0 - 100.0 + NVML_GPM_METRIC_NVLINK_TOTAL_RX_PER_SEC = 60, //!< NvLink read bandwidth for all links in MiB/sec + NVML_GPM_METRIC_NVLINK_TOTAL_TX_PER_SEC = 61, //!< NvLink write bandwidth for all links in MiB/sec + NVML_GPM_METRIC_NVLINK_L0_RX_PER_SEC = 62, //!< NvLink read bandwidth for link 0 in MiB/sec + NVML_GPM_METRIC_NVLINK_L0_TX_PER_SEC = 63, //!< NvLink write bandwidth for link 0 in MiB/sec + NVML_GPM_METRIC_NVLINK_L1_RX_PER_SEC = 64, //!< NvLink read bandwidth for link 1 in MiB/sec + NVML_GPM_METRIC_NVLINK_L1_TX_PER_SEC = 65, //!< NvLink write bandwidth for link 1 in MiB/sec + NVML_GPM_METRIC_NVLINK_L2_RX_PER_SEC = 66, //!< NvLink read bandwidth for link 2 in MiB/sec + NVML_GPM_METRIC_NVLINK_L2_TX_PER_SEC = 67, //!< NvLink write bandwidth for link 2 in MiB/sec + NVML_GPM_METRIC_NVLINK_L3_RX_PER_SEC = 68, //!< NvLink read bandwidth for link 3 in MiB/sec + NVML_GPM_METRIC_NVLINK_L3_TX_PER_SEC = 69, //!< NvLink write bandwidth for link 3 in MiB/sec + NVML_GPM_METRIC_NVLINK_L4_RX_PER_SEC = 70, //!< NvLink read bandwidth for link 4 in MiB/sec + NVML_GPM_METRIC_NVLINK_L4_TX_PER_SEC = 71, //!< NvLink write bandwidth for link 4 in MiB/sec + NVML_GPM_METRIC_NVLINK_L5_RX_PER_SEC = 72, //!< NvLink read bandwidth for link 5 in MiB/sec + NVML_GPM_METRIC_NVLINK_L5_TX_PER_SEC = 73, //!< NvLink write bandwidth for link 5 in MiB/sec + NVML_GPM_METRIC_NVLINK_L6_RX_PER_SEC = 74, //!< NvLink read bandwidth for link 6 in MiB/sec + NVML_GPM_METRIC_NVLINK_L6_TX_PER_SEC = 75, //!< NvLink write bandwidth for link 6 in MiB/sec + NVML_GPM_METRIC_NVLINK_L7_RX_PER_SEC = 76, //!< NvLink read bandwidth for link 7 in MiB/sec + NVML_GPM_METRIC_NVLINK_L7_TX_PER_SEC = 77, //!< NvLink write bandwidth for link 7 in MiB/sec + NVML_GPM_METRIC_NVLINK_L8_RX_PER_SEC = 78, //!< NvLink read bandwidth for link 8 in MiB/sec + NVML_GPM_METRIC_NVLINK_L8_TX_PER_SEC = 79, //!< NvLink write bandwidth for link 8 in MiB/sec + NVML_GPM_METRIC_NVLINK_L9_RX_PER_SEC = 80, //!< NvLink read bandwidth for link 9 in MiB/sec + NVML_GPM_METRIC_NVLINK_L9_TX_PER_SEC = 81, //!< NvLink write bandwidth for link 9 in MiB/sec + NVML_GPM_METRIC_NVLINK_L10_RX_PER_SEC = 82, //!< NvLink read bandwidth for link 10 in MiB/sec + NVML_GPM_METRIC_NVLINK_L10_TX_PER_SEC = 83, //!< NvLink write bandwidth for link 10 in MiB/sec + NVML_GPM_METRIC_NVLINK_L11_RX_PER_SEC = 84, //!< NvLink read bandwidth for link 11 in MiB/sec + NVML_GPM_METRIC_NVLINK_L11_TX_PER_SEC = 85, //!< NvLink write bandwidth for link 11 in MiB/sec + NVML_GPM_METRIC_NVLINK_L12_RX_PER_SEC = 86, //!< NvLink read bandwidth for link 12 in MiB/sec + NVML_GPM_METRIC_NVLINK_L12_TX_PER_SEC = 87, //!< NvLink write bandwidth for link 12 in MiB/sec + NVML_GPM_METRIC_NVLINK_L13_RX_PER_SEC = 88, //!< NvLink read bandwidth for link 13 in MiB/sec + NVML_GPM_METRIC_NVLINK_L13_TX_PER_SEC = 89, //!< NvLink write bandwidth for link 13 in MiB/sec + NVML_GPM_METRIC_NVLINK_L14_RX_PER_SEC = 90, //!< NvLink read bandwidth for link 14 in MiB/sec + NVML_GPM_METRIC_NVLINK_L14_TX_PER_SEC = 91, //!< NvLink write bandwidth for link 14 in MiB/sec + NVML_GPM_METRIC_NVLINK_L15_RX_PER_SEC = 92, //!< NvLink read bandwidth for link 15 in MiB/sec + NVML_GPM_METRIC_NVLINK_L15_TX_PER_SEC = 93, //!< NvLink write bandwidth for link 15 in MiB/sec + NVML_GPM_METRIC_NVLINK_L16_RX_PER_SEC = 94, //!< NvLink read bandwidth for link 16 in MiB/sec + NVML_GPM_METRIC_NVLINK_L16_TX_PER_SEC = 95, //!< NvLink write bandwidth for link 16 in MiB/sec + NVML_GPM_METRIC_NVLINK_L17_RX_PER_SEC = 96, //!< NvLink read bandwidth for link 17 in MiB/sec + NVML_GPM_METRIC_NVLINK_L17_TX_PER_SEC = 97, //!< NvLink write bandwidth for link 17 in MiB/sec + NVML_GPM_METRIC_MAX = 98, //!< Maximum value above +1. Note that changing this should also change NVML_GPM_METRICS_GET_VERSION due to struct size change } nvmlGpmMetricId_t; /** @} */ // @defgroup nvmlGpmEnums @@ -9402,8 +10738,9 @@ typedef enum */ /***************************************************************************************************/ -/* Handle to an allocated GPM sample allocated with nvmlGpmSampleAlloc() - Free this with nvmlGpmSampleFree() */ +/** + * Handle to an allocated GPM sample allocated with nvmlGpmSampleAlloc(). Free this with nvmlGpmSampleFree(). + */ typedef struct { struct nvmlGpmSample_st* handle; @@ -9415,30 +10752,38 @@ typedef struct { char *unit; } nvmlGpmMetricMetricInfo_t; +/** + * GPM metric information. + */ typedef struct { - unsigned int metricId; /* IN: NVML_GPM_METRIC_? #define of which metric to retrieve */ - nvmlReturn_t nvmlReturn; /* OUT: Status of this metric. If this is nonzero, then value is not valid */ - double value; /* OUT: Value of this metric. Is only valid if nvmlReturn is 0 (NVML_SUCCESS) */ - nvmlGpmMetricMetricInfo_t metricInfo; /* OUT: Metric name and unit. Those can be NULL if not defined */ + unsigned int metricId; //!< IN: NVML_GPM_METRIC_? #define of which metric to retrieve + nvmlReturn_t nvmlReturn; //!< OUT: Status of this metric. If this is nonzero, then value is not valid + double value; //!< OUT: Value of this metric. Is only valid if nvmlReturn is 0 (NVML_SUCCESS) + nvmlGpmMetricMetricInfo_t metricInfo; //!< OUT: Metric name and unit. Those can be NULL if not defined } nvmlGpmMetric_t; +/** + * GPM buffer information. + */ typedef struct { - unsigned int version; /* IN: Set to NVML_GPM_METRICS_GET_VERSION */ - unsigned int numMetrics; /* IN: How many metrics to retrieve in metrics[] */ - nvmlGpmSample_t sample1; /* IN: Sample buffer */ - nvmlGpmSample_t sample2; /* IN: Sample buffer */ - nvmlGpmMetric_t metrics[NVML_GPM_METRIC_MAX]; /* IN/OUT: Array of metrics. Set metricId on call. - see nvmlReturn and value on return */ + unsigned int version; //!< IN: Set to NVML_GPM_METRICS_GET_VERSION + unsigned int numMetrics; //!< IN: How many metrics to retrieve in metrics[] + nvmlGpmSample_t sample1; //!< IN: Sample buffer + nvmlGpmSample_t sample2; //!< IN: Sample buffer + nvmlGpmMetric_t metrics[NVML_GPM_METRIC_MAX]; //!< IN/OUT: Array of metrics. Set metricId on call. See nvmlReturn and value on return } nvmlGpmMetricsGet_t; #define NVML_GPM_METRICS_GET_VERSION 1 +/** + * GPM device information. + */ typedef struct { - unsigned int version; /* IN: Set to NVML_GPM_SUPPORT_VERSION */ - unsigned int isSupportedDevice; /* OUT: Indicates device support */ + unsigned int version; //!< IN: Set to NVML_GPM_SUPPORT_VERSION + unsigned int isSupportedDevice; //!< OUT: Indicates device support } nvmlGpmSupport_t; #define NVML_GPM_SUPPORT_VERSION 1 @@ -9454,10 +10799,9 @@ typedef struct /** * Calculate GPM metrics from two samples. * + * For Hopper &tm; or newer fully supported devices. * - * @param metricsGet IN/OUT: populated nvmlGpmMetricsGet_t struct - * - * %HOPPER_OR_NEWER% + * @param metricsGet IN/OUT: populated \a nvmlGpmMetricsGet_t struct * * @return * - \ref NVML_SUCCESS on success @@ -9469,7 +10813,7 @@ nvmlReturn_t DECLDIR nvmlGpmMetricsGet(nvmlGpmMetricsGet_t *metricsGet); /** * Free an allocated sample buffer that was allocated with \ref nvmlGpmSampleAlloc() * - * %HOPPER_OR_NEWER% + * For Hopper &tm; or newer fully supported devices. * * @param gpmSample Sample to free * @@ -9484,7 +10828,7 @@ nvmlReturn_t DECLDIR nvmlGpmSampleFree(nvmlGpmSample_t gpmSample); * Allocate a sample buffer to be used with NVML GPM . You will need to allocate * at least two of these buffers to use with the NVML GPM feature * - * %HOPPER_OR_NEWER% + * For Hopper &tm; or newer fully supported devices. * * @param gpmSample Where the allocated sample will be stored * @@ -9500,7 +10844,7 @@ nvmlReturn_t DECLDIR nvmlGpmSampleAlloc(nvmlGpmSample_t *gpmSample); * two samples are gathered, you can call nvmlGpmMetricGet on those samples to * retrive metrics * - * %HOPPER_OR_NEWER% + * For Hopper &tm; or newer fully supported devices. * * @param device Device to get samples for * @param gpmSample Buffer to read samples into @@ -9517,7 +10861,7 @@ nvmlReturn_t DECLDIR nvmlGpmSampleGet(nvmlDevice_t device, nvmlGpmSample_t gpmSa * After two samples are gathered, you can call nvmlGpmMetricGet on those * samples to retrive metrics * - * %HOPPER_OR_NEWER% + * For Hopper &tm; or newer fully supported devices. * * @param device Device to get samples for * @param gpuInstanceId MIG GPU Instance ID @@ -9533,7 +10877,7 @@ nvmlReturn_t DECLDIR nvmlGpmMigSampleGet(nvmlDevice_t device, unsigned int gpuIn * Indicate whether the supplied device supports GPM * * @param device NVML device to query for - * @param gpmSupport Structure to indicate GPM support. Indicates + * @param gpmSupport Structure to indicate GPM support \a nvmlGpmSupport_t. Indicates * GPM support per system for the supplied device * * @return @@ -9542,58 +10886,45 @@ nvmlReturn_t DECLDIR nvmlGpmMigSampleGet(nvmlDevice_t device, unsigned int gpuIn */ nvmlReturn_t DECLDIR nvmlGpmQueryDeviceSupport(nvmlDevice_t device, nvmlGpmSupport_t *gpmSupport); -/** @} */ // @defgroup nvmlGpmFunctions -/** @} */ // @defgroup GPM - -/***************************************************************************************************/ -/** @defgroup nvmlDevice definitions related to Counter Collection Unit - * @{ - */ -/***************************************************************************************************/ - -/* CCU Stream State */ -#define NVML_COUNTER_COLLECTION_UNIT_STREAM_STATE_DISABLE 0 -#define NVML_COUNTER_COLLECTION_UNIT_STREAM_STATE_ENABLE 1 - +/* GPM Stream State */ /** - * Get counter collection unit stream state. + * Get GPM stream state. * * %HOPPER_OR_NEWER% * Supported on Linux, Windows TCC. * * @param device The identifier of the target device - * @param state Returns counter collection unit stream state - * NVML_COUNTER_COLLECTION_UNIT_STREAM_STATE_DISABLE or - * NVML_COUNTER_COLLECTION_UNIT_STREAM_STATE_ENABLE + * @param state Returns GPM stream state + * NVML_FEATURE_DISABLED or NVML_FEATURE_ENABLED * * @return - * - \ref NVML_SUCCESS if \a current counter collection unit stream state were successfully queried + * - \ref NVML_SUCCESS if \a current GPM stream state were successfully queried * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a state is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device */ -nvmlReturn_t DECLDIR nvmlDeviceCcuGetStreamState(nvmlDevice_t device, unsigned int *state); +nvmlReturn_t DECLDIR nvmlGpmQueryIfStreamingEnabled(nvmlDevice_t device, unsigned int *state); /** - * Set counter collection unit stream state. + * Set GPM stream state. * * %HOPPER_OR_NEWER% * Supported on Linux, Windows TCC. * * @param device The identifier of the target device - * @param state Counter collection unit stream state, - * NVML_COUNTER_COLLECTION_UNIT_STREAM_STATE_DISABLE or - * NVML_COUNTER_COLLECTION_UNIT_STREAM_STATE_ENABLE + * @param state GPM stream state, + * NVML_FEATURE_DISABLED or NVML_FEATURE_ENABLED * * @return - * - \ref NVML_SUCCESS if \a current counter collection unit stream state is successfully set + * - \ref NVML_SUCCESS if \a current GPM stream state is successfully set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device */ -nvmlReturn_t DECLDIR nvmlDeviceCcuSetStreamState(nvmlDevice_t device, unsigned int state); +nvmlReturn_t DECLDIR nvmlGpmSetStreamingEnabled(nvmlDevice_t device, unsigned int state); -/** @} */ // @defgroup CCU +/** @} */ // @defgroup nvmlGpmFunctions +/** @} */ // @defgroup GPM #define NVML_NVLINK_POWER_STATE_HIGH_SPEED 0x0 #define NVML_NVLINK_POWER_STATE_LOW 0x1 @@ -9626,6 +10957,87 @@ typedef struct nvmlNvLinkPowerThres_st **/ nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkDeviceLowPowerThreshold(nvmlDevice_t device, nvmlNvLinkPowerThres_t *info); +/** + * Set the global nvlink bandwith mode + * + * @param nvlinkBwMode nvlink bandwidth mode + * @return + * - \ref NVML_SUCCESS on success + * - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid argument is provided + * - \ref NVML_ERROR_IN_USE if P2P object exists + * - \ref NVML_ERROR_NOT_SUPPORTED if GPU is not Hopper or newer architecture. + * - \ref NVML_ERROR_NO_PERMISSION if not root user + */ +nvmlReturn_t DECLDIR nvmlSystemSetNvlinkBwMode(unsigned int nvlinkBwMode); + +/** + * Get the global nvlink bandwith mode + * + * @param nvlinkBwMode reference of nvlink bandwidth mode + * @return + * - \ref NVML_SUCCESS on success + * - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid pointer is provided + * - \ref NVML_ERROR_NOT_SUPPORTED if GPU is not Hopper or newer architecture. + * - \ref NVML_ERROR_NO_PERMISSION if not root user + */ +nvmlReturn_t DECLDIR nvmlSystemGetNvlinkBwMode(unsigned int *nvlinkBwMode); + +/** + * Set new power limit of this device. + * + * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values. + * + * See \ref nvmlPowerValue_v2_t for more information on the struct. + * + * \note Limit is not persistent across reboots or driver unloads. + * Enable persistent mode to prevent driver from unloading when no application is using the device. + * + * This API replaces nvmlDeviceSetPowerManagementLimit. It can be used as a drop-in replacement for the older version. + * + * @param device The identifier of the target device + * @param powerValue Power management limit in milliwatts to set + * + * @return + * - \ref NVML_SUCCESS if \a limit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a powerValue is NULL or contains invalid values + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see NVML_FI_DEV_POWER_AVERAGE + * @see NVML_FI_DEV_POWER_INSTANT + * @see NVML_FI_DEV_POWER_MIN_LIMIT + * @see NVML_FI_DEV_POWER_MAX_LIMIT + * @see NVML_FI_DEV_POWER_CURRENT_LIMIT + */ +nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit_v2(nvmlDevice_t device, nvmlPowerValue_v2_t *powerValue); + +/** + * Get SRAM ECC error status of this device. + * + * For Ampere &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * See \ref nvmlEccSramErrorStatus_v1_t for more information on the struct. + * + * @param device The identifier of the target device + * @param status Returns SRAM ECC error status + * + * @return + * - \ref NVML_SUCCESS if \a limit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counters is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_VERSION_MISMATCH if the version of \a nvmlEccSramErrorStatus_t is invalid + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSramEccErrorStatus(nvmlDevice_t device, + nvmlEccSramErrorStatus_t *status); /** * NVML API versioning support */ @@ -9653,7 +11065,6 @@ nvmlReturn_t DECLDIR nvmlDeviceGetMPSComputeRunningProcesses(nvmlDevice_t device nvmlReturn_t DECLDIR nvmlDeviceGetMPSComputeRunningProcesses_v2(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v2_t *infos); nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstancePossiblePlacements(nvmlDevice_t device, unsigned int profileId, nvmlGpuInstancePlacement_t *placements, unsigned int *count); nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseInfo(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuLicenseInfo_t *licenseInfo); - #endif // #ifdef NVML_NO_UNVERSIONED_FUNC_DEFS #if defined(NVML_NO_UNVERSIONED_FUNC_DEFS) @@ -9679,6 +11090,7 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseInfo(nvmlVgpuInstance_t vgpuInsta #undef nvmlGetBlacklistDeviceInfoByIndex #undef nvmlDeviceGetGpuInstancePossiblePlacements #undef nvmlVgpuInstanceGetLicenseInfo +#undef nvmlDeviceSetPowerManagementLimit #endif diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/system.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/system.go index 03145b288..bee396415 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/system.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/system.go @@ -79,3 +79,60 @@ func (l *library) SystemGetTopologyGpuSet(cpuNumber int) ([]Device, Return) { ret = nvmlSystemGetTopologyGpuSet(uint32(cpuNumber), &count, &deviceArray[0]) return convertSlice[nvmlDevice, Device](deviceArray), ret } + +// nvml.SystemGetConfComputeCapabilities() +func (l *library) SystemGetConfComputeCapabilities() (ConfComputeSystemCaps, Return) { + var capabilities ConfComputeSystemCaps + ret := nvmlSystemGetConfComputeCapabilities(&capabilities) + return capabilities, ret +} + +// nvml.SystemGetConfComputeState() +func SystemGetConfComputeState() (ConfComputeSystemState, Return) { + var state ConfComputeSystemState + ret := nvmlSystemGetConfComputeState(&state) + return state, ret +} + +// nvml.SystemGetConfComputeGpusReadyState() +func SystemGetConfComputeGpusReadyState() (uint32, Return) { + var isAcceptingWork uint32 + ret := nvmlSystemGetConfComputeGpusReadyState(&isAcceptingWork) + return isAcceptingWork, ret +} + +// nvml.SystemSetConfComputeGpusReadyState() +func SystemSetConfComputeGpusReadyState(isAcceptingWork uint32) Return { + return nvmlSystemSetConfComputeGpusReadyState(isAcceptingWork) +} + +// nvml.SystemSetNvlinkBwMode() +func SystemSetNvlinkBwMode(nvlinkBwMode uint32) Return { + return nvmlSystemSetNvlinkBwMode(nvlinkBwMode) +} + +// nvml.SystemGetNvlinkBwMode() +func SystemGetNvlinkBwMode() (uint32, Return) { + var nvlinkBwMode uint32 + ret := nvmlSystemGetNvlinkBwMode(&nvlinkBwMode) + return nvlinkBwMode, ret +} + +// nvml.SystemGetConfComputeKeyRotationThresholdInfo() +func (l *library) SystemGetConfComputeKeyRotationThresholdInfo() (ConfComputeGetKeyRotationThresholdInfo, Return) { + var keyRotationThresholdInfo ConfComputeGetKeyRotationThresholdInfo + ret := nvmlSystemGetConfComputeKeyRotationThresholdInfo(&keyRotationThresholdInfo) + return keyRotationThresholdInfo, ret +} + +// nvml.SystemGetConfComputeSettings() +func (l *library) SystemGetConfComputeSettings() (SystemConfComputeSettings, Return) { + var settings SystemConfComputeSettings + ret := nvmlSystemGetConfComputeSettings(&settings) + return settings, ret +} + +// nvml.SystemSetConfComputeKeyRotationThresholdInfo() +func (l *library) SystemSetConfComputeKeyRotationThresholdInfo(keyRotationThresholdInfo ConfComputeSetKeyRotationThresholdInfo) Return { + return nvmlSystemSetConfComputeKeyRotationThresholdInfo(&keyRotationThresholdInfo) +} diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/types_gen.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/types_gen.go index 6a57bab3b..6ee33a6ab 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/types_gen.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/types_gen.go @@ -9,6 +9,30 @@ type nvmlDevice struct { Handle *_Ctype_struct_nvmlDevice_st } +type PciInfoExt_v1 struct { + Version uint32 + Domain uint32 + Bus uint32 + Device uint32 + PciDeviceId uint32 + PciSubSystemId uint32 + BaseClass uint32 + SubClass uint32 + BusId [32]int8 +} + +type PciInfoExt struct { + Version uint32 + Domain uint32 + Bus uint32 + Device uint32 + PciDeviceId uint32 + PciSubSystemId uint32 + BaseClass uint32 + SubClass uint32 + BusId [32]int8 +} + type PciInfo struct { BusIdLegacy [16]int8 Domain uint32 @@ -70,6 +94,28 @@ type ProcessInfo struct { ComputeInstanceId uint32 } +type ProcessDetail_v1 struct { + Pid uint32 + UsedGpuMemory uint64 + GpuInstanceId uint32 + ComputeInstanceId uint32 + UsedGpuCcProtectedMemory uint64 +} + +type ProcessDetailList_v1 struct { + Version uint32 + Mode uint32 + NumProcArrayEntries uint32 + ProcArray *ProcessDetail_v1 +} + +type ProcessDetailList struct { + Version uint32 + Mode uint32 + NumProcArrayEntries uint32 + ProcArray *ProcessDetail_v1 +} + type DeviceAttributes struct { MultiprocessorCount uint32 SharedCopyEngineCount uint32 @@ -82,6 +128,10 @@ type DeviceAttributes struct { MemorySizeMB uint64 } +type C2cModeInfo_v1 struct { + IsC2cEnabled uint32 +} + type RowRemapperHistogramValues struct { Max uint32 High uint32 @@ -147,6 +197,40 @@ type nvmlVgpuTypeId uint32 type nvmlVgpuInstance uint32 +type VgpuHeterogeneousMode_v1 struct { + Version uint32 + Mode uint32 +} + +type VgpuHeterogeneousMode struct { + Version uint32 + Mode uint32 +} + +type VgpuPlacementId_v1 struct { + Version uint32 + PlacementId uint32 +} + +type VgpuPlacementId struct { + Version uint32 + PlacementId uint32 +} + +type VgpuPlacementList_v1 struct { + Version uint32 + PlacementSize uint32 + Count uint32 + PlacementIds *uint32 +} + +type VgpuPlacementList struct { + Version uint32 + PlacementSize uint32 + Count uint32 + PlacementIds *uint32 +} + type VgpuInstanceUtilizationSample struct { VgpuInstance uint32 TimeStamp uint64 @@ -156,6 +240,34 @@ type VgpuInstanceUtilizationSample struct { DecUtil [8]byte } +type VgpuInstanceUtilizationInfo_v1 struct { + TimeStamp uint64 + VgpuInstance uint32 + Pad_cgo_0 [4]byte + SmUtil [8]byte + MemUtil [8]byte + EncUtil [8]byte + DecUtil [8]byte + JpgUtil [8]byte + OfaUtil [8]byte +} + +type VgpuInstancesUtilizationInfo_v1 struct { + Version uint32 + SampleValType uint32 + VgpuInstanceCount uint32 + LastSeenTimeStamp uint64 + VgpuUtilArray *VgpuInstanceUtilizationInfo_v1 +} + +type VgpuInstancesUtilizationInfo struct { + Version uint32 + SampleValType uint32 + VgpuInstanceCount uint32 + LastSeenTimeStamp uint64 + VgpuUtilArray *VgpuInstanceUtilizationInfo_v1 +} + type VgpuProcessUtilizationSample struct { VgpuInstance uint32 Pid uint32 @@ -167,6 +279,33 @@ type VgpuProcessUtilizationSample struct { DecUtil uint32 } +type VgpuProcessUtilizationInfo_v1 struct { + ProcessName [64]int8 + TimeStamp uint64 + VgpuInstance uint32 + Pid uint32 + SmUtil uint32 + MemUtil uint32 + EncUtil uint32 + DecUtil uint32 + JpgUtil uint32 + OfaUtil uint32 +} + +type VgpuProcessesUtilizationInfo_v1 struct { + Version uint32 + VgpuProcessCount uint32 + LastSeenTimeStamp uint64 + VgpuProcUtilArray *VgpuProcessUtilizationInfo_v1 +} + +type VgpuProcessesUtilizationInfo struct { + Version uint32 + VgpuProcessCount uint32 + LastSeenTimeStamp uint64 + VgpuProcUtilArray *VgpuProcessUtilizationInfo_v1 +} + type VgpuSchedulerParamsVgpuSchedDataWithARR struct { AvgFactor uint32 Timeslice uint32 @@ -192,7 +331,7 @@ type VgpuSchedulerLogEntry struct { type VgpuSchedulerLog struct { EngineId uint32 SchedulerPolicy uint32 - IsEnabledARR uint32 + ArrMode uint32 SchedulerParams [8]byte EntriesCount uint32 LogEntries [200]VgpuSchedulerLogEntry @@ -200,7 +339,7 @@ type VgpuSchedulerLog struct { type VgpuSchedulerGetState struct { SchedulerPolicy uint32 - IsEnabledARR uint32 + ArrMode uint32 SchedulerParams [8]byte } @@ -260,6 +399,32 @@ type ProcessUtilizationSample struct { DecUtil uint32 } +type ProcessUtilizationInfo_v1 struct { + TimeStamp uint64 + Pid uint32 + SmUtil uint32 + MemUtil uint32 + EncUtil uint32 + DecUtil uint32 + JpgUtil uint32 + OfaUtil uint32 + Pad_cgo_0 [4]byte +} + +type ProcessesUtilizationInfo_v1 struct { + Version uint32 + ProcessSamplesCount uint32 + LastSeenTimeStamp uint64 + ProcUtilArray *ProcessUtilizationInfo_v1 +} + +type ProcessesUtilizationInfo struct { + Version uint32 + ProcessSamplesCount uint32 + LastSeenTimeStamp uint64 + ProcUtilArray *ProcessUtilizationInfo_v1 +} + type GridLicenseExpiry struct { Year uint32 Month uint16 @@ -286,6 +451,40 @@ type GridLicensableFeatures struct { GridLicensableFeatures [3]GridLicensableFeature } +type EccSramErrorStatus_v1 struct { + Version uint32 + AggregateUncParity uint64 + AggregateUncSecDed uint64 + AggregateCor uint64 + VolatileUncParity uint64 + VolatileUncSecDed uint64 + VolatileCor uint64 + AggregateUncBucketL2 uint64 + AggregateUncBucketSm uint64 + AggregateUncBucketPcie uint64 + AggregateUncBucketMcu uint64 + AggregateUncBucketOther uint64 + BThresholdExceeded uint32 + Pad_cgo_0 [4]byte +} + +type EccSramErrorStatus struct { + Version uint32 + AggregateUncParity uint64 + AggregateUncSecDed uint64 + AggregateCor uint64 + VolatileUncParity uint64 + VolatileUncSecDed uint64 + VolatileCor uint64 + AggregateUncBucketL2 uint64 + AggregateUncBucketSm uint64 + AggregateUncBucketPcie uint64 + AggregateUncBucketMcu uint64 + AggregateUncBucketOther uint64 + BThresholdExceeded uint32 + Pad_cgo_0 [4]byte +} + type DeviceArchitecture uint32 type BusType uint32 @@ -408,16 +607,110 @@ type FBCSessionInfo struct { AverageLatency uint32 } +type ConfComputeSystemCaps struct { + CpuCaps uint32 + GpusCaps uint32 +} + +type ConfComputeSystemState struct { + Environment uint32 + CcFeature uint32 + DevToolsMode uint32 +} + +type SystemConfComputeSettings_v1 struct { + Version uint32 + Environment uint32 + CcFeature uint32 + DevToolsMode uint32 + MultiGpuMode uint32 +} + +type SystemConfComputeSettings struct { + Version uint32 + Environment uint32 + CcFeature uint32 + DevToolsMode uint32 + MultiGpuMode uint32 +} + +type ConfComputeMemSizeInfo struct { + ProtectedMemSizeKib uint64 + UnprotectedMemSizeKib uint64 +} + +type ConfComputeGpuCertificate struct { + CertChainSize uint32 + AttestationCertChainSize uint32 + CertChain [4096]uint8 + AttestationCertChain [5120]uint8 +} + +type ConfComputeGpuAttestationReport struct { + IsCecAttestationReportPresent uint32 + AttestationReportSize uint32 + CecAttestationReportSize uint32 + Nonce [32]uint8 + AttestationReport [8192]uint8 + CecAttestationReport [4096]uint8 +} + +type ConfComputeSetKeyRotationThresholdInfo_v1 struct { + Version uint32 + MaxAttackerAdvantage uint64 +} + +type ConfComputeSetKeyRotationThresholdInfo struct { + Version uint32 + MaxAttackerAdvantage uint64 +} + +type ConfComputeGetKeyRotationThresholdInfo_v1 struct { + Version uint32 + AttackerAdvantage uint64 +} + +type ConfComputeGetKeyRotationThresholdInfo struct { + Version uint32 + AttackerAdvantage uint64 +} + type GpuFabricState byte type GpuFabricInfo struct { - ClusterUuid [16]int8 + ClusterUuid [16]uint8 Status uint32 - PartitionId uint32 + CliqueId uint32 State uint8 Pad_cgo_0 [3]byte } +type GpuFabricInfo_v2 struct { + Version uint32 + ClusterUuid [16]uint8 + Status uint32 + CliqueId uint32 + State uint8 + HealthMask uint32 +} + +type GpuFabricInfoV struct { + Version uint32 + ClusterUuid [16]uint8 + Status uint32 + CliqueId uint32 + State uint8 + HealthMask uint32 +} + +type PowerScopeType byte + +type PowerValue_v2 struct { + Version uint32 + PowerScope uint8 + PowerValueMw uint32 +} + type AffinityScope uint32 type VgpuVersion struct { @@ -494,6 +787,23 @@ type GpuInstanceProfileInfo_v2 struct { Name [96]int8 } +type GpuInstanceProfileInfo_v3 struct { + Version uint32 + Id uint32 + SliceCount uint32 + InstanceCount uint32 + MultiprocessorCount uint32 + CopyEngineCount uint32 + DecoderCount uint32 + EncoderCount uint32 + JpegCount uint32 + OfaCount uint32 + MemorySizeMB uint64 + Name [96]int8 + Capabilities uint32 + Pad_cgo_0 [4]byte +} + type nvmlGpuInstanceInfo struct { Device nvmlDevice Id uint32 @@ -536,6 +846,21 @@ type ComputeInstanceProfileInfo_v2 struct { Name [96]int8 } +type ComputeInstanceProfileInfo_v3 struct { + Version uint32 + Id uint32 + SliceCount uint32 + InstanceCount uint32 + MultiprocessorCount uint32 + SharedCopyEngineCount uint32 + SharedDecoderCount uint32 + SharedEncoderCount uint32 + SharedJpegCount uint32 + SharedOfaCount uint32 + Name [96]int8 + Capabilities uint32 +} + type nvmlComputeInstanceInfo struct { Device nvmlDevice GpuInstance nvmlGpuInstance diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/zz_generated.api.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/zz_generated.api.go index 9997a275b..c1ecb2d0e 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/zz_generated.api.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/zz_generated.api.go @@ -22,8 +22,6 @@ package nvml var ( ComputeInstanceDestroy = libnvml.ComputeInstanceDestroy ComputeInstanceGetInfo = libnvml.ComputeInstanceGetInfo - DeviceCcuGetStreamState = libnvml.DeviceCcuGetStreamState - DeviceCcuSetStreamState = libnvml.DeviceCcuSetStreamState DeviceClearAccountingPids = libnvml.DeviceClearAccountingPids DeviceClearCpuAffinity = libnvml.DeviceClearCpuAffinity DeviceClearEccErrorCounts = libnvml.DeviceClearEccErrorCounts @@ -49,12 +47,17 @@ var ( DeviceGetBrand = libnvml.DeviceGetBrand DeviceGetBridgeChipInfo = libnvml.DeviceGetBridgeChipInfo DeviceGetBusType = libnvml.DeviceGetBusType + DeviceGetC2cModeInfoV = libnvml.DeviceGetC2cModeInfoV DeviceGetClkMonStatus = libnvml.DeviceGetClkMonStatus DeviceGetClock = libnvml.DeviceGetClock DeviceGetClockInfo = libnvml.DeviceGetClockInfo DeviceGetComputeInstanceId = libnvml.DeviceGetComputeInstanceId DeviceGetComputeMode = libnvml.DeviceGetComputeMode DeviceGetComputeRunningProcesses = libnvml.DeviceGetComputeRunningProcesses + DeviceGetConfComputeGpuAttestationReport = libnvml.DeviceGetConfComputeGpuAttestationReport + DeviceGetConfComputeGpuCertificate = libnvml.DeviceGetConfComputeGpuCertificate + DeviceGetConfComputeMemSizeInfo = libnvml.DeviceGetConfComputeMemSizeInfo + DeviceGetConfComputeProtectedMemoryUsage = libnvml.DeviceGetConfComputeProtectedMemoryUsage DeviceGetCount = libnvml.DeviceGetCount DeviceGetCpuAffinity = libnvml.DeviceGetCpuAffinity DeviceGetCpuAffinityWithinScope = libnvml.DeviceGetCpuAffinityWithinScope @@ -62,6 +65,7 @@ var ( DeviceGetCudaComputeCapability = libnvml.DeviceGetCudaComputeCapability DeviceGetCurrPcieLinkGeneration = libnvml.DeviceGetCurrPcieLinkGeneration DeviceGetCurrPcieLinkWidth = libnvml.DeviceGetCurrPcieLinkWidth + DeviceGetCurrentClocksEventReasons = libnvml.DeviceGetCurrentClocksEventReasons DeviceGetCurrentClocksThrottleReasons = libnvml.DeviceGetCurrentClocksThrottleReasons DeviceGetDecoderUtilization = libnvml.DeviceGetDecoderUtilization DeviceGetDefaultApplicationsClock = libnvml.DeviceGetDefaultApplicationsClock @@ -87,6 +91,7 @@ var ( DeviceGetGpcClkMinMaxVfOffset = libnvml.DeviceGetGpcClkMinMaxVfOffset DeviceGetGpcClkVfOffset = libnvml.DeviceGetGpcClkVfOffset DeviceGetGpuFabricInfo = libnvml.DeviceGetGpuFabricInfo + DeviceGetGpuFabricInfoV = libnvml.DeviceGetGpuFabricInfoV DeviceGetGpuInstanceById = libnvml.DeviceGetGpuInstanceById DeviceGetGpuInstanceId = libnvml.DeviceGetGpuInstanceId DeviceGetGpuInstancePossiblePlacements = libnvml.DeviceGetGpuInstancePossiblePlacements @@ -110,6 +115,8 @@ var ( DeviceGetInforomImageVersion = libnvml.DeviceGetInforomImageVersion DeviceGetInforomVersion = libnvml.DeviceGetInforomVersion DeviceGetIrqNum = libnvml.DeviceGetIrqNum + DeviceGetJpgUtilization = libnvml.DeviceGetJpgUtilization + DeviceGetLastBBXFlushTime = libnvml.DeviceGetLastBBXFlushTime DeviceGetMPSComputeRunningProcesses = libnvml.DeviceGetMPSComputeRunningProcesses DeviceGetMaxClockInfo = libnvml.DeviceGetMaxClockInfo DeviceGetMaxCustomerBoostClock = libnvml.DeviceGetMaxCustomerBoostClock @@ -128,10 +135,12 @@ var ( DeviceGetMinMaxClockOfPState = libnvml.DeviceGetMinMaxClockOfPState DeviceGetMinMaxFanSpeed = libnvml.DeviceGetMinMaxFanSpeed DeviceGetMinorNumber = libnvml.DeviceGetMinorNumber + DeviceGetModuleId = libnvml.DeviceGetModuleId DeviceGetMultiGpuBoard = libnvml.DeviceGetMultiGpuBoard DeviceGetName = libnvml.DeviceGetName DeviceGetNumFans = libnvml.DeviceGetNumFans DeviceGetNumGpuCores = libnvml.DeviceGetNumGpuCores + DeviceGetNumaNodeId = libnvml.DeviceGetNumaNodeId DeviceGetNvLinkCapability = libnvml.DeviceGetNvLinkCapability DeviceGetNvLinkErrorCounter = libnvml.DeviceGetNvLinkErrorCounter DeviceGetNvLinkRemoteDeviceType = libnvml.DeviceGetNvLinkRemoteDeviceType @@ -140,8 +149,10 @@ var ( DeviceGetNvLinkUtilizationControl = libnvml.DeviceGetNvLinkUtilizationControl DeviceGetNvLinkUtilizationCounter = libnvml.DeviceGetNvLinkUtilizationCounter DeviceGetNvLinkVersion = libnvml.DeviceGetNvLinkVersion + DeviceGetOfaUtilization = libnvml.DeviceGetOfaUtilization DeviceGetP2PStatus = libnvml.DeviceGetP2PStatus DeviceGetPciInfo = libnvml.DeviceGetPciInfo + DeviceGetPciInfoExt = libnvml.DeviceGetPciInfoExt DeviceGetPcieLinkMaxSpeed = libnvml.DeviceGetPcieLinkMaxSpeed DeviceGetPcieReplayCounter = libnvml.DeviceGetPcieReplayCounter DeviceGetPcieSpeed = libnvml.DeviceGetPcieSpeed @@ -157,13 +168,17 @@ var ( DeviceGetPowerState = libnvml.DeviceGetPowerState DeviceGetPowerUsage = libnvml.DeviceGetPowerUsage DeviceGetProcessUtilization = libnvml.DeviceGetProcessUtilization + DeviceGetProcessesUtilizationInfo = libnvml.DeviceGetProcessesUtilizationInfo DeviceGetRemappedRows = libnvml.DeviceGetRemappedRows DeviceGetRetiredPages = libnvml.DeviceGetRetiredPages DeviceGetRetiredPagesPendingStatus = libnvml.DeviceGetRetiredPagesPendingStatus DeviceGetRetiredPages_v2 = libnvml.DeviceGetRetiredPages_v2 DeviceGetRowRemapperHistogram = libnvml.DeviceGetRowRemapperHistogram + DeviceGetRunningProcessDetailList = libnvml.DeviceGetRunningProcessDetailList DeviceGetSamples = libnvml.DeviceGetSamples DeviceGetSerial = libnvml.DeviceGetSerial + DeviceGetSramEccErrorStatus = libnvml.DeviceGetSramEccErrorStatus + DeviceGetSupportedClocksEventReasons = libnvml.DeviceGetSupportedClocksEventReasons DeviceGetSupportedClocksThrottleReasons = libnvml.DeviceGetSupportedClocksThrottleReasons DeviceGetSupportedEventTypes = libnvml.DeviceGetSupportedEventTypes DeviceGetSupportedGraphicsClocks = libnvml.DeviceGetSupportedGraphicsClocks @@ -182,11 +197,16 @@ var ( DeviceGetUtilizationRates = libnvml.DeviceGetUtilizationRates DeviceGetVbiosVersion = libnvml.DeviceGetVbiosVersion DeviceGetVgpuCapabilities = libnvml.DeviceGetVgpuCapabilities + DeviceGetVgpuHeterogeneousMode = libnvml.DeviceGetVgpuHeterogeneousMode + DeviceGetVgpuInstancesUtilizationInfo = libnvml.DeviceGetVgpuInstancesUtilizationInfo DeviceGetVgpuMetadata = libnvml.DeviceGetVgpuMetadata DeviceGetVgpuProcessUtilization = libnvml.DeviceGetVgpuProcessUtilization + DeviceGetVgpuProcessesUtilizationInfo = libnvml.DeviceGetVgpuProcessesUtilizationInfo DeviceGetVgpuSchedulerCapabilities = libnvml.DeviceGetVgpuSchedulerCapabilities DeviceGetVgpuSchedulerLog = libnvml.DeviceGetVgpuSchedulerLog DeviceGetVgpuSchedulerState = libnvml.DeviceGetVgpuSchedulerState + DeviceGetVgpuTypeCreatablePlacements = libnvml.DeviceGetVgpuTypeCreatablePlacements + DeviceGetVgpuTypeSupportedPlacements = libnvml.DeviceGetVgpuTypeSupportedPlacements DeviceGetVgpuUtilization = libnvml.DeviceGetVgpuUtilization DeviceGetViolationStatus = libnvml.DeviceGetViolationStatus DeviceGetVirtualizationMode = libnvml.DeviceGetVirtualizationMode @@ -207,6 +227,7 @@ var ( DeviceSetApplicationsClocks = libnvml.DeviceSetApplicationsClocks DeviceSetAutoBoostedClocksEnabled = libnvml.DeviceSetAutoBoostedClocksEnabled DeviceSetComputeMode = libnvml.DeviceSetComputeMode + DeviceSetConfComputeUnprotectedMemSize = libnvml.DeviceSetConfComputeUnprotectedMemSize DeviceSetCpuAffinity = libnvml.DeviceSetCpuAffinity DeviceSetDefaultAutoBoostedClocksEnabled = libnvml.DeviceSetDefaultAutoBoostedClocksEnabled DeviceSetDefaultFanSpeed_v2 = libnvml.DeviceSetDefaultFanSpeed_v2 @@ -224,7 +245,10 @@ var ( DeviceSetNvLinkUtilizationControl = libnvml.DeviceSetNvLinkUtilizationControl DeviceSetPersistenceMode = libnvml.DeviceSetPersistenceMode DeviceSetPowerManagementLimit = libnvml.DeviceSetPowerManagementLimit + DeviceSetPowerManagementLimit_v2 = libnvml.DeviceSetPowerManagementLimit_v2 DeviceSetTemperatureThreshold = libnvml.DeviceSetTemperatureThreshold + DeviceSetVgpuCapabilities = libnvml.DeviceSetVgpuCapabilities + DeviceSetVgpuHeterogeneousMode = libnvml.DeviceSetVgpuHeterogeneousMode DeviceSetVgpuSchedulerState = libnvml.DeviceSetVgpuSchedulerState DeviceSetVirtualizationMode = libnvml.DeviceSetVirtualizationMode DeviceValidateInforom = libnvml.DeviceValidateInforom @@ -243,9 +267,11 @@ var ( GpmMigSampleGet = libnvml.GpmMigSampleGet GpmQueryDeviceSupport = libnvml.GpmQueryDeviceSupport GpmQueryDeviceSupportV = libnvml.GpmQueryDeviceSupportV + GpmQueryIfStreamingEnabled = libnvml.GpmQueryIfStreamingEnabled GpmSampleAlloc = libnvml.GpmSampleAlloc GpmSampleFree = libnvml.GpmSampleFree GpmSampleGet = libnvml.GpmSampleGet + GpmSetStreamingEnabled = libnvml.GpmSetStreamingEnabled GpuInstanceCreateComputeInstance = libnvml.GpuInstanceCreateComputeInstance GpuInstanceCreateComputeInstanceWithPlacement = libnvml.GpuInstanceCreateComputeInstanceWithPlacement GpuInstanceDestroy = libnvml.GpuInstanceDestroy @@ -260,6 +286,9 @@ var ( InitWithFlags = libnvml.InitWithFlags SetVgpuVersion = libnvml.SetVgpuVersion Shutdown = libnvml.Shutdown + SystemGetConfComputeCapabilities = libnvml.SystemGetConfComputeCapabilities + SystemGetConfComputeKeyRotationThresholdInfo = libnvml.SystemGetConfComputeKeyRotationThresholdInfo + SystemGetConfComputeSettings = libnvml.SystemGetConfComputeSettings SystemGetCudaDriverVersion = libnvml.SystemGetCudaDriverVersion SystemGetCudaDriverVersion_v2 = libnvml.SystemGetCudaDriverVersion_v2 SystemGetDriverVersion = libnvml.SystemGetDriverVersion @@ -267,6 +296,7 @@ var ( SystemGetNVMLVersion = libnvml.SystemGetNVMLVersion SystemGetProcessName = libnvml.SystemGetProcessName SystemGetTopologyGpuSet = libnvml.SystemGetTopologyGpuSet + SystemSetConfComputeKeyRotationThresholdInfo = libnvml.SystemSetConfComputeKeyRotationThresholdInfo UnitGetCount = libnvml.UnitGetCount UnitGetDevices = libnvml.UnitGetDevices UnitGetFanSpeedInfo = libnvml.UnitGetFanSpeedInfo @@ -319,8 +349,6 @@ var ( type Interface interface { ComputeInstanceDestroy(ComputeInstance) Return ComputeInstanceGetInfo(ComputeInstance) (ComputeInstanceInfo, Return) - DeviceCcuGetStreamState(Device) (int, Return) - DeviceCcuSetStreamState(Device, int) Return DeviceClearAccountingPids(Device) Return DeviceClearCpuAffinity(Device) Return DeviceClearEccErrorCounts(Device, EccCounterType) Return @@ -346,12 +374,17 @@ type Interface interface { DeviceGetBrand(Device) (BrandType, Return) DeviceGetBridgeChipInfo(Device) (BridgeChipHierarchy, Return) DeviceGetBusType(Device) (BusType, Return) + DeviceGetC2cModeInfoV(Device) C2cModeInfoHandler DeviceGetClkMonStatus(Device) (ClkMonStatus, Return) DeviceGetClock(Device, ClockType, ClockId) (uint32, Return) DeviceGetClockInfo(Device, ClockType) (uint32, Return) DeviceGetComputeInstanceId(Device) (int, Return) DeviceGetComputeMode(Device) (ComputeMode, Return) DeviceGetComputeRunningProcesses(Device) ([]ProcessInfo, Return) + DeviceGetConfComputeGpuAttestationReport(Device) (ConfComputeGpuAttestationReport, Return) + DeviceGetConfComputeGpuCertificate(Device) (ConfComputeGpuCertificate, Return) + DeviceGetConfComputeMemSizeInfo(Device) (ConfComputeMemSizeInfo, Return) + DeviceGetConfComputeProtectedMemoryUsage(Device) (Memory, Return) DeviceGetCount() (int, Return) DeviceGetCpuAffinity(Device, int) ([]uint, Return) DeviceGetCpuAffinityWithinScope(Device, int, AffinityScope) ([]uint, Return) @@ -359,6 +392,7 @@ type Interface interface { DeviceGetCudaComputeCapability(Device) (int, int, Return) DeviceGetCurrPcieLinkGeneration(Device) (int, Return) DeviceGetCurrPcieLinkWidth(Device) (int, Return) + DeviceGetCurrentClocksEventReasons(Device) (uint64, Return) DeviceGetCurrentClocksThrottleReasons(Device) (uint64, Return) DeviceGetDecoderUtilization(Device) (uint32, uint32, Return) DeviceGetDefaultApplicationsClock(Device, ClockType) (uint32, Return) @@ -384,11 +418,12 @@ type Interface interface { DeviceGetGpcClkMinMaxVfOffset(Device) (int, int, Return) DeviceGetGpcClkVfOffset(Device) (int, Return) DeviceGetGpuFabricInfo(Device) (GpuFabricInfo, Return) + DeviceGetGpuFabricInfoV(Device) GpuFabricInfoHandler DeviceGetGpuInstanceById(Device, int) (GpuInstance, Return) DeviceGetGpuInstanceId(Device) (int, Return) DeviceGetGpuInstancePossiblePlacements(Device, *GpuInstanceProfileInfo) ([]GpuInstancePlacement, Return) DeviceGetGpuInstanceProfileInfo(Device, int) (GpuInstanceProfileInfo, Return) - DeviceGetGpuInstanceProfileInfoV(Device, int) GpuInstanceProfileInfoV + DeviceGetGpuInstanceProfileInfoV(Device, int) GpuInstanceProfileInfoHandler DeviceGetGpuInstanceRemainingCapacity(Device, *GpuInstanceProfileInfo) (int, Return) DeviceGetGpuInstances(Device, *GpuInstanceProfileInfo) ([]GpuInstance, Return) DeviceGetGpuMaxPcieLinkGeneration(Device) (int, Return) @@ -407,6 +442,8 @@ type Interface interface { DeviceGetInforomImageVersion(Device) (string, Return) DeviceGetInforomVersion(Device, InforomObject) (string, Return) DeviceGetIrqNum(Device) (int, Return) + DeviceGetJpgUtilization(Device) (uint32, uint32, Return) + DeviceGetLastBBXFlushTime(Device) (uint64, uint, Return) DeviceGetMPSComputeRunningProcesses(Device) ([]ProcessInfo, Return) DeviceGetMaxClockInfo(Device, ClockType) (uint32, Return) DeviceGetMaxCustomerBoostClock(Device, ClockType) (uint32, Return) @@ -425,10 +462,12 @@ type Interface interface { DeviceGetMinMaxClockOfPState(Device, ClockType, Pstates) (uint32, uint32, Return) DeviceGetMinMaxFanSpeed(Device) (int, int, Return) DeviceGetMinorNumber(Device) (int, Return) + DeviceGetModuleId(Device) (int, Return) DeviceGetMultiGpuBoard(Device) (int, Return) DeviceGetName(Device) (string, Return) DeviceGetNumFans(Device) (int, Return) DeviceGetNumGpuCores(Device) (int, Return) + DeviceGetNumaNodeId(Device) (int, Return) DeviceGetNvLinkCapability(Device, int, NvLinkCapability) (uint32, Return) DeviceGetNvLinkErrorCounter(Device, int, NvLinkErrorCounter) (uint64, Return) DeviceGetNvLinkRemoteDeviceType(Device, int) (IntNvLinkDeviceType, Return) @@ -437,8 +476,10 @@ type Interface interface { DeviceGetNvLinkUtilizationControl(Device, int, int) (NvLinkUtilizationControl, Return) DeviceGetNvLinkUtilizationCounter(Device, int, int) (uint64, uint64, Return) DeviceGetNvLinkVersion(Device, int) (uint32, Return) + DeviceGetOfaUtilization(Device) (uint32, uint32, Return) DeviceGetP2PStatus(Device, Device, GpuP2PCapsIndex) (GpuP2PStatus, Return) DeviceGetPciInfo(Device) (PciInfo, Return) + DeviceGetPciInfoExt(Device) (PciInfoExt, Return) DeviceGetPcieLinkMaxSpeed(Device) (uint32, Return) DeviceGetPcieReplayCounter(Device) (int, Return) DeviceGetPcieSpeed(Device) (int, Return) @@ -454,13 +495,17 @@ type Interface interface { DeviceGetPowerState(Device) (Pstates, Return) DeviceGetPowerUsage(Device) (uint32, Return) DeviceGetProcessUtilization(Device, uint64) ([]ProcessUtilizationSample, Return) + DeviceGetProcessesUtilizationInfo(Device) (ProcessesUtilizationInfo, Return) DeviceGetRemappedRows(Device) (int, int, bool, bool, Return) DeviceGetRetiredPages(Device, PageRetirementCause) ([]uint64, Return) DeviceGetRetiredPagesPendingStatus(Device) (EnableState, Return) DeviceGetRetiredPages_v2(Device, PageRetirementCause) ([]uint64, []uint64, Return) DeviceGetRowRemapperHistogram(Device) (RowRemapperHistogramValues, Return) + DeviceGetRunningProcessDetailList(Device) (ProcessDetailList, Return) DeviceGetSamples(Device, SamplingType, uint64) (ValueType, []Sample, Return) DeviceGetSerial(Device) (string, Return) + DeviceGetSramEccErrorStatus(Device) (EccSramErrorStatus, Return) + DeviceGetSupportedClocksEventReasons(Device) (uint64, Return) DeviceGetSupportedClocksThrottleReasons(Device) (uint64, Return) DeviceGetSupportedEventTypes(Device) (uint64, Return) DeviceGetSupportedGraphicsClocks(Device, int) (int, uint32, Return) @@ -479,11 +524,16 @@ type Interface interface { DeviceGetUtilizationRates(Device) (Utilization, Return) DeviceGetVbiosVersion(Device) (string, Return) DeviceGetVgpuCapabilities(Device, DeviceVgpuCapability) (bool, Return) + DeviceGetVgpuHeterogeneousMode(Device) (VgpuHeterogeneousMode, Return) + DeviceGetVgpuInstancesUtilizationInfo(Device) (VgpuInstancesUtilizationInfo, Return) DeviceGetVgpuMetadata(Device) (VgpuPgpuMetadata, Return) DeviceGetVgpuProcessUtilization(Device, uint64) ([]VgpuProcessUtilizationSample, Return) + DeviceGetVgpuProcessesUtilizationInfo(Device) (VgpuProcessesUtilizationInfo, Return) DeviceGetVgpuSchedulerCapabilities(Device) (VgpuSchedulerCapabilities, Return) DeviceGetVgpuSchedulerLog(Device) (VgpuSchedulerLog, Return) DeviceGetVgpuSchedulerState(Device) (VgpuSchedulerGetState, Return) + DeviceGetVgpuTypeCreatablePlacements(Device, VgpuTypeId) (VgpuPlacementList, Return) + DeviceGetVgpuTypeSupportedPlacements(Device, VgpuTypeId) (VgpuPlacementList, Return) DeviceGetVgpuUtilization(Device, uint64) (ValueType, []VgpuInstanceUtilizationSample, Return) DeviceGetViolationStatus(Device, PerfPolicyType) (ViolationTime, Return) DeviceGetVirtualizationMode(Device) (GpuVirtualizationMode, Return) @@ -504,6 +554,7 @@ type Interface interface { DeviceSetApplicationsClocks(Device, uint32, uint32) Return DeviceSetAutoBoostedClocksEnabled(Device, EnableState) Return DeviceSetComputeMode(Device, ComputeMode) Return + DeviceSetConfComputeUnprotectedMemSize(Device, uint64) Return DeviceSetCpuAffinity(Device) Return DeviceSetDefaultAutoBoostedClocksEnabled(Device, EnableState, uint32) Return DeviceSetDefaultFanSpeed_v2(Device, int) Return @@ -521,7 +572,10 @@ type Interface interface { DeviceSetNvLinkUtilizationControl(Device, int, int, *NvLinkUtilizationControl, bool) Return DeviceSetPersistenceMode(Device, EnableState) Return DeviceSetPowerManagementLimit(Device, uint32) Return + DeviceSetPowerManagementLimit_v2(Device, *PowerValue_v2) Return DeviceSetTemperatureThreshold(Device, TemperatureThresholds, int) Return + DeviceSetVgpuCapabilities(Device, DeviceVgpuCapability, EnableState) Return + DeviceSetVgpuHeterogeneousMode(Device, VgpuHeterogeneousMode) Return DeviceSetVgpuSchedulerState(Device, *VgpuSchedulerSetState) Return DeviceSetVirtualizationMode(Device, GpuVirtualizationMode) Return DeviceValidateInforom(Device) Return @@ -540,16 +594,18 @@ type Interface interface { GpmMigSampleGet(Device, int, GpmSample) Return GpmQueryDeviceSupport(Device) (GpmSupport, Return) GpmQueryDeviceSupportV(Device) GpmSupportV + GpmQueryIfStreamingEnabled(Device) (uint32, Return) GpmSampleAlloc() (GpmSample, Return) GpmSampleFree(GpmSample) Return GpmSampleGet(Device, GpmSample) Return + GpmSetStreamingEnabled(Device, uint32) Return GpuInstanceCreateComputeInstance(GpuInstance, *ComputeInstanceProfileInfo) (ComputeInstance, Return) GpuInstanceCreateComputeInstanceWithPlacement(GpuInstance, *ComputeInstanceProfileInfo, *ComputeInstancePlacement) (ComputeInstance, Return) GpuInstanceDestroy(GpuInstance) Return GpuInstanceGetComputeInstanceById(GpuInstance, int) (ComputeInstance, Return) GpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, *ComputeInstanceProfileInfo) ([]ComputeInstancePlacement, Return) GpuInstanceGetComputeInstanceProfileInfo(GpuInstance, int, int) (ComputeInstanceProfileInfo, Return) - GpuInstanceGetComputeInstanceProfileInfoV(GpuInstance, int, int) ComputeInstanceProfileInfoV + GpuInstanceGetComputeInstanceProfileInfoV(GpuInstance, int, int) ComputeInstanceProfileInfoHandler GpuInstanceGetComputeInstanceRemainingCapacity(GpuInstance, *ComputeInstanceProfileInfo) (int, Return) GpuInstanceGetComputeInstances(GpuInstance, *ComputeInstanceProfileInfo) ([]ComputeInstance, Return) GpuInstanceGetInfo(GpuInstance) (GpuInstanceInfo, Return) @@ -557,6 +613,9 @@ type Interface interface { InitWithFlags(uint32) Return SetVgpuVersion(*VgpuVersion) Return Shutdown() Return + SystemGetConfComputeCapabilities() (ConfComputeSystemCaps, Return) + SystemGetConfComputeKeyRotationThresholdInfo() (ConfComputeGetKeyRotationThresholdInfo, Return) + SystemGetConfComputeSettings() (SystemConfComputeSettings, Return) SystemGetCudaDriverVersion() (int, Return) SystemGetCudaDriverVersion_v2() (int, Return) SystemGetDriverVersion() (string, Return) @@ -564,6 +623,7 @@ type Interface interface { SystemGetNVMLVersion() (string, Return) SystemGetProcessName(int) (string, Return) SystemGetTopologyGpuSet(int) ([]Device, Return) + SystemSetConfComputeKeyRotationThresholdInfo(ConfComputeSetKeyRotationThresholdInfo) Return UnitGetCount() (int, Return) UnitGetDevices(Unit) ([]Device, Return) UnitGetFanSpeedInfo(Unit) (UnitFanSpeeds, Return) @@ -614,8 +674,6 @@ type Interface interface { // //go:generate moq -out mock/device.go -pkg mock . Device:Device type Device interface { - CcuGetStreamState() (int, Return) - CcuSetStreamState(int) Return ClearAccountingPids() Return ClearCpuAffinity() Return ClearEccErrorCounts(EccCounterType) Return @@ -640,18 +698,24 @@ type Device interface { GetBrand() (BrandType, Return) GetBridgeChipInfo() (BridgeChipHierarchy, Return) GetBusType() (BusType, Return) + GetC2cModeInfoV() C2cModeInfoHandler GetClkMonStatus() (ClkMonStatus, Return) GetClock(ClockType, ClockId) (uint32, Return) GetClockInfo(ClockType) (uint32, Return) GetComputeInstanceId() (int, Return) GetComputeMode() (ComputeMode, Return) GetComputeRunningProcesses() ([]ProcessInfo, Return) + GetConfComputeGpuAttestationReport() (ConfComputeGpuAttestationReport, Return) + GetConfComputeGpuCertificate() (ConfComputeGpuCertificate, Return) + GetConfComputeMemSizeInfo() (ConfComputeMemSizeInfo, Return) + GetConfComputeProtectedMemoryUsage() (Memory, Return) GetCpuAffinity(int) ([]uint, Return) GetCpuAffinityWithinScope(int, AffinityScope) ([]uint, Return) GetCreatableVgpus() ([]VgpuTypeId, Return) GetCudaComputeCapability() (int, int, Return) GetCurrPcieLinkGeneration() (int, Return) GetCurrPcieLinkWidth() (int, Return) + GetCurrentClocksEventReasons() (uint64, Return) GetCurrentClocksThrottleReasons() (uint64, Return) GetDecoderUtilization() (uint32, uint32, Return) GetDefaultApplicationsClock(ClockType) (uint32, Return) @@ -677,11 +741,12 @@ type Device interface { GetGpcClkMinMaxVfOffset() (int, int, Return) GetGpcClkVfOffset() (int, Return) GetGpuFabricInfo() (GpuFabricInfo, Return) + GetGpuFabricInfoV() GpuFabricInfoHandler GetGpuInstanceById(int) (GpuInstance, Return) GetGpuInstanceId() (int, Return) GetGpuInstancePossiblePlacements(*GpuInstanceProfileInfo) ([]GpuInstancePlacement, Return) GetGpuInstanceProfileInfo(int) (GpuInstanceProfileInfo, Return) - GetGpuInstanceProfileInfoV(int) GpuInstanceProfileInfoV + GetGpuInstanceProfileInfoV(int) GpuInstanceProfileInfoHandler GetGpuInstanceRemainingCapacity(*GpuInstanceProfileInfo) (int, Return) GetGpuInstances(*GpuInstanceProfileInfo) ([]GpuInstance, Return) GetGpuMaxPcieLinkGeneration() (int, Return) @@ -696,6 +761,8 @@ type Device interface { GetInforomImageVersion() (string, Return) GetInforomVersion(InforomObject) (string, Return) GetIrqNum() (int, Return) + GetJpgUtilization() (uint32, uint32, Return) + GetLastBBXFlushTime() (uint64, uint, Return) GetMPSComputeRunningProcesses() ([]ProcessInfo, Return) GetMaxClockInfo(ClockType) (uint32, Return) GetMaxCustomerBoostClock(ClockType) (uint32, Return) @@ -714,10 +781,12 @@ type Device interface { GetMinMaxClockOfPState(ClockType, Pstates) (uint32, uint32, Return) GetMinMaxFanSpeed() (int, int, Return) GetMinorNumber() (int, Return) + GetModuleId() (int, Return) GetMultiGpuBoard() (int, Return) GetName() (string, Return) GetNumFans() (int, Return) GetNumGpuCores() (int, Return) + GetNumaNodeId() (int, Return) GetNvLinkCapability(int, NvLinkCapability) (uint32, Return) GetNvLinkErrorCounter(int, NvLinkErrorCounter) (uint64, Return) GetNvLinkRemoteDeviceType(int) (IntNvLinkDeviceType, Return) @@ -726,8 +795,10 @@ type Device interface { GetNvLinkUtilizationControl(int, int) (NvLinkUtilizationControl, Return) GetNvLinkUtilizationCounter(int, int) (uint64, uint64, Return) GetNvLinkVersion(int) (uint32, Return) + GetOfaUtilization() (uint32, uint32, Return) GetP2PStatus(Device, GpuP2PCapsIndex) (GpuP2PStatus, Return) GetPciInfo() (PciInfo, Return) + GetPciInfoExt() (PciInfoExt, Return) GetPcieLinkMaxSpeed() (uint32, Return) GetPcieReplayCounter() (int, Return) GetPcieSpeed() (int, Return) @@ -743,13 +814,17 @@ type Device interface { GetPowerState() (Pstates, Return) GetPowerUsage() (uint32, Return) GetProcessUtilization(uint64) ([]ProcessUtilizationSample, Return) + GetProcessesUtilizationInfo() (ProcessesUtilizationInfo, Return) GetRemappedRows() (int, int, bool, bool, Return) GetRetiredPages(PageRetirementCause) ([]uint64, Return) GetRetiredPagesPendingStatus() (EnableState, Return) GetRetiredPages_v2(PageRetirementCause) ([]uint64, []uint64, Return) GetRowRemapperHistogram() (RowRemapperHistogramValues, Return) + GetRunningProcessDetailList() (ProcessDetailList, Return) GetSamples(SamplingType, uint64) (ValueType, []Sample, Return) GetSerial() (string, Return) + GetSramEccErrorStatus() (EccSramErrorStatus, Return) + GetSupportedClocksEventReasons() (uint64, Return) GetSupportedClocksThrottleReasons() (uint64, Return) GetSupportedEventTypes() (uint64, Return) GetSupportedGraphicsClocks(int) (int, uint32, Return) @@ -768,18 +843,25 @@ type Device interface { GetUtilizationRates() (Utilization, Return) GetVbiosVersion() (string, Return) GetVgpuCapabilities(DeviceVgpuCapability) (bool, Return) + GetVgpuHeterogeneousMode() (VgpuHeterogeneousMode, Return) + GetVgpuInstancesUtilizationInfo() (VgpuInstancesUtilizationInfo, Return) GetVgpuMetadata() (VgpuPgpuMetadata, Return) GetVgpuProcessUtilization(uint64) ([]VgpuProcessUtilizationSample, Return) + GetVgpuProcessesUtilizationInfo() (VgpuProcessesUtilizationInfo, Return) GetVgpuSchedulerCapabilities() (VgpuSchedulerCapabilities, Return) GetVgpuSchedulerLog() (VgpuSchedulerLog, Return) GetVgpuSchedulerState() (VgpuSchedulerGetState, Return) + GetVgpuTypeCreatablePlacements(VgpuTypeId) (VgpuPlacementList, Return) + GetVgpuTypeSupportedPlacements(VgpuTypeId) (VgpuPlacementList, Return) GetVgpuUtilization(uint64) (ValueType, []VgpuInstanceUtilizationSample, Return) GetViolationStatus(PerfPolicyType) (ViolationTime, Return) GetVirtualizationMode() (GpuVirtualizationMode, Return) GpmMigSampleGet(int, GpmSample) Return GpmQueryDeviceSupport() (GpmSupport, Return) GpmQueryDeviceSupportV() GpmSupportV + GpmQueryIfStreamingEnabled() (uint32, Return) GpmSampleGet(GpmSample) Return + GpmSetStreamingEnabled(uint32) Return IsMigDeviceHandle() (bool, Return) OnSameBoard(Device) (int, Return) RegisterEvents(uint64, EventSet) Return @@ -793,6 +875,7 @@ type Device interface { SetApplicationsClocks(uint32, uint32) Return SetAutoBoostedClocksEnabled(EnableState) Return SetComputeMode(ComputeMode) Return + SetConfComputeUnprotectedMemSize(uint64) Return SetCpuAffinity() Return SetDefaultAutoBoostedClocksEnabled(EnableState, uint32) Return SetDefaultFanSpeed_v2(int) Return @@ -810,7 +893,10 @@ type Device interface { SetNvLinkUtilizationControl(int, int, *NvLinkUtilizationControl, bool) Return SetPersistenceMode(EnableState) Return SetPowerManagementLimit(uint32) Return + SetPowerManagementLimit_v2(*PowerValue_v2) Return SetTemperatureThreshold(TemperatureThresholds, int) Return + SetVgpuCapabilities(DeviceVgpuCapability, EnableState) Return + SetVgpuHeterogeneousMode(VgpuHeterogeneousMode) Return SetVgpuSchedulerState(*VgpuSchedulerSetState) Return SetVirtualizationMode(GpuVirtualizationMode) Return ValidateInforom() Return @@ -827,7 +913,7 @@ type GpuInstance interface { GetComputeInstanceById(int) (ComputeInstance, Return) GetComputeInstancePossiblePlacements(*ComputeInstanceProfileInfo) ([]ComputeInstancePlacement, Return) GetComputeInstanceProfileInfo(int, int) (ComputeInstanceProfileInfo, Return) - GetComputeInstanceProfileInfoV(int, int) ComputeInstanceProfileInfoV + GetComputeInstanceProfileInfoV(int, int) ComputeInstanceProfileInfoHandler GetComputeInstanceRemainingCapacity(*ComputeInstanceProfileInfo) (int, Return) GetComputeInstances(*ComputeInstanceProfileInfo) ([]ComputeInstance, Return) GetInfo() (GpuInstanceInfo, Return) @@ -906,6 +992,7 @@ type VgpuInstance interface { type VgpuTypeId interface { GetCapabilities(VgpuCapability) (bool, Return) GetClass() (string, Return) + GetCreatablePlacements(Device) (VgpuPlacementList, Return) GetDeviceID() (uint64, uint64, Return) GetFrameRateLimit() (uint32, Return) GetFramebufferSize() (uint64, Return) @@ -916,4 +1003,5 @@ type VgpuTypeId interface { GetName() (string, Return) GetNumDisplayHeads() (int, Return) GetResolution(int) (uint32, uint32, Return) + GetSupportedPlacements(Device) (VgpuPlacementList, Return) } diff --git a/vendor/modules.txt b/vendor/modules.txt index 0114626a0..909fb3cd3 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -31,7 +31,7 @@ github.com/Microsoft/hcsshim/osversion ## explicit; go 1.20 github.com/NVIDIA/go-gpuallocator/gpuallocator github.com/NVIDIA/go-gpuallocator/internal/links -# github.com/NVIDIA/go-nvlib v0.5.0 +# github.com/NVIDIA/go-nvlib v0.6.0 ## explicit; go 1.20 github.com/NVIDIA/go-nvlib/pkg/nvlib/device github.com/NVIDIA/go-nvlib/pkg/nvlib/info @@ -39,7 +39,7 @@ github.com/NVIDIA/go-nvlib/pkg/nvpci github.com/NVIDIA/go-nvlib/pkg/nvpci/bytes github.com/NVIDIA/go-nvlib/pkg/nvpci/mmio github.com/NVIDIA/go-nvlib/pkg/pciids -# github.com/NVIDIA/go-nvml v0.12.0-6 +# github.com/NVIDIA/go-nvml v0.12.4-0 ## explicit; go 1.20 github.com/NVIDIA/go-nvml/pkg/dl github.com/NVIDIA/go-nvml/pkg/nvml From 2f2541e0a23e2c8e0476b1a9f572ac155ab442ee Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Tue, 18 Jun 2024 09:55:31 +0000 Subject: [PATCH 05/10] Renaming GetClass method to GetPIEClass Signed-off-by: Vishesh Tanksale --- internal/lm/nvml.go | 2 +- internal/resource/cuda-device.go | 2 +- internal/resource/device_mock.go | 28 +++++++++---------- internal/resource/nvml-device.go | 2 +- internal/resource/nvml-mig-device.go | 2 +- internal/resource/sysfs-device.go | 2 +- internal/resource/testing/resource-testing.go | 2 +- internal/resource/types.go | 2 +- 8 files changed, 21 insertions(+), 21 deletions(-) diff --git a/internal/lm/nvml.go b/internal/lm/nvml.go index 96ce7d615..f06fae9a5 100644 --- a/internal/lm/nvml.go +++ b/internal/lm/nvml.go @@ -242,7 +242,7 @@ func getModeForClasses(classes []string) string { func getDeviceClasses(devices []resource.Device) ([]string, error) { seenClasses := make(map[string]bool) for _, d := range devices { - class, err := d.GetClass() + class, err := d.GetPIEClass() if err != nil { return nil, err } diff --git a/internal/resource/cuda-device.go b/internal/resource/cuda-device.go index 142b03ef3..16cdbf8fc 100644 --- a/internal/resource/cuda-device.go +++ b/internal/resource/cuda-device.go @@ -97,6 +97,6 @@ func (d *cudaDevice) IsMigEnabled() (bool, error) { return false, nil } -func (d *cudaDevice) GetClass() (string, error) { +func (d *cudaDevice) GetPIEClass() (string, error) { return "unknown", nil } diff --git a/internal/resource/device_mock.go b/internal/resource/device_mock.go index 27cb8f897..d1b261b65 100644 --- a/internal/resource/device_mock.go +++ b/internal/resource/device_mock.go @@ -21,7 +21,7 @@ var _ Device = &DeviceMock{} // panic("mock out the GetAttributes method") // }, // GetClassFunc: func() (string, error) { -// panic("mock out the GetClass method") +// panic("mock out the GetPIEClass method") // }, // GetCudaComputeCapabilityFunc: func() (int, int, error) { // panic("mock out the GetCudaComputeCapability method") @@ -54,8 +54,8 @@ type DeviceMock struct { // GetAttributesFunc mocks the GetAttributes method. GetAttributesFunc func() (map[string]interface{}, error) - // GetClassFunc mocks the GetClass method. - GetClassFunc func() (string, error) + // GetPIEClassFunc mocks the GetClass method. + GetPIEClassFunc func() (string, error) // GetCudaComputeCapabilityFunc mocks the GetCudaComputeCapability method. GetCudaComputeCapabilityFunc func() (int, int, error) @@ -83,8 +83,8 @@ type DeviceMock struct { // GetAttributes holds details about calls to the GetAttributes method. GetAttributes []struct { } - // GetClass holds details about calls to the GetClass method. - GetClass []struct { + // GetPIEClass holds details about calls to the GetClass method. + GetPIEClass []struct { } // GetCudaComputeCapability holds details about calls to the GetCudaComputeCapability method. GetCudaComputeCapability []struct { @@ -146,29 +146,29 @@ func (mock *DeviceMock) GetAttributesCalls() []struct { return calls } -// GetClass calls GetClassFunc. -func (mock *DeviceMock) GetClass() (string, error) { - if mock.GetClassFunc == nil { - panic("DeviceMock.GetClassFunc: method is nil but Device.GetClass was just called") +// GetPIEClass calls GetPIEClassFunc. +func (mock *DeviceMock) GetPIEClass() (string, error) { + if mock.GetPIEClassFunc == nil { + panic("DeviceMock.GetClassFunc: method is nil but Device.GetPIEClass was just called") } callInfo := struct { }{} mock.lockGetClass.Lock() - mock.calls.GetClass = append(mock.calls.GetClass, callInfo) + mock.calls.GetPIEClass = append(mock.calls.GetPIEClass, callInfo) mock.lockGetClass.Unlock() - return mock.GetClassFunc() + return mock.GetPIEClassFunc() } // GetClassCalls gets all the calls that were made to GetClass. // Check the length with: // -// len(mockedDevice.GetClassCalls()) -func (mock *DeviceMock) GetClassCalls() []struct { +// len(mockedDevice.GetPIEClassCalls()) +func (mock *DeviceMock) GetPIEClassCalls() []struct { } { var calls []struct { } mock.lockGetClass.RLock() - calls = mock.calls.GetClass + calls = mock.calls.GetPIEClass mock.lockGetClass.RUnlock() return calls } diff --git a/internal/resource/nvml-device.go b/internal/resource/nvml-device.go index 830bbcbc2..ccc6aa2ab 100644 --- a/internal/resource/nvml-device.go +++ b/internal/resource/nvml-device.go @@ -88,7 +88,7 @@ func (d nvmlDevice) GetTotalMemoryMB() (uint64, error) { return info.Total / (1024 * 1024), nil } -func (d nvmlDevice) GetClass() (string, error) { +func (d nvmlDevice) GetPIEClass() (string, error) { pciBusID, err := d.GetPCIBusID() if err != nil { return "", err diff --git a/internal/resource/nvml-mig-device.go b/internal/resource/nvml-mig-device.go index 4cad926c9..27a1bb68b 100644 --- a/internal/resource/nvml-mig-device.go +++ b/internal/resource/nvml-mig-device.go @@ -134,7 +134,7 @@ func totalMemory(attr map[string]interface{}) (uint64, error) { } } -func (d nvmlMigDevice) GetClass() (string, error) { +func (d nvmlMigDevice) GetPIEClass() (string, error) { info, retVal := d.MigDevice.GetPciInfo() if retVal != nvml.SUCCESS { return "", retVal diff --git a/internal/resource/sysfs-device.go b/internal/resource/sysfs-device.go index 5f54dcf96..5dcf0012f 100644 --- a/internal/resource/sysfs-device.go +++ b/internal/resource/sysfs-device.go @@ -65,7 +65,7 @@ func (d vfioDevice) IsMigCapable() (bool, error) { return false, nil } -func (d vfioDevice) GetClass() (string, error) { +func (d vfioDevice) GetPIEClass() (string, error) { class := fmt.Sprintf("%#06x\n", d.nvidiaPCIDevice.Class) return class, nil } diff --git a/internal/resource/testing/resource-testing.go b/internal/resource/testing/resource-testing.go index b012d5314..e2b0989da 100644 --- a/internal/resource/testing/resource-testing.go +++ b/internal/resource/testing/resource-testing.go @@ -51,7 +51,7 @@ func NewDeviceMock(migEnabled bool) *DeviceMock { IsMigEnabledFunc: func() (bool, error) { return migEnabled, nil }, IsMigCapableFunc: func() (bool, error) { return migEnabled, nil }, GetMigDevicesFunc: func() ([]resource.Device, error) { return nil, nil }, - GetClassFunc: func() (string, error) { return "0x030000", nil }, + GetPIEClassFunc: func() (string, error) { return "0x030000", nil }, }} return &d } diff --git a/internal/resource/types.go b/internal/resource/types.go index 5e89e7972..3c9f6d64d 100644 --- a/internal/resource/types.go +++ b/internal/resource/types.go @@ -39,5 +39,5 @@ type Device interface { GetTotalMemoryMB() (uint64, error) GetDeviceHandleFromMigDeviceHandle() (Device, error) GetCudaComputeCapability() (int, int, error) - GetClass() (string, error) + GetPIEClass() (string, error) } From 4e2818b81160991f1393f08b9ed387edcb070e56 Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Tue, 18 Jun 2024 10:13:31 +0000 Subject: [PATCH 06/10] Changing signature of GetPIEClass method Signed-off-by: Vishesh Tanksale --- internal/lm/nvml.go | 15 ++-- internal/resource/cuda-device.go | 4 +- internal/resource/device_mock.go | 74 +++++++++---------- internal/resource/nvml-device.go | 8 +- internal/resource/nvml-mig-device.go | 8 +- internal/resource/sysfs-device.go | 5 +- internal/resource/testing/resource-testing.go | 2 +- internal/resource/types.go | 2 +- 8 files changed, 59 insertions(+), 59 deletions(-) diff --git a/internal/lm/nvml.go b/internal/lm/nvml.go index f06fae9a5..8b95980a4 100644 --- a/internal/lm/nvml.go +++ b/internal/lm/nvml.go @@ -22,6 +22,8 @@ import ( "strconv" "strings" + "github.com/NVIDIA/go-nvlib/pkg/nvpci" + spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" "github.com/NVIDIA/k8s-device-plugin/internal/resource" ) @@ -220,7 +222,7 @@ func newGPUModeLabeler(manager resource.Manager) (Labeler, error) { return labels, nil } -func getModeForClasses(classes []string) string { +func getModeForClasses(classes []uint32) string { if len(classes) == 0 { return "unknown" } @@ -230,27 +232,26 @@ func getModeForClasses(classes []string) string { } } switch classes[0] { - case "0x030000": + case nvpci.PCIVgaControllerClass: return "graphics" - case "0x030200": + case nvpci.PCI3dControllerClass: return "compute" default: return "unknown" } } -func getDeviceClasses(devices []resource.Device) ([]string, error) { - seenClasses := make(map[string]bool) +func getDeviceClasses(devices []resource.Device) ([]uint32, error) { + seenClasses := make(map[uint32]bool) for _, d := range devices { class, err := d.GetPIEClass() if err != nil { return nil, err } - class = strings.TrimSpace(class) seenClasses[class] = true } - var classes []string + var classes []uint32 for class := range seenClasses { classes = append(classes, class) } diff --git a/internal/resource/cuda-device.go b/internal/resource/cuda-device.go index 16cdbf8fc..4eaf73a41 100644 --- a/internal/resource/cuda-device.go +++ b/internal/resource/cuda-device.go @@ -97,6 +97,6 @@ func (d *cudaDevice) IsMigEnabled() (bool, error) { return false, nil } -func (d *cudaDevice) GetPIEClass() (string, error) { - return "unknown", nil +func (d *cudaDevice) GetPIEClass() (uint32, error) { + return 0, nil } diff --git a/internal/resource/device_mock.go b/internal/resource/device_mock.go index d1b261b65..e48a26076 100644 --- a/internal/resource/device_mock.go +++ b/internal/resource/device_mock.go @@ -20,9 +20,6 @@ var _ Device = &DeviceMock{} // GetAttributesFunc: func() (map[string]interface{}, error) { // panic("mock out the GetAttributes method") // }, -// GetClassFunc: func() (string, error) { -// panic("mock out the GetPIEClass method") -// }, // GetCudaComputeCapabilityFunc: func() (int, int, error) { // panic("mock out the GetCudaComputeCapability method") // }, @@ -35,6 +32,9 @@ var _ Device = &DeviceMock{} // GetNameFunc: func() (string, error) { // panic("mock out the GetName method") // }, +// GetPIEClassFunc: func() (uint32, error) { +// panic("mock out the GetPIEClass method") +// }, // GetTotalMemoryMBFunc: func() (uint64, error) { // panic("mock out the GetTotalMemoryMB method") // }, @@ -54,9 +54,6 @@ type DeviceMock struct { // GetAttributesFunc mocks the GetAttributes method. GetAttributesFunc func() (map[string]interface{}, error) - // GetPIEClassFunc mocks the GetClass method. - GetPIEClassFunc func() (string, error) - // GetCudaComputeCapabilityFunc mocks the GetCudaComputeCapability method. GetCudaComputeCapabilityFunc func() (int, int, error) @@ -69,6 +66,9 @@ type DeviceMock struct { // GetNameFunc mocks the GetName method. GetNameFunc func() (string, error) + // GetPIEClassFunc mocks the GetPIEClass method. + GetPIEClassFunc func() (uint32, error) + // GetTotalMemoryMBFunc mocks the GetTotalMemoryMB method. GetTotalMemoryMBFunc func() (uint64, error) @@ -83,9 +83,6 @@ type DeviceMock struct { // GetAttributes holds details about calls to the GetAttributes method. GetAttributes []struct { } - // GetPIEClass holds details about calls to the GetClass method. - GetPIEClass []struct { - } // GetCudaComputeCapability holds details about calls to the GetCudaComputeCapability method. GetCudaComputeCapability []struct { } @@ -98,6 +95,9 @@ type DeviceMock struct { // GetName holds details about calls to the GetName method. GetName []struct { } + // GetPIEClass holds details about calls to the GetPIEClass method. + GetPIEClass []struct { + } // GetTotalMemoryMB holds details about calls to the GetTotalMemoryMB method. GetTotalMemoryMB []struct { } @@ -109,11 +109,11 @@ type DeviceMock struct { } } lockGetAttributes sync.RWMutex - lockGetClass sync.RWMutex lockGetCudaComputeCapability sync.RWMutex lockGetDeviceHandleFromMigDeviceHandle sync.RWMutex lockGetMigDevices sync.RWMutex lockGetName sync.RWMutex + lockGetPIEClass sync.RWMutex lockGetTotalMemoryMB sync.RWMutex lockIsMigCapable sync.RWMutex lockIsMigEnabled sync.RWMutex @@ -146,33 +146,6 @@ func (mock *DeviceMock) GetAttributesCalls() []struct { return calls } -// GetPIEClass calls GetPIEClassFunc. -func (mock *DeviceMock) GetPIEClass() (string, error) { - if mock.GetPIEClassFunc == nil { - panic("DeviceMock.GetClassFunc: method is nil but Device.GetPIEClass was just called") - } - callInfo := struct { - }{} - mock.lockGetClass.Lock() - mock.calls.GetPIEClass = append(mock.calls.GetPIEClass, callInfo) - mock.lockGetClass.Unlock() - return mock.GetPIEClassFunc() -} - -// GetClassCalls gets all the calls that were made to GetClass. -// Check the length with: -// -// len(mockedDevice.GetPIEClassCalls()) -func (mock *DeviceMock) GetPIEClassCalls() []struct { -} { - var calls []struct { - } - mock.lockGetClass.RLock() - calls = mock.calls.GetPIEClass - mock.lockGetClass.RUnlock() - return calls -} - // GetCudaComputeCapability calls GetCudaComputeCapabilityFunc. func (mock *DeviceMock) GetCudaComputeCapability() (int, int, error) { if mock.GetCudaComputeCapabilityFunc == nil { @@ -281,6 +254,33 @@ func (mock *DeviceMock) GetNameCalls() []struct { return calls } +// GetPIEClass calls GetPIEClassFunc. +func (mock *DeviceMock) GetPIEClass() (uint32, error) { + if mock.GetPIEClassFunc == nil { + panic("DeviceMock.GetPIEClassFunc: method is nil but Device.GetPIEClass was just called") + } + callInfo := struct { + }{} + mock.lockGetPIEClass.Lock() + mock.calls.GetPIEClass = append(mock.calls.GetPIEClass, callInfo) + mock.lockGetPIEClass.Unlock() + return mock.GetPIEClassFunc() +} + +// GetPIEClassCalls gets all the calls that were made to GetPIEClass. +// Check the length with: +// +// len(mockedDevice.GetPIEClassCalls()) +func (mock *DeviceMock) GetPIEClassCalls() []struct { +} { + var calls []struct { + } + mock.lockGetPIEClass.RLock() + calls = mock.calls.GetPIEClass + mock.lockGetPIEClass.RUnlock() + return calls +} + // GetTotalMemoryMB calls GetTotalMemoryMBFunc. func (mock *DeviceMock) GetTotalMemoryMB() (uint64, error) { if mock.GetTotalMemoryMBFunc == nil { diff --git a/internal/resource/nvml-device.go b/internal/resource/nvml-device.go index ccc6aa2ab..f323cdacd 100644 --- a/internal/resource/nvml-device.go +++ b/internal/resource/nvml-device.go @@ -88,14 +88,14 @@ func (d nvmlDevice) GetTotalMemoryMB() (uint64, error) { return info.Total / (1024 * 1024), nil } -func (d nvmlDevice) GetPIEClass() (string, error) { +func (d nvmlDevice) GetPIEClass() (uint32, error) { pciBusID, err := d.GetPCIBusID() if err != nil { - return "", err + return 0, err } nvDevice, err := nvpci.New().GetGPUByPciBusID(pciBusID) if err != nil { - return "", err + return 0, err } - return fmt.Sprintf("%#06x", nvDevice.Class), nil + return nvDevice.Class, nil } diff --git a/internal/resource/nvml-mig-device.go b/internal/resource/nvml-mig-device.go index 27a1bb68b..dd0a3a2f7 100644 --- a/internal/resource/nvml-mig-device.go +++ b/internal/resource/nvml-mig-device.go @@ -134,10 +134,10 @@ func totalMemory(attr map[string]interface{}) (uint64, error) { } } -func (d nvmlMigDevice) GetPIEClass() (string, error) { +func (d nvmlMigDevice) GetPIEClass() (uint32, error) { info, retVal := d.MigDevice.GetPciInfo() if retVal != nvml.SUCCESS { - return "", retVal + return 0, retVal } var bytes []byte for _, char := range info.BusId { @@ -149,7 +149,7 @@ func (d nvmlMigDevice) GetPIEClass() (string, error) { pciID := strings.ToLower(strings.TrimPrefix(string(bytes), "0000")) nvDevice, err := nvpci.New().GetGPUByPciBusID(pciID) if err != nil { - return "", err + return 0, err } - return fmt.Sprintf("%#06x", nvDevice.Class), nil + return nvDevice.Class, nil } diff --git a/internal/resource/sysfs-device.go b/internal/resource/sysfs-device.go index 5dcf0012f..a9b754b47 100644 --- a/internal/resource/sysfs-device.go +++ b/internal/resource/sysfs-device.go @@ -65,7 +65,6 @@ func (d vfioDevice) IsMigCapable() (bool, error) { return false, nil } -func (d vfioDevice) GetPIEClass() (string, error) { - class := fmt.Sprintf("%#06x\n", d.nvidiaPCIDevice.Class) - return class, nil +func (d vfioDevice) GetPIEClass() (uint32, error) { + return d.nvidiaPCIDevice.Class, nil } diff --git a/internal/resource/testing/resource-testing.go b/internal/resource/testing/resource-testing.go index e2b0989da..13ba459be 100644 --- a/internal/resource/testing/resource-testing.go +++ b/internal/resource/testing/resource-testing.go @@ -51,7 +51,7 @@ func NewDeviceMock(migEnabled bool) *DeviceMock { IsMigEnabledFunc: func() (bool, error) { return migEnabled, nil }, IsMigCapableFunc: func() (bool, error) { return migEnabled, nil }, GetMigDevicesFunc: func() ([]resource.Device, error) { return nil, nil }, - GetPIEClassFunc: func() (string, error) { return "0x030000", nil }, + GetPIEClassFunc: func() (uint32, error) { return 0x030000, nil }, }} return &d } diff --git a/internal/resource/types.go b/internal/resource/types.go index 3c9f6d64d..02c81e74b 100644 --- a/internal/resource/types.go +++ b/internal/resource/types.go @@ -39,5 +39,5 @@ type Device interface { GetTotalMemoryMB() (uint64, error) GetDeviceHandleFromMigDeviceHandle() (Device, error) GetCudaComputeCapability() (int, int, error) - GetPIEClass() (string, error) + GetPIEClass() (uint32, error) } From ac2cb436236aae0c0a3f329b35fc4838bef64478 Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Mon, 24 Jun 2024 22:38:59 +0000 Subject: [PATCH 07/10] Adding Unit test and fixing typos Signed-off-by: Vishesh Tanksale --- internal/lm/nvml.go | 16 ++-- internal/lm/nvml_test.go | 86 +++++++++++++++++++ internal/resource/cuda-device.go | 2 +- internal/resource/device_mock.go | 42 ++++----- internal/resource/nvml-device.go | 2 +- internal/resource/nvml-mig-device.go | 3 +- internal/resource/sysfs-device.go | 2 +- internal/resource/testing/resource-testing.go | 9 +- internal/resource/types.go | 2 +- 9 files changed, 126 insertions(+), 38 deletions(-) diff --git a/internal/lm/nvml.go b/internal/lm/nvml.go index 8b95980a4..b93ba85e4 100644 --- a/internal/lm/nvml.go +++ b/internal/lm/nvml.go @@ -73,7 +73,7 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e return nil, fmt.Errorf("error creating resource labeler: %v", err) } - gpuModeLabeler, err := newGPUModeLabeler(manager) + gpuModeLabeler, err := newGPUModeLabeler(devices) if err != nil { return nil, fmt.Errorf("error creating resource labeler: %v", err) } @@ -202,15 +202,9 @@ func isMPSCapable(manager resource.Manager) (bool, error) { return true, nil } -func newGPUModeLabeler(manager resource.Manager) (Labeler, error) { - devices, err := manager.GetDevices() - if err != nil { - return nil, err - } - if len(devices) == 0 { - // no devices, return empty labels - return empty{}, nil - } +// newGPUModeLabeler creates a new labeler that reports the mode of GPUs on the node. +// GPUs can be in Graphics or Compute mode. +func newGPUModeLabeler(devices []resource.Device) (Labeler, error) { classes, err := getDeviceClasses(devices) if err != nil { return nil, err @@ -244,7 +238,7 @@ func getModeForClasses(classes []uint32) string { func getDeviceClasses(devices []resource.Device) ([]uint32, error) { seenClasses := make(map[uint32]bool) for _, d := range devices { - class, err := d.GetPIEClass() + class, err := d.GetPCIClass() if err != nil { return nil, err } diff --git a/internal/lm/nvml_test.go b/internal/lm/nvml_test.go index f3ec71806..073721cd8 100644 --- a/internal/lm/nvml_test.go +++ b/internal/lm/nvml_test.go @@ -204,3 +204,89 @@ func TestSharingLabeler(t *testing.T) { }) } } + +func TestGPUModeLabeler(t *testing.T) { + testCases := []struct { + description string + devices []resource.Device + expectedError bool + expectedLabels map[string]string + }{ + { + description: "single device with compute PCI class", + devices: []resource.Device{ + rt.NewDeviceWithPCIClassMock(0x030000), + }, + expectedLabels: map[string]string{ + "nvidia.com/gpu.mode": "graphics", + }, + }, + { + description: "single device with graphics PCI class", + devices: []resource.Device{ + rt.NewDeviceWithPCIClassMock(0x030200), + }, + expectedLabels: map[string]string{ + "nvidia.com/gpu.mode": "compute", + }, + }, + { + description: "single device with switch PCI class", + devices: []resource.Device{ + rt.NewDeviceWithPCIClassMock(0x068000), + }, + expectedLabels: map[string]string{ + "nvidia.com/gpu.mode": "unknown", + }, + }, + { + description: "multiple device have same graphics PCI class", + devices: []resource.Device{ + rt.NewDeviceWithPCIClassMock(0x030200), + rt.NewDeviceWithPCIClassMock(0x030200), + rt.NewDeviceWithPCIClassMock(0x030200), + }, + expectedLabels: map[string]string{ + "nvidia.com/gpu.mode": "compute", + }, + }, + { + description: "multiple device have same compute PCI class", + devices: []resource.Device{ + rt.NewDeviceWithPCIClassMock(0x030000), + rt.NewDeviceWithPCIClassMock(0x030000), + rt.NewDeviceWithPCIClassMock(0x030000), + }, + expectedLabels: map[string]string{ + "nvidia.com/gpu.mode": "graphics", + }, + }, + { + description: "multiple device with some with graphics and others with compute PCI class", + devices: []resource.Device{ + rt.NewDeviceWithPCIClassMock(0x030000), + rt.NewDeviceWithPCIClassMock(0x030200), + rt.NewDeviceWithPCIClassMock(0x030000), + }, + expectedLabels: map[string]string{ + "nvidia.com/gpu.mode": "unknown", + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + + gpuModeLabeler, _ := newGPUModeLabeler(tc.devices) + + labels, err := gpuModeLabeler.Labels() + if tc.expectedError { + require.Error(t, err) + } else { + require.NoError(t, err) + } + + require.EqualValues(t, tc.expectedLabels, labels) + }) + } +} diff --git a/internal/resource/cuda-device.go b/internal/resource/cuda-device.go index 4eaf73a41..a4f4bc4a4 100644 --- a/internal/resource/cuda-device.go +++ b/internal/resource/cuda-device.go @@ -97,6 +97,6 @@ func (d *cudaDevice) IsMigEnabled() (bool, error) { return false, nil } -func (d *cudaDevice) GetPIEClass() (uint32, error) { +func (d *cudaDevice) GetPCIClass() (uint32, error) { return 0, nil } diff --git a/internal/resource/device_mock.go b/internal/resource/device_mock.go index e48a26076..1024ec96b 100644 --- a/internal/resource/device_mock.go +++ b/internal/resource/device_mock.go @@ -32,8 +32,8 @@ var _ Device = &DeviceMock{} // GetNameFunc: func() (string, error) { // panic("mock out the GetName method") // }, -// GetPIEClassFunc: func() (uint32, error) { -// panic("mock out the GetPIEClass method") +// GetPCIClassFunc: func() (uint32, error) { +// panic("mock out the GetPCIClass method") // }, // GetTotalMemoryMBFunc: func() (uint64, error) { // panic("mock out the GetTotalMemoryMB method") @@ -66,8 +66,8 @@ type DeviceMock struct { // GetNameFunc mocks the GetName method. GetNameFunc func() (string, error) - // GetPIEClassFunc mocks the GetPIEClass method. - GetPIEClassFunc func() (uint32, error) + // GetPCIClassFunc mocks the GetPCIClass method. + GetPCIClassFunc func() (uint32, error) // GetTotalMemoryMBFunc mocks the GetTotalMemoryMB method. GetTotalMemoryMBFunc func() (uint64, error) @@ -95,8 +95,8 @@ type DeviceMock struct { // GetName holds details about calls to the GetName method. GetName []struct { } - // GetPIEClass holds details about calls to the GetPIEClass method. - GetPIEClass []struct { + // GetPCIClass holds details about calls to the GetPCIClass method. + GetPCIClass []struct { } // GetTotalMemoryMB holds details about calls to the GetTotalMemoryMB method. GetTotalMemoryMB []struct { @@ -113,7 +113,7 @@ type DeviceMock struct { lockGetDeviceHandleFromMigDeviceHandle sync.RWMutex lockGetMigDevices sync.RWMutex lockGetName sync.RWMutex - lockGetPIEClass sync.RWMutex + lockGetPCIClass sync.RWMutex lockGetTotalMemoryMB sync.RWMutex lockIsMigCapable sync.RWMutex lockIsMigEnabled sync.RWMutex @@ -254,30 +254,30 @@ func (mock *DeviceMock) GetNameCalls() []struct { return calls } -// GetPIEClass calls GetPIEClassFunc. -func (mock *DeviceMock) GetPIEClass() (uint32, error) { - if mock.GetPIEClassFunc == nil { - panic("DeviceMock.GetPIEClassFunc: method is nil but Device.GetPIEClass was just called") +// GetPCIClass calls GetPCIClassFunc. +func (mock *DeviceMock) GetPCIClass() (uint32, error) { + if mock.GetPCIClassFunc == nil { + panic("DeviceMock.GetPCIClassFunc: method is nil but Device.GetPCIClass was just called") } callInfo := struct { }{} - mock.lockGetPIEClass.Lock() - mock.calls.GetPIEClass = append(mock.calls.GetPIEClass, callInfo) - mock.lockGetPIEClass.Unlock() - return mock.GetPIEClassFunc() + mock.lockGetPCIClass.Lock() + mock.calls.GetPCIClass = append(mock.calls.GetPCIClass, callInfo) + mock.lockGetPCIClass.Unlock() + return mock.GetPCIClassFunc() } -// GetPIEClassCalls gets all the calls that were made to GetPIEClass. +// GetPCIClassCalls gets all the calls that were made to GetPCIClass. // Check the length with: // -// len(mockedDevice.GetPIEClassCalls()) -func (mock *DeviceMock) GetPIEClassCalls() []struct { +// len(mockedDevice.GetPCIClassCalls()) +func (mock *DeviceMock) GetPCIClassCalls() []struct { } { var calls []struct { } - mock.lockGetPIEClass.RLock() - calls = mock.calls.GetPIEClass - mock.lockGetPIEClass.RUnlock() + mock.lockGetPCIClass.RLock() + calls = mock.calls.GetPCIClass + mock.lockGetPCIClass.RUnlock() return calls } diff --git a/internal/resource/nvml-device.go b/internal/resource/nvml-device.go index f323cdacd..1184657d2 100644 --- a/internal/resource/nvml-device.go +++ b/internal/resource/nvml-device.go @@ -88,7 +88,7 @@ func (d nvmlDevice) GetTotalMemoryMB() (uint64, error) { return info.Total / (1024 * 1024), nil } -func (d nvmlDevice) GetPIEClass() (uint32, error) { +func (d nvmlDevice) GetPCIClass() (uint32, error) { pciBusID, err := d.GetPCIBusID() if err != nil { return 0, err diff --git a/internal/resource/nvml-mig-device.go b/internal/resource/nvml-mig-device.go index dd0a3a2f7..031aa5222 100644 --- a/internal/resource/nvml-mig-device.go +++ b/internal/resource/nvml-mig-device.go @@ -134,11 +134,12 @@ func totalMemory(attr map[string]interface{}) (uint64, error) { } } -func (d nvmlMigDevice) GetPIEClass() (uint32, error) { +func (d nvmlMigDevice) GetPCIClass() (uint32, error) { info, retVal := d.MigDevice.GetPciInfo() if retVal != nvml.SUCCESS { return 0, retVal } + var bytes []byte for _, char := range info.BusId { if char == 0 { diff --git a/internal/resource/sysfs-device.go b/internal/resource/sysfs-device.go index a9b754b47..105229fe4 100644 --- a/internal/resource/sysfs-device.go +++ b/internal/resource/sysfs-device.go @@ -65,6 +65,6 @@ func (d vfioDevice) IsMigCapable() (bool, error) { return false, nil } -func (d vfioDevice) GetPIEClass() (uint32, error) { +func (d vfioDevice) GetPCIClass() (uint32, error) { return d.nvidiaPCIDevice.Class, nil } diff --git a/internal/resource/testing/resource-testing.go b/internal/resource/testing/resource-testing.go index 13ba459be..1704a2067 100644 --- a/internal/resource/testing/resource-testing.go +++ b/internal/resource/testing/resource-testing.go @@ -51,7 +51,14 @@ func NewDeviceMock(migEnabled bool) *DeviceMock { IsMigEnabledFunc: func() (bool, error) { return migEnabled, nil }, IsMigCapableFunc: func() (bool, error) { return migEnabled, nil }, GetMigDevicesFunc: func() ([]resource.Device, error) { return nil, nil }, - GetPIEClassFunc: func() (uint32, error) { return 0x030000, nil }, + GetPCIClassFunc: func() (uint32, error) { return 0x030000, nil }, + }} + return &d +} + +func NewDeviceWithPCIClassMock(pciClass uint32) *DeviceMock { + d := DeviceMock{resource.DeviceMock{ + GetPCIClassFunc: func() (uint32, error) { return pciClass, nil }, }} return &d } diff --git a/internal/resource/types.go b/internal/resource/types.go index 02c81e74b..ec89ec579 100644 --- a/internal/resource/types.go +++ b/internal/resource/types.go @@ -39,5 +39,5 @@ type Device interface { GetTotalMemoryMB() (uint64, error) GetDeviceHandleFromMigDeviceHandle() (Device, error) GetCudaComputeCapability() (int, int, error) - GetPIEClass() (uint32, error) + GetPCIClass() (uint32, error) } From a2e6a7cd98a06a8b6c01172d5374dd0480fedf3b Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Mon, 24 Jun 2024 23:27:05 +0000 Subject: [PATCH 08/10] Updating README with details about GPU modes Signed-off-by: Vishesh Tanksale --- docs/gpu-feature-discovery/README.md | 32 ++++++++++++++-------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/gpu-feature-discovery/README.md b/docs/gpu-feature-discovery/README.md index e29cf3316..57e63acb7 100644 --- a/docs/gpu-feature-discovery/README.md +++ b/docs/gpu-feature-discovery/README.md @@ -194,22 +194,22 @@ Environment variables override the command line options if they conflict. This is the list of the labels generated by NVIDIA GPU Feature Discovery and their meaning: -| Label Name | Value Type | Meaning | Example | -| -------------------------------| ---------- | -------------------------------------------- | -------------- | -| nvidia.com/cuda.driver.major | Integer | Major of the version of NVIDIA driver | 418 | -| nvidia.com/cuda.driver.minor | Integer | Minor of the version of NVIDIA driver | 30 | -| nvidia.com/cuda.driver.rev | Integer | Revision of the version of NVIDIA driver | 40 | -| nvidia.com/cuda.runtime.major | Integer | Major of the version of CUDA | 10 | -| nvidia.com/cuda.runtime.minor | Integer | Minor of the version of CUDA | 1 | -| nvidia.com/gfd.timestamp | Integer | Timestamp of the generated labels (optional) | 1555019244 | -| nvidia.com/gpu.compute.major | Integer | Major of the compute capabilities | 3 | -| nvidia.com/gpu.compute.minor | Integer | Minor of the compute capabilities | 3 | -| nvidia.com/gpu.count | Integer | Number of GPUs | 2 | -| nvidia.com/gpu.family | String | Architecture family of the GPU | kepler | -| nvidia.com/gpu.machine | String | Machine type | DGX-1 | -| nvidia.com/gpu.memory | Integer | Memory of the GPU in Mb | 2048 | -| nvidia.com/gpu.product | String | Model of the GPU | GeForce-GT-710 | -| nvidia.com/gpu.mode | String | Display or Compute Mode of the GPU | compute | +| Label Name | Value Type | Meaning | Example | +| -------------------------------| ---------- |----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------------- | +| nvidia.com/cuda.driver.major | Integer | Major of the version of NVIDIA driver | 418 | +| nvidia.com/cuda.driver.minor | Integer | Minor of the version of NVIDIA driver | 30 | +| nvidia.com/cuda.driver.rev | Integer | Revision of the version of NVIDIA driver | 40 | +| nvidia.com/cuda.runtime.major | Integer | Major of the version of CUDA | 10 | +| nvidia.com/cuda.runtime.minor | Integer | Minor of the version of CUDA | 1 | +| nvidia.com/gfd.timestamp | Integer | Timestamp of the generated labels (optional) | 1555019244 | +| nvidia.com/gpu.compute.major | Integer | Major of the compute capabilities | 3 | +| nvidia.com/gpu.compute.minor | Integer | Minor of the compute capabilities | 3 | +| nvidia.com/gpu.count | Integer | Number of GPUs | 2 | +| nvidia.com/gpu.family | String | Architecture family of the GPU | kepler | +| nvidia.com/gpu.machine | String | Machine type | DGX-1 | +| nvidia.com/gpu.memory | Integer | Memory of the GPU in Mb | 2048 | +| nvidia.com/gpu.product | String | Model of the GPU | GeForce-GT-710 | +| nvidia.com/gpu.mode | String | Display or Compute Mode of the GPU. Details of the GPU modes can be found [here](https://docs.nvidia.com/grid/13.0/grid-gpumodeswitch-user-guide/index.html#compute-and-graphics-mode) | compute | Depending on the MIG strategy used, the following set of labels may also be available (or override the default values for some of the labels listed above): From 82c22b636cecd4ab317210c8d61a4be3629161f0 Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Tue, 25 Jun 2024 19:18:59 +0000 Subject: [PATCH 09/10] Updating the method to get class for MIG devices Signed-off-by: Vishesh Tanksale --- internal/resource/nvml-mig-device.go | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/internal/resource/nvml-mig-device.go b/internal/resource/nvml-mig-device.go index 031aa5222..8ef933ff5 100644 --- a/internal/resource/nvml-mig-device.go +++ b/internal/resource/nvml-mig-device.go @@ -135,22 +135,6 @@ func totalMemory(attr map[string]interface{}) (uint64, error) { } func (d nvmlMigDevice) GetPCIClass() (uint32, error) { - info, retVal := d.MigDevice.GetPciInfo() - if retVal != nvml.SUCCESS { - return 0, retVal - } - - var bytes []byte - for _, char := range info.BusId { - if char == 0 { - break - } - bytes = append(bytes, byte(char)) - } - pciID := strings.ToLower(strings.TrimPrefix(string(bytes), "0000")) - nvDevice, err := nvpci.New().GetGPUByPciBusID(pciID) - if err != nil { - return 0, err - } - return nvDevice.Class, nil + // GPU devices that support MIG do not support switching mode between graphics and compute, so they are always in compute mode. + return nvpci.PCI3dControllerClass, nil } From 7d0e25285cea102fdacde06f870ddf464682c233 Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Thu, 27 Jun 2024 06:35:41 +0000 Subject: [PATCH 10/10] Adding log messages and updating unit test cases Signed-off-by: Vishesh Tanksale --- internal/lm/nvml.go | 3 +++ internal/resource/testing/resource-testing.go | 2 +- tests/expected-output-mig-single.txt | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/internal/lm/nvml.go b/internal/lm/nvml.go index b93ba85e4..0b5ed6e9a 100644 --- a/internal/lm/nvml.go +++ b/internal/lm/nvml.go @@ -22,6 +22,8 @@ import ( "strconv" "strings" + "k8s.io/klog/v2" + "github.com/NVIDIA/go-nvlib/pkg/nvpci" spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" @@ -222,6 +224,7 @@ func getModeForClasses(classes []uint32) string { } for _, class := range classes { if class != classes[0] { + klog.Infof("Not all GPU devices belong to the same class %#06x ", classes) return "unknown" } } diff --git a/internal/resource/testing/resource-testing.go b/internal/resource/testing/resource-testing.go index 1704a2067..d4dc6216d 100644 --- a/internal/resource/testing/resource-testing.go +++ b/internal/resource/testing/resource-testing.go @@ -51,7 +51,7 @@ func NewDeviceMock(migEnabled bool) *DeviceMock { IsMigEnabledFunc: func() (bool, error) { return migEnabled, nil }, IsMigCapableFunc: func() (bool, error) { return migEnabled, nil }, GetMigDevicesFunc: func() ([]resource.Device, error) { return nil, nil }, - GetPCIClassFunc: func() (uint32, error) { return 0x030000, nil }, + GetPCIClassFunc: func() (uint32, error) { return 0, nil }, }} return &d } diff --git a/tests/expected-output-mig-single.txt b/tests/expected-output-mig-single.txt index eaf3bfd3c..13691a5c6 100644 --- a/tests/expected-output-mig-single.txt +++ b/tests/expected-output-mig-single.txt @@ -30,5 +30,5 @@ nvidia\.com\/gpu\.engines\.jpeg=[0-9]+ nvidia\.com\/gpu\.engines\.ofa=[0-9]+ nvidia\.com\/gpu\.slices\.gi=[0-9]+ nvidia\.com\/gpu\.slices\.ci=[0-9]+ -nvidia\.com\/gpu\.mode=[unknown|compute|graphics] +nvidia\.com\/gpu\.mode=[compute] nvidia\.com\/mps\.capable=[true|false]