Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions examples/devices/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/**
# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/

package main

import (
"fmt"
"os"

"github.com/NVIDIA/go-gpuallocator/gpuallocator"
)

func main() {
dl, err := gpuallocator.NewDevices()
if err != nil {
fmt.Printf("error getting devices: %v\n", err)
os.Exit(1)
}

fmt.Printf("Found %d devices:\n", len(dl))
for i, device := range dl {
fmt.Printf("device %d:\n", i)
fmt.Printf("%s\n", device.Details())
}
}
7 changes: 5 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@ module github.com/NVIDIA/go-gpuallocator

go 1.20

require github.com/NVIDIA/go-nvlib v0.0.0-20231116150931-9fd385bace0d
require github.com/NVIDIA/go-nvlib v0.0.0-20240109130712-11603560817a

require github.com/NVIDIA/go-nvml v0.12.0-1.0.20231020145430-e06766c5e74f // indirect
require (
github.com/NVIDIA/go-nvml v0.12.0-1.0.20231020145430-e06766c5e74f // indirect
github.com/google/uuid v1.4.0 // indirect
)

replace (
k8s.io/api => k8s.io/api v0.18.2
Expand Down
6 changes: 4 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
github.com/NVIDIA/go-nvlib v0.0.0-20231116150931-9fd385bace0d h1:XxRHS7eNkZVcPpZZmUcoT4oO8FEcoYKn06sooQh5niU=
github.com/NVIDIA/go-nvlib v0.0.0-20231116150931-9fd385bace0d/go.mod h1:HPFNPAYqQeoos58MKUboWsdZMu71EzSQrbmd+QBRD40=
github.com/NVIDIA/go-nvlib v0.0.0-20240109130712-11603560817a h1:EH7wiaq9+NYDgCBJEcGa3HTO2Sz6dRlmO2y9yMxA5jE=
github.com/NVIDIA/go-nvlib v0.0.0-20240109130712-11603560817a/go.mod h1:U82N6/xKp6OnoqpALBH0C5SO59Buu4sX1Z3rQtBsBKQ=
github.com/NVIDIA/go-nvml v0.12.0-1.0.20231020145430-e06766c5e74f h1:FTblgO87K1vPB8tcwM5EOFpFf6UpsrlDpErPm25mFWE=
github.com/NVIDIA/go-nvml v0.12.0-1.0.20231020145430-e06766c5e74f/go.mod h1:7ruy85eOM73muOc/I37euONSwEyFqZsv5ED9AogD4G0=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/google/uuid v1.4.0 h1:MtMxsa51/r9yyhkyLsVeVt0B+BGQZzpQiTQ4eHZ8bc4=
github.com/google/uuid v1.4.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
Expand Down
21 changes: 11 additions & 10 deletions gpuallocator/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,36 +98,37 @@ func (o *deviceListBuilder) build() (DeviceList, error) {
_ = o.nvmllib.Shutdown()
}()

nvmlDevices, err := o.devicelib.GetDevices()
if err != nil {
return nil, fmt.Errorf("failed to get devices: %v", err)
}

var devices DeviceList
err := o.devicelib.VisitDevices(func(i int, d device.Device) error {
for i, d := range nvmlDevices {
device, err := newDevice(i, d)
if err != nil {
return fmt.Errorf("failed to construct linked device: %v", err)
return nil, fmt.Errorf("failed to construct linked device: %v", err)
}
devices = append(devices, device)
return nil
})
if err != nil {
return nil, fmt.Errorf("failed to get devices: %v", err)
}

for i, d1 := range devices {
for j, d2 := range devices {
for i, d1 := range nvmlDevices {
for j, d2 := range nvmlDevices {
if i != j {
p2plink, err := links.GetP2PLink(d1, d2)
if err != nil {
return nil, fmt.Errorf("error getting P2PLink for devices (%v, %v): %v", i, j, err)
}
if p2plink != links.P2PLinkUnknown {
d1.Links[d2.Index] = append(d1.Links[d2.Index], P2PLink{d2, p2plink})
devices[i].Links[j] = append(devices[i].Links[j], P2PLink{devices[j], p2plink})
}

nvlink, err := links.GetNVLink(d1, d2)
if err != nil {
return nil, fmt.Errorf("error getting NVLink for devices (%v, %v): %v", i, j, err)
}
if nvlink != links.P2PLinkUnknown {
d1.Links[d2.Index] = append(d1.Links[d2.Index], P2PLink{d2, nvlink})
devices[i].Links[j] = append(devices[i].Links[j], P2PLink{devices[j], nvlink})
}
}
}
Expand Down
78 changes: 67 additions & 11 deletions internal/links/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,62 @@ const (
EighteenNVLINKLinks
)

// String returns the string representation of the P2PLink type.
func (l P2PLinkType) String() string {
switch l {
case P2PLinkCrossCPU:
return "P2PLinkCrossCPU"
case P2PLinkSameCPU:
return "P2PLinkSameCPU"
case P2PLinkHostBridge:
return "P2PLinkHostBridge"
case P2PLinkMultiSwitch:
return "P2PLinkMultiSwitch"
case P2PLinkSingleSwitch:
return "P2PLinkSingleSwitch"
case P2PLinkSameBoard:
return "P2PLinkSameBoard"
case SingleNVLINKLink:
return "SingleNVLINKLink"
case TwoNVLINKLinks:
return "TwoNVLINKLinks"
case ThreeNVLINKLinks:
return "ThreeNVLINKLinks"
case FourNVLINKLinks:
return "FourNVLINKLinks"
case FiveNVLINKLinks:
return "FiveNVLINKLinks"
case SixNVLINKLinks:
return "SixNVLINKLinks"
case SevenNVLINKLinks:
return "SevenNVLINKLinks"
case EightNVLINKLinks:
return "EightNVLINKLinks"
case NineNVLINKLinks:
return "NineNVLINKLinks"
case TenNVLINKLinks:
return "TenNVLINKLinks"
case ElevenNVLINKLinks:
return "ElevenNVLINKLinks"
case TwelveNVLINKLinks:
return "TwelveNVLINKLinks"
case ThirteenNVLINKLinks:
return "ThirteenNVLINKLinks"
case FourteenNVLINKLinks:
return "FourteenNVLINKLinks"
case FifteenNVLINKLinks:
return "FifteenNVLINKLinks"
case SixteenNVLINKLinks:
return "SixteenNVLINKLinks"
case SeventeenNVLINKLinks:
return "SeventeenNVLINKLinks"
case EighteenNVLINKLinks:
return "EighteenNVLINKLinks"
default:
return fmt.Sprintf("UNKOWN (%v)", uint(l))
}
}

// GetP2PLink gets the peer-to-peer connectivity between two devices.
func GetP2PLink(dev1 device.Device, dev2 device.Device) (P2PLinkType, error) {
level, ret := dev1.GetTopologyCommonAncestor(dev2)
Expand Down Expand Up @@ -149,23 +205,23 @@ func getAllNvLinkRemotePciInfo(dev device.Device) ([]PciInfo, error) {
var pciInfos []PciInfo
for i := 0; i < nvml.NVLINK_MAX_LINKS; i++ {
state, ret := dev.GetNvLinkState(i)
if ret == nvml.ERROR_NOT_SUPPORTED {
if ret == nvml.ERROR_NOT_SUPPORTED || ret == nvml.ERROR_INVALID_ARGUMENT {
continue
}
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("failed to get nvlink state: %v", ret)
}

if state == nvml.FEATURE_ENABLED {
pciInfo, ret := dev.GetNvLinkRemotePciInfo(i)
if ret == nvml.ERROR_NOT_SUPPORTED {
continue
}
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("failed to get remote pci info: %v", ret)
}
pciInfos = append(pciInfos, PciInfo(pciInfo))
if state != nvml.FEATURE_ENABLED {
continue
}
pciInfo, ret := dev.GetNvLinkRemotePciInfo(i)
if ret == nvml.ERROR_NOT_SUPPORTED || ret == nvml.ERROR_INVALID_ARGUMENT {
continue
}
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("failed to get remote pci info: %v", ret)
}
pciInfos = append(pciInfos, PciInfo(pciInfo))
}

return pciInfos, nil
Expand Down
94 changes: 94 additions & 0 deletions vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/device/identifier.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 8 additions & 3 deletions vendor/github.com/NVIDIA/go-nvlib/pkg/nvml/device.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading