Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module github.com/NVIDIA/go-gpuallocator

go 1.15

require github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20201109160820-d08ea3cdcce4
require github.com/NVIDIA/go-nvlib v0.0.0-20231116150931-9fd385bace0d

replace (
k8s.io/api => k8s.io/api v0.18.2
Expand Down
713 changes: 16 additions & 697 deletions go.sum

Large diffs are not rendered by default.

14 changes: 8 additions & 6 deletions gpuallocator/allocator.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
"fmt"
"runtime"

"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
"github.com/NVIDIA/go-nvlib/pkg/nvml"
)

// Allocator defines the primary object for allocating and freeing the
Expand Down Expand Up @@ -45,12 +45,14 @@ func NewBestEffortAllocator() (*Allocator, error) {

// NewAllocator creates a new Allocator using the given allocation policy
func NewAllocator(policy Policy) (*Allocator, error) {
err := nvml.Init()
if err != nil {
return nil, fmt.Errorf("error initializing NVML: %v", err)
nvmllib := nvml.New()
if ret := nvmllib.Init(); ret != nvml.SUCCESS {
return nil, fmt.Errorf("error initializing NVML: %v", ret)
}

devices, err := NewDevices()
devices, err := NewDevices(
WithNvmlLib(nvmllib),
)
if err != nil {
return nil, fmt.Errorf("error enumerating GPU devices: %v", err)
}
Expand All @@ -59,7 +61,7 @@ func NewAllocator(policy Policy) (*Allocator, error) {

runtime.SetFinalizer(allocator, func(allocator *Allocator) {
// Explicitly ignore any errors from nvml.Shutdown().
_ = nvml.Shutdown()
_ = nvmllib.Shutdown()
})

return allocator, nil
Expand Down
16 changes: 15 additions & 1 deletion gpuallocator/besteffort_policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ package gpuallocator
import (
"fmt"

"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
// TODO: We rename this import to reduce the changes required below.
// This can be removed once the link-specifics have been migrated into go-nvlib.
nvml "github.com/NVIDIA/go-gpuallocator/internal/links"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This only provides constants as it stands.

)

type bestEffortPolicy struct{}
Expand Down Expand Up @@ -349,6 +351,18 @@ func calculateGPUPairScore(gpu0 *Device, gpu1 *Device) int {
score += 1100
case nvml.TwelveNVLINKLinks:
score += 1200
case nvml.ThirteenNVLINKLinks:
score += 1300
case nvml.FourteenNVLINKLinks:
score += 1400
case nvml.FifteenNVLINKLinks:
score += 1500
case nvml.SixteenNVLINKLinks:
score += 1600
case nvml.SeventeenNVLINKLinks:
score += 1700
case nvml.EighteenNVLINKLinks:
score += 1800
}
}

Expand Down
12 changes: 6 additions & 6 deletions gpuallocator/common_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ import (
"sort"
"testing"

"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
// TODO: We rename this import to reduce the changes required below.
// This can be removed once the link-specifics have been migrated into go-nvlib.
nvml "github.com/NVIDIA/go-gpuallocator/internal/links"
)

const pad = ^int(0)
Expand Down Expand Up @@ -110,13 +112,11 @@ func RunPolicyAllocTests(t *testing.T, policy Policy, tests []PolicyAllocTest) {

func NewTestGPU(index int) *TestGPU {
return &TestGPU{
Index: index,
Device: &nvml.Device{
nvlibDevice: nvlibDevice{
UUID: fmt.Sprintf("GPU-%d", index),
PCI: nvml.PCIInfo{
BusID: fmt.Sprintf("GPU-%d", index),
},
PCI: struct{ BusID string }{fmt.Sprintf("GPU-%d", index)},
},
Index: index,
Links: make(map[int][]P2PLink),
}
}
Expand Down
117 changes: 96 additions & 21 deletions gpuallocator/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,61 +7,126 @@ import (
"sort"
"strings"

"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
"github.com/NVIDIA/go-gpuallocator/internal/links"
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
"github.com/NVIDIA/go-nvlib/pkg/nvml"
)

// Device represents a GPU device as reported by NVML, including all of its
// Point-to-Point link information.
type Device struct {
*nvml.Device
nvlibDevice
Index int
Links map[int][]P2PLink
}

type nvlibDevice struct {
device.Device
// The previous binding implementation used to cache specific device properties.
// These should be considered deprecated and the functions associated with device.Device
// should be used instead.
UUID string
PCI struct {
BusID string
}
CPUAffinity *uint
}

// newDevice constructs a Device for the specified index and nvml Device.
func newDevice(i int, d device.Device) (*Device, error) {
uuid, ret := d.GetUUID()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("failed to get device uuid: %v", ret)
}
pciInfo, ret := d.GetPciInfo()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("failed to get device pci info: %v", ret)
}

device := Device{
nvlibDevice: nvlibDevice{
Device: d,
UUID: uuid,
PCI: struct{ BusID string }{BusID: links.PciInfo(pciInfo).BusID()},
CPUAffinity: links.PciInfo(pciInfo).CPUAffinity(),
},
Index: i,
Links: make(map[int][]P2PLink),
}

return &device, nil
}

// P2PLink represents a Point-to-Point link between two GPU devices. The link
// is between the Device struct this struct is embedded in and the GPU Device
// contained in the P2PLink struct itself.
type P2PLink struct {
GPU *Device
Type nvml.P2PLinkType
Type links.P2PLinkType
}

// DeviceList stores an ordered list of devices.
type DeviceList []*Device

// DeviceSet is used to hold and manipulate a set of unique GPU devices.
type DeviceSet map[string]*Device

// NewDevices creates a list of Devices from all available nvml.Devices.
func NewDevices() ([]*Device, error) {
count, err := nvml.GetDeviceCount()
if err != nil {
return nil, fmt.Errorf("error calling nvml.GetDeviceCount: %v", err)
// NewDevices creates a list of Devices from all available nvml.Devices using the specified options.
func NewDevices(opts ...deviceListOption) (DeviceList, error) {
o := &deviceListBuilder{}
for _, opt := range opts {
opt(o)
}
if o.nvmllib == nil {
o.nvmllib = nvml.New()
}
if o.devicelib == nil {
o.devicelib = device.New(
device.WithNvml(o.nvmllib),
)
}

return o.build()
}

devices := []*Device{}
for i := 0; i < int(count); i++ {
device, err := nvml.NewDevice(uint(i))
// build uses the configured options to build a DeviceList.
func (o *deviceListBuilder) build() (DeviceList, error) {
if err := o.nvmllib.Init(); err != nvml.SUCCESS {
return nil, fmt.Errorf("error calling nvml.Init: %v", err)
}
defer func() {
_ = o.nvmllib.Shutdown()
}()

var devices DeviceList
err := o.devicelib.VisitDevices(func(i int, d device.Device) error {
device, err := newDevice(i, d)
if err != nil {
return nil, fmt.Errorf("error creating nvml.Device %v: %v", i, err)
return fmt.Errorf("failed to construct linked device: %v", err)
}

devices = append(devices, &Device{device, i, make(map[int][]P2PLink)})
devices = append(devices, device)
return nil
})
if err != nil {
return nil, fmt.Errorf("failed to get devices: %v", err)
}

for i, d1 := range devices {
for j, d2 := range devices {
if d1 != d2 {
p2plink, err := nvml.GetP2PLink(d1.Device, d2.Device)
if i != j {
p2plink, err := links.GetP2PLink(d1, d2)
if err != nil {
return nil, fmt.Errorf("error getting P2PLink for devices (%v, %v): %v", i, j, err)
}
if p2plink != nvml.P2PLinkUnknown {
if p2plink != links.P2PLinkUnknown {
d1.Links[d2.Index] = append(d1.Links[d2.Index], P2PLink{d2, p2plink})
}

nvlink, err := nvml.GetNVLink(d1.Device, d2.Device)
nvlink, err := links.GetNVLink(d1, d2)
if err != nil {
return nil, fmt.Errorf("error getting NVLink for devices (%v, %v): %v", i, j, err)
}
if nvlink != nvml.P2PLinkUnknown {
if nvlink != links.P2PLinkUnknown {
d1.Links[d2.Index] = append(d1.Links[d2.Index], P2PLink{d2, nvlink})
}
}
Expand All @@ -72,15 +137,25 @@ func NewDevices() ([]*Device, error) {
}

// NewDevicesFrom creates a list of Devices from the specific set of GPU uuids passed in.
func NewDevicesFrom(uuids []string) ([]*Device, error) {
func NewDevicesFrom(uuids []string) (DeviceList, error) {
devices, err := NewDevices()
if err != nil {
return nil, err
}
return devices.Filter(uuids)
}

// Filter filters out the selected devices from the list.
// If the supplied list of uuids is nil, no filtering is performed.
// Note that the specified uuids must exist in the list of devices.
func (d DeviceList) Filter(uuids []string) (DeviceList, error) {
if uuids == nil {
return d, nil
}

filtered := []*Device{}
for _, uuid := range uuids {
for _, device := range devices {
for _, device := range d {
if device.UUID == uuid {
filtered = append(filtered, device)
break
Expand Down
44 changes: 44 additions & 0 deletions gpuallocator/options.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/**
# Copyright 2023 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/

package gpuallocator

import (
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
"github.com/NVIDIA/go-nvlib/pkg/nvml"
)

// deviceListBuilder stores the options required to build a list of linked devices.
type deviceListBuilder struct {
nvmllib nvml.Interface
devicelib device.Interface
}

type deviceListOption func(*deviceListBuilder)

// WithNvmlLib provides an option to set the nvml library.
func WithNvmlLib(nvmllib nvml.Interface) deviceListOption {
return func(o *deviceListBuilder) {
o.nvmllib = nvmllib
}
}

// WithDeviceLib provides an option to set the library used for device enumeration.
func WithDeviceLib(devicelib device.Interface) deviceListOption {
return func(o *deviceListBuilder) {
o.devicelib = devicelib
}
}
Loading