Skip to content

Commit

Permalink
Merge pull request #43 from Project-HAMi/addcm
Browse files Browse the repository at this point in the history
Add configMap, which is the config-center of volcano-vgpu
  • Loading branch information
archlitchi authored Dec 31, 2024
2 parents b54431c + e622377 commit 2e0c4f9
Show file tree
Hide file tree
Showing 7 changed files with 317 additions and 27 deletions.
5 changes: 2 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,16 @@ require (
github.com/NVIDIA/go-nvml v0.12.4-0
github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20201109160820-d08ea3cdcce4
github.com/fsnotify/fsnotify v1.4.9
github.com/mitchellh/gox v1.0.1 // indirect
github.com/prometheus/client_golang v1.0.0
github.com/prometheus/common v0.4.1
github.com/spf13/cobra v0.0.5
github.com/spf13/viper v1.3.2
github.com/stretchr/testify v1.9.0
github.com/urfave/cli/v2 v2.4.0
golang.org/x/exp v0.0.0-20190312203227-4b39c73a6495
golang.org/x/net v0.0.0-20200421231249-e086a090c8fd
google.golang.org/grpc v1.32.0
google.golang.org/protobuf v1.34.2
google.golang.org/protobuf v1.34.2 // indirect
gopkg.in/yaml.v2 v2.2.8
k8s.io/api v0.18.2
k8s.io/apimachinery v0.18.2
k8s.io/client-go v0.18.2
Expand Down
7 changes: 0 additions & 7 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -285,8 +285,6 @@ github.com/grpc-ecosystem/go-grpc-middleware v1.0.1-0.20190118093823-f849b5445de
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk=
github.com/grpc-ecosystem/grpc-gateway v1.9.5/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY=
github.com/hashicorp/go-syslog v1.0.0/go.mod h1:qPfqrKkXGihmCqbJM2mZgkZGvKG1dFdvsLplgctolz4=
github.com/hashicorp/go-version v1.0.0 h1:21MVWPKDphxa7ineQQTrCU5brh7OuVVAzGOCnnCPtE8=
github.com/hashicorp/go-version v1.0.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA=
github.com/hashicorp/golang-lru v0.0.0-20180201235237-0fb14efe8c47/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hashicorp/golang-lru v0.5.1 h1:0hERBMJE1eitiLkihrMvRVBYAkpHzc/J3QdDN+dAcgU=
Expand Down Expand Up @@ -373,10 +371,6 @@ github.com/mitchellh/go-homedir v1.0.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrk
github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
github.com/mitchellh/go-ps v0.0.0-20170309133038-4fdf99ab2936/go.mod h1:r1VsdOzOPt1ZSrGZWFoNhsAedKnEd6r9Np1+5blZCWk=
github.com/mitchellh/go-wordwrap v1.0.0/go.mod h1:ZXFpozHsX6DPmq2I0TCekCxypsnAUbP2oI0UX1GXzOo=
github.com/mitchellh/gox v1.0.1 h1:x0jD3dcHk9a9xPSDN6YEL4xL6Qz0dvNYm8yZqui5chI=
github.com/mitchellh/gox v1.0.1/go.mod h1:ED6BioOGXMswlXa2zxfh/xdd5QhwYliBFn9V18Ap4z4=
github.com/mitchellh/iochan v1.0.0 h1:C+X3KsSTLFVBr/tK1eYN/vs4rJcvsiLU338UhYPJWeY=
github.com/mitchellh/iochan v1.0.0/go.mod h1:JwYml1nuB7xOzsp52dPpHFffvOCDupsG0QubkSMEySY=
github.com/mitchellh/mapstructure v0.0.0-20180220230111-00c29f56e238/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
github.com/mitchellh/mapstructure v1.1.2 h1:fmNYVwqnSfB9mZU6OS2O6GsXM+wcskZDuKQzvN1EDeE=
github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
Expand Down Expand Up @@ -568,7 +562,6 @@ golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL
golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190312203227-4b39c73a6495 h1:I6A9Ag9FpEKOjcKrRNjQkPHawoXIhKyTGfvvjFAiiAk=
golang.org/x/exp v0.0.0-20190312203227-4b39c73a6495/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
Expand Down
83 changes: 83 additions & 0 deletions pkg/plugin/vgpu/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,24 @@ limitations under the License.

package config

type NvidiaConfig struct {
ResourceCountName string `yaml:"resourceCountName"`
ResourceMemoryName string `yaml:"resourceMemoryName"`
ResourceCoreName string `yaml:"resourceCoreName"`
ResourceMemoryPercentageName string `yaml:"resourceMemoryPercentageName"`
ResourcePriority string `yaml:"resourcePriorityName"`
OverwriteEnv bool `yaml:"overwriteEnv"`
DefaultMemory int32 `yaml:"defaultMemory"`
DefaultCores int32 `yaml:"defaultCores"`
DefaultGPUNum int32 `yaml:"defaultGPUNum"`
DeviceSplitCount uint `yaml:"deviceSplitCount"`
DeviceMemoryScaling float64 `yaml:"deviceMemoryScaling"`
DeviceCoreScaling float64 `yaml:"deviceCoreScaling"`
DisableCoreLimit bool `yaml:"disableCoreLimit"`
MigGeometriesList []AllowedMigGeometries `yaml:"knownMigGeometries"`
GPUMemoryFactor uint `yaml:"gpuMemoryFactor"`
}

var (
DeviceSplitCount uint
GPUMemoryFactor uint
Expand All @@ -24,3 +42,68 @@ var (
RuntimeSocketFlag string
DisableCoreLimit bool
)

type MigTemplate struct {
Name string `yaml:"name"`
Memory int32 `yaml:"memory"`
Count int32 `yaml:"count"`
}

type MigTemplateUsage struct {
Name string `json:"name,omitempty"`
Memory int32 `json:"memory,omitempty"`
InUse bool `json:"inuse,omitempty"`
}

type Geometry []MigTemplate

type MIGS []MigTemplateUsage

type MigInUse struct {
Index int32
UsageList MIGS
}

type AllowedMigGeometries struct {
Models []string `yaml:"models"`
Geometries []Geometry `yaml:"allowedGeometries"`
}

type Config struct {
NvidiaConfig NvidiaConfig `yaml:"nvidia"`
}

type MigPartedSpec struct {
Version string `json:"version" yaml:"version"`
MigConfigs map[string]MigConfigSpecSlice `json:"mig-configs,omitempty" yaml:"mig-configs,omitempty"`
}

// MigConfigSpec defines the spec to declare the desired MIG configuration for a set of GPUs.
type MigConfigSpec struct {
DeviceFilter interface{} `json:"device-filter,omitempty" yaml:"device-filter,flow,omitempty"`
Devices []int32 `json:"devices" yaml:"devices,flow"`
MigEnabled bool `json:"mig-enabled" yaml:"mig-enabled"`
MigDevices map[string]int32 `json:"mig-devices" yaml:"mig-devices"`
}

// MigConfigSpecSlice represents a slice of 'MigConfigSpec'.
type MigConfigSpecSlice []MigConfigSpec

type FilterDevice struct {
// UUID is the device ID.
UUID []string `json:"uuid"`
// Index is the device index.
Index []uint `json:"index"`
}

type DevicePluginConfigs struct {
Nodeconfig []struct {
Name string `json:"name"`
OperatingMode string `json:"operatingmode"`
Devicememoryscaling float64 `json:"devicememoryscaling"`
Devicecorescaling float64 `json:"devicecorescaling"`
Devicesplitcount uint `json:"devicesplitcount"`
Migstrategy string `json:"migstrategy"`
FilterDevice *FilterDevice `json:"filterdevices"`
} `json:"nodeconfig"`
}
78 changes: 71 additions & 7 deletions pkg/plugin/vgpu/plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package vgpu

import (
"encoding/json"
"errors"
"fmt"
"log"
Expand Down Expand Up @@ -65,8 +66,11 @@ type NvidiaDevicePlugin struct {
deviceListEnvvar string
allocatePolicy gpuallocator.Policy
socket string
schedulerConfig config.NvidiaConfig
operatingMode string

virtualDevices []*pluginapi.Device
migCurrent config.MigPartedSpec

server *grpc.Server
cachedDevices []*Device
Expand All @@ -76,21 +80,81 @@ type NvidiaDevicePlugin struct {
migStrategy string
}

var (
// DevicePluginFilterDevice need device-plugin filter this device, don't register this device.
DevicePluginFilterDevice *config.FilterDevice
)

func readFromConfigFile(sConfig *config.NvidiaConfig) (string, error) {
jsonbyte, err := os.ReadFile("/config/config.json")
mode := "hami-core"
if err != nil {
return "", err
}
var deviceConfigs config.DevicePluginConfigs
err = json.Unmarshal(jsonbyte, &deviceConfigs)
if err != nil {
return "", err
}
klog.Infof("Device Plugin Configs: %v", fmt.Sprintf("%v", deviceConfigs))
for _, val := range deviceConfigs.Nodeconfig {
if os.Getenv("NODE_NAME") == val.Name {
klog.Infof("Reading config from file %s", val.Name)
if val.Devicememoryscaling > 0 {
sConfig.DeviceMemoryScaling = val.Devicememoryscaling
}
if val.Devicecorescaling > 0 {
sConfig.DeviceCoreScaling = val.Devicecorescaling
}
if val.Devicesplitcount > 0 {
sConfig.DeviceSplitCount = val.Devicesplitcount
}
if val.FilterDevice != nil && (len(val.FilterDevice.UUID) > 0 || len(val.FilterDevice.Index) > 0) {
DevicePluginFilterDevice = val.FilterDevice
}
if len(val.OperatingMode) > 0 {
mode = val.OperatingMode
}
klog.Infof("FilterDevice: %v", val.FilterDevice)
}
}
return mode, nil
}

// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin
func NewNvidiaDevicePlugin(resourceName string, deviceCache *DeviceCache, allocatePolicy gpuallocator.Policy, socket string) *NvidiaDevicePlugin {
return &NvidiaDevicePlugin{
deviceCache: deviceCache,
resourceName: resourceName,
allocatePolicy: allocatePolicy,
socket: socket,
migStrategy: "none",

configs, err := util.LoadConfigFromCM("volcano-vgpu-device-config")
if err != nil {
klog.InfoS("configMap not found", err.Error())
}
nvidiaConfig := config.NvidiaConfig{}
if configs != nil {
nvidiaConfig = configs.NvidiaConfig
}
nvidiaConfig.DeviceSplitCount = config.DeviceSplitCount
nvidiaConfig.DeviceCoreScaling = config.DeviceCoresScaling
nvidiaConfig.GPUMemoryFactor = config.GPUMemoryFactor
mode, err := readFromConfigFile(&nvidiaConfig)
if err != nil {
klog.InfoS("readFrom device cm error", err.Error())
return nil
}
klog.Infoln("Loaded config=", nvidiaConfig)
dp := &NvidiaDevicePlugin{
deviceCache: deviceCache,
resourceName: resourceName,
allocatePolicy: allocatePolicy,
socket: socket,
migStrategy: "none",
operatingMode: mode,
schedulerConfig: nvidiaConfig,
// These will be reinitialized every
// time the plugin server is restarted.
server: nil,
health: nil,
stop: nil,
}
return dp
}

// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin
Expand Down
20 changes: 12 additions & 8 deletions pkg/plugin/vgpu/util/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ limitations under the License.

package util

import "volcano.sh/k8s-device-plugin/pkg/plugin/vgpu/config"

const (
AssignedTimeAnnotations = "volcano.sh/vgpu-time"
AssignedIDsAnnotations = "volcano.sh/vgpu-ids-new"
Expand Down Expand Up @@ -89,12 +91,14 @@ type ContainerDevices []ContainerDevice
type PodDevices []ContainerDevices

type DeviceInfo struct {
Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"`
Count int32 `protobuf:"varint,2,opt,name=count,proto3" json:"count,omitempty"`
Devmem int32 `protobuf:"varint,3,opt,name=devmem,proto3" json:"devmem,omitempty"`
Type string `protobuf:"bytes,4,opt,name=type,proto3" json:"type,omitempty"`
Health bool `protobuf:"varint,5,opt,name=health,proto3" json:"health,omitempty"`
XXX_NoUnkeyedLiteral struct{} `json:"-"`
XXX_unrecognized []byte `json:"-"`
XXX_sizecache int32 `json:"-"`
Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"`
Count int32 `protobuf:"varint,2,opt,name=count,proto3" json:"count,omitempty"`
Devmem int32 `protobuf:"varint,3,opt,name=devmem,proto3" json:"devmem,omitempty"`
Type string `protobuf:"bytes,4,opt,name=type,proto3" json:"type,omitempty"`
Health bool `protobuf:"varint,5,opt,name=health,proto3" json:"health,omitempty"`
Mode string `json:"mode,omitempty"`
MIGTemplate []config.Geometry `json:"migtemplate,omitempty"`
XXX_NoUnkeyedLiteral struct{} `json:"-"`
XXX_unrecognized []byte `json:"-"`
XXX_sizecache int32 `json:"-"`
}
36 changes: 36 additions & 0 deletions pkg/plugin/vgpu/util/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@ import (
"strconv"
"strings"

"gopkg.in/yaml.v2"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
k8stypes "k8s.io/apimachinery/pkg/types"
"k8s.io/klog/v2"
"volcano.sh/k8s-device-plugin/pkg/lock"
"volcano.sh/k8s-device-plugin/pkg/plugin/vgpu/config"
)

var DevicesToHandle []string
Expand Down Expand Up @@ -337,3 +339,37 @@ func PatchPodAnnotations(pod *v1.Pod, annotations map[string]string) error {
}
return err
}

func LoadConfigFromCM(cmName string) (*config.Config, error) {
lock.NewClient()
cm, err := lock.GetClient().CoreV1().ConfigMaps("kube-system").Get(context.Background(), cmName, metav1.GetOptions{})
if err != nil {
cm, err = lock.GetClient().CoreV1().ConfigMaps("volcano-system").Get(context.Background(), cmName, metav1.GetOptions{})
if err != nil {
return nil, err
}
}
data, ok := cm.Data["device-config.yaml"]
if !ok {
return nil, errors.New("data-config.yaml not found")
}
var yamlData config.Config
err = yaml.Unmarshal([]byte(data), &yamlData)
if err != nil {
return nil, err
}
return &yamlData, nil
}

func LoadConfig(path string) (*config.Config, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, err
}
var yamlData config.Config
err = yaml.Unmarshal(data, &yamlData)
if err != nil {
return nil, err
}
return &yamlData, nil
}
Loading

0 comments on commit 2e0c4f9

Please sign in to comment.