From a02335fc6a93466d4b1aad4e713690b5c73cfb84 Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Mon, 8 Aug 2022 15:01:41 +0300 Subject: [PATCH 01/13] Add "prefix" option to GPU plugin for scalability testing Devices can be faked for scalability testing when non-standard paths are used (GPU plugin code assumes container paths to match host paths, and container runtime prevents creating fake files under real paths). Note: If one wants to run both normal GPU plugin and faked one in same cluster, all nodes providing fake "i915" resources should be labeled differently from ones with real GPU plugin + devices, so that real GPU workloads can be limited to correct nodes with a suitable nodeSelector. Signed-off-by: Eero Tamminen --- cmd/gpu_plugin/gpu_plugin.go | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/cmd/gpu_plugin/gpu_plugin.go b/cmd/gpu_plugin/gpu_plugin.go index 9ecb9d003..eef2d9239 100644 --- a/cmd/gpu_plugin/gpu_plugin.go +++ b/cmd/gpu_plugin/gpu_plugin.go @@ -1,4 +1,4 @@ -// Copyright 2017-2021 Intel Corporation. All Rights Reserved. +// Copyright 2017-2022 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -367,8 +367,10 @@ func (dp *devicePlugin) Allocate(request *pluginapi.AllocateRequest) (*pluginapi } func main() { + var prefix string var opts cliOptions + flag.StringVar(&prefix, "prefix", "", "Prefix for devfs & sysfs paths") flag.BoolVar(&opts.enableMonitoring, "enable-monitoring", false, "whether to enable 'i915_monitoring' (= all GPUs) resource") flag.BoolVar(&opts.resourceManagement, "resource-manager", false, "fractional GPU resource management") flag.IntVar(&opts.sharedDevNum, "shared-dev-num", 1, "number of containers sharing the same GPU device") @@ -393,7 +395,15 @@ func main() { klog.V(1).Infof("GPU device plugin started with %s preferred allocation policy", opts.preferredAllocationPolicy) - plugin := newDevicePlugin(sysfsDrmDirectory, devfsDriDirectory, opts) + var sysfs, devfs string + if prefix != "" { + sysfs = prefix + sysfsDrmDirectory + devfs = prefix + devfsDriDirectory + } else { + sysfs = sysfsDrmDirectory + devfs = devfsDriDirectory + } + plugin := newDevicePlugin(sysfs, devfs, opts) manager := dpapi.NewManager(namespace, plugin) manager.Run() } From 460fce14a8b031ebaf3b9ae555aa9f9eec3928ca Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Thu, 11 Aug 2022 16:11:50 +0300 Subject: [PATCH 02/13] More detailed log for number of found GPU devices / resource types Signed-off-by: Eero Tamminen --- cmd/gpu_plugin/gpu_plugin.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cmd/gpu_plugin/gpu_plugin.go b/cmd/gpu_plugin/gpu_plugin.go index eef2d9239..bd39f9499 100644 --- a/cmd/gpu_plugin/gpu_plugin.go +++ b/cmd/gpu_plugin/gpu_plugin.go @@ -240,9 +240,13 @@ func (dp *devicePlugin) Scan(notifier dpapi.Notifier) error { klog.Warning("Failed to scan: ", err) } - found := len(devTree) + found := 0 + for key := range devTree { + found += len(devTree[key]) + } if found != previouslyFound { - klog.V(1).Info("GPU scan update: devices found: ", found) + klog.V(1).Infof("GPU scan update: %d device resources (with %dx sharing) of %d types found", + found, dp.options.sharedDevNum, len(devTree)) previouslyFound = found } From f49ca25b0e1d9705a4916ad451b002bc74ea1b5e Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Mon, 8 Aug 2022 15:09:58 +0300 Subject: [PATCH 03/13] Add code for generating fake GPU sysfs + devfs files Based on input JSON file --- cmd/gpu_plugin/fakedev/generator.go | 256 ++++++++++++++++++++++++++++ 1 file changed, 256 insertions(+) create mode 100644 cmd/gpu_plugin/fakedev/generator.go diff --git a/cmd/gpu_plugin/fakedev/generator.go b/cmd/gpu_plugin/fakedev/generator.go new file mode 100644 index 000000000..e7bd82dae --- /dev/null +++ b/cmd/gpu_plugin/fakedev/generator.go @@ -0,0 +1,256 @@ +// Copyright 2021-2022 Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//--------------------------------------------------------------- +// sysfs SPECIFICATION +// +// sys/class/drm/cardX/ +// sys/class/drm/cardX/lmem_total_bytes (gpu memory size, number) +// sys/class/drm/cardX/device/ +// sys/class/drm/cardX/device/vendor (0x8086) +// sys/class/drm/cardX/device/sriov_numvfs (PF only, number of VF GPUs, number) +// sys/class/drm/cardX/device/drm/ +// sys/class/drm/cardX/device/drm/cardX/ +// sys/class/drm/cardX/device/drm/renderD1XX/ +// sys/class/drm/cardX/device/numa_node (Numa node index[1], number) +// [1] indexing these: /sys/devices/system/node/nodeX/ +//--------------------------------------------------------------- +// devfs SPECIFICATION +// +// dev/dri/cardX +// dev/dri/renderD1XX +//--------------------------------------------------------------- + +package main + +import ( + "encoding/json" + "flag" + "fmt" + "log" + "os" +) + +const ( + dirMode = 0775 + fileMode = 0644 + cardBase = 0 + renderBase = 128 + maxDevs = 128 + sysfsPath = "sys" + devfsPath = "dev" + mib = 1024.0 * 1024.0 +) + +var verbose bool + +type genOptions struct { + Info string // verbal config description + DevCount int // how many devices to fake + TilesPerDev int // per-device tile count + DevMemSize int // available per-device device-local memory, in bytes + DevsPerNode int // How many devices per Numa node + VfsPerPf int // How many SR-IOV VFs per PF + Capabilities map[string]string // device capabilities mapping for NFD hook + // fields for counting what was generated + files int + dirs int +} + +func addSysfsDriTree(root string, opts *genOptions, i int) error { + card := cardBase + i + base := fmt.Sprintf("%s/class/drm/card%d", root, card) + if err := os.MkdirAll(base, dirMode); err != nil { + return err + } + opts.dirs++ + + data := []byte(fmt.Sprintf("%d", opts.DevMemSize)) + file := fmt.Sprintf("%s/lmem_total_bytes", base) + if err := os.WriteFile(file, data, fileMode); err != nil { + return err + } + opts.files++ + + path := fmt.Sprintf("%s/device/drm/card%d", base, card) + if err := os.MkdirAll(path, dirMode); err != nil { + return err + } + opts.dirs++ + + path = fmt.Sprintf("%s/device/drm/renderD%d", base, renderBase+i) + if err := os.Mkdir(path, dirMode); err != nil { + return err + } + opts.dirs++ + + data = []byte("0x8086") + file = fmt.Sprintf("%s/device/vendor", base) + if err := os.WriteFile(file, data, fileMode); err != nil { + return err + } + opts.files++ + + node := 0 + if opts.DevsPerNode > 0 { + node = i / opts.DevsPerNode + } + data = []byte(fmt.Sprintf("%d", node)) + file = fmt.Sprintf("%s/device/numa_node", base) + if err := os.WriteFile(file, data, fileMode); err != nil { + return err + } + opts.files++ + + if opts.VfsPerPf > 0 && i%(opts.VfsPerPf+1) == 0 { + data = []byte(fmt.Sprintf("%d", opts.VfsPerPf)) + file = fmt.Sprintf("%s/device/sriov_numvfs", base) + if err := os.WriteFile(file, data, fileMode); err != nil { + return err + } + opts.files++ + } + for tile := 0; tile < opts.TilesPerDev; tile++ { + path := fmt.Sprintf("%s/gt/gt%d", base, tile) + if err := os.MkdirAll(path, dirMode); err != nil { + return err + } + opts.dirs++ + } + return nil +} + +func addDevfsDriTree(root string, opts *genOptions, i int) error { + base := fmt.Sprintf("%s/dri", root) + if err := os.MkdirAll(base, dirMode); err != nil { + return err + } + opts.dirs++ + + data := []byte("fakegpu") + file := fmt.Sprintf("%s/card%d", base, cardBase+i) + if err := os.WriteFile(file, data, fileMode); err != nil { + return err + } + opts.files++ + + file = fmt.Sprintf("%s/renderD%d", base, renderBase+i) + if err := os.WriteFile(file, data, fileMode); err != nil { + return err + } + opts.files++ + return nil +} + +func addDebugfsDriTree(root string, opts *genOptions, i int) error { + base := fmt.Sprintf("%s/kernel/debug/dri/%d", root, i) + os.MkdirAll(base, dirMode) + opts.dirs++ + + path := fmt.Sprintf("%s/i915_capabilities", base) + f, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_EXCL, fileMode) + if err != nil { + return err + } + opts.files++ + defer f.Close() + + // keys are in random order which provides extra testing for NFD label parsing code + for key, value := range opts.Capabilities { + line := fmt.Sprintf("%s: %s\n", key, value) + if _, err = f.WriteString(line); err != nil { + return err + } + } + return nil +} + +// generateDriFiles generarates the fake sysfs + debugfs + devfs dirs & files according to given options +func generateDriFiles(opts genOptions) { + entries, _ := os.ReadDir(sysfsPath) + if len(entries) > 0 { + log.Printf("WARN: '%s' has already %d entries", sysfsPath, len(entries)) + } + // real devfs entries are needed, so check just dri subdir + entries, _ = os.ReadDir(devfsPath + "/dri") + if len(entries) > 0 { + log.Printf("WARN: '%s/dri' has already %d entries", devfsPath, len(entries)) + } + log.Printf("Generating fake DRI device(s) sysfs, debugfs and devfs content under '%s' & '%s'", + sysfsPath, devfsPath) + if opts.Info != "" { + log.Println(opts.Info) + } + opts.dirs, opts.files = 0, 0 + for i := 0; i < opts.DevCount; i++ { + if err := addSysfsDriTree(sysfsPath, &opts, i); err != nil { + log.Fatalf("ERROR: sysfs tree generation failed: %v", err) + } + if err := addDebugfsDriTree(sysfsPath, &opts, i); err != nil { + log.Fatalf("ERROR: debugfs tree generation failed: %v", err) + } + if err := addDevfsDriTree(devfsPath, &opts, i); err != nil { + log.Fatalf("ERROR: devfs tree generation failed: %v", err) + } + } + log.Printf("Done, created %d dirs and %d file entries.", opts.dirs, opts.files) +} + +// getOptions parses options from given JSON file, validates and returns them +func getOptions(name string) genOptions { + if name == "" { + log.Fatal("ERROR: no fake device spec provided") + } + var err error + var data []byte + if data, err = os.ReadFile(name); err != nil { + log.Fatalf("ERROR: reading JSON spec file '%s' failed: %v", name, err) + } + if verbose { + log.Printf("Using fake device spec: %v\n", string(data)) + } + var opts genOptions + if err = json.Unmarshal(data, &opts); err != nil { + log.Fatalf("ERROR: Unmarshaling JSON spec file '%s' failed: %v", name, err) + } + if opts.DevCount < 1 || opts.DevCount > maxDevs { + log.Fatalf("ERROR: invalid device count: 1 <= %d <= %d", opts.DevCount, maxDevs) + } + if opts.VfsPerPf > 0 { + if opts.TilesPerDev > 0 || opts.DevsPerNode > 0 { + log.Fatalf("ERROR: SR-IOV VFs (%d) with device tiles (%d) or Numa nodes (%d) is unsupported for faking", + opts.VfsPerPf, opts.TilesPerDev, opts.DevsPerNode) + } + if opts.DevCount%(opts.VfsPerPf+1) != 0 { + log.Fatalf("ERROR: %d devices cannot be evenly split to between set of 1 SR-IOV PF + %d VFs", + opts.DevCount, opts.VfsPerPf) + } + } + if opts.DevsPerNode > opts.DevCount { + log.Fatalf("ERROR: DevsPerNode (%d) > DevCount (%d)", opts.DevsPerNode, opts.DevCount) + } + if opts.DevMemSize%mib != 0 { + log.Fatalf("ERROR: Invalid memory size (%f MiB), not even MiB", float64(opts.DevMemSize)/mib) + } + return opts +} + +func main() { + var name string + flag.StringVar(&name, "json", "", "JSON spec for fake device sysfs, debugfs and devfs content") + flag.BoolVar(&verbose, "verbose", false, "More verbose output") + flag.Parse() + + generateDriFiles(getOptions(name)) +} From 7ecd8a3bbfda59467ab309d2063d982ff7b92303 Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Mon, 8 Aug 2022 17:51:06 +0300 Subject: [PATCH 04/13] Remove pre-existing fake sysfs & devfs content + more info Fake devfs directory is mounted from host so OCI runtime can "mount" device files also to workloads requesting fake devices. This means that those files can persist over fake GPU plugin life-time, so earlier files need to be removed, as they may not match. Also, DaemonSet restarts failing init containers, so errors about directories generated on previous generator run would prevent getting logs of the real error from first generator run. --- cmd/gpu_plugin/fakedev/generator.go | 30 ++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/cmd/gpu_plugin/fakedev/generator.go b/cmd/gpu_plugin/fakedev/generator.go index e7bd82dae..fb1932c9e 100644 --- a/cmd/gpu_plugin/fakedev/generator.go +++ b/cmd/gpu_plugin/fakedev/generator.go @@ -38,6 +38,7 @@ import ( "encoding/json" "flag" "fmt" + "io/fs" "log" "os" ) @@ -178,30 +179,37 @@ func addDebugfsDriTree(root string, opts *genOptions, i int) error { // generateDriFiles generarates the fake sysfs + debugfs + devfs dirs & files according to given options func generateDriFiles(opts genOptions) { - entries, _ := os.ReadDir(sysfsPath) - if len(entries) > 0 { - log.Printf("WARN: '%s' has already %d entries", sysfsPath, len(entries)) + if opts.Info != "" { + log.Printf("Config: '%s'", opts.Info) } // real devfs entries are needed, so check just dri subdir - entries, _ = os.ReadDir(devfsPath + "/dri") + path := devfsPath + "/dri" + entries, _ := os.ReadDir(path) + if len(entries) > 0 { + if entries[0].Type()&fs.ModeDevice != 0 { + log.Fatalf("ERROR: real device(s) in '%s' - trying to overwrite real devfs?", path) + } + log.Printf("WARN: removing already existing %d entries from '%s'", len(entries), path) + os.RemoveAll(path) + } + entries, _ = os.ReadDir(sysfsPath) if len(entries) > 0 { - log.Printf("WARN: '%s/dri' has already %d entries", devfsPath, len(entries)) + log.Printf("WARN: removing already existing %d entries from '%s'", len(entries), sysfsPath) + os.RemoveAll(sysfsPath) } log.Printf("Generating fake DRI device(s) sysfs, debugfs and devfs content under '%s' & '%s'", sysfsPath, devfsPath) - if opts.Info != "" { - log.Println(opts.Info) - } + opts.dirs, opts.files = 0, 0 for i := 0; i < opts.DevCount; i++ { if err := addSysfsDriTree(sysfsPath, &opts, i); err != nil { - log.Fatalf("ERROR: sysfs tree generation failed: %v", err) + log.Fatalf("ERROR: dev-%d sysfs tree generation failed: %v", i, err) } if err := addDebugfsDriTree(sysfsPath, &opts, i); err != nil { - log.Fatalf("ERROR: debugfs tree generation failed: %v", err) + log.Fatalf("ERROR: dev-%d debugfs tree generation failed: %v", i, err) } if err := addDevfsDriTree(devfsPath, &opts, i); err != nil { - log.Fatalf("ERROR: devfs tree generation failed: %v", err) + log.Fatalf("ERROR: dev-%d devfs tree generation failed: %v", i, err) } } log.Printf("Done, created %d dirs and %d file entries.", opts.dirs, opts.files) From ca83c870a4e81d51d8d233ed51eed42ee3f27c71 Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Wed, 10 Aug 2022 19:03:41 +0300 Subject: [PATCH 05/13] Container runtime requires device files to real be devices Represent fake GPU devices with null devices: https://www.kernel.org/doc/Documentation/admin-guide/devices.txt Real devfs check needed also changing, and removal warnings were simplified, as there's always just one entry. --- cmd/gpu_plugin/fakedev/generator.go | 36 +++++++++++++++++------------ 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/cmd/gpu_plugin/fakedev/generator.go b/cmd/gpu_plugin/fakedev/generator.go index fb1932c9e..e63650785 100644 --- a/cmd/gpu_plugin/fakedev/generator.go +++ b/cmd/gpu_plugin/fakedev/generator.go @@ -38,9 +38,10 @@ import ( "encoding/json" "flag" "fmt" - "io/fs" "log" "os" + + "golang.org/x/sys/unix" ) const ( @@ -52,6 +53,10 @@ const ( sysfsPath = "sys" devfsPath = "dev" mib = 1024.0 * 1024.0 + // null device major, minor on linux + devNullMajor = 1 + devNullMinor = 3 + devNullType = unix.S_IFCHR ) var verbose bool @@ -67,6 +72,7 @@ type genOptions struct { // fields for counting what was generated files int dirs int + devs int } func addSysfsDriTree(root string, opts *genOptions, i int) error { @@ -139,18 +145,20 @@ func addDevfsDriTree(root string, opts *genOptions, i int) error { } opts.dirs++ - data := []byte("fakegpu") + mode := uint32(fileMode | devNullType) + devid := int(unix.Mkdev(uint32(devNullMajor), uint32(devNullMinor))) + file := fmt.Sprintf("%s/card%d", base, cardBase+i) - if err := os.WriteFile(file, data, fileMode); err != nil { + if err := unix.Mknod(file, mode, devid); err != nil { return err } - opts.files++ + opts.devs++ file = fmt.Sprintf("%s/renderD%d", base, renderBase+i) - if err := os.WriteFile(file, data, fileMode); err != nil { + if err := unix.Mknod(file, mode, devid); err != nil { return err } - opts.files++ + opts.devs++ return nil } @@ -182,19 +190,17 @@ func generateDriFiles(opts genOptions) { if opts.Info != "" { log.Printf("Config: '%s'", opts.Info) } - // real devfs entries are needed, so check just dri subdir - path := devfsPath + "/dri" - entries, _ := os.ReadDir(path) + entries, _ := os.ReadDir(devfsPath) if len(entries) > 0 { - if entries[0].Type()&fs.ModeDevice != 0 { - log.Fatalf("ERROR: real device(s) in '%s' - trying to overwrite real devfs?", path) + if len(entries) > 1 || entries[0].Name() != "dri" { + log.Fatalf("ERROR: >1 entries in '%s', or '%s' != 'dri' - real devfs?", devfsPath, entries[0].Name()) } - log.Printf("WARN: removing already existing %d entries from '%s'", len(entries), path) - os.RemoveAll(path) + log.Printf("WARN: removing already existing %s'", devfsPath) + os.RemoveAll(devfsPath) } entries, _ = os.ReadDir(sysfsPath) if len(entries) > 0 { - log.Printf("WARN: removing already existing %d entries from '%s'", len(entries), sysfsPath) + log.Printf("WARN: removing already existing '%s'", sysfsPath) os.RemoveAll(sysfsPath) } log.Printf("Generating fake DRI device(s) sysfs, debugfs and devfs content under '%s' & '%s'", @@ -212,7 +218,7 @@ func generateDriFiles(opts genOptions) { log.Fatalf("ERROR: dev-%d devfs tree generation failed: %v", i, err) } } - log.Printf("Done, created %d dirs and %d file entries.", opts.dirs, opts.files) + log.Printf("Done, created %d dirs, %d devices and %d files.", opts.dirs, opts.devs, opts.files) } // getOptions parses options from given JSON file, validates and returns them From 1e0e04d7a63a17d3cd26ce4a8b791c1093726af7 Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Fri, 19 Aug 2022 20:15:26 +0300 Subject: [PATCH 06/13] Apply golang-ci-lint suggestions to device generator Signed-off-by: Eero Tamminen --- cmd/gpu_plugin/fakedev/generator.go | 46 +++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/cmd/gpu_plugin/fakedev/generator.go b/cmd/gpu_plugin/fakedev/generator.go index e63650785..0f0d54bf5 100644 --- a/cmd/gpu_plugin/fakedev/generator.go +++ b/cmd/gpu_plugin/fakedev/generator.go @@ -53,7 +53,7 @@ const ( sysfsPath = "sys" devfsPath = "dev" mib = 1024.0 * 1024.0 - // null device major, minor on linux + // null device major, minor on linux. devNullMajor = 1 devNullMinor = 3 devNullType = unix.S_IFCHR @@ -62,13 +62,13 @@ const ( var verbose bool type genOptions struct { + Capabilities map[string]string // device capabilities mapping for NFD hook Info string // verbal config description DevCount int // how many devices to fake TilesPerDev int // per-device tile count DevMemSize int // available per-device device-local memory, in bytes DevsPerNode int // How many devices per Numa node VfsPerPf int // How many SR-IOV VFs per PF - Capabilities map[string]string // device capabilities mapping for NFD hook // fields for counting what was generated files int dirs int @@ -78,6 +78,7 @@ type genOptions struct { func addSysfsDriTree(root string, opts *genOptions, i int) error { card := cardBase + i base := fmt.Sprintf("%s/class/drm/card%d", root, card) + if err := os.MkdirAll(base, dirMode); err != nil { return err } @@ -85,6 +86,7 @@ func addSysfsDriTree(root string, opts *genOptions, i int) error { data := []byte(fmt.Sprintf("%d", opts.DevMemSize)) file := fmt.Sprintf("%s/lmem_total_bytes", base) + if err := os.WriteFile(file, data, fileMode); err != nil { return err } @@ -104,6 +106,7 @@ func addSysfsDriTree(root string, opts *genOptions, i int) error { data = []byte("0x8086") file = fmt.Sprintf("%s/device/vendor", base) + if err := os.WriteFile(file, data, fileMode); err != nil { return err } @@ -113,8 +116,10 @@ func addSysfsDriTree(root string, opts *genOptions, i int) error { if opts.DevsPerNode > 0 { node = i / opts.DevsPerNode } + data = []byte(fmt.Sprintf("%d", node)) file = fmt.Sprintf("%s/device/numa_node", base) + if err := os.WriteFile(file, data, fileMode); err != nil { return err } @@ -123,11 +128,13 @@ func addSysfsDriTree(root string, opts *genOptions, i int) error { if opts.VfsPerPf > 0 && i%(opts.VfsPerPf+1) == 0 { data = []byte(fmt.Sprintf("%d", opts.VfsPerPf)) file = fmt.Sprintf("%s/device/sriov_numvfs", base) + if err := os.WriteFile(file, data, fileMode); err != nil { return err } opts.files++ } + for tile := 0; tile < opts.TilesPerDev; tile++ { path := fmt.Sprintf("%s/gt/gt%d", base, tile) if err := os.MkdirAll(path, dirMode); err != nil { @@ -135,6 +142,7 @@ func addSysfsDriTree(root string, opts *genOptions, i int) error { } opts.dirs++ } + return nil } @@ -159,21 +167,25 @@ func addDevfsDriTree(root string, opts *genOptions, i int) error { return err } opts.devs++ + return nil } func addDebugfsDriTree(root string, opts *genOptions, i int) error { base := fmt.Sprintf("%s/kernel/debug/dri/%d", root, i) - os.MkdirAll(base, dirMode) + if err := os.MkdirAll(base, dirMode); err != nil { + return err + } opts.dirs++ path := fmt.Sprintf("%s/i915_capabilities", base) f, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_EXCL, fileMode) + if err != nil { return err } - opts.files++ defer f.Close() + opts.files++ // keys are in random order which provides extra testing for NFD label parsing code for key, value := range opts.Capabilities { @@ -182,27 +194,32 @@ func addDebugfsDriTree(root string, opts *genOptions, i int) error { return err } } + return nil } -// generateDriFiles generarates the fake sysfs + debugfs + devfs dirs & files according to given options +// generateDriFiles generates the fake sysfs + debugfs + devfs dirs & files according to given options. func generateDriFiles(opts genOptions) { if opts.Info != "" { log.Printf("Config: '%s'", opts.Info) } + entries, _ := os.ReadDir(devfsPath) if len(entries) > 0 { if len(entries) > 1 || entries[0].Name() != "dri" { log.Fatalf("ERROR: >1 entries in '%s', or '%s' != 'dri' - real devfs?", devfsPath, entries[0].Name()) } + log.Printf("WARN: removing already existing %s'", devfsPath) os.RemoveAll(devfsPath) } + entries, _ = os.ReadDir(sysfsPath) if len(entries) > 0 { log.Printf("WARN: removing already existing '%s'", sysfsPath) os.RemoveAll(sysfsPath) } + log.Printf("Generating fake DRI device(s) sysfs, debugfs and devfs content under '%s' & '%s'", sysfsPath, devfsPath) @@ -211,9 +228,11 @@ func generateDriFiles(opts genOptions) { if err := addSysfsDriTree(sysfsPath, &opts, i); err != nil { log.Fatalf("ERROR: dev-%d sysfs tree generation failed: %v", i, err) } + if err := addDebugfsDriTree(sysfsPath, &opts, i); err != nil { log.Fatalf("ERROR: dev-%d debugfs tree generation failed: %v", i, err) } + if err := addDevfsDriTree(devfsPath, &opts, i); err != nil { log.Fatalf("ERROR: dev-%d devfs tree generation failed: %v", i, err) } @@ -221,47 +240,56 @@ func generateDriFiles(opts genOptions) { log.Printf("Done, created %d dirs, %d devices and %d files.", opts.dirs, opts.devs, opts.files) } -// getOptions parses options from given JSON file, validates and returns them +// getOptions parses options from given JSON file, validates and returns them. func getOptions(name string) genOptions { if name == "" { log.Fatal("ERROR: no fake device spec provided") } - var err error - var data []byte - if data, err = os.ReadFile(name); err != nil { + + data, err := os.ReadFile(name) + if err != nil { log.Fatalf("ERROR: reading JSON spec file '%s' failed: %v", name, err) } + if verbose { log.Printf("Using fake device spec: %v\n", string(data)) } + var opts genOptions if err = json.Unmarshal(data, &opts); err != nil { log.Fatalf("ERROR: Unmarshaling JSON spec file '%s' failed: %v", name, err) } + if opts.DevCount < 1 || opts.DevCount > maxDevs { log.Fatalf("ERROR: invalid device count: 1 <= %d <= %d", opts.DevCount, maxDevs) } + if opts.VfsPerPf > 0 { if opts.TilesPerDev > 0 || opts.DevsPerNode > 0 { log.Fatalf("ERROR: SR-IOV VFs (%d) with device tiles (%d) or Numa nodes (%d) is unsupported for faking", opts.VfsPerPf, opts.TilesPerDev, opts.DevsPerNode) } + if opts.DevCount%(opts.VfsPerPf+1) != 0 { log.Fatalf("ERROR: %d devices cannot be evenly split to between set of 1 SR-IOV PF + %d VFs", opts.DevCount, opts.VfsPerPf) } } + if opts.DevsPerNode > opts.DevCount { log.Fatalf("ERROR: DevsPerNode (%d) > DevCount (%d)", opts.DevsPerNode, opts.DevCount) } + if opts.DevMemSize%mib != 0 { log.Fatalf("ERROR: Invalid memory size (%f MiB), not even MiB", float64(opts.DevMemSize)/mib) } + return opts } func main() { var name string + flag.StringVar(&name, "json", "", "JSON spec for fake device sysfs, debugfs and devfs content") flag.BoolVar(&verbose, "verbose", false, "More verbose output") flag.Parse() From e5403064fecf50794b44f48bb40e47eedbe09f32 Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Mon, 8 Aug 2022 16:12:05 +0300 Subject: [PATCH 07/13] Use normal GPU plugin deployment pod spec as base With latest devices release. --- .../fakedev/intel-gpu-plugin-fake.yaml | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 cmd/gpu_plugin/fakedev/intel-gpu-plugin-fake.yaml diff --git a/cmd/gpu_plugin/fakedev/intel-gpu-plugin-fake.yaml b/cmd/gpu_plugin/fakedev/intel-gpu-plugin-fake.yaml new file mode 100644 index 000000000..9e2ab1fd3 --- /dev/null +++ b/cmd/gpu_plugin/fakedev/intel-gpu-plugin-fake.yaml @@ -0,0 +1,76 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: intel-gpu-plugin + namespace: inteldeviceplugins-system + labels: + app: intel-gpu-plugin +spec: + selector: + matchLabels: + app: intel-gpu-plugin + template: + metadata: + labels: + app: intel-gpu-plugin + spec: + serviceAccountName: resource-reader-sa + initContainers: + - name: intel-gpu-initcontainer + image: intel/intel-gpu-initcontainer:0.24.0 + imagePullPolicy: IfNotPresent + securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + volumeMounts: + - mountPath: /etc/kubernetes/node-feature-discovery/source.d/ + name: nfd-source-hooks + containers: + - name: intel-gpu-plugin + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + image: intel/intel-gpu-plugin:0.24.0 + imagePullPolicy: IfNotPresent + securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + command: [ + "/usr/local/bin/intel_gpu_device_plugin", + "-shared-dev-num", "300", + "-enable-monitoring", + "-resource-manager", + "-v", "2" + ] + volumeMounts: + - name: devfs + mountPath: /dev/dri + readOnly: true + - name: sysfs + mountPath: /sys/class/drm + readOnly: true + - name: kubeletsockets + mountPath: /var/lib/kubelet/device-plugins + - name: podresources + mountPath: /var/lib/kubelet/pod-resources + volumes: + - name: devfs + hostPath: + path: /dev/dri + - name: sysfs + hostPath: + path: /sys/class/drm + - name: kubeletsockets + hostPath: + path: /var/lib/kubelet/device-plugins + - name: podresources + hostPath: + path: /var/lib/kubelet/pod-resources + - name: nfd-source-hooks + hostPath: + path: /etc/kubernetes/node-feature-discovery/source.d/ + type: DirectoryOrCreate + nodeSelector: + kubernetes.io/arch: amd64 From d5ff61318b05340f82bb41fcd7d2859d27374039 Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Fri, 19 Aug 2022 17:37:02 +0300 Subject: [PATCH 08/13] Add 8x DG1 configMap for fake GPU device generator Signed-off-by: Eero Tamminen --- cmd/gpu_plugin/fakedev/fake-8x-DG1.yaml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 cmd/gpu_plugin/fakedev/fake-8x-DG1.yaml diff --git a/cmd/gpu_plugin/fakedev/fake-8x-DG1.yaml b/cmd/gpu_plugin/fakedev/fake-8x-DG1.yaml new file mode 100644 index 000000000..b49f673c5 --- /dev/null +++ b/cmd/gpu_plugin/fakedev/fake-8x-DG1.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: fake-gen-config + namespace: validation +immutable: false +data: + fakedev.json: |- + { + "Info": "8x 4 GiB DG1 [Iris Xe MAX Graphics] GPUs", + "DevCount": 8, + "DevMemSize": 4294967296, + "Capabilities": { + "platform": "fake_DG1", + "graphics version": "12.10", + "media version": "12", + "display version": "12" + } + } From 269788ea50ec084f4f495ed7d1ab74f0c020b335 Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Fri, 19 Aug 2022 17:21:03 +0300 Subject: [PATCH 09/13] Switch Intel plugin pod to use faked devices Signed-off-by: Eero Tamminen --- .../fakedev/intel-gpu-plugin-fake.yaml | 56 +++++++++++++++---- 1 file changed, 44 insertions(+), 12 deletions(-) diff --git a/cmd/gpu_plugin/fakedev/intel-gpu-plugin-fake.yaml b/cmd/gpu_plugin/fakedev/intel-gpu-plugin-fake.yaml index 9e2ab1fd3..468259c6b 100644 --- a/cmd/gpu_plugin/fakedev/intel-gpu-plugin-fake.yaml +++ b/cmd/gpu_plugin/fakedev/intel-gpu-plugin-fake.yaml @@ -16,6 +16,26 @@ spec: spec: serviceAccountName: resource-reader-sa initContainers: + - name: fakedev-generator + image: intel/fakedev-generator:latest + securityContext: + runAsUser: 0 + readOnlyRootFilesystem: false + allowPrivilegeEscalation: false + # files are generated under CWD + workingDir: /tmp/fakedev + volumeMounts: + - name: devfs + mountPath: /tmp/fakedev/dev + readOnly: false + - name: sysfs + mountPath: /tmp/fakedev/sys + readOnly: false + - name: fake-conf + mountPath: /config + readOnly: true + # generate fake sysfs / devfs files for GPU plugin based on config + command: ["/generator", "-json", "/config/fakedev.json", "-verbose"] - name: intel-gpu-initcontainer image: intel/intel-gpu-initcontainer:0.24.0 imagePullPolicy: IfNotPresent @@ -23,8 +43,15 @@ spec: readOnlyRootFilesystem: true allowPrivilegeEscalation: false volumeMounts: - - mountPath: /etc/kubernetes/node-feature-discovery/source.d/ - name: nfd-source-hooks + - name: sysfs + mountPath: /host-sys + readOnly: true + - name: nfd-features + mountPath: /nfd + readOnly: false + workingDir: /usr/local/bin/gpu-sw + # convert generated sysfs content to NFD feature labels + command: ["sh", "-c", "./intel-gpu-nfdhook | tee /nfd/fake-gpu"] containers: - name: intel-gpu-plugin env: @@ -32,45 +59,50 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName - image: intel/intel-gpu-plugin:0.24.0 - imagePullPolicy: IfNotPresent + image: intel/intel-gpu-plugin:latest securityContext: readOnlyRootFilesystem: true allowPrivilegeEscalation: false command: [ "/usr/local/bin/intel_gpu_device_plugin", - "-shared-dev-num", "300", + "-shared-dev-num", "2", + "-fake-mode", "/tmp/fakedev", "-enable-monitoring", "-resource-manager", "-v", "2" ] volumeMounts: + # combined GPU plugin / container runtime limitation: + # device files need to be mounted to same path on host and inside container - name: devfs - mountPath: /dev/dri + mountPath: /tmp/fakedev/dev readOnly: true - name: sysfs - mountPath: /sys/class/drm + mountPath: /tmp/fakedev/sys readOnly: true - name: kubeletsockets mountPath: /var/lib/kubelet/device-plugins - name: podresources mountPath: /var/lib/kubelet/pod-resources volumes: + - name: fake-conf + configMap: + name: fake-gen-config - name: devfs hostPath: - path: /dev/dri + path: /tmp/fakedev/dev + type: DirectoryOrCreate - name: sysfs - hostPath: - path: /sys/class/drm + emptyDir: {} - name: kubeletsockets hostPath: path: /var/lib/kubelet/device-plugins - name: podresources hostPath: path: /var/lib/kubelet/pod-resources - - name: nfd-source-hooks + - name: nfd-features hostPath: - path: /etc/kubernetes/node-feature-discovery/source.d/ + path: /etc/kubernetes/node-feature-discovery/features.d/ type: DirectoryOrCreate nodeSelector: kubernetes.io/arch: amd64 From b9038fa728846228b836563a2641f6ca67798519 Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Fri, 19 Aug 2022 20:29:11 +0300 Subject: [PATCH 10/13] Apply golang-ci-lint suggestions to GPU plugin Signed-off-by: Eero Tamminen --- cmd/gpu_plugin/gpu_plugin.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cmd/gpu_plugin/gpu_plugin.go b/cmd/gpu_plugin/gpu_plugin.go index bd39f9499..75e351ba5 100644 --- a/cmd/gpu_plugin/gpu_plugin.go +++ b/cmd/gpu_plugin/gpu_plugin.go @@ -244,9 +244,11 @@ func (dp *devicePlugin) Scan(notifier dpapi.Notifier) error { for key := range devTree { found += len(devTree[key]) } + if found != previouslyFound { klog.V(1).Infof("GPU scan update: %d device resources (with %dx sharing) of %d types found", found, dp.options.sharedDevNum, len(devTree)) + previouslyFound = found } @@ -371,8 +373,10 @@ func (dp *devicePlugin) Allocate(request *pluginapi.AllocateRequest) (*pluginapi } func main() { - var prefix string - var opts cliOptions + var ( + prefix string + opts cliOptions + ) flag.StringVar(&prefix, "prefix", "", "Prefix for devfs & sysfs paths") flag.BoolVar(&opts.enableMonitoring, "enable-monitoring", false, "whether to enable 'i915_monitoring' (= all GPUs) resource") @@ -407,6 +411,7 @@ func main() { sysfs = sysfsDrmDirectory devfs = devfsDriDirectory } + plugin := newDevicePlugin(sysfs, devfs, opts) manager := dpapi.NewManager(namespace, plugin) manager.Run() From ee1ac152a9fb71d7ab0c850f30c2a2f84a65ef84 Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Mon, 22 Aug 2022 20:12:52 +0300 Subject: [PATCH 11/13] Trivialize GPU plugin -prefix option handling As suggested by Ukri. Signed-off-by: Eero Tamminen --- cmd/gpu_plugin/gpu_plugin.go | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/cmd/gpu_plugin/gpu_plugin.go b/cmd/gpu_plugin/gpu_plugin.go index 75e351ba5..6fd3e428f 100644 --- a/cmd/gpu_plugin/gpu_plugin.go +++ b/cmd/gpu_plugin/gpu_plugin.go @@ -403,16 +403,7 @@ func main() { klog.V(1).Infof("GPU device plugin started with %s preferred allocation policy", opts.preferredAllocationPolicy) - var sysfs, devfs string - if prefix != "" { - sysfs = prefix + sysfsDrmDirectory - devfs = prefix + devfsDriDirectory - } else { - sysfs = sysfsDrmDirectory - devfs = devfsDriDirectory - } - - plugin := newDevicePlugin(sysfs, devfs, opts) + plugin := newDevicePlugin(prefix+sysfsDrmDirectory, prefix+devfsDriDirectory, opts) manager := dpapi.NewManager(namespace, plugin) manager.Run() } From dafd0795c6a4d2f82c1a6fa25c8b7b1a9bc53113 Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Mon, 22 Aug 2022 20:10:09 +0300 Subject: [PATCH 12/13] Better error checks+logs for MkNod(), ReadDir() and RemoveAll() Give more detailed logging for most likely failure, as MkNod() device node creation can fail as normal user. Additional error checking done in new dir removal helper function fixes Ukri's review comments. There's now error if to-be-removed fake sysfs has more content than expected (earlier such check was only for fake devfs content). Signed-off-by: Eero Tamminen --- cmd/gpu_plugin/fakedev/generator.go | 49 +++++++++++++++++++---------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/cmd/gpu_plugin/fakedev/generator.go b/cmd/gpu_plugin/fakedev/generator.go index 0f0d54bf5..ca075c84e 100644 --- a/cmd/gpu_plugin/fakedev/generator.go +++ b/cmd/gpu_plugin/fakedev/generator.go @@ -36,8 +36,10 @@ package main import ( "encoding/json" + "errors" "flag" "fmt" + "io/fs" "log" "os" @@ -158,13 +160,15 @@ func addDevfsDriTree(root string, opts *genOptions, i int) error { file := fmt.Sprintf("%s/card%d", base, cardBase+i) if err := unix.Mknod(file, mode, devid); err != nil { - return err + return fmt.Errorf("NULL device (%d:%d) node creation failed for '%s': %w", + devNullMajor, devNullMinor, file, err) } opts.devs++ file = fmt.Sprintf("%s/renderD%d", base, renderBase+i) if err := unix.Mknod(file, mode, devid); err != nil { - return err + return fmt.Errorf("NULL device (%d:%d) node creation failed for '%s': %w", + devNullMajor, devNullMinor, file, err) } opts.devs++ @@ -198,28 +202,39 @@ func addDebugfsDriTree(root string, opts *genOptions, i int) error { return nil } -// generateDriFiles generates the fake sysfs + debugfs + devfs dirs & files according to given options. -func generateDriFiles(opts genOptions) { - if opts.Info != "" { - log.Printf("Config: '%s'", opts.Info) +func removeExistingDir(path, name string) { + entries, err := os.ReadDir(path) + if err != nil && !errors.Is(err, fs.ErrNotExist) { + log.Fatalf("ERROR: ReadDir() failed on fake %s path '%s': %v", name, path, err) } - entries, _ := os.ReadDir(devfsPath) - if len(entries) > 0 { - if len(entries) > 1 || entries[0].Name() != "dri" { - log.Fatalf("ERROR: >1 entries in '%s', or '%s' != 'dri' - real devfs?", devfsPath, entries[0].Name()) - } + if len(entries) == 0 { + return + } + + if name == "sysfs" && len(entries) > 2 { + log.Fatalf("ERROR: >2 entries in '%s' - real sysfs?", path) + } - log.Printf("WARN: removing already existing %s'", devfsPath) - os.RemoveAll(devfsPath) + if name == "devfs" && (entries[0].Name() != "dri" || len(entries) > 1) { + log.Fatalf("ERROR: >1 entries in '%s', or '%s' != 'dri' - real devfs?", path, entries[0].Name()) } - entries, _ = os.ReadDir(sysfsPath) - if len(entries) > 0 { - log.Printf("WARN: removing already existing '%s'", sysfsPath) - os.RemoveAll(sysfsPath) + log.Printf("WARN: removing already existing fake %s path '%s'", name, path) + + if err = os.RemoveAll(path); err != nil { + log.Fatalf("ERROR: removing existing %s in '%s' failed: %v", name, path, err) + } +} + +// generateDriFiles generates the fake sysfs + debugfs + devfs dirs & files according to given options. +func generateDriFiles(opts genOptions) { + if opts.Info != "" { + log.Printf("Config: '%s'", opts.Info) } + removeExistingDir(devfsPath, "devfs") + removeExistingDir(sysfsPath, "sysfs") log.Printf("Generating fake DRI device(s) sysfs, debugfs and devfs content under '%s' & '%s'", sysfsPath, devfsPath) From 968e294639305adbce82c188b2359f5299993c46 Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Mon, 22 Aug 2022 19:32:23 +0300 Subject: [PATCH 13/13] Fix -prefix option name Noticed by Tuomas. Signed-off-by: Eero Tamminen --- cmd/gpu_plugin/fakedev/intel-gpu-plugin-fake.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/gpu_plugin/fakedev/intel-gpu-plugin-fake.yaml b/cmd/gpu_plugin/fakedev/intel-gpu-plugin-fake.yaml index 468259c6b..34836b234 100644 --- a/cmd/gpu_plugin/fakedev/intel-gpu-plugin-fake.yaml +++ b/cmd/gpu_plugin/fakedev/intel-gpu-plugin-fake.yaml @@ -66,7 +66,7 @@ spec: command: [ "/usr/local/bin/intel_gpu_device_plugin", "-shared-dev-num", "2", - "-fake-mode", "/tmp/fakedev", + "-prefix", "/tmp/fakedev", "-enable-monitoring", "-resource-manager", "-v", "2"