From c15feea1f894e246740dd00a36a144b1e4bfa5aa Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Mon, 8 Aug 2022 15:09:58 +0300 Subject: [PATCH 1/5] Add code for generating fake GPU sysfs + devfs files To facilitate GPU plugin scalability testing on a real cluster. Pre-existing (fake) sysfs & devfs content needs to be removed first: * Fake devfs directory is mounted from host so OCI runtime can "mount" device files also to workloads requesting fake devices. This means that those files can persist over fake GPU plugin life-time, and earlier files need to be removed, as they may not match * DaemonSet restarts failing init containers, so errors about content created on previous generator run would prevent getting logs of the real error on first generator run * Before removal, check that removed directory content is as expected, to avoid accidentally removing host sysfs/devfs content (in case container was erronously granted access to the real thing) Container runtime requires fake device files to real be devices: * Use NULL devices to represent fake GPU devices: https://www.kernel.org/doc/Documentation/admin-guide/devices.txt * Give more detailed logging for MkNod() failures as device node creation is most likely operation to fail when container does not have the necessary access rights Created content is based on JSON config file (instead of e.g. commandline options) so that (configMap providing) it can be updated independently of the pod where generator is run. Signed-off-by: Eero Tamminen --- cmd/gpu_fakedev/gpu_fakedev.go | 313 +++++++++++++++++++++++++++++++++ 1 file changed, 313 insertions(+) create mode 100644 cmd/gpu_fakedev/gpu_fakedev.go diff --git a/cmd/gpu_fakedev/gpu_fakedev.go b/cmd/gpu_fakedev/gpu_fakedev.go new file mode 100644 index 000000000..ca075c84e --- /dev/null +++ b/cmd/gpu_fakedev/gpu_fakedev.go @@ -0,0 +1,313 @@ +// Copyright 2021-2022 Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//--------------------------------------------------------------- +// sysfs SPECIFICATION +// +// sys/class/drm/cardX/ +// sys/class/drm/cardX/lmem_total_bytes (gpu memory size, number) +// sys/class/drm/cardX/device/ +// sys/class/drm/cardX/device/vendor (0x8086) +// sys/class/drm/cardX/device/sriov_numvfs (PF only, number of VF GPUs, number) +// sys/class/drm/cardX/device/drm/ +// sys/class/drm/cardX/device/drm/cardX/ +// sys/class/drm/cardX/device/drm/renderD1XX/ +// sys/class/drm/cardX/device/numa_node (Numa node index[1], number) +// [1] indexing these: /sys/devices/system/node/nodeX/ +//--------------------------------------------------------------- +// devfs SPECIFICATION +// +// dev/dri/cardX +// dev/dri/renderD1XX +//--------------------------------------------------------------- + +package main + +import ( + "encoding/json" + "errors" + "flag" + "fmt" + "io/fs" + "log" + "os" + + "golang.org/x/sys/unix" +) + +const ( + dirMode = 0775 + fileMode = 0644 + cardBase = 0 + renderBase = 128 + maxDevs = 128 + sysfsPath = "sys" + devfsPath = "dev" + mib = 1024.0 * 1024.0 + // null device major, minor on linux. + devNullMajor = 1 + devNullMinor = 3 + devNullType = unix.S_IFCHR +) + +var verbose bool + +type genOptions struct { + Capabilities map[string]string // device capabilities mapping for NFD hook + Info string // verbal config description + DevCount int // how many devices to fake + TilesPerDev int // per-device tile count + DevMemSize int // available per-device device-local memory, in bytes + DevsPerNode int // How many devices per Numa node + VfsPerPf int // How many SR-IOV VFs per PF + // fields for counting what was generated + files int + dirs int + devs int +} + +func addSysfsDriTree(root string, opts *genOptions, i int) error { + card := cardBase + i + base := fmt.Sprintf("%s/class/drm/card%d", root, card) + + if err := os.MkdirAll(base, dirMode); err != nil { + return err + } + opts.dirs++ + + data := []byte(fmt.Sprintf("%d", opts.DevMemSize)) + file := fmt.Sprintf("%s/lmem_total_bytes", base) + + if err := os.WriteFile(file, data, fileMode); err != nil { + return err + } + opts.files++ + + path := fmt.Sprintf("%s/device/drm/card%d", base, card) + if err := os.MkdirAll(path, dirMode); err != nil { + return err + } + opts.dirs++ + + path = fmt.Sprintf("%s/device/drm/renderD%d", base, renderBase+i) + if err := os.Mkdir(path, dirMode); err != nil { + return err + } + opts.dirs++ + + data = []byte("0x8086") + file = fmt.Sprintf("%s/device/vendor", base) + + if err := os.WriteFile(file, data, fileMode); err != nil { + return err + } + opts.files++ + + node := 0 + if opts.DevsPerNode > 0 { + node = i / opts.DevsPerNode + } + + data = []byte(fmt.Sprintf("%d", node)) + file = fmt.Sprintf("%s/device/numa_node", base) + + if err := os.WriteFile(file, data, fileMode); err != nil { + return err + } + opts.files++ + + if opts.VfsPerPf > 0 && i%(opts.VfsPerPf+1) == 0 { + data = []byte(fmt.Sprintf("%d", opts.VfsPerPf)) + file = fmt.Sprintf("%s/device/sriov_numvfs", base) + + if err := os.WriteFile(file, data, fileMode); err != nil { + return err + } + opts.files++ + } + + for tile := 0; tile < opts.TilesPerDev; tile++ { + path := fmt.Sprintf("%s/gt/gt%d", base, tile) + if err := os.MkdirAll(path, dirMode); err != nil { + return err + } + opts.dirs++ + } + + return nil +} + +func addDevfsDriTree(root string, opts *genOptions, i int) error { + base := fmt.Sprintf("%s/dri", root) + if err := os.MkdirAll(base, dirMode); err != nil { + return err + } + opts.dirs++ + + mode := uint32(fileMode | devNullType) + devid := int(unix.Mkdev(uint32(devNullMajor), uint32(devNullMinor))) + + file := fmt.Sprintf("%s/card%d", base, cardBase+i) + if err := unix.Mknod(file, mode, devid); err != nil { + return fmt.Errorf("NULL device (%d:%d) node creation failed for '%s': %w", + devNullMajor, devNullMinor, file, err) + } + opts.devs++ + + file = fmt.Sprintf("%s/renderD%d", base, renderBase+i) + if err := unix.Mknod(file, mode, devid); err != nil { + return fmt.Errorf("NULL device (%d:%d) node creation failed for '%s': %w", + devNullMajor, devNullMinor, file, err) + } + opts.devs++ + + return nil +} + +func addDebugfsDriTree(root string, opts *genOptions, i int) error { + base := fmt.Sprintf("%s/kernel/debug/dri/%d", root, i) + if err := os.MkdirAll(base, dirMode); err != nil { + return err + } + opts.dirs++ + + path := fmt.Sprintf("%s/i915_capabilities", base) + f, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_EXCL, fileMode) + + if err != nil { + return err + } + defer f.Close() + opts.files++ + + // keys are in random order which provides extra testing for NFD label parsing code + for key, value := range opts.Capabilities { + line := fmt.Sprintf("%s: %s\n", key, value) + if _, err = f.WriteString(line); err != nil { + return err + } + } + + return nil +} + +func removeExistingDir(path, name string) { + entries, err := os.ReadDir(path) + if err != nil && !errors.Is(err, fs.ErrNotExist) { + log.Fatalf("ERROR: ReadDir() failed on fake %s path '%s': %v", name, path, err) + } + + if len(entries) == 0 { + return + } + + if name == "sysfs" && len(entries) > 2 { + log.Fatalf("ERROR: >2 entries in '%s' - real sysfs?", path) + } + + if name == "devfs" && (entries[0].Name() != "dri" || len(entries) > 1) { + log.Fatalf("ERROR: >1 entries in '%s', or '%s' != 'dri' - real devfs?", path, entries[0].Name()) + } + + log.Printf("WARN: removing already existing fake %s path '%s'", name, path) + + if err = os.RemoveAll(path); err != nil { + log.Fatalf("ERROR: removing existing %s in '%s' failed: %v", name, path, err) + } +} + +// generateDriFiles generates the fake sysfs + debugfs + devfs dirs & files according to given options. +func generateDriFiles(opts genOptions) { + if opts.Info != "" { + log.Printf("Config: '%s'", opts.Info) + } + + removeExistingDir(devfsPath, "devfs") + removeExistingDir(sysfsPath, "sysfs") + log.Printf("Generating fake DRI device(s) sysfs, debugfs and devfs content under '%s' & '%s'", + sysfsPath, devfsPath) + + opts.dirs, opts.files = 0, 0 + for i := 0; i < opts.DevCount; i++ { + if err := addSysfsDriTree(sysfsPath, &opts, i); err != nil { + log.Fatalf("ERROR: dev-%d sysfs tree generation failed: %v", i, err) + } + + if err := addDebugfsDriTree(sysfsPath, &opts, i); err != nil { + log.Fatalf("ERROR: dev-%d debugfs tree generation failed: %v", i, err) + } + + if err := addDevfsDriTree(devfsPath, &opts, i); err != nil { + log.Fatalf("ERROR: dev-%d devfs tree generation failed: %v", i, err) + } + } + log.Printf("Done, created %d dirs, %d devices and %d files.", opts.dirs, opts.devs, opts.files) +} + +// getOptions parses options from given JSON file, validates and returns them. +func getOptions(name string) genOptions { + if name == "" { + log.Fatal("ERROR: no fake device spec provided") + } + + data, err := os.ReadFile(name) + if err != nil { + log.Fatalf("ERROR: reading JSON spec file '%s' failed: %v", name, err) + } + + if verbose { + log.Printf("Using fake device spec: %v\n", string(data)) + } + + var opts genOptions + if err = json.Unmarshal(data, &opts); err != nil { + log.Fatalf("ERROR: Unmarshaling JSON spec file '%s' failed: %v", name, err) + } + + if opts.DevCount < 1 || opts.DevCount > maxDevs { + log.Fatalf("ERROR: invalid device count: 1 <= %d <= %d", opts.DevCount, maxDevs) + } + + if opts.VfsPerPf > 0 { + if opts.TilesPerDev > 0 || opts.DevsPerNode > 0 { + log.Fatalf("ERROR: SR-IOV VFs (%d) with device tiles (%d) or Numa nodes (%d) is unsupported for faking", + opts.VfsPerPf, opts.TilesPerDev, opts.DevsPerNode) + } + + if opts.DevCount%(opts.VfsPerPf+1) != 0 { + log.Fatalf("ERROR: %d devices cannot be evenly split to between set of 1 SR-IOV PF + %d VFs", + opts.DevCount, opts.VfsPerPf) + } + } + + if opts.DevsPerNode > opts.DevCount { + log.Fatalf("ERROR: DevsPerNode (%d) > DevCount (%d)", opts.DevsPerNode, opts.DevCount) + } + + if opts.DevMemSize%mib != 0 { + log.Fatalf("ERROR: Invalid memory size (%f MiB), not even MiB", float64(opts.DevMemSize)/mib) + } + + return opts +} + +func main() { + var name string + + flag.StringVar(&name, "json", "", "JSON spec for fake device sysfs, debugfs and devfs content") + flag.BoolVar(&verbose, "verbose", false, "More verbose output") + flag.Parse() + + generateDriFiles(getOptions(name)) +} From cc3aebbefcd10fe6d302ce645fb1685d52b8f689 Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Wed, 24 Aug 2022 15:15:25 +0300 Subject: [PATCH 2/5] Add minimal example JSON to test "gpu_fakedev" generator Config file is suitably indented so that it can be directly appended to a suitable configMap header. Signed-off-by: Eero Tamminen --- cmd/gpu_fakedev/configs/8x-DG1.json | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 cmd/gpu_fakedev/configs/8x-DG1.json diff --git a/cmd/gpu_fakedev/configs/8x-DG1.json b/cmd/gpu_fakedev/configs/8x-DG1.json new file mode 100644 index 000000000..e9840b23f --- /dev/null +++ b/cmd/gpu_fakedev/configs/8x-DG1.json @@ -0,0 +1,8 @@ +{ + "Info": "8x 4 GiB DG1 [Iris Xe MAX Graphics] GPUs", + "DevCount": 8, + "DevMemSize": 4294967296, + "Capabilities": { + "platform": "fake_DG1", + } +} From 55c8aa591f90866a043f16b59d7b915593db0361 Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Wed, 24 Aug 2022 20:16:09 +0300 Subject: [PATCH 3/5] Add container for "gpu_fakedev" Signed-off-by: Eero Tamminen --- .gitignore | 3 ++- build/docker/templates/intel-gpu-fakedev.Dockerfile.in | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 build/docker/templates/intel-gpu-fakedev.Dockerfile.in diff --git a/.gitignore b/.gitignore index c950689ab..d448e9d05 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ cmd/fpga_crihook/fpga_crihook cmd/dlb_plugin/dlb_plugin cmd/fpga_plugin/fpga_plugin cmd/fpga_tool/fpga_tool +cmd/gpu_fakedev/gpu_fakedev cmd/gpu_nfdhook/gpu_nfdhook cmd/gpu_plugin/gpu_plugin cmd/iaa_plugin/iaa_plugin @@ -32,4 +33,4 @@ _build _work *.tgz -charts/operator/crds \ No newline at end of file +charts/operator/crds diff --git a/build/docker/templates/intel-gpu-fakedev.Dockerfile.in b/build/docker/templates/intel-gpu-fakedev.Dockerfile.in new file mode 100644 index 000000000..600734504 --- /dev/null +++ b/build/docker/templates/intel-gpu-fakedev.Dockerfile.in @@ -0,0 +1,8 @@ +#define _ENTRYPOINT_ /usr/local/bin/intel_gpu_fakedev +ARG CMD=gpu_fakedev + +#include "default_plugin.docker" + +LABEL name='intel-gpu-fakedev' +LABEL summary='Fake device file generator for IntelĀ® GPU plugin' +LABEL description='Fake device file generator provides fake sysfs+devfs content for Intel GPU plugin from its initcontainer, for scalability testing' From 9d4b52188e0e3df2d556920d37c2b00cd6bcbb8e Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Wed, 24 Aug 2022 20:04:03 +0300 Subject: [PATCH 4/5] Add "gpu_fakedev" documentation Signed-off-by: Eero Tamminen --- cmd/gpu_fakedev/README.md | 47 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 cmd/gpu_fakedev/README.md diff --git a/cmd/gpu_fakedev/README.md b/cmd/gpu_fakedev/README.md new file mode 100644 index 000000000..94938ea41 --- /dev/null +++ b/cmd/gpu_fakedev/README.md @@ -0,0 +1,47 @@ +# Fake (GPU) device file generator + +Table of Contents +* [Introduction](#introduction) +* [Configuration](#configuration) +* [Potential improvements](#potential-improvements) +* [Related tools](#related-tools) + +## Introduction + +This is a tool for generating (large number of) fake device files for +k8s device scheduling scalability testing. But it can also be used +just to test (GPU) device plugin functionality without having +corresponding device HW. + +Its "intel-gpu-fakedev" container is intended to be run as first init +container in a device plugin pod, so that device plugin (and its NFD +labeler) see the fake (sysfs + devfs) files generated by the tool, +instead of real host sysfs and devfs content. + +## Configuration + +[Configs](configs/) subdirectory contains example JSON configuration +file(s) for the generator. Currently there's only one example JSON +file, but each new device variant adding feature(s) that have specific +support in device plugin, could have their own fake device config. + +## Potential improvements + +If support for mixed device environment is needed, tool can be updated +to use node / configuration file mapping. Such mappings could be e.g. +in configuration files themselves as node name include / exlude lists, +and tool would use first configuration file matching the node it's +running on. For now, one would need to use different pod / config +specs for different nodes to achieve that... + +Currently JSON config file options and the generated files are tied to +what GPU plugin uses, but if needed, they could be changed to fake +also sysfs + devfs device files used by other plugins. + +## Related tools + +[fakedev-exporter](#https://github.com/intel/fakedev-exporter) project +can be used to schedule suitably configured fake workloads on the fake +devices, and to provide provide fake activity metrics for them to +Prometheus, that look like they were reported by real Prometheus +metric exporters for real workloads running on real devices. From ff5cc41317f872b5fd963fa2be512755455adbb5 Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Wed, 24 Aug 2022 20:46:20 +0300 Subject: [PATCH 5/5] Add "intel-gpu-fakedev" container to CI Signed-off-by: Eero Tamminen --- .github/workflows/ci.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index f22e1a8f1..8c487f95f 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -105,6 +105,7 @@ jobs: image: - intel-fpga-admissionwebhook - intel-fpga-initcontainer + - intel-gpu-fakedev - intel-gpu-initcontainer - intel-gpu-plugin - intel-fpga-plugin