diff --git a/Makefile b/Makefile index 4f3c1b1b..fce53dbe 100644 --- a/Makefile +++ b/Makefile @@ -76,12 +76,12 @@ TARGETS += kata-containers TARGETS += mdadm TARGETS += mei TARGETS += nut-client -TARGETS += nvidia-container-toolkit-lts -TARGETS += nvidia-container-toolkit-production TARGETS += nvidia-fabricmanager-lts TARGETS += nvidia-fabricmanager-production -TARGETS += nvidia-open-gpu-kernel-modules-lts -TARGETS += nvidia-open-gpu-kernel-modules-production +TARGETS += nvidia-driver-lts +TARGETS += nvidia-driver-production +TARGETS += nvidia-driver-proprietary-lts +TARGETS += nvidia-driver-proprietary-production TARGETS += qemu-guest-agent TARGETS += qlogic-firmware TARGETS += realtek-firmware diff --git a/go.work b/go.work index 9b082d97..1fd25fe2 100644 --- a/go.work +++ b/go.work @@ -2,7 +2,6 @@ go 1.22 use ( ./examples/hello-world-service/src - ./nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime-wrapper - ./nvidia-gpu/nvidia-container-toolkit/nvidia-persistenced-wrapper + ./nvidia-gpu/nvidia-driver/service ./storage/iscsi-tools/iscsid-wrapper ) diff --git a/nvidia-gpu/nonfree/kmod-nvidia/lts/manifest.yaml b/nvidia-gpu/nonfree/kmod-nvidia/lts/manifest.yaml deleted file mode 100644 index 0571c0a9..00000000 --- a/nvidia-gpu/nonfree/kmod-nvidia/lts/manifest.yaml +++ /dev/null @@ -1,10 +0,0 @@ -version: v1alpha1 -metadata: - name: nonfree-kmod-nvidia-lts - version: "$VERSION" - author: Sidero Labs - description: | - This system extension provides nvidia proprietary kernel modules built against a specific Talos version. - compatibility: - talos: - version: ">= v1.5.0" diff --git a/nvidia-gpu/nonfree/kmod-nvidia/production/files/nvidia.conf b/nvidia-gpu/nonfree/kmod-nvidia/production/files/nvidia.conf deleted file mode 100644 index 62b5f931..00000000 --- a/nvidia-gpu/nonfree/kmod-nvidia/production/files/nvidia.conf +++ /dev/null @@ -1,4 +0,0 @@ -blacklist nvidia -blacklist nvidia_uvm -blacklist nvidia_drm -blacklist nvidia_modeset diff --git a/nvidia-gpu/nonfree/kmod-nvidia/production/manifest.yaml b/nvidia-gpu/nonfree/kmod-nvidia/production/manifest.yaml deleted file mode 100644 index f6ecfb81..00000000 --- a/nvidia-gpu/nonfree/kmod-nvidia/production/manifest.yaml +++ /dev/null @@ -1,10 +0,0 @@ -version: v1alpha1 -metadata: - name: nonfree-kmod-nvidia-production - version: "$VERSION" - author: Sidero Labs - description: | - This system extension provides nvidia proprietary kernel modules built against a specific Talos version. - compatibility: - talos: - version: ">= v1.5.0" diff --git a/nvidia-gpu/nvidia-container-toolkit/DEVELOPMENT.md b/nvidia-gpu/nvidia-container-toolkit/DEVELOPMENT.md deleted file mode 100644 index 4971a722..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/DEVELOPMENT.md +++ /dev/null @@ -1,30 +0,0 @@ -# development - -This document is intended as a guide to updating the `nvidia-container-toolkit` dependencies. - -## Components - -### [nvidia-container-cli](./nvidia-container-cli/) - -`nvidia-container-cli` is called by the `nvidia-container-runtime` to setup the required NVIDIA library mounts and NVIDIA device files for a workload container - -### [nvidia-container-runtime](./nvidia-container-runtime/) - -`nvidia-container-runtime` is the runtime used by `containerd` to run workload containers. It's mostly a wrapper around `runc` - -It also ships a tool called `nvidia-container-runtime-hook` which is used to setup OCI hooks. - -### [glibc](./glibc/) - -`nvidia-container-cli` is fully dependent on `glibc` to be able to access the NVIDIA shared objects. - -## Updating the nvidia driver version - -- Update the driver version in `pkgs` repo [here](https://github.com/siderolabs/pkgs/blob/master/nonfree/kmod-nvidia/pkg.yaml) -- Update the driver version [here](../vars.yaml) -- Update the version checksums [here](./nvidia-pkgs/pkg.yaml) - -## Updating the nvidia-container-toolkit version - -- Update the `libnvidia-container` version checksums and `REVISION` [here](./nvidia-container-cli/pkg.yaml) -- Update the `container-toolkit` version checksums and `GIT_COMMIT` [here](./nvidia-container-runtime/pkg.yaml) diff --git a/nvidia-gpu/nvidia-container-toolkit/README.md b/nvidia-gpu/nvidia-container-toolkit/README.md deleted file mode 100644 index 57b501f1..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/README.md +++ /dev/null @@ -1,91 +0,0 @@ -# NVIDIA Container toolkit extension - - -## Installation - -See [Installing Extensions](https://github.com/siderolabs/extensions#installing-extensions). - - -## Usage - -The following NVIDIA modules needs to be loaded, so add this to the talos config: - -```yaml -machine: - kernel: - modules: - - name: nvidia - - name: nvidia_uvm - - name: nvidia_drm - - name: nvidia_modeset -``` - -`nvidia-container-cli` loads BPF programs and requires relaxed KSPP setting for [bpf_jit_harden](https://sysctl-explorer.net/net/core/bpf_jit_harden/), so Talos default setting -should be overridden: - -```yaml -machine: - sysctls: - net.core.bpf_jit_harden: 1 -``` - -> Warning! This disables [KSPP best practices](https://kernsec.org/wiki/index.php/Kernel_Self_Protection_Project/Recommended_Settings#sysctls) setting. - -## Testing - -Apply the following manifest to create a runtime class that uses the extension: - -```yaml ---- -apiVersion: node.k8s.io/v1 -kind: RuntimeClass -metadata: - name: nvidia -handler: nvidia -``` - -Install the NVIDIA device plugin: - -```bash -helm repo add nvdp https://nvidia.github.io/k8s-device-plugin -helm repo update -helm install nvidia-device-plugin nvdp/nvidia-device-plugin --version=0.14.1 --set=runtimeClassName=nvidia -``` - -Apply the following manifest to run CUDA pod via nvidia runtime: - -```yaml ---- -apiVersion: v1 -kind: Pod -metadata: - name: gpu-operator-test -spec: - restartPolicy: OnFailure - runtimeClassName: nvidia - containers: - - name: cuda-vector-add - image: "nvidia/samples:vectoradd-cuda11.6.0" - resources: - limits: - nvidia.com/gpu: 1 -``` - - -The status can be viewed by running: - -```bash -❯ kubectl get pods -NAME READY STATUS RESTARTS AGE -gpu-operator-test 0/1 Completed 0 13s -``` - -```bash -❯ kubectl logs gpu-operator-test -[Vector addition of 50000 elements] -Copy input data from the host memory to the CUDA device -CUDA kernel launch with 196 blocks of 256 threads -Copy output data from the CUDA device to the host memory -Test PASSED -Done -``` diff --git a/nvidia-gpu/nvidia-container-toolkit/lts/manifest.yaml b/nvidia-gpu/nvidia-container-toolkit/lts/manifest.yaml deleted file mode 100644 index a93c078a..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/lts/manifest.yaml +++ /dev/null @@ -1,10 +0,0 @@ -version: v1alpha1 -metadata: - name: nvidia-container-toolkit-lts - version: "$VERSION" - author: Sidero Labs - description: | - This system extension provides nvidia runtime and it's dependencies using NVIDIA's runtime handler. - compatibility: - talos: - version: ">= v1.2.0" diff --git a/nvidia-gpu/nvidia-container-toolkit/lts/vars.yaml b/nvidia-gpu/nvidia-container-toolkit/lts/vars.yaml deleted file mode 100644 index 38963327..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/lts/vars.yaml +++ /dev/null @@ -1,2 +0,0 @@ -# the first part is the driver version and the second the container-toolkit version -VERSION: "{{ .NVIDIA_DRIVER_LTS_VERSION }}-{{ .CONTAINER_TOOLKIT_VERSION }}" diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/elfutils/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/elfutils/pkg.yaml deleted file mode 100644 index 52aeb8f8..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/elfutils/pkg.yaml +++ /dev/null @@ -1,46 +0,0 @@ -name: elfutils -variant: scratch -shell: /bin/bash -install: - - build-base - - bash - - m4 -dependencies: - - image: cgr.dev/chainguard/wolfi-base@{{ .WOLFI_BASE_REF }} - - stage: zlib - from: /rootfs -steps: - - sources: - - url: https://src.fedoraproject.org/lookaside/extras/elfutils/elfutils-{{ .ELFUTILS_VERSION }}.tar.bz2/sha512/e22d85f25317a79b36d370347e50284c9120c86f9830f08791b7b6a7b4ad89b9bf4c7c71129133b8d193a0edffb2a2c17987b7e48428b9670aff5ce918777e04/elfutils-{{ .ELFUTILS_VERSION }}.tar.bz2 - destination: elfutils.tar.bz2 - sha256: df76db71366d1d708365fc7a6c60ca48398f14367eb2b8954efc8897147ad871 - sha512: e22d85f25317a79b36d370347e50284c9120c86f9830f08791b7b6a7b4ad89b9bf4c7c71129133b8d193a0edffb2a2c17987b7e48428b9670aff5ce918777e04 - prepare: - - | - tar -xjf elfutils.tar.bz2 --strip-components=1 - - mkdir build - cd build - - export CFLAGS="${CFLAGS} -I/usr/local/glibc/include/ -L/usr/local/glibc/lib" - - ../configure \ - --prefix=/usr/local/glibc \ - --with-zstd=no \ - --disable-libdebuginfod \ - --disable-debuginfod \ - CFLAGS="${CFLAGS} -fPIC -Wno-error" - build: - - | - cd build - - make -j $(nproc) - install: - - | - cd build - make DESTDIR=/rootfs install - # we only need the libs and headers, remove everything else - find /rootfs/usr/local/ -type d \( -name bin -o -name sbin -o -name share \) -prune -exec rm -rf {} \; -finalize: - - from: /rootfs - to: /rootfs diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/libcap2/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/libcap2/pkg.yaml deleted file mode 100644 index efe1a6a1..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/libcap2/pkg.yaml +++ /dev/null @@ -1,28 +0,0 @@ -name: libcap -variant: scratch -shell: /bin/bash -install: - - build-base - - bash -dependencies: - - image: cgr.dev/chainguard/wolfi-base@{{ .WOLFI_BASE_REF }} -steps: - - sources: - - url: https://kernel.org/pub/linux/libs/security/linux-privs/libcap2/libcap-{{ .LIBCAP_VERSION }}.tar.xz - destination: libcap.tar.xz - sha256: 23a6ef8aadaf1e3e875f633bb2d116cfef8952dba7bc7c569b13458e1952b30f - sha512: 4e0bf0efeccb654c409afe9727b2b53c1d4da8190d7a0a9848fc52550ff3e13502add3eacde04a68a5b7bec09e91df487f64c5746ba987f873236a9e53b3d4e8 - prepare: - - | - tar -xf libcap.tar.xz --strip-components=1 - build: - - | - make prefix=/usr/local/glibc lib=lib -j $(nproc) - install: - - | - make DESTDIR=/rootfs prefix=/usr/local lib=lib install - # we only need the libs and headers, remove everything else - find /rootfs/usr/local/ -type d \( -name bin -o -name sbin -o -name share \) -prune -exec rm -rf {} \; -finalize: - - from: /rootfs - to: /rootfs diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/libseccomp/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/libseccomp/pkg.yaml deleted file mode 100644 index e93f7b51..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/libseccomp/pkg.yaml +++ /dev/null @@ -1,37 +0,0 @@ -name: libseccomp -variant: scratch -shell: /bin/bash -install: - - build-base - - bash - - gperf -dependencies: - - image: cgr.dev/chainguard/wolfi-base@{{ .WOLFI_BASE_REF }} -steps: - - sources: - - url: https://github.com/seccomp/libseccomp/releases/download/v{{ .LIBSECCOMP_VERSION }}/libseccomp-{{ .LIBSECCOMP_VERSION }}.tar.gz - destination: libseccomp.tar.gz - sha256: 248a2c8a4d9b9858aa6baf52712c34afefcf9c9e94b76dce02c1c9aa25fb3375 - sha512: f630e7a7e53a21b7ccb4d3e7b37616b89aeceba916677c8e3032830411d77a14c2d74dcf594cd193b1acc11f52595072e28316dc44300e54083d5d7b314a38da - prepare: - - | - tar -xzf libseccomp.tar.gz --strip-components=1 - - mkdir build - cd build - - ../configure \ - --prefix=/usr/local/glibc - build: - - | - cd build - make -j $(nproc) - install: - - | - cd build - make install DESTDIR=/rootfs - # we only need the libs and headers, remove everything else - find /rootfs/usr/local/ -type d \( -name bin -o -name sbin -o -name share \) -prune -exec rm -rf {} \; -finalize: - - from: /rootfs - to: /rootfs diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/libtirpc/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/libtirpc/pkg.yaml deleted file mode 100644 index 8d5a7aa1..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/libtirpc/pkg.yaml +++ /dev/null @@ -1,35 +0,0 @@ - -name: libtirpc -variant: scratch -shell: /bin/bash -install: - - build-base - - bash - - autoconf -dependencies: - - image: cgr.dev/chainguard/wolfi-base@{{ .WOLFI_BASE_REF }} -steps: - - sources: - - url: https://src.fedoraproject.org/lookaside/extras/libtirpc/libtirpc-{{ .LIBTIRPC_VERSION | replace "-" "." }}.tar.bz2/sha512/df0781a74ff9ded2d3c4f5eb7e05496b9f58eac8060c02c68331dc14c4a00304dcd19f46836f5756fe0d9d27095fd463d42dd696fcdff891516711b7d63deabe/libtirpc-{{ .LIBTIRPC_VERSION | replace "-" "." }}.tar.bz2 - destination: libtirpc.tar.bz2 - sha256: 6474e98851d9f6f33871957ddee9714fdcd9d8a5ee9abb5a98d63ea2e60e12f3 - sha512: df0781a74ff9ded2d3c4f5eb7e05496b9f58eac8060c02c68331dc14c4a00304dcd19f46836f5756fe0d9d27095fd463d42dd696fcdff891516711b7d63deabe - prepare: - - | - tar -xf libtirpc.tar.bz2 --strip-components=1 - - ./configure \ - --prefix=/usr/local/glibc \ - --disable-gssapi - build: - - | - make -j $(nproc) - install: - - | - mkdir -p /rootfs - - make install DESTDIR=/rootfs - rm -rf /rootfs/usr/local/share/man -finalize: - - from: /rootfs - to: /rootfs diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/lts/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/lts/pkg.yaml deleted file mode 100644 index 874f74f7..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/lts/pkg.yaml +++ /dev/null @@ -1,67 +0,0 @@ -name: nvidia-container-cli-lts -variant: scratch -shell: /bin/bash -install: - - build-base - - bash - - go - - coreutils - - sed - - curl - - rpcsvc-proto - - patch -dependencies: - - image: cgr.dev/chainguard/wolfi-base@{{ .WOLFI_BASE_REF }} - # nvidia-pkgs depends on glibc, - # so any stage depending on nvidia-container-cli will have the updated ld.so.cache, - # from both nvidia-pkgs and nvidia-container-cli - - stage: nvidia-pkgs-lts - - stage: libseccomp - from: /rootfs - - stage: libcap - from: /rootfs - - stage: elfutils - from: /rootfs - - stage: zlib - from: /rootfs - - stage: libtirpc - from: /rootfs -steps: - - sources: - - url: https://github.com/NVIDIA/libnvidia-container/archive/refs/tags/{{ .LIBNVIDIA_CONTAINER_VERSION }}.tar.gz - destination: libnvidia-container.tar.gz - sha256: cbc1dda7ee90b8b729c5f178292cd07b421863015d84b84c37e69c8d580ab3ff - sha512: b304c284c5ab0c3544362307dc16ffcca8d34497e4356a520dc6da81a86a62b2a262b528cba559bb0d7a3addf018c3b50b6cb78669c82c1b4acae159e5922548 - env: - SOURCE_DATE_EPOCH: {{ .BUILD_ARG_SOURCE_DATE_EPOCH }} - REVISION: {{ .LIBNVIDIA_CONTAINER_REF }} - LIB_VERSION: {{ .LIBNVIDIA_CONTAINER_VERSION | replace "v" "" }} - WITH_NVCGO: yes - WITH_LIBELF: yes - WITH_TIRPC: no # setting no means we'll use the system libtirpc - WITH_SECCOMP: yes - PKG_CONFIG_PATH: /usr/local/glibc/lib/pkgconfig # to find runtime libraries compiled in extensions (libseccomp) - PATH: "/usr/bin:{{ .PATH }}" # bldr doesn't have /usr/bin in PATH - prepare: - - | - mkdir libnvidia-container - tar -xzf libnvidia-container.tar.gz --strip-components=1 -C libnvidia-container - build: - - | - cd libnvidia-container - - # LDLIBS=-L/usr/local/glibc/lib is set so that libnvidia-container-cli libs which are hardcoded as -llibname and not using pkg-config - CPPFLAGS="-I/usr/local/glibc/include/tirpc" LDLIBS="-L/usr/local/glibc/lib -ltirpc -lelf -lseccomp" LDFLAGS='-Wl,--rpath=\$$ORIGIN/../glibc/\$$LIB' make - install: - - | - mkdir -p /rootfs - - cd libnvidia-container - - make install DESTDIR=/rootfs - - # run ldconfig to update the cache - /rootfs/usr/local/glibc/sbin/ldconfig -r /rootfs -finalize: - - from: /rootfs - to: /rootfs diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/production/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/production/pkg.yaml deleted file mode 100644 index 347fa269..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/production/pkg.yaml +++ /dev/null @@ -1,67 +0,0 @@ -name: nvidia-container-cli-production -variant: scratch -shell: /bin/bash -install: - - build-base - - bash - - go - - coreutils - - sed - - curl - - rpcsvc-proto - - patch -dependencies: - - image: cgr.dev/chainguard/wolfi-base@{{ .WOLFI_BASE_REF }} - # nvidia-pkgs depends on glibc, - # so any stage depending on nvidia-container-cli will have the updated ld.so.cache, - # from both nvidia-pkgs and nvidia-container-cli - - stage: nvidia-pkgs-production - - stage: libseccomp - from: /rootfs - - stage: libcap - from: /rootfs - - stage: elfutils - from: /rootfs - - stage: zlib - from: /rootfs - - stage: libtirpc - from: /rootfs -steps: - - sources: - - url: https://github.com/NVIDIA/libnvidia-container/archive/refs/tags/{{ .LIBNVIDIA_CONTAINER_VERSION }}.tar.gz - destination: libnvidia-container.tar.gz - sha256: cbc1dda7ee90b8b729c5f178292cd07b421863015d84b84c37e69c8d580ab3ff - sha512: b304c284c5ab0c3544362307dc16ffcca8d34497e4356a520dc6da81a86a62b2a262b528cba559bb0d7a3addf018c3b50b6cb78669c82c1b4acae159e5922548 - env: - SOURCE_DATE_EPOCH: {{ .BUILD_ARG_SOURCE_DATE_EPOCH }} - REVISION: {{ .LIBNVIDIA_CONTAINER_REF }} - LIB_VERSION: {{ .LIBNVIDIA_CONTAINER_VERSION | replace "v" "" }} - WITH_NVCGO: yes - WITH_LIBELF: yes - WITH_TIRPC: no # setting no means we'll use the system libtirpc - WITH_SECCOMP: yes - PKG_CONFIG_PATH: /usr/local/glibc/lib/pkgconfig # to find runtime libraries compiled in extensions (libseccomp) - PATH: "/usr/bin:{{ .PATH }}" # bldr doesn't have /usr/bin in PATH - prepare: - - | - mkdir libnvidia-container - tar -xzf libnvidia-container.tar.gz --strip-components=1 -C libnvidia-container - build: - - | - cd libnvidia-container - - # LDLIBS=-L/usr/local/glibc/lib is set so that libnvidia-container-cli libs which are hardcoded as -llibname and not using pkg-config - CPPFLAGS="-I/usr/local/glibc/include/tirpc" LDLIBS="-L/usr/local/glibc/lib -ltirpc -lelf -lseccomp" LDFLAGS='-Wl,--rpath=\$$ORIGIN/../glibc/\$$LIB' make - install: - - | - mkdir -p /rootfs - - cd libnvidia-container - - make install DESTDIR=/rootfs - - # run ldconfig to update the cache - /rootfs/usr/local/glibc/sbin/ldconfig -r /rootfs -finalize: - - from: /rootfs - to: /rootfs diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/zlib/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/zlib/pkg.yaml deleted file mode 100644 index f500d4f7..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/zlib/pkg.yaml +++ /dev/null @@ -1,35 +0,0 @@ -name: zlib -variant: scratch -shell: /bin/bash -install: - - build-base - - bash -dependencies: - - image: cgr.dev/chainguard/wolfi-base@{{ .WOLFI_BASE_REF }} -steps: - - sources: - - url: https://zlib.net/fossils/zlib-{{ .ZLIB_VERSION }}.tar.gz - destination: zlib.tar.gz - sha256: 9a93b2b7dfdac77ceba5a558a580e74667dd6fede4585b91eefb60f03b72df23 - sha512: 580677aad97093829090d4b605ac81c50327e74a6c2de0b85dd2e8525553f3ddde17556ea46f8f007f89e435493c9a20bc997d1ef1c1c2c23274528e3c46b94f - prepare: - - | - tar -xf zlib.tar.gz --strip-components=1 - mkdir build - cd build - - ../configure \ - --prefix=/usr/local/glibc - build: - - | - cd build - make -j $(nproc) - install: - - | - cd build - make DESTDIR=/rootfs install - # we only need the libs and headers, remove everything else - find /rootfs/usr/local/glibc -type d \( -name bin -o -name sbin -o -name share \) -prune -exec rm -rf {} \; -finalize: - - from: /rootfs - to: /rootfs diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime-wrapper/go.mod b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime-wrapper/go.mod deleted file mode 100644 index 3e5427ec..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime-wrapper/go.mod +++ /dev/null @@ -1,5 +0,0 @@ -module nvidia-container-runtime-wrapper - -go 1.22 - -require golang.org/x/sys v0.24.0 diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime-wrapper/main.go b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime-wrapper/main.go deleted file mode 100644 index e4ba2177..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime-wrapper/main.go +++ /dev/null @@ -1,54 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at http://mozilla.org/MPL/2.0/. - -package main - -import ( - "log" - "os" - "path/filepath" - - "golang.org/x/sys/unix" -) - -func main() { - cmdName := filepath.Base(os.Args[0]) - switch cmdName { - case - "nvidia-container-runtime", - "nvidia-container-runtime-hook", - "nvidia-container-runtime.cdi", - "nvidia-container-runtime.legacy", - "nvidia-container-toolkit", - "nvidia-ctk": - execCommand(cmdName, os.Args[1:]) - default: - log.Fatalf("nvidia-container-runtime-wrapper: unknown command %s\n", cmdName) - } -} - -func execCommand(cmdName string, args []string) { - environ := os.Environ() - environ = append(environ, "XDG_CONFIG_HOME=/usr/local/etc") - - realCmdName := cmdName + ".real" - - cmdArgs := []string{realCmdName} - - if cmdName == "nvidia-container-runtime-hook" { - cmdArgs = append( - cmdArgs, - "-config", - "/usr/local/etc/nvidia-container-runtime/config.toml", - ) - } - - cmdArgs = append(cmdArgs, args...) - - cmdFullPath := filepath.Join("/usr/local/bin", realCmdName) - - if err := unix.Exec(cmdFullPath, cmdArgs, environ); err != nil { - log.Fatalf("nvidia-container-runtime-wrapper: error execing %s %v\n", cmdFullPath, err) - } -} diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime-wrapper/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime-wrapper/pkg.yaml deleted file mode 100644 index 0eed9f78..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime-wrapper/pkg.yaml +++ /dev/null @@ -1,24 +0,0 @@ -name: nvidia-container-runtime-wrapper -variant: scratch -shell: /toolchain/bin/bash -dependencies: - - stage: base -steps: - - cachePaths: - - /.cache/go-build - - /go/pkg - build: - - | - export PATH=${PATH}:${TOOLCHAIN}/go/bin - - cp -r /pkg/* . - - CGO_ENABLED=0 go build -o nvidia-container-runtime-wrapper main.go - install: - - | - mkdir -p /rootfs/usr/local/bin - - cp nvidia-container-runtime-wrapper /rootfs/usr/local/bin/nvidia-container-runtime-wrapper -finalize: - - from: /rootfs - to: /rootfs diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime/nvidia-container-runtime.part b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime/nvidia-container-runtime.part deleted file mode 100644 index 8e04a27f..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime/nvidia-container-runtime.part +++ /dev/null @@ -1,7 +0,0 @@ -[plugins."io.containerd.cri.v1.runtime".containerd.runtimes.nvidia] - privileged_without_host_devices = false - runtime_engine = "" - runtime_root = "" - runtime_type = "io.containerd.runc.v2" - [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.nvidia.options] - BinaryName = "/usr/local/bin/nvidia-container-runtime" diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime/nvidia-container-runtime.toml b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime/nvidia-container-runtime.toml deleted file mode 100644 index 91087fbc..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime/nvidia-container-runtime.toml +++ /dev/null @@ -1,15 +0,0 @@ -disable-require = false - -[nvidia-container-cli] -#root = "/run/nvidia/driver" -#path = "/usr/bin/nvidia-container-cli" -environment = [] -debug = "/var/log/nvidia-container-cli.log" -ldcache = "/usr/local/glibc/etc/ld.so.cache" -ldconfig = "@/usr/local/glibc/sbin/ldconfig" -load-kmods = false -user = "0:0" -#no-cgroups = false - -[nvidia-container-runtime] -debug = "/var/log/nvidia-container-runtime.log" diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime/patches/ldcache_path.patch b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime/patches/ldcache_path.patch deleted file mode 100644 index c3730b61..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime/patches/ldcache_path.patch +++ /dev/null @@ -1,13 +0,0 @@ -diff --git internal/ldcache/ldcache.go internal/ldcache/ldcache.go -index 5493dc3..9c86e41 100644 ---- internal/ldcache/ldcache.go -+++ internal/ldcache/ldcache.go -@@ -32,7 +32,7 @@ import ( - log "github.com/sirupsen/logrus" - ) - --const ldcachePath = "/etc/ld.so.cache" -+const ldcachePath = "/usr/local/glibc/etc/ld.so.cache" - - const ( - magicString1 = "ld.so-1.7.0" diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime/pkg.yaml deleted file mode 100644 index 325a6572..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime/pkg.yaml +++ /dev/null @@ -1,60 +0,0 @@ -name: nvidia-container-runtime -variant: scratch -shell: /bin/bash -install: - - build-base - - bash - - go - - patch -dependencies: - - image: cgr.dev/chainguard/wolfi-base@{{ .WOLFI_BASE_REF }} -steps: - - sources: - - url: https://github.com/NVIDIA/nvidia-container-toolkit/archive/refs/tags/{{ .CONTAINER_TOOLKIT_VERSION }}.tar.gz - destination: container-toolkit.tar.gz - sha256: 38a193444e0342c0a2c0d3664403e2c341eb77f1461b3f9172fd93c04de82165 - sha512: 691d4fc47ea60b730ec491b333aa8118bcfd62cdab20a42b84155c6a13484d920e758435b5029bbae4fbefce82352aa5764f1554992682f689c95615809fb83c - env: - GIT_COMMIT: {{ substr 0 7 .CONTAINER_TOOLKIT_REF }} # build is using short sha - prepare: - - | - mkdir -p container-toolkit - tar -xzf container-toolkit.tar.gz --strip-components=1 -C container-toolkit - - cd container-toolkit - patch -p0 < /pkg/patches/ldcache_path.patch - build: - - | - cd container-toolkit - - make cmds - install: - - | - mkdir -p /rootfs/usr/local/bin - - cd container-toolkit - - # let's copy all built binaries suffixing them with .real - # the wrapper binary will call the real binary with the same name - for file in $(find . -maxdepth 1 -type f -executable); do - clean_file=$(basename $file) - - # oci-nvidia-hook is a shell script calling nvidia-container-runtime-hook - if [[ $clean_file == "oci-nvidia-hook" ]]; then - ln -sv nvidia-container-runtime-hook /rootfs/usr/local/bin/$clean_file - - continue - fi - - ln -sv nvidia-container-runtime-wrapper /rootfs/usr/local/bin/$(basename $clean_file) - cp $clean_file /rootfs/usr/local/bin/$(basename $clean_file).real - done - - | - mkdir -p /rootfs/etc/cri/conf.d - cp /pkg/nvidia-container-runtime.part /rootfs/etc/cri/conf.d/nvidia-container-runtime.part - - mkdir -p /rootfs/usr/local/etc/nvidia-container-runtime - cp /pkg/nvidia-container-runtime.toml /rootfs/usr/local/etc/nvidia-container-runtime/config.toml -finalize: - - from: /rootfs - to: /rootfs diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-persistenced-wrapper/go.sum b/nvidia-gpu/nvidia-container-toolkit/nvidia-persistenced-wrapper/go.sum deleted file mode 100644 index d88e7bd7..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-persistenced-wrapper/go.sum +++ /dev/null @@ -1,2 +0,0 @@ -golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= -golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-persistenced-wrapper/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/nvidia-persistenced-wrapper/pkg.yaml deleted file mode 100644 index dde8134e..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-persistenced-wrapper/pkg.yaml +++ /dev/null @@ -1,24 +0,0 @@ -name: nvidia-persistenced-wrapper -variant: scratch -shell: /toolchain/bin/bash -dependencies: - - stage: base -steps: - - cachePaths: - - /.cache/go-build - - /go/pkg - build: - - | - export PATH=${PATH}:${TOOLCHAIN}/go/bin - - cp -r /pkg/* . - - CGO_ENABLED=0 go build -o nvidia-persistenced-wrapper main.go - install: - - | - mkdir -p /rootfs/usr/local/bin - - cp nvidia-persistenced-wrapper /rootfs/usr/local/bin/nvidia-persistenced-wrapper -finalize: - - from: /rootfs - to: /rootfs diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/lts/files/15-nvidia-device.rules b/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/lts/files/15-nvidia-device.rules deleted file mode 100644 index 9277b3cb..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/lts/files/15-nvidia-device.rules +++ /dev/null @@ -1,5 +0,0 @@ -# This will create the device nvidia device nodes -ACTION=="add", DEVPATH=="/bus/pci/drivers/nvidia", RUN+="/usr/local/bin/nvidia-modprobe -c 0" - -# Create the device node for the nvidia-uvm module -ACTION=="add", DEVPATH=="/module/nvidia_uvm", SUBSYSTEM=="module", RUN+="/usr/local/bin/nvidia-modprobe -c 0 -u" diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/lts/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/lts/pkg.yaml deleted file mode 100644 index 7568662a..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/lts/pkg.yaml +++ /dev/null @@ -1,98 +0,0 @@ -name: nvidia-pkgs-lts -variant: scratch -shell: /bin/bash -install: - - bash -dependencies: - - image: cgr.dev/chainguard/wolfi-base@{{ .WOLFI_BASE_REF }} - # depends on glibc to update ld.so.cache - # so any stage depending on nvidia-pkgs will have the updated cache - - stage: glibc -steps: - - sources: - # {{ if eq .ARCH "aarch64" }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr - - url: https://developer.download.nvidia.com/compute/nvidia-driver/redist/nvidia_driver/linux-sbsa/nvidia_driver-linux-sbsa-{{ .NVIDIA_DRIVER_LTS_VERSION }}-archive.tar.xz - destination: nvidia.tar.xz - sha256: 970be3ae71332ca008f3e6589ae44a70aeffb9e29382980114e47b8fce7790d1 - sha512: bd730a51a77d897509381ecb22eb21a9f4e0c2419288f1c1c26f8ef00e887b1cc09718d1d4c9d613912560e48185ff03ea221865be5c0e590a20868c45a8ea00 - # {{ else }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr - - url: https://developer.download.nvidia.com/compute/nvidia-driver/redist/nvidia_driver/linux-x86_64/nvidia_driver-linux-x86_64-{{ .NVIDIA_DRIVER_LTS_VERSION }}-archive.tar.xz - destination: nvidia.tar.xz - sha256: e66527c5c016d0bee9050a7a8573e38be86aad58adee2f40e808c88a4d0c6e90 - sha512: 71624903e9d57a3f8a5dc7ffb2435991fe787b0609096e0e146d03ffef54bdb145940e8717510aa87cd6407c860e22938c840c126db7d4469c265f202db35e18 - # {{ end }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr - prepare: - - | - # the nvidia installer validates these packages are installed - ln -s /bin/true /bin/modprobe - ln -s /bin/true /bin/rmmod - ln -s /bin/true /bin/lsmod - ln -s /bin/true /bin/depmod - - tar xf nvidia.tar.xz --strip-components=1 - install: - - | - mkdir -p assets/{html,libglvnd_install_checker} - - cp -r bin/* assets/ - cp CHANGELOG assets/NVIDIA_Changelog - cp -r docs/* assets/html/ - cp -r etc/* assets/ - cp -r firmware assets/ - cp -r lib/* assets/ - cp LICENSE assets/ - cp -r man/man1/* assets/ - cp MANIFEST assets/.manifest - cp README assets/README.txt - cp -r sbin/* assets/ - cp -r share/* assets/ - cp -r supported-gpus assets/ - cp -r systemd assets/ - cp -r tests/glvnd/* assets/libglvnd_install_checker - # {{ if eq .ARCH "x86_64" }}cp -r wine/* assets/{{ end }} - - cd assets - - ./nvidia-installer --silent \ - --opengl-prefix=/rootfs/usr/local \ - --utility-prefix=/rootfs/usr/local \ - --utility-libdir=glibc/lib \ - --documentation-prefix=/rootfs/usr/local \ - --no-rpms \ - --no-kernel-modules \ - --log-file-name=/tmp/nvidia-installer.log \ - --no-distro-scripts \ - --no-wine-files \ - --no-kernel-module-source \ - --no-check-for-alternate-installs \ - --override-file-type-destination=NVIDIA_MODPROBE:/rootfs/usr/local/bin \ - --override-file-type-destination=FIRMWARE:/rootfs/lib/firmware/nvidia/{{ .NVIDIA_DRIVER_LTS_VERSION }} \ - --no-systemd \ - # {{ if eq .ARCH "x86_64" }}--no-install-compat32-libs{{ end }} - - # copy vulkan/OpenGL json files - mkdir -p /rootfs/{etc/vulkan,usr/share/{glvnd,egl}} - - cp -r /usr/share/glvnd/* /rootfs/usr/share/glvnd - cp -r /usr/share/egl/* /rootfs/usr/share/egl - cp -r /etc/vulkan/* /rootfs/etc/vulkan - - # mv over files from /usr/local/lib -> /usr/local/glibc/lib - mv /rootfs/usr/local/lib/* /rootfs/usr/local/glibc/lib/ - - # copy xorg files - mkdir -p /rootfs/usr/local/glibc/lib/nvidia/xorg - find /usr/lib/xorg/modules -type f -exec cp {} /rootfs/usr/local/glibc/lib/nvidia/xorg \; - - # run ldconfig to update the cache - /rootfs/usr/local/glibc/sbin/ldconfig -r /rootfs - - mkdir -p /rootfs/usr/local/lib/containers/nvidia-persistenced \ - /rootfs/usr/local/etc/containers \ - /rootfs/usr/etc/udev/rules.d - - # copy udev rule - cp /pkg/files/15-nvidia-device.rules /rootfs/usr/etc/udev/rules.d -finalize: - - from: /rootfs - to: /rootfs diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/production/files/15-nvidia-device.rules b/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/production/files/15-nvidia-device.rules deleted file mode 100644 index 9277b3cb..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/production/files/15-nvidia-device.rules +++ /dev/null @@ -1,5 +0,0 @@ -# This will create the device nvidia device nodes -ACTION=="add", DEVPATH=="/bus/pci/drivers/nvidia", RUN+="/usr/local/bin/nvidia-modprobe -c 0" - -# Create the device node for the nvidia-uvm module -ACTION=="add", DEVPATH=="/module/nvidia_uvm", SUBSYSTEM=="module", RUN+="/usr/local/bin/nvidia-modprobe -c 0 -u" diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/production/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/production/pkg.yaml deleted file mode 100644 index b9b43927..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/production/pkg.yaml +++ /dev/null @@ -1,96 +0,0 @@ -name: nvidia-pkgs-production -variant: scratch -shell: /bin/bash -install: - - bash -dependencies: - - image: cgr.dev/chainguard/wolfi-base@{{ .WOLFI_BASE_REF }} - # depends on glibc to update ld.so.cache - # so any stage depending on nvidia-pkgs will have the updated cache - - stage: glibc -steps: - - sources: - # {{ if eq .ARCH "aarch64" }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr - - url: https://developer.download.nvidia.com/compute/nvidia-driver/redist/nvidia_driver/linux-sbsa/nvidia_driver-linux-sbsa-{{ .NVIDIA_DRIVER_PRODUCTION_VERSION }}-archive.tar.xz - destination: nvidia.tar.xz - sha256: dd2892ac0c97abe69dd9ccb5e09d2fd5b5ce010c64ce5eb0950a0f6fceb9b4dc - sha512: 9c1466d9ea09a01dda4de0a2b3270cc6a5093636554eadfb58c3e2957e053592f7d628c3d5b31dbb36702e187561cb7f955e9bf2ddb1adb28e7ca4568d39a0f0 - # {{ else }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr - - url: https://developer.download.nvidia.com/compute/nvidia-driver/redist/nvidia_driver/linux-x86_64/nvidia_driver-linux-x86_64-{{ .NVIDIA_DRIVER_PRODUCTION_VERSION }}-archive.tar.xz - destination: nvidia.tar.xz - sha256: 7959e9e0e15863c9242f8a0bda0b3b67b39701956890ff159961f59e89f92158 - sha512: 89a4249bce2c15af56911afa6998c355d6522e2e7493e80ed9241a9d5009ccf2522bf7bceffc03673600bbfd0d89f3a46a3c21fb0f4977e6dc674648b4c6caea - # {{ end }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr - prepare: - - | - # the nvidia installer validates these packages are installed - ln -s /bin/true /bin/modprobe - ln -s /bin/true /bin/rmmod - ln -s /bin/true /bin/lsmod - ln -s /bin/true /bin/depmod - - tar xf nvidia.tar.xz --strip-components=1 - install: - - | - mkdir -p assets/html - - cp -r bin/* assets/ - cp CHANGELOG assets/NVIDIA_Changelog - cp -r docs/* assets/html/ - cp -r etc/* assets/ - cp -r firmware assets/ - cp -r lib/* assets/ - cp LICENSE assets/ - cp -r man/man1/* assets/ - cp MANIFEST assets/.manifest - cp README assets/README.txt - cp -r sbin/* assets/ - cp -r share/* assets/ - cp -r supported-gpus assets/ - cp -r systemd assets/ - # {{ if eq .ARCH "x86_64" }}cp -r wine/* assets/{{ end }} - - cd assets - - ./nvidia-installer --silent \ - --opengl-prefix=/rootfs/usr/local \ - --utility-prefix=/rootfs/usr/local \ - --utility-libdir=glibc/lib \ - --documentation-prefix=/rootfs/usr/local \ - --no-rpms \ - --no-kernel-modules \ - --log-file-name=/tmp/nvidia-installer.log \ - --no-distro-scripts \ - --no-wine-files \ - --no-kernel-module-source \ - --no-check-for-alternate-installs \ - --override-file-type-destination=NVIDIA_MODPROBE:/rootfs/usr/local/bin \ - --override-file-type-destination=FIRMWARE:/rootfs/lib/firmware/nvidia/{{ .NVIDIA_DRIVER_PRODUCTION_VERSION }} \ - --no-systemd - - # copy vulkan/OpenGL json files - mkdir -p /rootfs/{etc/vulkan,usr/share/{glvnd,egl}} - - cp -r /usr/share/glvnd/* /rootfs/usr/share/glvnd - cp -r /usr/share/egl/* /rootfs/usr/share/egl - cp -r /etc/vulkan/* /rootfs/etc/vulkan - - # mv over files from /usr/local/lib -> /usr/local/glibc/lib - mv /rootfs/usr/local/lib/* /rootfs/usr/local/glibc/lib/ - - # copy xorg files - mkdir -p /rootfs/usr/local/glibc/lib/nvidia/xorg - find /usr/lib/xorg/modules -type f -exec cp {} /rootfs/usr/local/glibc/lib/nvidia/xorg \; - - # run ldconfig to update the cache - /rootfs/usr/local/glibc/sbin/ldconfig -r /rootfs - - mkdir -p /rootfs/usr/local/lib/containers/nvidia-persistenced \ - /rootfs/usr/local/etc/containers \ - /rootfs/usr/etc/udev/rules.d - - # copy udev rule - cp /pkg/files/15-nvidia-device.rules /rootfs/usr/etc/udev/rules.d -finalize: - - from: /rootfs - to: /rootfs diff --git a/nvidia-gpu/nvidia-container-toolkit/production/manifest.yaml b/nvidia-gpu/nvidia-container-toolkit/production/manifest.yaml deleted file mode 100644 index 49fc70ab..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/production/manifest.yaml +++ /dev/null @@ -1,10 +0,0 @@ -version: v1alpha1 -metadata: - name: nvidia-container-toolkit-production - version: "$VERSION" - author: Sidero Labs - description: | - This system extension provides nvidia runtime and it's dependencies using NVIDIA's runtime handler. - compatibility: - talos: - version: ">= v1.2.0" diff --git a/nvidia-gpu/nvidia-container-toolkit/production/vars.yaml b/nvidia-gpu/nvidia-container-toolkit/production/vars.yaml deleted file mode 100644 index 58e7f164..00000000 --- a/nvidia-gpu/nvidia-container-toolkit/production/vars.yaml +++ /dev/null @@ -1,2 +0,0 @@ -# the first part is the driver version and the second the container-toolkit version -VERSION: "{{ .NVIDIA_DRIVER_PRODUCTION_VERSION }}-{{ .CONTAINER_TOOLKIT_VERSION }}" diff --git a/nvidia-gpu/nvidia-driver-proprietary/kernel-modules/lts/pkg.yaml b/nvidia-gpu/nvidia-driver-proprietary/kernel-modules/lts/pkg.yaml new file mode 100644 index 00000000..5cc36e4b --- /dev/null +++ b/nvidia-gpu/nvidia-driver-proprietary/kernel-modules/lts/pkg.yaml @@ -0,0 +1,16 @@ +name: nvidia-proprietary-gpu-kernel-modules-lts +variant: scratch +shell: /toolchain/bin/bash +dependencies: + - stage: base + # The pkgs version for a particular release of Talos as defined in + # https://github.com/siderolabs/talos/blob//pkg/machinery/gendata/data/pkgs + - image: "{{ .BUILD_ARG_PKGS_PREFIX }}/nonfree-kmod-nvidia-lts-pkg:{{ .BUILD_ARG_PKGS }}" +steps: + - install: + - | + mkdir -p /rootfs/lib/modules + cp -R /lib/modules/* /rootfs/lib/modules +finalize: + - from: /rootfs + to: /rootfs diff --git a/nvidia-gpu/nvidia-driver-proprietary/kernel-modules/production/pkg.yaml b/nvidia-gpu/nvidia-driver-proprietary/kernel-modules/production/pkg.yaml new file mode 100644 index 00000000..da7e910f --- /dev/null +++ b/nvidia-gpu/nvidia-driver-proprietary/kernel-modules/production/pkg.yaml @@ -0,0 +1,16 @@ +name: nvidia-proprietary-gpu-kernel-modules-production +variant: scratch +shell: /toolchain/bin/bash +dependencies: + - stage: base + # The pkgs version for a particular release of Talos as defined in + # https://github.com/siderolabs/talos/blob//pkg/machinery/gendata/data/pkgs + - image: "{{ .BUILD_ARG_PKGS_PREFIX }}/nonfree-kmod-nvidia-production-pkg:{{ .BUILD_ARG_PKGS }}" +steps: + - install: + - | + mkdir -p /rootfs/lib/modules + cp -R /lib/modules/* /rootfs/lib/modules +finalize: + - from: /rootfs + to: /rootfs diff --git a/nvidia-gpu/nvidia-driver-proprietary/lts/manifest.yaml b/nvidia-gpu/nvidia-driver-proprietary/lts/manifest.yaml new file mode 100644 index 00000000..dd18a10c --- /dev/null +++ b/nvidia-gpu/nvidia-driver-proprietary/lts/manifest.yaml @@ -0,0 +1,12 @@ +version: v1alpha1 +metadata: + name: nvidia-driver-proprietary-lts + version: "$VERSION" + author: Jean-Francois Roy + description: | + This system extension provides the NVIDIA proprietary GPU kernel modules and matching userspace + drivers, LTS version. It depends on the glibc extension. NVIDIA recommends using the open-source + GPU kernel modules on supported architectures. + compatibility: + talos: + version: ">= v1.9.0" diff --git a/nvidia-gpu/nvidia-container-toolkit/lts/nvidia-persistenced.yaml b/nvidia-gpu/nvidia-driver-proprietary/lts/nvidia-driver.yaml similarity index 70% rename from nvidia-gpu/nvidia-container-toolkit/lts/nvidia-persistenced.yaml rename to nvidia-gpu/nvidia-driver-proprietary/lts/nvidia-driver.yaml index ffd99967..d88fec6d 100644 --- a/nvidia-gpu/nvidia-container-toolkit/lts/nvidia-persistenced.yaml +++ b/nvidia-gpu/nvidia-driver-proprietary/lts/nvidia-driver.yaml @@ -1,7 +1,6 @@ -# https://download.nvidia.com/XFree86/Linux-x86_64/515.65.01/README/nvidia-persistenced.html -name: nvidia-persistenced +name: nvidia-driver container: - entrypoint: /usr/local/bin/nvidia-persistenced-wrapper + entrypoint: /nvidia-driver mounts: # device files - source: /dev @@ -25,13 +24,21 @@ container: options: - bind - ro - # nvidia libraries - - source: /usr/local/lib - destination: /usr/local/lib + # nvidia driver source + - source: /usr/local/glibc + destination: /usr/local/glibc type: bind options: - bind - ro + # nvidia driver destination + - source: /run/nvidia + destination: /run/nvidia + type: bind + options: + - rshared + - rbind + - rw # service state file - source: /var/run destination: /var/run @@ -40,13 +47,6 @@ container: - rshared - rbind - rw - # binaries - - source: /usr/local/bin - destination: /usr/local/bin - type: bind - options: - - bind - - ro depends: - service: cri # we need to depend on udevd so that the nvidia device files are created diff --git a/nvidia-gpu/nvidia-container-toolkit/lts/pkg.yaml b/nvidia-gpu/nvidia-driver-proprietary/lts/pkg.yaml similarity index 61% rename from nvidia-gpu/nvidia-container-toolkit/lts/pkg.yaml rename to nvidia-gpu/nvidia-driver-proprietary/lts/pkg.yaml index 4215aa5f..f01dfae6 100644 --- a/nvidia-gpu/nvidia-container-toolkit/lts/pkg.yaml +++ b/nvidia-gpu/nvidia-driver-proprietary/lts/pkg.yaml @@ -1,17 +1,12 @@ -name: nvidia-container-toolkit-lts +name: nvidia-driver-proprietary-lts variant: scratch shell: /toolchain/bin/bash dependencies: - stage: base - - stage: nvidia-container-cli-lts - - stage: elfutils - - stage: zlib - - stage: libcap - - stage: libseccomp - - stage: libtirpc - - stage: nvidia-container-runtime - - stage: nvidia-container-runtime-wrapper - - stage: nvidia-persistenced-wrapper + - stage: nvidia-driver-common + - stage: nvidia-driver-service + - stage: nvidia-driver-userspace-lts + - stage: nvidia-proprietary-gpu-kernel-modules-lts steps: - prepare: - | @@ -19,8 +14,7 @@ steps: install: - | mkdir -p /rootfs/usr/local/etc/containers - - cp /pkg/nvidia-persistenced.yaml /rootfs/usr/local/etc/containers/nvidia-persistenced.yaml + cp /pkg/nvidia-driver.yaml /rootfs/usr/local/etc/containers/ test: - | mkdir -p /extensions-validator-rootfs diff --git a/nvidia-gpu/nonfree/kmod-nvidia/lts/vars.yaml b/nvidia-gpu/nvidia-driver-proprietary/lts/vars.yaml similarity index 100% rename from nvidia-gpu/nonfree/kmod-nvidia/lts/vars.yaml rename to nvidia-gpu/nvidia-driver-proprietary/lts/vars.yaml diff --git a/nvidia-gpu/nvidia-driver-proprietary/production/manifest.yaml b/nvidia-gpu/nvidia-driver-proprietary/production/manifest.yaml new file mode 100644 index 00000000..a846102e --- /dev/null +++ b/nvidia-gpu/nvidia-driver-proprietary/production/manifest.yaml @@ -0,0 +1,12 @@ +version: v1alpha1 +metadata: + name: nvidia-driver-proprietary-production + version: "$VERSION" + author: Jean-Francois Roy + description: | + This system extension provides the NVIDIA proprietary GPU kernel modules and matching userspace + drivers, production version. It depends on the glibc extension. NVIDIA recommends using the + open-source GPU kernel modules on supported architectures. + compatibility: + talos: + version: ">= v1.9.0" diff --git a/nvidia-gpu/nvidia-driver-proprietary/production/nvidia-driver.yaml b/nvidia-gpu/nvidia-driver-proprietary/production/nvidia-driver.yaml new file mode 100644 index 00000000..c1421693 --- /dev/null +++ b/nvidia-gpu/nvidia-driver-proprietary/production/nvidia-driver.yaml @@ -0,0 +1,50 @@ +name: nvidia-driver +container: + entrypoint: /nvidia-driver + mounts: + # device files + - source: /dev + destination: /dev + type: bind + options: + - rshared + - rbind + - rw + # glibc dynamic linker + - source: /lib64 + destination: /lib64 + type: bind + options: + - bind + - ro + # glibc root, which includes the nvidia driver userspace components + - source: /usr/local/glibc + destination: /usr/local/glibc + type: bind + options: + - bind + - ro + # nvidia driver destination + - source: /run/nvidia + destination: /run/nvidia + type: bind + options: + - rshared + - rbind + - rw + # service state file + - source: /var/run + destination: /var/run + type: bind + options: + - rshared + - rbind + - rw + security: + rootfsPropagation: shared +depends: + - service: cri + # we need to depend on udevd so that the nvidia device files are created + - service: udevd + - path: /sys/bus/pci/drivers/nvidia +restart: always diff --git a/nvidia-gpu/nvidia-container-toolkit/production/pkg.yaml b/nvidia-gpu/nvidia-driver-proprietary/production/pkg.yaml similarity index 60% rename from nvidia-gpu/nvidia-container-toolkit/production/pkg.yaml rename to nvidia-gpu/nvidia-driver-proprietary/production/pkg.yaml index 022f4d14..27536cc6 100644 --- a/nvidia-gpu/nvidia-container-toolkit/production/pkg.yaml +++ b/nvidia-gpu/nvidia-driver-proprietary/production/pkg.yaml @@ -1,17 +1,12 @@ -name: nvidia-container-toolkit-production +name: nvidia-driver-proprietary-production variant: scratch shell: /toolchain/bin/bash dependencies: - stage: base - - stage: nvidia-container-cli-production - - stage: elfutils - - stage: zlib - - stage: libcap - - stage: libseccomp - - stage: libtirpc - - stage: nvidia-container-runtime - - stage: nvidia-container-runtime-wrapper - - stage: nvidia-persistenced-wrapper + - stage: nvidia-driver-common + - stage: nvidia-driver-service + - stage: nvidia-driver-userspace-production + - stage: nvidia-proprietary-gpu-kernel-modules-production steps: - prepare: - | @@ -19,7 +14,7 @@ steps: install: - | mkdir -p /rootfs/usr/local/etc/containers - cp /pkg/nvidia-persistenced.yaml /rootfs/usr/local/etc/containers/nvidia-persistenced.yaml + cp /pkg/nvidia-driver.yaml /rootfs/usr/local/etc/containers/ test: - | mkdir -p /extensions-validator-rootfs diff --git a/nvidia-gpu/nonfree/kmod-nvidia/production/vars.yaml b/nvidia-gpu/nvidia-driver-proprietary/production/vars.yaml similarity index 100% rename from nvidia-gpu/nonfree/kmod-nvidia/production/vars.yaml rename to nvidia-gpu/nvidia-driver-proprietary/production/vars.yaml diff --git a/nvidia-gpu/nonfree/kmod-nvidia/lts/files/nvidia.conf b/nvidia-gpu/nvidia-driver/common/files/nvidia.conf similarity index 77% rename from nvidia-gpu/nonfree/kmod-nvidia/lts/files/nvidia.conf rename to nvidia-gpu/nvidia-driver/common/files/nvidia.conf index 62b5f931..021b91c2 100644 --- a/nvidia-gpu/nonfree/kmod-nvidia/lts/files/nvidia.conf +++ b/nvidia-gpu/nvidia-driver/common/files/nvidia.conf @@ -1,4 +1,5 @@ blacklist nvidia blacklist nvidia_uvm -blacklist nvidia_drm blacklist nvidia_modeset +blacklist nvidia_peermem +blacklist nvidia_drm diff --git a/nvidia-gpu/nvidia-driver/common/pkg.yaml b/nvidia-gpu/nvidia-driver/common/pkg.yaml new file mode 100644 index 00000000..8550b5db --- /dev/null +++ b/nvidia-gpu/nvidia-driver/common/pkg.yaml @@ -0,0 +1,13 @@ +name: nvidia-driver-common +variant: scratch +shell: /toolchain/bin/bash +dependencies: + - stage: base +steps: + - install: + - | + mkdir -p /rootfs/usr/local/lib/modprobe.d + cp /pkg/files/nvidia.conf /rootfs/usr/local/lib/modprobe.d/nvidia.conf +finalize: + - from: /rootfs + to: /rootfs diff --git a/nvidia-gpu/nvidia-driver/kernel-modules/lts/pkg.yaml b/nvidia-gpu/nvidia-driver/kernel-modules/lts/pkg.yaml new file mode 100644 index 00000000..67f4ea78 --- /dev/null +++ b/nvidia-gpu/nvidia-driver/kernel-modules/lts/pkg.yaml @@ -0,0 +1,16 @@ +name: nvidia-open-gpu-kernel-modules-lts +variant: scratch +shell: /toolchain/bin/bash +dependencies: + - stage: base + # The pkgs version for a particular release of Talos as defined in + # https://github.com/siderolabs/talos/blob//pkg/machinery/gendata/data/pkgs + - image: "{{ .BUILD_ARG_PKGS_PREFIX }}/nvidia-open-gpu-kernel-modules-lts-pkg:{{ .BUILD_ARG_PKGS }}" +steps: + - install: + - | + mkdir -p /rootfs/lib/modules + cp -R /lib/modules/* /rootfs/lib/modules +finalize: + - from: /rootfs + to: /rootfs diff --git a/nvidia-gpu/nvidia-driver/kernel-modules/production/pkg.yaml b/nvidia-gpu/nvidia-driver/kernel-modules/production/pkg.yaml new file mode 100644 index 00000000..9e8f5873 --- /dev/null +++ b/nvidia-gpu/nvidia-driver/kernel-modules/production/pkg.yaml @@ -0,0 +1,16 @@ +name: nvidia-open-gpu-kernel-modules-production +variant: scratch +shell: /toolchain/bin/bash +dependencies: + - stage: base + # The pkgs version for a particular release of Talos as defined in + # https://github.com/siderolabs/talos/blob//pkg/machinery/gendata/data/pkgs + - image: "{{ .BUILD_ARG_PKGS_PREFIX }}/nvidia-open-gpu-kernel-modules-production-pkg:{{ .BUILD_ARG_PKGS }}" +steps: + - install: + - | + mkdir -p /rootfs/lib/modules + cp -R /lib/modules/* /rootfs/lib/modules +finalize: + - from: /rootfs + to: /rootfs diff --git a/nvidia-gpu/nvidia-driver/lts/manifest.yaml b/nvidia-gpu/nvidia-driver/lts/manifest.yaml new file mode 100644 index 00000000..8ed44b85 --- /dev/null +++ b/nvidia-gpu/nvidia-driver/lts/manifest.yaml @@ -0,0 +1,11 @@ +version: v1alpha1 +metadata: + name: nvidia-driver-lts + version: "$VERSION" + author: Jean-Francois Roy + description: | + This system extension provides the NVIDIA open-source GPU kernel modules and matching userspace + drivers, LTS version. It depends on the glibc extension. + compatibility: + talos: + version: ">= v1.9.0" diff --git a/nvidia-gpu/nvidia-container-toolkit/production/nvidia-persistenced.yaml b/nvidia-gpu/nvidia-driver/lts/nvidia-driver.yaml similarity index 70% rename from nvidia-gpu/nvidia-container-toolkit/production/nvidia-persistenced.yaml rename to nvidia-gpu/nvidia-driver/lts/nvidia-driver.yaml index ffd99967..d88fec6d 100644 --- a/nvidia-gpu/nvidia-container-toolkit/production/nvidia-persistenced.yaml +++ b/nvidia-gpu/nvidia-driver/lts/nvidia-driver.yaml @@ -1,7 +1,6 @@ -# https://download.nvidia.com/XFree86/Linux-x86_64/515.65.01/README/nvidia-persistenced.html -name: nvidia-persistenced +name: nvidia-driver container: - entrypoint: /usr/local/bin/nvidia-persistenced-wrapper + entrypoint: /nvidia-driver mounts: # device files - source: /dev @@ -25,13 +24,21 @@ container: options: - bind - ro - # nvidia libraries - - source: /usr/local/lib - destination: /usr/local/lib + # nvidia driver source + - source: /usr/local/glibc + destination: /usr/local/glibc type: bind options: - bind - ro + # nvidia driver destination + - source: /run/nvidia + destination: /run/nvidia + type: bind + options: + - rshared + - rbind + - rw # service state file - source: /var/run destination: /var/run @@ -40,13 +47,6 @@ container: - rshared - rbind - rw - # binaries - - source: /usr/local/bin - destination: /usr/local/bin - type: bind - options: - - bind - - ro depends: - service: cri # we need to depend on udevd so that the nvidia device files are created diff --git a/nvidia-gpu/nonfree/kmod-nvidia/lts/pkg.yaml b/nvidia-gpu/nvidia-driver/lts/pkg.yaml similarity index 51% rename from nvidia-gpu/nonfree/kmod-nvidia/lts/pkg.yaml rename to nvidia-gpu/nvidia-driver/lts/pkg.yaml index 9b96f0ff..315a50ef 100644 --- a/nvidia-gpu/nonfree/kmod-nvidia/lts/pkg.yaml +++ b/nvidia-gpu/nvidia-driver/lts/pkg.yaml @@ -1,23 +1,20 @@ -name: nonfree-kmod-nvidia-lts +name: nvidia-driver-lts variant: scratch shell: /toolchain/bin/bash dependencies: - - stage: base -# The pkgs version for a particular release of Talos as defined in -# https://github.com/siderolabs/talos/blob//pkg/machinery/gendata/data/pkgs - - image: "{{ .BUILD_ARG_PKGS_PREFIX }}/nonfree-kmod-nvidia-lts-pkg:{{ .BUILD_ARG_PKGS }}" + - stage: base + - stage: nvidia-driver-common + - stage: nvidia-driver-service + - stage: nvidia-driver-userspace-lts + - stage: nvidia-open-gpu-kernel-modules-lts steps: - prepare: - | sed -i 's#$VERSION#{{ .VERSION }}#' /pkg/manifest.yaml - - install: + install: - | - mkdir -p /rootfs/lib/modules \ - /rootfs/usr/local/lib/modprobe.d - - cp /pkg/files/nvidia.conf /rootfs/usr/local/lib/modprobe.d/nvidia.conf - - cp -R /lib/modules/* /rootfs/lib/modules + mkdir -p /rootfs/usr/local/etc/containers + cp /pkg/nvidia-driver.yaml /rootfs/usr/local/etc/containers/ test: - | mkdir -p /extensions-validator-rootfs diff --git a/nvidia-gpu/nvidia-modules/lts/vars.yaml b/nvidia-gpu/nvidia-driver/lts/vars.yaml similarity index 100% rename from nvidia-gpu/nvidia-modules/lts/vars.yaml rename to nvidia-gpu/nvidia-driver/lts/vars.yaml diff --git a/nvidia-gpu/nvidia-driver/production/manifest.yaml b/nvidia-gpu/nvidia-driver/production/manifest.yaml new file mode 100644 index 00000000..33f714aa --- /dev/null +++ b/nvidia-gpu/nvidia-driver/production/manifest.yaml @@ -0,0 +1,11 @@ +version: v1alpha1 +metadata: + name: nvidia-driver-production + version: "$VERSION" + author: Jean-Francois Roy + description: | + This system extension provides the NVIDIA open-source GPU kernel modules and matching userspace + drivers, production version. It depends on the glibc extension. + compatibility: + talos: + version: ">= v1.9.0" diff --git a/nvidia-gpu/nvidia-driver/production/nvidia-driver.yaml b/nvidia-gpu/nvidia-driver/production/nvidia-driver.yaml new file mode 100644 index 00000000..c1421693 --- /dev/null +++ b/nvidia-gpu/nvidia-driver/production/nvidia-driver.yaml @@ -0,0 +1,50 @@ +name: nvidia-driver +container: + entrypoint: /nvidia-driver + mounts: + # device files + - source: /dev + destination: /dev + type: bind + options: + - rshared + - rbind + - rw + # glibc dynamic linker + - source: /lib64 + destination: /lib64 + type: bind + options: + - bind + - ro + # glibc root, which includes the nvidia driver userspace components + - source: /usr/local/glibc + destination: /usr/local/glibc + type: bind + options: + - bind + - ro + # nvidia driver destination + - source: /run/nvidia + destination: /run/nvidia + type: bind + options: + - rshared + - rbind + - rw + # service state file + - source: /var/run + destination: /var/run + type: bind + options: + - rshared + - rbind + - rw + security: + rootfsPropagation: shared +depends: + - service: cri + # we need to depend on udevd so that the nvidia device files are created + - service: udevd + - path: /sys/bus/pci/drivers/nvidia +restart: always diff --git a/nvidia-gpu/nonfree/kmod-nvidia/production/pkg.yaml b/nvidia-gpu/nvidia-driver/production/pkg.yaml similarity index 50% rename from nvidia-gpu/nonfree/kmod-nvidia/production/pkg.yaml rename to nvidia-gpu/nvidia-driver/production/pkg.yaml index 24f95bae..d101ffb0 100644 --- a/nvidia-gpu/nonfree/kmod-nvidia/production/pkg.yaml +++ b/nvidia-gpu/nvidia-driver/production/pkg.yaml @@ -1,23 +1,20 @@ -name: nonfree-kmod-nvidia-production +name: nvidia-driver-production variant: scratch shell: /toolchain/bin/bash dependencies: - - stage: base -# The pkgs version for a particular release of Talos as defined in -# https://github.com/siderolabs/talos/blob//pkg/machinery/gendata/data/pkgs - - image: "{{ .BUILD_ARG_PKGS_PREFIX }}/nonfree-kmod-nvidia-production-pkg:{{ .BUILD_ARG_PKGS }}" + - stage: base + - stage: nvidia-driver-common + - stage: nvidia-driver-service + - stage: nvidia-driver-userspace-production + - stage: nvidia-open-gpu-kernel-modules-production steps: - prepare: - | sed -i 's#$VERSION#{{ .VERSION }}#' /pkg/manifest.yaml - - install: + install: - | - mkdir -p /rootfs/lib/modules \ - /rootfs/usr/local/lib/modprobe.d - - cp /pkg/files/nvidia.conf /rootfs/usr/local/lib/modprobe.d/nvidia.conf - - cp -R /lib/modules/* /rootfs/lib/modules + mkdir -p /rootfs/usr/local/etc/containers + cp /pkg/nvidia-driver.yaml /rootfs/usr/local/etc/containers/ test: - | mkdir -p /extensions-validator-rootfs diff --git a/nvidia-gpu/nvidia-modules/production/vars.yaml b/nvidia-gpu/nvidia-driver/production/vars.yaml similarity index 100% rename from nvidia-gpu/nvidia-modules/production/vars.yaml rename to nvidia-gpu/nvidia-driver/production/vars.yaml diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-persistenced-wrapper/go.mod b/nvidia-gpu/nvidia-driver/service/go.mod similarity index 55% rename from nvidia-gpu/nvidia-container-toolkit/nvidia-persistenced-wrapper/go.mod rename to nvidia-gpu/nvidia-driver/service/go.mod index 366caaa2..154e8140 100644 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-persistenced-wrapper/go.mod +++ b/nvidia-gpu/nvidia-driver/service/go.mod @@ -1,4 +1,4 @@ -module nvidia-persistenced-wrapper +module nvidia-driver go 1.22 diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime-wrapper/go.sum b/nvidia-gpu/nvidia-driver/service/go.sum similarity index 100% rename from nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime-wrapper/go.sum rename to nvidia-gpu/nvidia-driver/service/go.sum diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-persistenced-wrapper/main.go b/nvidia-gpu/nvidia-driver/service/main.go similarity index 60% rename from nvidia-gpu/nvidia-container-toolkit/nvidia-persistenced-wrapper/main.go rename to nvidia-gpu/nvidia-driver/service/main.go index 6179e234..64e35097 100644 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-persistenced-wrapper/main.go +++ b/nvidia-gpu/nvidia-driver/service/main.go @@ -11,7 +11,9 @@ import ( "os" "os/exec" "os/signal" + "path/filepath" "strconv" + "syscall" "golang.org/x/sys/unix" ) @@ -19,49 +21,66 @@ import ( const ( stateFolder = "/var/run/nvidia-persistenced" pidFile = stateFolder + "/" + "nvidia-persistenced.pid" + + driverSource = "/usr/local/glibc" + driverTarget = "/run/nvidia/driver" ) func main() { + bindMounDriver() + execPersistenced() +} + +func bindMounDriver() { + err := os.MkdirAll(driverTarget, 0755) + if err != nil { + log.Fatalf("nvidia-driver: failed mkdir: %v", err) + } + err = syscall.Mount(driverSource, driverTarget, "", syscall.MS_BIND|syscall.MS_REC, "") + if err != nil { + log.Fatalf("nvidia-driver: failed mount BIND: %v", err) + } +} + +func execPersistenced() { // ref: https://docs.nvidia.com/deploy/driver-persistence/index.html // first check if the pid file exists, // then check if the process is running, // if running try to kill it, then start the new process if _, err := os.Stat(pidFile); err != nil { if !errors.Is(err, os.ErrNotExist) { - log.Fatalf("nvidia-persistenced-wrapper: failed to stat pid file: %s%v\n", pidFile, err) + log.Fatalf("nvidia-driver: failed to stat pid file: %s%v\n", pidFile, err) } } else { pid, err := getProcessId() if err != nil { - log.Fatalf("nvidia-persistenced-wrapper: error reading pid file: %s%v\n", pidFile, err) + log.Fatalf("nvidia-driver: error reading pid file: %s%v\n", pidFile, err) } if err := killProcess(pid); err != nil { - log.Fatalf("nvidia-persistenced-wrapper: error killing process: %d%v\n", pid, err) + log.Fatalf("nvidia-driver: error killing process: %d%v\n", pid, err) } // now we can remove the state directory if err := os.RemoveAll(stateFolder); err != nil { - log.Fatalf("nvidia-persistenced-wrapper: error removing state directory: %s%v\n", stateFolder, err) + log.Fatalf("nvidia-driver: error removing state directory: %s%v\n", stateFolder, err) } } - cmd := exec.Command("/usr/local/bin/nvidia-persistenced") - + cmd := exec.Command(filepath.Join(driverTarget, "/usr/bin/nvidia-persistenced")) cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr - if err := cmd.Start(); err != nil { - log.Fatalf("nvidia-persistenced-wrapper: error starting nvidia-persistenced: %v\n", err) + log.Fatalf("nvidia-driver: error starting nvidia-persistenced: %v\n", err) } ch := make(chan os.Signal, 1) signal.Notify(ch, unix.SIGINT, unix.SIGTERM) if err := cmd.Process.Signal(<-ch); err != nil { - log.Fatalf("nvidia-persistenced-wrapper: error sending signal to nvidia-persistenced: %v\n", err) + log.Fatalf("nvidia-driver: error sending signal to nvidia-persistenced: %v\n", err) } if _, err := cmd.Process.Wait(); err != nil { - log.Fatalf("nvidia-persistenced-wrapper: error waiting for nvidia-persistenced to exit: %v\n", err) + log.Fatalf("nvidia-driver: error waiting for nvidia-persistenced to exit: %v\n", err) } } diff --git a/nvidia-gpu/nvidia-driver/service/pkg.yaml b/nvidia-gpu/nvidia-driver/service/pkg.yaml new file mode 100644 index 00000000..770f13cb --- /dev/null +++ b/nvidia-gpu/nvidia-driver/service/pkg.yaml @@ -0,0 +1,21 @@ +name: nvidia-driver-service +variant: scratch +shell: /toolchain/bin/bash +dependencies: + - stage: base +steps: + - cachePaths: + - /.cache/go-build + - /go/pkg + build: + - | + export PATH=${PATH}:${TOOLCHAIN}/go/bin + cp -r /pkg/* . + CGO_ENABLED=0 go build -ldflags "-s -w" -trimpath -o nvidia-driver main.go + install: + - | + mkdir -p /rootfs/usr/local/lib/containers/nvidia-driver + cp nvidia-driver /rootfs/usr/local/lib/containers/nvidia-driver/ +finalize: + - from: /rootfs + to: /rootfs diff --git a/nvidia-gpu/nvidia-driver/userspace/lts/pkg.yaml b/nvidia-gpu/nvidia-driver/userspace/lts/pkg.yaml new file mode 100644 index 00000000..1a82ebdc --- /dev/null +++ b/nvidia-gpu/nvidia-driver/userspace/lts/pkg.yaml @@ -0,0 +1,68 @@ +name: nvidia-driver-userspace-lts +variant: scratch +shell: /bin/bash +install: + - bash +dependencies: + - image: cgr.dev/chainguard/wolfi-base@{{ .WOLFI_BASE_REF }} +steps: + - sources: + # {{ if eq .ARCH "aarch64" }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr + - url: https://us.download.nvidia.com/tesla/{{ .NVIDIA_DRIVER_LTS_VERSION }}/NVIDIA-Linux-aarch64-{{ .NVIDIA_DRIVER_LTS_VERSION }}.run + destination: nvidia.tar.xz + sha256: af3f72f5e4906805987844636b87ad1132650d05116272824c76dcc3f816d8e9 + sha512: bb305f1703557461b0a0a29066c304658d9684841104c6f4d9ff44f9db90fee14ae619cd2fe3242823a5fe3a69b168b8174b163740014b15cdef36db88ba2d96 + # {{ else }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr + - url: https://us.download.nvidia.com/tesla/{{ .NVIDIA_DRIVER_LTS_VERSION }}/NVIDIA-Linux-x86_64-{{ .NVIDIA_DRIVER_LTS_VERSION }}.run + destination: nvidia.tar.xz + sha256: c7bb0a0569c5347845479ed4e3e4d885c6ee3b8adf068c3401cdf754d5ba3d3b + sha512: 424950ef303ea39499e96f8c90c1e0c83aee12309779d4f335769ef554ad4f7c38e98f69c64b408adc85a7cf51ea600d85222792402b9c6b7941f1af066d2a33 + # {{ end }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr + prepare: + - | + # the nvidia installer validates these packages are installed + ln -s /bin/true /bin/modprobe + ln -s /bin/true /bin/rmmod + ln -s /bin/true /bin/lsmod + ln -s /bin/true /bin/depmod + + chmod +x nvidia.run + ./nvidia.run --extract-only --target nvidia + install: + - | + ./nvidia/nvidia-installer \ + --silent \ + --x-prefix=/rootfs/usr/local/glibc/usr \ + --opengl-prefix=/rootfs/usr/local/glibc/usr \ + --utility-prefix=/rootfs/usr/local/glibc/usr \ + --xdg-data-dir=/rootfs/usr/local/glibc/usr/share \ + --documentation-prefix=/rootfs/usr/local/glibc/usr \ + --application-profile-path=/rootfs/usr/local/glibc/usr/share/nvidia \ + --log-file-name=/tmp/nvidia-installer.log \ + --no-rpms \ + --no-backup \ + --no-recursion \ + --no-kernel-modules \ + --no-x-check \ + --no-nouveau-check \ + --no-distro-scripts \ + --no-wine-files \ + --no-kernel-module-source \ + --no-check-for-alternate-installs \ + --override-file-type-destination=FIRMWARE:/rootfs/lib/firmware/nvidia/{{ .NVIDIA_DRIVER_PRODUCTION_VERSION }} \ + --no-systemd +finalize: + - from: /rootfs + to: /rootfs + - from: /etc/OpenCL + to: /rootfs/usr/local/glibc/etc/OpenCL + - from: /etc/vulkan + to: /rootfs/usr/local/glibc/etc/vulkan + - from: /usr/lib/nvidia + to: /rootfs/usr/local/glibc/usr/lib/nvidia + - from: /usr/share/egl + to: /rootfs/usr/local/glibc/usr/share/egl + - from: /usr/share/glvnd + to: /rootfs/usr/local/glibc/usr/share/glvnd + - from: /usr/share/nvidia + to: /rootfs/usr/local/glibc/usr/share/nvidia diff --git a/nvidia-gpu/nvidia-driver/userspace/production/pkg.yaml b/nvidia-gpu/nvidia-driver/userspace/production/pkg.yaml new file mode 100644 index 00000000..e2dcb56b --- /dev/null +++ b/nvidia-gpu/nvidia-driver/userspace/production/pkg.yaml @@ -0,0 +1,68 @@ +name: nvidia-driver-userspace-production +variant: scratch +shell: /bin/bash +install: + - bash +dependencies: + - image: cgr.dev/chainguard/wolfi-base@{{ .WOLFI_BASE_REF }} +steps: + - sources: + # {{ if eq .ARCH "aarch64" }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr + - url: https://us.download.nvidia.com/tesla/{{ .NVIDIA_DRIVER_PRODUCTION_VERSION }}/NVIDIA-Linux-aarch64-{{ .NVIDIA_DRIVER_PRODUCTION_VERSION }}.run + destination: nvidia.run + sha256: b896b76ae465307afc5b269c40bd8ccb279e6ea7d3ecae95534a91ecb1971572 + sha512: 79b956ad890a096bfb00c9dd996cba0673200b1d61f702ea6c5c64ca3fe2cefdd61e2bc844fdb7b4668c2796af5399be51e6f511565c3799cf731de2a7e9efaa + # {{ else }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr + - url: https://us.download.nvidia.com/tesla/{{ .NVIDIA_DRIVER_PRODUCTION_VERSION }}/NVIDIA-Linux-x86_64-{{ .NVIDIA_DRIVER_PRODUCTION_VERSION }}.run + destination: nvidia.run + sha256: 51acf579d5a9884f573a1d3f522e7fafa5e7841e22a9cec0b4bbeae31b0b9733 + sha512: b8c2cdc918ec74b44517fc181f9eb08ea44d0d9a53f221c0aa243e34872203721a9a7fb27628d35e3028a6aa68917abd2962cc13d5d4b09e92866e14678567a4 + # {{ end }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr + prepare: + - | + # the nvidia installer validates these packages are installed + ln -s /bin/true /bin/modprobe + ln -s /bin/true /bin/rmmod + ln -s /bin/true /bin/lsmod + ln -s /bin/true /bin/depmod + + chmod +x nvidia.run + ./nvidia.run --extract-only --target nvidia + install: + - | + ./nvidia/nvidia-installer \ + --silent \ + --x-prefix=/rootfs/usr/local/glibc/usr \ + --opengl-prefix=/rootfs/usr/local/glibc/usr \ + --utility-prefix=/rootfs/usr/local/glibc/usr \ + --xdg-data-dir=/rootfs/usr/local/glibc/usr/share \ + --documentation-prefix=/rootfs/usr/local/glibc/usr \ + --application-profile-path=/rootfs/usr/local/glibc/usr/share/nvidia \ + --log-file-name=/tmp/nvidia-installer.log \ + --no-rpms \ + --no-backup \ + --no-recursion \ + --no-kernel-modules \ + --no-x-check \ + --no-nouveau-check \ + --no-distro-scripts \ + --no-wine-files \ + --no-kernel-module-source \ + --no-check-for-alternate-installs \ + --override-file-type-destination=FIRMWARE:/rootfs/lib/firmware/nvidia/{{ .NVIDIA_DRIVER_PRODUCTION_VERSION }} \ + --no-systemd +finalize: + - from: /rootfs + to: /rootfs + - from: /etc/OpenCL + to: /rootfs/usr/local/glibc/etc/OpenCL + - from: /etc/vulkan + to: /rootfs/usr/local/glibc/etc/vulkan + - from: /usr/lib/nvidia + to: /rootfs/usr/local/glibc/usr/lib/nvidia + - from: /usr/share/egl + to: /rootfs/usr/local/glibc/usr/share/egl + - from: /usr/share/glvnd + to: /rootfs/usr/local/glibc/usr/share/glvnd + - from: /usr/share/nvidia + to: /rootfs/usr/local/glibc/usr/share/nvidia diff --git a/nvidia-gpu/nvidia-fabricmanager/lts/manifest.yaml b/nvidia-gpu/nvidia-fabricmanager/lts/manifest.yaml index b1ba3303..a4bb0e2c 100644 --- a/nvidia-gpu/nvidia-fabricmanager/lts/manifest.yaml +++ b/nvidia-gpu/nvidia-fabricmanager/lts/manifest.yaml @@ -4,7 +4,7 @@ metadata: version: "$VERSION" author: Sidero Labs description: | - This system extension provides the Nvidia fabricmanager for GPU's that need NVLink support. + This system extension provides the NVIDIA fabricmanager for GPU's that need NVLink support. compatibility: talos: version: ">= v1.2.0" diff --git a/nvidia-gpu/nvidia-fabricmanager/production/lts/manifest.yaml b/nvidia-gpu/nvidia-fabricmanager/production/lts/manifest.yaml index 61ccc162..13a3fb55 100644 --- a/nvidia-gpu/nvidia-fabricmanager/production/lts/manifest.yaml +++ b/nvidia-gpu/nvidia-fabricmanager/production/lts/manifest.yaml @@ -4,7 +4,7 @@ metadata: version: "$VERSION" author: Sidero Labs description: | - This system extension provides the Nvidia fabricmanager for GPU's that need NVLink support. + This system extension provides the NVIDIA fabricmanager for GPU's that need NVLink support. compatibility: talos: version: ">= v1.2.0" diff --git a/nvidia-gpu/nvidia-modules/lts/files/nvidia.conf b/nvidia-gpu/nvidia-modules/lts/files/nvidia.conf deleted file mode 100644 index 62b5f931..00000000 --- a/nvidia-gpu/nvidia-modules/lts/files/nvidia.conf +++ /dev/null @@ -1,4 +0,0 @@ -blacklist nvidia -blacklist nvidia_uvm -blacklist nvidia_drm -blacklist nvidia_modeset diff --git a/nvidia-gpu/nvidia-modules/lts/manifest.yaml b/nvidia-gpu/nvidia-modules/lts/manifest.yaml deleted file mode 100644 index 77874823..00000000 --- a/nvidia-gpu/nvidia-modules/lts/manifest.yaml +++ /dev/null @@ -1,10 +0,0 @@ -version: v1alpha1 -metadata: - name: nvidia-open-gpu-kernel-modules-lts - version: "$VERSION" - author: Sidero Labs - description: | - This system extension provides nvidia open source driver kernel modules built against a specific Talos version. - compatibility: - talos: - version: ">= v1.2.0" diff --git a/nvidia-gpu/nvidia-modules/lts/pkg.yaml b/nvidia-gpu/nvidia-modules/lts/pkg.yaml deleted file mode 100644 index 6c29c2bc..00000000 --- a/nvidia-gpu/nvidia-modules/lts/pkg.yaml +++ /dev/null @@ -1,31 +0,0 @@ -name: nvidia-open-gpu-kernel-modules-lts -variant: scratch -shell: /toolchain/bin/bash -dependencies: - - stage: base -# The pkgs version for a particular release of Talos as defined in -# https://github.com/siderolabs/talos/blob//pkg/machinery/gendata/data/pkgs - - image: "{{ .BUILD_ARG_PKGS_PREFIX }}/nvidia-open-gpu-kernel-modules-lts-pkg:{{ .BUILD_ARG_PKGS }}" -steps: - - prepare: - - | - sed -i 's#$VERSION#{{ .VERSION }}#' /pkg/manifest.yaml - - install: - - | - mkdir -p /rootfs/lib/modules \ - /rootfs/usr/local/lib/modprobe.d - - cp /pkg/files/nvidia.conf /rootfs/usr/local/lib/modprobe.d/nvidia.conf - - cp -R /lib/modules/* /rootfs/lib/modules - test: - - | - mkdir -p /extensions-validator-rootfs - cp -r /rootfs/ /extensions-validator-rootfs/rootfs - cp /pkg/manifest.yaml /extensions-validator-rootfs/manifest.yaml - /extensions-validator validate --rootfs=/extensions-validator-rootfs --pkg-name="${PKG_NAME}" -finalize: - - from: /rootfs - to: /rootfs - - from: /pkg/manifest.yaml - to: / diff --git a/nvidia-gpu/nvidia-modules/production/files/nvidia.conf b/nvidia-gpu/nvidia-modules/production/files/nvidia.conf deleted file mode 100644 index 62b5f931..00000000 --- a/nvidia-gpu/nvidia-modules/production/files/nvidia.conf +++ /dev/null @@ -1,4 +0,0 @@ -blacklist nvidia -blacklist nvidia_uvm -blacklist nvidia_drm -blacklist nvidia_modeset diff --git a/nvidia-gpu/nvidia-modules/production/manifest.yaml b/nvidia-gpu/nvidia-modules/production/manifest.yaml deleted file mode 100644 index 7398546b..00000000 --- a/nvidia-gpu/nvidia-modules/production/manifest.yaml +++ /dev/null @@ -1,10 +0,0 @@ -version: v1alpha1 -metadata: - name: nvidia-open-gpu-kernel-modules-production - version: "$VERSION" - author: Sidero Labs - description: | - This system extension provides nvidia open source driver kernel modules built against a specific Talos version. - compatibility: - talos: - version: ">= v1.2.0" diff --git a/nvidia-gpu/nvidia-modules/production/pkg.yaml b/nvidia-gpu/nvidia-modules/production/pkg.yaml deleted file mode 100644 index 64c374dc..00000000 --- a/nvidia-gpu/nvidia-modules/production/pkg.yaml +++ /dev/null @@ -1,31 +0,0 @@ -name: nvidia-open-gpu-kernel-modules-production -variant: scratch -shell: /toolchain/bin/bash -dependencies: - - stage: base -# The pkgs version for a particular release of Talos as defined in -# https://github.com/siderolabs/talos/blob//pkg/machinery/gendata/data/pkgs - - image: "{{ .BUILD_ARG_PKGS_PREFIX }}/nvidia-open-gpu-kernel-modules-production-pkg:{{ .BUILD_ARG_PKGS }}" -steps: - - prepare: - - | - sed -i 's#$VERSION#{{ .VERSION }}#' /pkg/manifest.yaml - - install: - - | - mkdir -p /rootfs/lib/modules \ - /rootfs/usr/local/lib/modprobe.d - - cp /pkg/files/nvidia.conf /rootfs/usr/local/lib/modprobe.d/nvidia.conf - - cp -R /lib/modules/* /rootfs/lib/modules - test: - - | - mkdir -p /extensions-validator-rootfs - cp -r /rootfs/ /extensions-validator-rootfs/rootfs - cp /pkg/manifest.yaml /extensions-validator-rootfs/manifest.yaml - /extensions-validator validate --rootfs=/extensions-validator-rootfs --pkg-name="${PKG_NAME}" -finalize: - - from: /rootfs - to: /rootfs - - from: /pkg/manifest.yaml - to: / diff --git a/nvidia-gpu/vars.yaml b/nvidia-gpu/vars.yaml index 6b701c52..6dfa237d 100644 --- a/nvidia-gpu/vars.yaml +++ b/nvidia-gpu/vars.yaml @@ -3,19 +3,5 @@ NVIDIA_DRIVER_LTS_VERSION: 535.183.06 # renovate: datasource=github-releases extractVersion=^\d+\.(?\d+\.\d+)$ depName=nvidia/open-gpu-kernel-modules NVIDIA_DRIVER_PRODUCTION_VERSION: 550.90.07 -# renovate: datasource=github-releases depName=nvidia/nvidia-container-toolkit -CONTAINER_TOOLKIT_VERSION: v1.16.1 -CONTAINER_TOOLKIT_REF: a470818ba7d9166be282cd0039dd2fc9b0a34d73 -# renovate: datasource=github-releases depName=nvidia/libnvidia-container -LIBNVIDIA_CONTAINER_VERSION: v1.16.1 -LIBNVIDIA_CONTAINER_REF: 4c2494f16573b585788a42e9c7bee76ecd48c73d # renovate: datasource=docker versioning=docker depName=cgr.dev/chainguard/wolfi-base WOLFI_BASE_REF: sha256:72c8bfed3266b2780243b144dc5151150015baf5a739edbbde53d154574f1607 -# renovate: datasource=git-tags extractVersion=^glibc-(?.*)$ depName=https://sourceware.org/git/glibc.git -GLIBC_VERSION: 2.40 -# renovate: datasource=github-tags extractVersion=^v(?.*)$ depName=seccomp/libseccomp -LIBSECCOMP_VERSION: 2.5.5 -# renovate: datasource=git-tags extractVersion=^libcap-(?.*)$ depName=git://git.kernel.org/pub/scm/libs/libcap/libcap.git -LIBCAP_VERSION: 2.70 -# renovate: datasource=git-tags extractVersion=^elfutils-(?.*)$ depName=git://sourceware.org/git/elfutils.git -ELFUTILS_VERSION: 0.191