From 1610c54145c916f83e729847417f583681f1867f Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Mon, 4 Mar 2024 18:44:52 +0100 Subject: [PATCH 1/7] Create script for nvidia-container-toolkit --- create_nvidia_runtime_sysext.sh | 182 ++++++++++++++++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100755 create_nvidia_runtime_sysext.sh diff --git a/create_nvidia_runtime_sysext.sh b/create_nvidia_runtime_sysext.sh new file mode 100755 index 0000000..b5f074a --- /dev/null +++ b/create_nvidia_runtime_sysext.sh @@ -0,0 +1,182 @@ +#!/usr/bin/env bash +set -euo pipefail + +export ARCH="${ARCH-x86-64}" +SCRIPTFOLDER="$(dirname "$(readlink -f "$0")")" + +if [ $# -lt 2 ] || [ "$1" = "-h" ] || [ "$1" = "--help" ]; then + echo "Usage: $0 VERSION SYSEXTNAME" + echo "The script will build nvidia-container-toolkit on ubuntu 18 and package it into a syseext." + echo "A temporary directory named SYSEXTNAME in the current folder will be created and deleted again." + echo "All files in the sysext image will be owned by root." + echo "To use arm64 pass 'ARCH=arm64' as environment variable (current value is '${ARCH}')." + "${SCRIPTFOLDER}"/bake.sh --help + exit 1 +fi + +# Default should be: v1.14.3 +VERSION="$1" +SYSEXTNAME="$2" + +# The github release uses different arch identifiers, we map them here +# and rely on bake.sh to map them back to what systemd expects +if [ "${ARCH}" = "amd64" ] || [ "${ARCH}" = "x86-64" ]; then + ARCH="amd64" +elif [ "${ARCH}" = "arm64" ]; then + ARCH="arch64" +fi + +git clone -b ${VERSION} --depth 1 https://github.com/NVIDIA/libnvidia-container || true +git clone -b ${VERSION} --depth 1 https://github.com/NVIDIA/nvidia-container-toolkit || true + +make -C libnvidia-container ubuntu18.04-${ARCH} +make -C nvidia-container-toolkit ubuntu18.04-${ARCH} + +rm -rf "${SYSEXTNAME}" +mkdir -p "${SYSEXTNAME}" +for deb in libnvidia-container/dist/ubuntu18.04/${ARCH}/libnvidia-container{1_*,-tools_}*.deb; do + dpkg-deb -x $deb "${SYSEXTNAME}"/ +done +for deb in nvidia-container-toolkit/dist/ubuntu18.04/${ARCH}/nvidia-container-toolkit*.deb; do + dpkg-deb -x $deb "${SYSEXTNAME}"/ +done +rm -rf "${SYSEXTNAME}"/usr/share +mv "${SYSEXTNAME}"/usr/lib/*-linux-gnu "${SYSEXTNAME}"/usr/lib64 +mkdir -p "${SYSEXTNAME}"/usr/local +ln -s /opt/nvidia "${SYSEXTNAME}"/usr/local/nvidia +ln -s /opt/bin/nvidia-smi "${SYSEXTNAME}"/usr/bin/nvidia-smi + +mkdir -p "${SYSEXTNAME}"/usr/lib/systemd/system/docker.service.d +cat <"${SYSEXTNAME}"/usr/lib/systemd/system/docker.service.d/10-nvidia.conf +[Unit] +After=nvidia.service + +[Service] +Environment=DOCKER_OPTS=--add-runtime=nvidia=nvidia-container-runtime +EOF + +mkdir -p "${SYSEXTNAME}"/usr/lib/systemd/system/containerd.service.d +cat <"${SYSEXTNAME}"/usr/lib/systemd/system/containerd.service.d/10-nvidia.conf +[Unit] +After=nvidia.service + +[Service] +ExecStart= +ExecStart=/usr/bin/containerd --config /etc/containerd/config.toml +EOF + +mkdir -p "${SYSEXTNAME}"/usr/lib/systemd/system/nvidia.service.d +cat <"${SYSEXTNAME}"/usr/lib/systemd/system/nvidia.service.d/10-persistenced.conf +[Service] +ExecStartPost=-/opt/bin/nvidia-persistenced +ExecStartPost=-/bin/sh -c "chcon -R -t container_file_t /dev/nvidia*" +ExecStartPost=mkdir -p /run/extensions +ExecStartPost=ln -s /opt/nvidia/current /run/extensions/nvidia-driver +ExecStartPost=systemctl restart systemd-sysext +EOF + + +mkdir -p "${SYSEXTNAME}"/usr/lib/tmpfiles.d/ +cat <"${SYSEXTNAME}"/usr/lib/tmpfiles.d/10-nvidia.conf +C /etc/containerd/config.toml - - - - /usr/share/flatcar/etc/containerd/config.toml +C /etc/nvidia-container-runtime/config.toml - - - - /usr/share/flatcar/etc/nvidia-container-runtime/config.toml +EOF + +mkdir -p "${SYSEXTNAME}"/usr/share/flatcar/etc/nvidia-container-runtime/ +cat <"${SYSEXTNAME}"/usr/share/flatcar/etc/nvidia-container-runtime/config.toml +#accept-nvidia-visible-devices-as-volume-mounts = false +#accept-nvidia-visible-devices-envvar-when-unprivileged = true +disable-require = false +supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,video" +#swarm-resource = "DOCKER_RESOURCE_GPU" + +[nvidia-container-cli] +#debug = "/var/log/nvidia-container-toolkit.log" +environment = [] +#ldcache = "/etc/ld.so.cache" +ldconfig = "@/sbin/ldconfig" +load-kmods = true +#no-cgroups = false +#path = "/usr/bin/nvidia-container-cli" +#root = "/run/nvidia/driver" +#user = "root:video" + +[nvidia-container-runtime] +#debug = "/var/log/nvidia-container-runtime.log" +log-level = "info" +mode = "auto" +runtimes = ["docker-runc", "runc", "crun"] + +[nvidia-container-runtime.modes] + +[nvidia-container-runtime.modes.cdi] +annotation-prefixes = ["cdi.k8s.io/"] +default-kind = "nvidia.com/gpu" +spec-dirs = ["/etc/cdi", "/var/run/cdi"] + +[nvidia-container-runtime.modes.csv] +mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + +[nvidia-container-runtime-hook] +path = "nvidia-container-runtime-hook" +skip-mode-detection = false + +[nvidia-ctk] +path = "nvidia-ctk" +EOF + +mkdir -p "${SYSEXTNAME}"/usr/share/flatcar/etc/containerd/ +cat <"${SYSEXTNAME}"/usr/share/flatcar/etc/containerd/config.toml +version = 2 + +# persistent data location +root = "/var/lib/containerd" +# runtime state information +state = "/run/containerd" +# set containerd as a subreaper on linux when it is not running as PID 1 +subreaper = true +# set containerd's OOM score +oom_score = -999 +disabled_plugins = [] + +# grpc configuration +[grpc] +address = "/run/containerd/containerd.sock" +# socket uid +uid = 0 +# socket gid +gid = 0 + +[plugins."io.containerd.runtime.v1.linux"] +# shim binary name/path +shim = "containerd-shim" +# runtime binary name/path +runtime = "runc" +# do not use a shim when starting containers, saves on memory but +# live restore is not supported +no_shim = false + +[plugins."io.containerd.grpc.v1.cri"] +# enable SELinux labeling +enable_selinux = true + +[plugins."io.containerd.grpc.v1.cri".containerd] +default_runtime_name = "nvidia" + +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] +# setting runc.options unsets parent settings +runtime_type = "io.containerd.runc.v2" +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] +SystemdCgroup = true + +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] +runtime_type = "io.containerd.runc.v2" + +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options] +BinaryName = "/usr/bin/nvidia-container-runtime" +SystemdCgroup = true +EOF + +mkdir -p "${SYSEXTNAME}"/usr/bin +"${SCRIPTFOLDER}"/bake.sh "${SYSEXTNAME}" +rm -rf "${SYSEXTNAME}" From a4ed33a5e31acd6e01ed17b36e86da38faceaca5 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Mon, 4 Mar 2024 18:45:23 +0100 Subject: [PATCH 2/7] Add nvidia_runtime to release builds --- release_build_versions.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/release_build_versions.txt b/release_build_versions.txt index e014f33..8ff92b5 100644 --- a/release_build_versions.txt +++ b/release_build_versions.txt @@ -28,3 +28,5 @@ crio-1.28.4 k3s-v1.29.2+k3s1 rke2-v1.29.2+rke2r1 + +nvidia_runtime-v1.14.3 From e119f055b23bfc182bbec55152b3d7dd8ad1fc5e Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Mon, 5 Aug 2024 14:34:49 +0200 Subject: [PATCH 3/7] nvidia-runtime: Fix symlink creation and rmmod on service stop Signed-off-by: Jeremi Piotrowski --- create_nvidia_runtime_sysext.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/create_nvidia_runtime_sysext.sh b/create_nvidia_runtime_sysext.sh index b5f074a..882f5ff 100755 --- a/create_nvidia_runtime_sysext.sh +++ b/create_nvidia_runtime_sysext.sh @@ -71,8 +71,9 @@ cat <"${SYSEXTNAME}"/usr/lib/systemd/system/nvidia.service.d/10-persistenc ExecStartPost=-/opt/bin/nvidia-persistenced ExecStartPost=-/bin/sh -c "chcon -R -t container_file_t /dev/nvidia*" ExecStartPost=mkdir -p /run/extensions -ExecStartPost=ln -s /opt/nvidia/current /run/extensions/nvidia-driver +ExecStartPost=ln -sf /opt/nvidia/current /run/extensions/nvidia-driver ExecStartPost=systemctl restart systemd-sysext +ExecStopPost=rmmod nvidia_uvm nvidia_modeset nvidia EOF From 9bf460b6255561d6a6ece95f5c70a6320422f325 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Mon, 5 Aug 2024 14:35:22 +0200 Subject: [PATCH 4/7] nvidia: Switch default version to v1.16.1 --- release_build_versions.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release_build_versions.txt b/release_build_versions.txt index 8ff92b5..7b9cce5 100644 --- a/release_build_versions.txt +++ b/release_build_versions.txt @@ -29,4 +29,4 @@ k3s-v1.29.2+k3s1 rke2-v1.29.2+rke2r1 -nvidia_runtime-v1.14.3 +nvidia_runtime-v1.16.1 From 427605ce0108f7f87e0b3310ba4bb7cdf078988a Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Mon, 5 Aug 2024 15:17:05 +0200 Subject: [PATCH 5/7] nvidia-runtime: Fix arm64 build Signed-off-by: Jeremi Piotrowski --- create_nvidia_runtime_sysext.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/create_nvidia_runtime_sysext.sh b/create_nvidia_runtime_sysext.sh index 882f5ff..a5219cc 100755 --- a/create_nvidia_runtime_sysext.sh +++ b/create_nvidia_runtime_sysext.sh @@ -23,7 +23,7 @@ SYSEXTNAME="$2" if [ "${ARCH}" = "amd64" ] || [ "${ARCH}" = "x86-64" ]; then ARCH="amd64" elif [ "${ARCH}" = "arm64" ]; then - ARCH="arch64" + ARCH="arm64" fi git clone -b ${VERSION} --depth 1 https://github.com/NVIDIA/libnvidia-container || true From 9577961e28894356c0ed791e217ee395d56d1c00 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Mon, 5 Aug 2024 17:04:30 +0200 Subject: [PATCH 6/7] nvidia-runtime: Remove sysext before reinstalling If the nvidia-driver sysext is merged while running setup-nvidia, then running nvidia-smi might pickup incompatible library version. This happens when changing the driver version. Signed-off-by: Jeremi Piotrowski --- create_nvidia_runtime_sysext.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/create_nvidia_runtime_sysext.sh b/create_nvidia_runtime_sysext.sh index a5219cc..9675cec 100755 --- a/create_nvidia_runtime_sysext.sh +++ b/create_nvidia_runtime_sysext.sh @@ -68,6 +68,7 @@ EOF mkdir -p "${SYSEXTNAME}"/usr/lib/systemd/system/nvidia.service.d cat <"${SYSEXTNAME}"/usr/lib/systemd/system/nvidia.service.d/10-persistenced.conf [Service] +ExecStartPre=-/bin/sh -c "rm /run/extensions/nvidia-driver && systemctl restart systemd-sysext" ExecStartPost=-/opt/bin/nvidia-persistenced ExecStartPost=-/bin/sh -c "chcon -R -t container_file_t /dev/nvidia*" ExecStartPost=mkdir -p /run/extensions From 75ab117d1de6141553cad53494d3fbf324baf14c Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Tue, 3 Sep 2024 11:43:12 +0200 Subject: [PATCH 7/7] nvidia-runtime: Address review - fix typo - switch to ubuntu 20.04 - add extension to README --- README.md | 1 + create_nvidia_runtime_sysext.sh | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 7291b32..6d3feb3 100644 --- a/README.md +++ b/README.md @@ -85,6 +85,7 @@ For extensions that are not part of the GitHub Release or which you want to cust | `kubernetes` | released | | `docker` | released (includes containerd) | | `docker_compose` | released | +| `nvidia-runtime` | released | | `wasmtime` | released | | `wasmcloud` | released | | `tailscale` | released | diff --git a/create_nvidia_runtime_sysext.sh b/create_nvidia_runtime_sysext.sh index 9675cec..94645c2 100755 --- a/create_nvidia_runtime_sysext.sh +++ b/create_nvidia_runtime_sysext.sh @@ -6,7 +6,7 @@ SCRIPTFOLDER="$(dirname "$(readlink -f "$0")")" if [ $# -lt 2 ] || [ "$1" = "-h" ] || [ "$1" = "--help" ]; then echo "Usage: $0 VERSION SYSEXTNAME" - echo "The script will build nvidia-container-toolkit on ubuntu 18 and package it into a syseext." + echo "The script will build nvidia-container-toolkit on ubuntu 20 and package it into a sysext." echo "A temporary directory named SYSEXTNAME in the current folder will be created and deleted again." echo "All files in the sysext image will be owned by root." echo "To use arm64 pass 'ARCH=arm64' as environment variable (current value is '${ARCH}')." @@ -29,15 +29,15 @@ fi git clone -b ${VERSION} --depth 1 https://github.com/NVIDIA/libnvidia-container || true git clone -b ${VERSION} --depth 1 https://github.com/NVIDIA/nvidia-container-toolkit || true -make -C libnvidia-container ubuntu18.04-${ARCH} -make -C nvidia-container-toolkit ubuntu18.04-${ARCH} +make -C libnvidia-container ubuntu20.04-${ARCH} +make -C nvidia-container-toolkit ubuntu20.04-${ARCH} rm -rf "${SYSEXTNAME}" mkdir -p "${SYSEXTNAME}" -for deb in libnvidia-container/dist/ubuntu18.04/${ARCH}/libnvidia-container{1_*,-tools_}*.deb; do +for deb in libnvidia-container/dist/ubuntu20.04/${ARCH}/libnvidia-container{1_*,-tools_}*.deb; do dpkg-deb -x $deb "${SYSEXTNAME}"/ done -for deb in nvidia-container-toolkit/dist/ubuntu18.04/${ARCH}/nvidia-container-toolkit*.deb; do +for deb in nvidia-container-toolkit/dist/ubuntu20.04/${ARCH}/nvidia-container-toolkit*.deb; do dpkg-deb -x $deb "${SYSEXTNAME}"/ done rm -rf "${SYSEXTNAME}"/usr/share