From 1d87773837fcb0d098143d707bdc569149694657 Mon Sep 17 00:00:00 2001 From: Luis Capelo Date: Wed, 3 Jul 2024 11:36:31 -0700 Subject: [PATCH] nvproxy: add missing UVM_UNMAP_EXTERNAL ioctl Distributed training isn't working with PyTorch on certain A100 nodes. Adds the missing ioctl `UVM_UNMAP_EXTERNAL` allowing for certain NCCL operations to succeed when using [`torch.distributed`](https://pytorch.org/docs/stable/distributed.html), fixing distributed training. ## Reproduction This affects numerous A100 40GB and 80GB instances in our fleet. This reproduction requires 4 A100 GPUs, either 40GB or 80GB. - **NVIDIA Driver Version**: 550.54.15 - **CUDA Version**: 12.4 - **NVIDIA device**: NVIDIA A100 80GB PCIe ### Steps 1. **Install gvisor** ```bash URL="https://storage.googleapis.com/gvisor/releases/master/latest/${ARCH}" wget -nc "${URL}/runsc" "${URL}/runsc.sha512" chmod +x runsc sudo cp runsc /usr/local/bin/runsc sudo /usr/local/bin/runsc install sudo systemctl reload docker ``` 2. **Add GPU enabling gvisor options** ```json { "runtimes": { "nvidia": { "path": "nvidia-container-runtime", "runtimeArgs": [] }, "runsc": { "path": "/usr/local/bin/runsc", "runtimeArgs": ["--nvproxy", "--nvproxy-docker", "-debug-log=/tmp/runsc/", "-debug", "-strace"] } } } ``` Reload configs with `sudo systemctl reload docker`. 3. **Run reproduction NCCL test** This test creates one main process and N peer processes. Each peer process sends a torch `Tensor` to the main process using NCCL. ```Dockerfile # Dockerfile FROM python:3.9.15-slim-bullseye RUN pip install torch numpy COPY < + 0x5b3a23e (0x7edd8d73a23e in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7edd8d734c87 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7edd8d734f82 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7edd8d735fd1 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7edd8d6ea371 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7edd8d6ea371 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7edd8d6ea371 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) frame #8: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7edd54da9189 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so) frame #9: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7edd54db0610 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so) frame #10: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7edd54dcf978 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so) frame #11: + 0x5adc309 (0x7edd8d6dc309 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) frame #12: + 0x5ae6f10 (0x7edd8d6e6f10 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) frame #13: + 0x5ae6fa5 (0x7edd8d6e6fa5 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) frame #14: + 0x5124446 (0x7edd8cd24446 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) frame #15: + 0x1acf4b8 (0x7edd896cf4b8 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) frame #16: + 0x5aee004 (0x7edd8d6ee004 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) frame #17: + 0x5af36b5 (0x7edd8d6f36b5 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) frame #18: + 0xd2fe8e (0x7edda032fe8e in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_python.so) frame #19: + 0x47f074 (0x7edd9fa7f074 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_python.so) frame #35: + 0x29d90 (0x7edda2029d90 in /usr/lib/x86_64-linux-gnu/libc.so.6) frame #36: __libc_start_main + 0x80 (0x7edda2029e40 in /usr/lib/x86_64-linux-gnu/libc.so.6) frame #37: + 0x108e (0x55f950b0c08e in /usr/local/bin/python) . This may indicate a possible application crash on rank 0 or a network set up issue. ... ``` ### Fix gvisor debug logs show: ``` W0702 20:36:17.577055 445833 uvm.go:148] [ 22: 84] nvproxy: unknown uvm ioctl 66 = 0x42 ``` I've implemented that ioctl in this PR. This is the output after the fix. ``` [RANK 2] sending tensor: tensor([1., 1., 1., 1., 1.], device='cuda:2') [RANK 0] sending tensor: tensor([1., 1., 1., 1., 1.], device='cuda:0') [RANK 1] sending tensor: tensor([1., 1., 1., 1., 1.], device='cuda:1') [RANK 3] received tensor from rank=0: tensor([1., 1., 1., 1., 1.], device='cuda:3') [RANK 3] received tensor from rank=1: tensor([1., 1., 1., 1., 1.], device='cuda:3') [RANK 3] received tensor from rank=2: tensor([1., 1., 1., 1., 1.], device='cuda:3') PASS: NCCL working. ``` FUTURE_COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/10610 from luiscape:master ee88734c0493d24bb9278cf9f473050bb22e0865 PiperOrigin-RevId: 649146570 --- pkg/abi/nvgpu/uvm.go | 10 ++++++++++ pkg/sentry/devices/nvproxy/seccomp_filters.go | 4 ++++ pkg/sentry/devices/nvproxy/version.go | 1 + 3 files changed, 15 insertions(+) diff --git a/pkg/abi/nvgpu/uvm.go b/pkg/abi/nvgpu/uvm.go index 3a61a9f441..cc01d24c63 100644 --- a/pkg/abi/nvgpu/uvm.go +++ b/pkg/abi/nvgpu/uvm.go @@ -39,6 +39,7 @@ const ( UVM_TOOLS_READ_PROCESS_MEMORY = 62 UVM_TOOLS_WRITE_PROCESS_MEMORY = 63 UVM_MAP_DYNAMIC_PARALLELISM_REGION = 65 + UVM_UNMAP_EXTERNAL = 66 UVM_ALLOC_SEMAPHORE_POOL = 68 UVM_VALIDATE_VA_RANGE = 72 UVM_CREATE_EXTERNAL_RANGE = 73 @@ -288,6 +289,15 @@ type UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS struct { Pad0 [4]byte } +// +marshal +type UVM_UNMAP_EXTERNAL_PARAMS struct { + Base uint64 + Length uint64 + GPUUUID NvUUID + RMStatus uint32 + Pad0 [4]byte +} + // +marshal type UVM_ALLOC_SEMAPHORE_POOL_PARAMS struct { Base uint64 diff --git a/pkg/sentry/devices/nvproxy/seccomp_filters.go b/pkg/sentry/devices/nvproxy/seccomp_filters.go index a497ba97ab..eca9f716ac 100644 --- a/pkg/sentry/devices/nvproxy/seccomp_filters.go +++ b/pkg/sentry/devices/nvproxy/seccomp_filters.go @@ -182,6 +182,10 @@ func Filters() seccomp.SyscallRules { seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_MAP_DYNAMIC_PARALLELISM_REGION), }, + seccomp.PerArg{ + seccomp.NonNegativeFD{}, + seccomp.EqualTo(nvgpu.UVM_UNMAP_EXTERNAL), + }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_ALLOC_SEMAPHORE_POOL), diff --git a/pkg/sentry/devices/nvproxy/version.go b/pkg/sentry/devices/nvproxy/version.go index f0131d9bdd..98e4b925b7 100644 --- a/pkg/sentry/devices/nvproxy/version.go +++ b/pkg/sentry/devices/nvproxy/version.go @@ -192,6 +192,7 @@ func Init() { nvgpu.UVM_DISABLE_READ_DUPLICATION: uvmIoctlSimple[nvgpu.UVM_DISABLE_READ_DUPLICATION_PARAMS], nvgpu.UVM_MIGRATE_RANGE_GROUP: uvmIoctlSimple[nvgpu.UVM_MIGRATE_RANGE_GROUP_PARAMS], nvgpu.UVM_MAP_DYNAMIC_PARALLELISM_REGION: uvmIoctlSimple[nvgpu.UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS], + nvgpu.UVM_UNMAP_EXTERNAL: uvmIoctlSimple[nvgpu.UVM_UNMAP_EXTERNAL_PARAMS], nvgpu.UVM_ALLOC_SEMAPHORE_POOL: uvmIoctlSimple[nvgpu.UVM_ALLOC_SEMAPHORE_POOL_PARAMS], nvgpu.UVM_VALIDATE_VA_RANGE: uvmIoctlSimple[nvgpu.UVM_VALIDATE_VA_RANGE_PARAMS], nvgpu.UVM_CREATE_EXTERNAL_RANGE: uvmIoctlSimple[nvgpu.UVM_CREATE_EXTERNAL_RANGE_PARAMS],