From 1d87773837fcb0d098143d707bdc569149694657 Mon Sep 17 00:00:00 2001
From: Luis Capelo <luis@modal.com>
Date: Wed, 3 Jul 2024 11:36:31 -0700
Subject: [PATCH] nvproxy: add missing UVM_UNMAP_EXTERNAL ioctl

Distributed training isn't working with PyTorch on certain A100 nodes.

Adds the missing ioctl `UVM_UNMAP_EXTERNAL` allowing for certain NCCL operations to succeed when using [`torch.distributed`](https://pytorch.org/docs/stable/distributed.html), fixing distributed training.

## Reproduction

This affects numerous A100 40GB and 80GB instances in our fleet. This reproduction requires 4 A100 GPUs, either 40GB or 80GB.

- **NVIDIA Driver Version**: 550.54.15
- **CUDA Version**: 12.4
- **NVIDIA device**: NVIDIA A100 80GB PCIe

### Steps

1. **Install gvisor**
```bash
URL="https://storage.googleapis.com/gvisor/releases/master/latest/${ARCH}"
wget -nc "${URL}/runsc" "${URL}/runsc.sha512"
chmod +x runsc
sudo cp runsc /usr/local/bin/runsc
sudo /usr/local/bin/runsc install
sudo systemctl reload docker
```

2. **Add GPU enabling gvisor options**

```json
{
    "runtimes": {
        "nvidia": {
            "path": "nvidia-container-runtime",
            "runtimeArgs": []
        },
        "runsc": {
            "path": "/usr/local/bin/runsc",
	    "runtimeArgs": ["--nvproxy", "--nvproxy-docker", "-debug-log=/tmp/runsc/", "-debug", "-strace"]

        }
    }
}
```
Reload configs with `sudo systemctl reload docker`.

3. **Run reproduction NCCL test**

This test creates one main process and N peer processes. Each peer process sends a torch `Tensor` to the main process using NCCL.

```Dockerfile
# Dockerfile
FROM python:3.9.15-slim-bullseye

RUN pip install torch numpy
COPY <<EOF repro.py
import argparse
import datetime
import os

import torch
import torch.distributed as dist
import torch.multiprocessing as mp

def setup(rank, world_size):
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "12355"
    dist.init_process_group("nccl", rank=rank, world_size=world_size, timeout=datetime.timedelta(seconds=600))
    torch.cuda.set_device(rank)

def cleanup():
    dist.destroy_process_group()

def send_tensor(rank, world_size):
    try:
        setup(rank, world_size)

        # rank receiving all tensors
        target_rank = world_size - 1

        dist.barrier()

        tensor = torch.ones(5).cuda(rank)
        if rank < target_rank:
            print(f"[RANK {rank}] sending tensor: {tensor}")
            dist.send(tensor=tensor, dst=target_rank)
        elif rank == target_rank:
            for other_rank in range(target_rank):
                tensor = torch.zeros(5).cuda(target_rank)
                dist.recv(tensor=tensor, src=other_rank)
                print(f"[RANK {target_rank}] received tensor from rank={other_rank}: {tensor}")

            print("PASS: NCCL working.")

    except Exception as e:
        print(f"[RANK {rank}] error in send_tensor: {e}")
        raise
    finally:
        cleanup()

def main(world_size: int = 2):
    mp.spawn(send_tensor, args=(world_size,), nprocs=world_size, join=True)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run torch-based NCCL tests")
    parser.add_argument("world_size", type=int, help="number of GPUs to run test on")
    args = parser.parse_args()

    if args.world_size < 2:
        raise RuntimeError(f"world_size needs to be larger than 1 {args.world_size}")

    main(args.world_size)
EOF

ENTRYPOINT ["python", "repro.py", "4"]
```
Build image with:

```
docker build -f Dockerfile .
```

Then run it with:
```
sudo docker run -it --shm-size=2.00gb --runtime=runsc --gpus='"device=GPU-742ea7fc-dd4f-612c-e860-499bf200a815,GPU-94a801d8-7713-acf6-337d-338b7cfdf19e,GPU-0d19cef2-10ce-e445-a0be-3d330e36c1fd,GPU-ac5046fb-020c-93e8-2784-f44aedbc5bbd"' 040a44863fb1
```

#### Failure (truncated)
```
...
Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7edda14cf897 in /usr/local/lib/python3.11/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x5b3a23e (0x7edd8d73a23e in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so)
frame #2: c10d::TCPStore::doWait(c10::ArrayRef<std::string>, std::chrono::duration<long, std::ratio<1l, 1000l> >) + 0x2c7 (0x7edd8d734c87 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so)
frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7edd8d734f82 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so)
frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7edd8d735fd1 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so)
frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7edd8d6ea371 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so)
frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7edd8d6ea371 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so)
frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7edd8d6ea371 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so)
frame #8: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7edd54da9189 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so)
frame #9: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7edd54db0610 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so)
frame #10: c10d::ProcessGroupNCCL::recv(std::vector<at::Tensor, std::allocator<at::Tensor> >&, int, int) + 0x5f8 (0x7edd54dcf978 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so)
frame #11: <unknown function> + 0x5adc309 (0x7edd8d6dc309 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so)
frame #12: <unknown function> + 0x5ae6f10 (0x7edd8d6e6f10 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so)
frame #13: <unknown function> + 0x5ae6fa5 (0x7edd8d6e6fa5 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so)
frame #14: <unknown function> + 0x5124446 (0x7edd8cd24446 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so)
frame #15: <unknown function> + 0x1acf4b8 (0x7edd896cf4b8 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so)
frame #16: <unknown function> + 0x5aee004 (0x7edd8d6ee004 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so)
frame #17: <unknown function> + 0x5af36b5 (0x7edd8d6f36b5 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so)
frame #18: <unknown function> + 0xd2fe8e (0x7edda032fe8e in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_python.so)
frame #19: <unknown function> + 0x47f074 (0x7edd9fa7f074 in /usr/local/lib/python3.11/site-packages/torch/lib/libtorch_python.so)
<omitting python frames>
frame #35: <unknown function> + 0x29d90 (0x7edda2029d90 in /usr/lib/x86_64-linux-gnu/libc.so.6)
frame #36: __libc_start_main + 0x80 (0x7edda2029e40 in /usr/lib/x86_64-linux-gnu/libc.so.6)
frame #37: <unknown function> + 0x108e (0x55f950b0c08e in /usr/local/bin/python)
. This may indicate a possible application crash on rank 0 or a network set up issue.
...
```

### Fix
gvisor debug logs show:

```
W0702 20:36:17.577055  445833 uvm.go:148] [  22:  84] nvproxy: unknown uvm ioctl 66 = 0x42
```
I've implemented that ioctl in this PR. This is the output after the fix.

```
[RANK 2] sending tensor: tensor([1., 1., 1., 1., 1.], device='cuda:2')
[RANK 0] sending tensor: tensor([1., 1., 1., 1., 1.], device='cuda:0')
[RANK 1] sending tensor: tensor([1., 1., 1., 1., 1.], device='cuda:1')
[RANK 3] received tensor from rank=0: tensor([1., 1., 1., 1., 1.], device='cuda:3')
[RANK 3] received tensor from rank=1: tensor([1., 1., 1., 1., 1.], device='cuda:3')
[RANK 3] received tensor from rank=2: tensor([1., 1., 1., 1., 1.], device='cuda:3')
PASS: NCCL working.
```
FUTURE_COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/10610 from luiscape:master ee88734c0493d24bb9278cf9f473050bb22e0865
PiperOrigin-RevId: 649146570
---
 pkg/abi/nvgpu/uvm.go                          | 10 ++++++++++
 pkg/sentry/devices/nvproxy/seccomp_filters.go |  4 ++++
 pkg/sentry/devices/nvproxy/version.go         |  1 +
 3 files changed, 15 insertions(+)

diff --git a/pkg/abi/nvgpu/uvm.go b/pkg/abi/nvgpu/uvm.go
index 3a61a9f441..cc01d24c63 100644
--- a/pkg/abi/nvgpu/uvm.go
+++ b/pkg/abi/nvgpu/uvm.go
@@ -39,6 +39,7 @@ const (
 	UVM_TOOLS_READ_PROCESS_MEMORY      = 62
 	UVM_TOOLS_WRITE_PROCESS_MEMORY     = 63
 	UVM_MAP_DYNAMIC_PARALLELISM_REGION = 65
+	UVM_UNMAP_EXTERNAL                 = 66
 	UVM_ALLOC_SEMAPHORE_POOL           = 68
 	UVM_VALIDATE_VA_RANGE              = 72
 	UVM_CREATE_EXTERNAL_RANGE          = 73
@@ -288,6 +289,15 @@ type UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS struct {
 	Pad0     [4]byte
 }
 
+// +marshal
+type UVM_UNMAP_EXTERNAL_PARAMS struct {
+	Base     uint64
+	Length   uint64
+	GPUUUID  NvUUID
+	RMStatus uint32
+	Pad0     [4]byte
+}
+
 // +marshal
 type UVM_ALLOC_SEMAPHORE_POOL_PARAMS struct {
 	Base               uint64
diff --git a/pkg/sentry/devices/nvproxy/seccomp_filters.go b/pkg/sentry/devices/nvproxy/seccomp_filters.go
index a497ba97ab..eca9f716ac 100644
--- a/pkg/sentry/devices/nvproxy/seccomp_filters.go
+++ b/pkg/sentry/devices/nvproxy/seccomp_filters.go
@@ -182,6 +182,10 @@ func Filters() seccomp.SyscallRules {
 				seccomp.NonNegativeFD{},
 				seccomp.EqualTo(nvgpu.UVM_MAP_DYNAMIC_PARALLELISM_REGION),
 			},
+			seccomp.PerArg{
+				seccomp.NonNegativeFD{},
+				seccomp.EqualTo(nvgpu.UVM_UNMAP_EXTERNAL),
+			},
 			seccomp.PerArg{
 				seccomp.NonNegativeFD{},
 				seccomp.EqualTo(nvgpu.UVM_ALLOC_SEMAPHORE_POOL),
diff --git a/pkg/sentry/devices/nvproxy/version.go b/pkg/sentry/devices/nvproxy/version.go
index f0131d9bdd..98e4b925b7 100644
--- a/pkg/sentry/devices/nvproxy/version.go
+++ b/pkg/sentry/devices/nvproxy/version.go
@@ -192,6 +192,7 @@ func Init() {
 					nvgpu.UVM_DISABLE_READ_DUPLICATION:       uvmIoctlSimple[nvgpu.UVM_DISABLE_READ_DUPLICATION_PARAMS],
 					nvgpu.UVM_MIGRATE_RANGE_GROUP:            uvmIoctlSimple[nvgpu.UVM_MIGRATE_RANGE_GROUP_PARAMS],
 					nvgpu.UVM_MAP_DYNAMIC_PARALLELISM_REGION: uvmIoctlSimple[nvgpu.UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS],
+					nvgpu.UVM_UNMAP_EXTERNAL:                 uvmIoctlSimple[nvgpu.UVM_UNMAP_EXTERNAL_PARAMS],
 					nvgpu.UVM_ALLOC_SEMAPHORE_POOL:           uvmIoctlSimple[nvgpu.UVM_ALLOC_SEMAPHORE_POOL_PARAMS],
 					nvgpu.UVM_VALIDATE_VA_RANGE:              uvmIoctlSimple[nvgpu.UVM_VALIDATE_VA_RANGE_PARAMS],
 					nvgpu.UVM_CREATE_EXTERNAL_RANGE:          uvmIoctlSimple[nvgpu.UVM_CREATE_EXTERNAL_RANGE_PARAMS],