From ad8b0d642bff8364da7455e308f48fc5e1337e1b Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 14 Feb 2022 19:50:23 -0800 Subject: [PATCH 1/2] add a network debug script and document it --- docs/source/debugging.mdx | 29 ++++++ .../distributed/torch-distributed-gpu-test.py | 92 +++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100755 scripts/distributed/torch-distributed-gpu-test.py diff --git a/docs/source/debugging.mdx b/docs/source/debugging.mdx index edb3a6ece90592..63ade1f79689c7 100644 --- a/docs/source/debugging.mdx +++ b/docs/source/debugging.mdx @@ -12,6 +12,35 @@ specific language governing permissions and limitations under the License. # Debugging +## Multi-GPU Network Issues Debug + +When training or inferencing with `DistributedDataParallel` and multiple GPU, if you run into issue of inter-communication between processes and/or nodes, you can use the following script to diagnose network issues. + +``` +wget https://raw.githubusercontent.com/huggingface/transformers/master/scripts/distributed/torch-distributed-gpu-test.py +``` + +For example to test how 2 GPUs interact do: + +``` +python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py +``` +If both processes can talk to each and allocate GPU memory each will print an OK status. + +For more GPUs or nodes adjust the arguments in the script. + +You will find a lot more details inside the diagnostics script and even a recipe to how you could run it in a SLURM environment. + +An additional level of debug is to add `NCCL_DEBUG=INFO` environment variable as follows: + +``` +NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py +``` + +This will dump a lot of NCCL-related debug information, which you can then search online if you find that some problems are reported. Or if you're not sure how to interpret the output you can share the log file in an Issue. + + + ## Underflow and Overflow Detection diff --git a/scripts/distributed/torch-distributed-gpu-test.py b/scripts/distributed/torch-distributed-gpu-test.py new file mode 100755 index 00000000000000..22a99d570e4f85 --- /dev/null +++ b/scripts/distributed/torch-distributed-gpu-test.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python + +# +# This a `torch.distributed` diagnostics script that checks that all GPUs in the cluster (one or +# many nodes) can talk to each other via nccl and allocate gpu memory. +# +# To run first adjust the number of processes and nodes: +# +# python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py +# +# You may need to add --master_addr $MASTER_ADDR --master_port $MASTER_PORT if using a custom addr:port +# +# You can also use the rdzv API: --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --rdzv_backend c10d +# +# use torch.distributed.launch instead of torch.distributed.run for torch < 1.9 +# +# If you get a hanging in `barrier` calls you have some network issues, you may try to debug this with: +# +# NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py +# +# which should tell you what's going on behind the scenes. +# +# +# This script can be run via `srun` in the SLURM environment as well. Here is a SLURM script that +# runs on 2 nodes of 4 gpus per node: +# +# #SBATCH --job-name=test-nodes # name +# #SBATCH --nodes=2 # nodes +# #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +# #SBATCH --cpus-per-task=10 # number of cores per tasks +# #SBATCH --gres=gpu:4 # number of gpus +# #SBATCH --time 0:05:00 # maximum execution time (HH:MM:SS) +# #SBATCH --output=%x-%j.out # output file name +# +# GPUS_PER_NODE=4 +# MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +# MASTER_PORT=6000 +# +# srun --jobid $SLURM_JOBID bash -c 'python -m torch.distributed.run \ +# --nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \ +# --master_addr $MASTER_ADDR --master_port $MASTER_PORT \ +# torch-distributed-gpu-test.py' +# + +import fcntl +import os +import socket + +import torch +import torch.distributed as dist + + +def printflock(*msgs): + """solves multi-process interleaved print problem""" + with open(__file__, "r") as fh: + fcntl.flock(fh, fcntl.LOCK_EX) + try: + print(*msgs) + finally: + fcntl.flock(fh, fcntl.LOCK_UN) + + +local_rank = int(os.environ["LOCAL_RANK"]) +torch.cuda.set_device(local_rank) +device = torch.device("cuda", local_rank) +hostname = socket.gethostname() + +gpu = f"[{hostname}-{local_rank}]" + +try: + # test distributed + dist.init_process_group("nccl") + dist.all_reduce(torch.ones(1).to(device), op=dist.ReduceOp.SUM) + dist.barrier() + + # test cuda is available and can allocate memory + torch.cuda.is_available() + torch.ones(1).cuda(local_rank) + + # global rank + rank = dist.get_rank() + world_size = dist.get_world_size() + + printflock(f"{gpu} is OK (global rank: {rank}/{world_size})") + + dist.barrier() + if rank == 0: + printflock(f"pt={torch.__version__}, cuda={torch.version.cuda}, nccl={torch.cuda.nccl.version()}") + +except Exception: + printflock(f"{gpu} is broken") + raise From 2d7100b75161680a36b0564ccdc28c291a8d4a2f Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 14 Feb 2022 19:52:04 -0800 Subject: [PATCH 2/2] doc --- docs/source/debugging.mdx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/debugging.mdx b/docs/source/debugging.mdx index 63ade1f79689c7..daf45784369c4e 100644 --- a/docs/source/debugging.mdx +++ b/docs/source/debugging.mdx @@ -16,13 +16,13 @@ specific language governing permissions and limitations under the License. When training or inferencing with `DistributedDataParallel` and multiple GPU, if you run into issue of inter-communication between processes and/or nodes, you can use the following script to diagnose network issues. -``` +```bash wget https://raw.githubusercontent.com/huggingface/transformers/master/scripts/distributed/torch-distributed-gpu-test.py ``` For example to test how 2 GPUs interact do: -``` +```bash python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py ``` If both processes can talk to each and allocate GPU memory each will print an OK status. @@ -33,7 +33,7 @@ You will find a lot more details inside the diagnostics script and even a recipe An additional level of debug is to add `NCCL_DEBUG=INFO` environment variable as follows: -``` +```bash NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py ```