From 5d2599b1108e0dd1365e608c17760f52d4e6d41b Mon Sep 17 00:00:00 2001 From: Andy li Date: Mon, 6 Nov 2023 23:05:50 +0800 Subject: [PATCH] update readme (#14) * update readme * catch up to latest nccl executor --- .gitmodules | 1 + README.md | 117 ++++++--------------------------- docs/performance-nd-h100-v5.md | 94 +------------------------- executor/msccl-executor-nccl | 2 +- 4 files changed, 24 insertions(+), 190 deletions(-) diff --git a/.gitmodules b/.gitmodules index 34bd5f1..717fe35 100644 --- a/.gitmodules +++ b/.gitmodules @@ -9,3 +9,4 @@ [submodule "scheduler/msccl-scheduler"] path = scheduler/msccl-scheduler url = https://github.com/Azure/msccl-scheduler + branch = main \ No newline at end of file diff --git a/README.md b/README.md index 87d4266..b6c96ef 100644 --- a/README.md +++ b/README.md @@ -15,11 +15,10 @@ MSCCL vision is to provide a unified, efficient, and scalable framework for exec - MSCCL test toolkit([msccl-tests-nccl](https://github.com/Azure/msccl-tests-nccl)): These tests check both the performance and the correctness of MSCCL operations. ## Performance -For reference, FP16 All-Reduce and All-Gather algorithms were tested and compared on ND H100 v5 VM, using msccl-tests-nccl. +For reference, FP16 All-Gather algorithms were tested and compared on ND H100 v5 VM, using msccl-tests-nccl. - @@ -27,216 +26,128 @@ For reference, FP16 All-Reduce and All-Gather algorithms were tested and compare - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -248,13 +159,13 @@ For reference, FP16 All-Reduce and All-Gather algorithms were tested and compare In order to use MSCCL, you may follow these steps to use two different MSCCL algorithms for AllReduce on Azure NDv4 which has 8xA100 GPUs: -Follow below steps to download the source code of msccl and related submodules +#####1. Follow below steps to download the source code of msccl and related submodules ```sh $ git clone https://github.com/Azure/msccl.git --recurse-submodules ``` -Steps to install MSCCL executor: +#####2. Steps to install MSCCL executor: ```sh $ git clone https://github.com/Azure/msccl.git --recurse-submodules @@ -264,7 +175,7 @@ $ cd ../ $ cd ../ ``` -Then, follow these steps to install msccl-tests-nccl for performance evaluation: +#####3. follow these steps to install msccl-tests-nccl for performance evaluation: ```sh $ cd tests/msccl-tests-nccl/ @@ -273,7 +184,20 @@ $ cd ../ $ cd ../ ``` -Next install [MSCCL toolkit](https://github.com/microsoft/msccl-tools) to compile a few custom algorithms: +#####4. apply the msccl algo when using msccl executor +######- for ndv5, we already have algo optimized, you can use msccl scheduler to apply this algo directly to the executor, below is the steps to apply the scheduler +```sh +$ sudo apt-get install libcurl4-openssl-dev nlohmann-json3-dev + +for nccl: +$ CXX=/path/to/nvcc BIN_HOME=/path/to/nccl/binary SRC_HOME=/path/to/nccl/source make +for rccl: +$ CXX=/path/to/nvcc BIN_HOME=/path/to/nccl/binary SRC_HOME=/path/to/nccl/source make PLATFORM=RCCL + +$ make install +``` + +######- for customize the msccl algo for your system, you can install [MSCCL toolkit](https://github.com/microsoft/msccl-tools) to compile a few custom algorithms: ```sh $ git clone https://github.com/microsoft/msccl-tools.git @@ -286,11 +210,12 @@ $ cd ../ The compiler's generated code is an XML file (`test.xml`) that is fed to MSCCL runtime. To evaluate its performance, copy the `test.xml` to the msccl/exector/msccl-executor-nccl/build/lib/msccl-algorithms/ and execute the following command line on an Azure NDv4 node or any 8xA100 system: +######below is the command to run test using msccl-executor-nccl ```sh $ mpirun -np 8 -x LD_LIBRARY_PATH=msccl/exector/msccl-executor-nccl/build/lib/:$LD_LIBRARY_PATH -x NCCL_DEBUG=INFO -x NCCL_DEBUG_SUBSYS=INIT,ENV tests/msccl-tests-nccl/build/all_reduce_perf -b 128 -e 32MB -f 2 -g 1 -c 1 -n 100 -w 100 -G 100 -z 0 ``` - -If everything is installed correctly, you should see the following output in log: + +######If everything is installed correctly, you should see the following output in log: ```sh [0] NCCL INFO Connected 1 MSCCL algorithms diff --git a/docs/performance-nd-h100-v5.md b/docs/performance-nd-h100-v5.md index fabe763..63df428 100644 --- a/docs/performance-nd-h100-v5.md +++ b/docs/performance-nd-h100-v5.md @@ -8,16 +8,12 @@ All results are from ND H100 v5. MSCCL executor version is [commit 6eacec0](http ```bash mpirun --allow-run-as-root --tag-output -map-by ppr:8:node --bind-to numa -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 -x PATH -x LD_PRELOAD=/path/to/msccl-executor-nccl/build/lib/libnccl.so -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_TOPO_FILE=/path/to/ndv5-topo.xml -x NCCL_DEBUG=WARN -x NCCL_MIN_NCHANNELS=32 /path/to/msccl-tests-nccl/build/all_gather_perf -b 1 -e 1G -f 2 -g 1 -c 1 -w 20 -n 1000 -d half -G 1 ``` -**- all-reduce** -```bash -mpirun --allow-run-as-root --tag-output -map-by ppr:8:node --bind-to numa -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 -x PATH -x LD_PRELOAD=/path/to/msccl-executor-nccl/build/lib/libnccl.so -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_TOPO_FILE=/path/to/ndv5-topo.xml -x NCCL_DEBUG=WARN -x NCCL_MIN_NCHANNELS=32 /path/to/msccl-tests-nccl/build/all_reduce_perf -b 1 -e 1G -f 2 -g 1 -c 1 -w 20 -n 1000 -d half -G 1 -``` + ### 2. Performance Results: **- 1 node, 8 gpus/node**
FP16 All-Reduce Latency (us) All-Gather Latency (us)
NCCL MSCCL MSCCL SpeedupMessage SizeNCCLMSCCLMSCCL Speedup
1KB13.125.842.25x 1KB 9.54 5.65 1.69x
2KB14.395.92.44x 2KB 9.8 5.7 1.72x
4KB15.285.832.62x 4KB 9.78 5.43 1.80x
8KB15.695.872.67x 8KB 9.78 5.47 1.81x
16KB16.645.942.80x 16KB 10.29 5.53 1.86x
32KB19.36.143.14x 32KB 12.49 5.75 2.17x
64KB206.473.09x 64KB 12.87 5.95 2.16x
128KB20.427.572.70x 128KB 13.16 6.38 2.06x
256KB20.59.392.18x 256KB 13.23 7.26 1.82x
512KB29.8912.582.38x 512KB 13.39 8.71 1.54x
1MB31.9418.211.75x 1MB 18.33 12.3 1.49x
2MB37.9524.471.55x 2MB 23.18 17.75 1.31x
4MB49.2838.231.29x 4MB 33.66 23.37 1.44x
8MB77.0174.061.04x 8MB 44.7 38.54 1.16x
16MB116115.71.00x 16MB 67.19 67.16 1.00x
32MB187.2186.51.00x 32MB 104.7 98.4 1.06x
64MB317.4315.71.01x 64MB 192.4 181.9 1.06x
128MB572.5570.41.00x 128MB 368.3 348.4 1.06x
256MB10791075.61.00x 256MB 699.5 680.7 1.03x
512MB2071.12067.91.00x 512MB 1358.6 1339.3 1.01x
1GB4028.74026.81.00x 1GB 2663.8 2633
- @@ -25,216 +21,128 @@ mpirun --allow-run-as-root --tag-output -map-by ppr:8:node --bind-to numa -mca p - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/executor/msccl-executor-nccl b/executor/msccl-executor-nccl index 38bc944..bb86f77 160000 --- a/executor/msccl-executor-nccl +++ b/executor/msccl-executor-nccl @@ -1 +1 @@ -Subproject commit 38bc944823bb0bd69bee246b09525341468836db +Subproject commit bb86f7728001de537dd3186201e75ce00feae913
FP16 All-Reduce Latency (us) All-Gather Latency (us)
NCCL MSCCL MSCCL SpeedupMessage SizeNCCLMSCCLMSCCL Speedup
1KB13.125.842.25x 1KB 9.54 5.65 1.69x
2KB14.395.92.44x 2KB 9.8 5.7 1.72x
4KB15.285.832.62x 4KB 9.78 5.43 1.80x
8KB15.695.872.67x 8KB 9.78 5.47 1.81x
16KB16.645.942.80x 16KB 10.29 5.53 1.86x
32KB19.36.143.14x 32KB 12.49 5.75 2.17x
64KB206.473.09x 64KB 12.87 5.95 2.16x
128KB20.427.572.70x 128KB 13.16 6.38 2.06x
256KB20.59.392.18x 256KB 13.23 7.26 1.82x
512KB29.8912.582.38x 512KB 13.39 8.71 1.54x
1MB31.9418.211.75x 1MB 18.33 12.3 1.49x
2MB37.9524.471.55x 2MB 23.18 17.75 1.31x
4MB49.2838.231.29x 4MB 33.66 23.37 1.44x
8MB77.0174.061.04x 8MB 44.7 38.54 1.16x
16MB116115.71.00x 16MB 67.19 67.16 1.00x
32MB187.2186.51.00x 32MB 104.7 98.4 1.06x
64MB317.4315.71.01x 64MB 192.4 181.9 1.06x
128MB572.5570.41.00x 128MB 368.3 348.4 1.06x
256MB10791075.61.00x 256MB 699.5 680.7 1.03x
512MB2071.12067.91.00x 512MB 1358.6 1339.3 1.01x
1GB4028.74026.81.00x 1GB 2663.8 2633