Chao1Han · Chao1Han · Aug 29, 2024 · Aug 30, 2024 · Aug 30, 2024 · Aug 30, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -276,6 +276,8 @@ option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
 cmake_dependent_option(USE_NCCL "Use NCCL" ON
                        "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
+cmake_dependent_option(USE_XCCL "Use XCCL" ON
+                       "USE_XPU;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
 cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
 cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL"
@@ -352,6 +354,8 @@ cmake_dependent_option(
     USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF)
 cmake_dependent_option(
     USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF)
+cmake_dependent_option(
+    USE_C10D_XCCL "USE C10D XCCL" ON "USE_DISTRIBUTED;USE_XCCL" OFF)
 cmake_dependent_option(
     USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF)
 cmake_dependent_option(

diff --git a/build_variables.bzl b/build_variables.bzl
@@ -704,6 +704,10 @@ libtorch_cuda_sources = libtorch_cuda_core_sources + libtorch_cuda_distributed_s
     "torch/csrc/cuda/nccl.cpp",
 ]
 
+libtorch_xpu_distributed_extra_sources = [
+    "torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp",
+]
+
 torch_cpp_srcs = [
     "torch/csrc/api/src/cuda.cpp",  # this just forwards stuff, no real CUDA
     "torch/csrc/api/src/data/datasets/mnist.cpp",

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
@@ -1049,6 +1049,13 @@ elseif(USE_CUDA)
 endif()
 
 if(USE_XPU)
+  # if SYCL runtime and oneCCL runtime are both system installed
+  # then building flag USE_XPU=ON , USE_XCCL=ON and USE_C10D_XCCL=ON;
+  # XCCL backend will be build in libtorch_xpu;
+  # manually set `USE_XCCL=OFF` disable XCCL backend building.
+  if(USE_XCCL)
+    append_filelist("libtorch_xpu_distributed_extra_sources" Caffe2_XPU_SRCS)
+  endif()
   list(APPEND Caffe2_XPU_SRCS ${GENERATED_CXX_TORCH_XPU})
   add_library(torch_xpu ${Caffe2_XPU_SRCS})
   torch_compile_options(torch_xpu)  # see cmake/public/utils.cmake
@@ -1118,6 +1125,10 @@ if(USE_XPU)
     include_directories(SYSTEM ${ATen_XPU_INCLUDE_DIRS})
 
   endif()
+  if(USE_XCCL)
+    target_link_libraries(torch_xpu PRIVATE torch::xccl)
+    target_compile_definitions(torch_xpu PRIVATE USE_XCCL)
+  endif()
 endif()
 
 if(NOT MSVC AND USE_XNNPACK)
@@ -1404,6 +1415,9 @@ if(USE_DISTRIBUTED)
       target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
     endif()
   endif()
+  if(USE_XPU AND USE_C10D_XCCL)
+    target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL)
+  endif()
   if(USE_MPI AND USE_C10D_MPI)
     if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
       set_source_files_properties(

diff --git a/caffe2/core/macros.h.in b/caffe2/core/macros.h.in
@@ -45,6 +45,7 @@
   {"USE_CUDNN", "${USE_CUDNN}"}, \
   {"CUDNN_VERSION", "${CUDNN_VERSION}"}, \
   {"USE_NCCL", "${USE_NCCL}"}, \
+  {"USE_XCCL", "${USE_XCCL}"}, \
   {"USE_MPI", "${USE_MPI}"}, \
   {"USE_GFLAGS", "${USE_GFLAGS}"}, \
   {"USE_GLOG", "${USE_GLOG}"}, \

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
@@ -1123,6 +1123,24 @@ if(USE_CUDA)
   include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
 endif()
 
+# ---[ XCCL
+if(USE_XCCL)
+  if(NOT USE_XPU)
+    message(WARNING
+        "Not using XPU, so disabling USE_XCCL. Suppress this warning with "
+        "-DUSE_XCCL=OFF.")
+    caffe2_update_option(USE_XCCL OFF)
+  elseif(NOT CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    message(WARNING "USE_XCCL is currently only supported under Linux.")
+    caffe2_update_option(USE_XCCL OFF)
+  else()
+    include(${CMAKE_CURRENT_LIST_DIR}/External/xccl.cmake)
+    if(NOT XCCL_FOUND)
+      caffe2_update_option(USE_XCCL OFF)
+    endif()
+  endif()
+endif()
+
 if(USE_DISTRIBUTED AND USE_TENSORPIPE)
   if(MSVC)
     message(WARNING "Tensorpipe cannot be used on Windows.")

diff --git a/cmake/External/xccl.cmake b/cmake/External/xccl.cmake
@@ -0,0 +1,15 @@
+if(NOT __XCCL_INCLUDED)
+  set(__XCCL_INCLUDED TRUE)
+
+  # XCCL_ROOT, XCCL_LIBRARY_DIR, XCCL_INCLUDE_DIR are handled by FindXCCL.cmake.
+  find_package(XCCL REQUIRED)
+  if(XCCL_FOUND)
+    add_library(torch::xccl INTERFACE IMPORTED)
+    set_property(
+      TARGET torch::xccl PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+      ${XCCL_INCLUDE_DIR})
+    set_property(
+      TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES
+      ${XCCL_LIBRARY})
+  endif()
+endif()
diff --git a/cmake/Modules/FindXCCL.cmake b/cmake/Modules/FindXCCL.cmake
@@ -0,0 +1,69 @@
+# This will define the following variables:
+# XCCL_FOUND               : True if the system has the XCCL library.
+# XCCL_INCLUDE_DIR         : Include directories needed to use XCCL.
+# XCCL_LIBRARY_DIR         ：The path to the XCCL library.
+# XCCL_LIBRARY             : XCCL library fullname.
+
+include(FindPackageHandleStandardArgs)
+
+set(XCCL_ROOT "/opt/intel/oneapi/ccl/latest")
+if (NOT EXISTS "${XCCL_ROOT}")
+  message(STATUS "Default OneCCL not found, using current environment OneAPI")
+  set(XCCL_ROOT $ENV{ONEAPI_ROOT}/ccl/latest)
+endif()
+
+string(COMPARE EQUAL "${XCCL_ROOT}" "" nocclfound)
+if(nocclfound)
+  set(XCCL_FOUND False)
+  set(XCCL_REASON_FAILURE "OneCCL library not found!!")
+  set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}")
+  return()
+endif()
+
+# Find include path from binary.
+find_file(
+  XCCL_INCLUDE_DIR
+  NAMES include
+  HINTS ${XCCL_ROOT}
+  NO_DEFAULT_PATH
+)
+
+# Find include/oneapi path from include path.
+find_file(
+  XCCL_INCLUDE_ONEAPI_DIR
+  NAMES oneapi
+  HINTS ${XCCL_ROOT}/include/
+  NO_DEFAULT_PATH
+)
+
+list(APPEND XCCL_INCLUDE_DIR ${XCCL_INCLUDE_ONEAPI_DIR})
+
+# Find library directory from binary.
+find_file(
+  XCCL_LIBRARY_DIR
+  NAMES lib
+  HINTS ${XCCL_ROOT}
+  NO_DEFAULT_PATH
+)
+
+# Find XCCL library fullname.
+find_library(
+  XCCL_LIBRARY
+  NAMES ccl
+  HINTS ${XCCL_LIBRARY_DIR}
+  NO_DEFAULT_PATH
+)
+
+if((NOT XCCL_INCLUDE_DIR) OR (NOT XCCL_LIBRARY_DIR) OR (NOT XCCL_LIBRARY))
+  set(XCCL_FOUND False)
+  set(XCCL_REASON_FAILURE "OneCCL library not found!!")
+  set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}")
+  return()
+endif()
+
+find_package_handle_standard_args(
+  XCCL
+  FOUND_VAR XCCL_FOUND
+  REQUIRED_VARS XCCL_INCLUDE_DIR XCCL_LIBRARY_DIR XCCL_LIBRARY
+  REASON_FAILURE_MESSAGE "${XCCL_REASON_FAILURE}"
+)
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
@@ -159,6 +159,12 @@ function(caffe2_print_configuration_summary)
     message(STATUS "    USE_SYSTEM_UCC        : ${USE_SYSTEM_UCC}")
   endif()
   message(STATUS "  USE_ITT               : ${USE_ITT}")
+  message(STATUS "  USE_XCCL              : ${USE_XCCL}")
+  if(${USE_XCCL})
+    message(STATUS "    USE_C10D_XCCL       : ${USE_C10D_XCCL}")
+    message(STATUS "    XCCL include path   : ${XCCL_INCLUDE_DIR}")
+    message(STATUS "    XCCL library        : ${XCCL_LIBRARY}")
+  endif()
   message(STATUS "  USE_NCCL              : ${USE_NCCL}")
   if(${USE_NCCL})
     message(STATUS "    USE_SYSTEM_NCCL     : ${USE_SYSTEM_NCCL}")

diff --git a/setup.py b/setup.py
@@ -658,6 +658,10 @@ def run(self):
             report("-- Building NCCL library")
         else:
             report("-- Not using NCCL")
+        if cmake_cache_vars["USE_XCCL"]:
+            report("-- Building XCCL library")
+        else:
+            report("-- Not using XCCL")
         if cmake_cache_vars["USE_DISTRIBUTED"]:
             if IS_WINDOWS:
                 report("-- Building without distributed package")

diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
@@ -31,6 +31,7 @@
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     skip_if_lt_x_gpu,
+    get_device_count,
 )
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -60,14 +61,15 @@
 torch.backends.cuda.matmul.allow_tf32 = False
 
 
-def gpus_for_rank(world_size):
+def gpus_for_rank(world_size, backend):
     """Multigpu tests are designed to simulate the multi nodes with multi
     GPUs on each node. Nccl backend requires equal #GPUs in each process.
     On a single node, all visible GPUs are evenly
     divided to subsets, each process only uses a subset.
     """
-    visible_devices = list(range(torch.cuda.device_count()))
-    gpus_per_process = torch.cuda.device_count() // world_size
+    device_count = get_device_count(backend)
+    visible_devices = list(range(device_count))
+    gpus_per_process = device_count // world_size
     gpus_for_rank = []
     for rank in range(world_size):
         gpus_for_rank.append(
@@ -828,7 +830,7 @@ def update_parameters(model):
     def _gpu_model_with_ddp_comm_hook(
         self, process_group, hook=None, gradient_as_bucket_view=False, state=None
     ):
-        device_id = gpus_for_rank(self.world_size)[self.rank][0]
+        device_id = gpus_for_rank(self.world_size, process_group.name())[self.rank][0]
         gpu_model = DistributedDataParallel(
             ModuleForDdpCommHook().to(device_id),
             device_ids=[device_id],
@@ -845,7 +847,7 @@ def _gpu_model_with_ddp_comm_hook(
     def _gpu_model_with_builtin_ddp_comm_hook(
         self, process_group, hook=None, gradient_as_bucket_view=False
     ):
-        device_id = gpus_for_rank(self.world_size)[self.rank][0]
+        device_id = gpus_for_rank(self.world_size, process_group.name())[self.rank][0]
         gpu_model = DistributedDataParallel(
             ModuleForDdpCommHook().to(device_id),
             device_ids=[device_id],
@@ -1834,6 +1836,9 @@ def test_init_process_group_for_all_backends(self):
             elif backend == dist.Backend.UCC:
                 if not dist.is_ucc_available():
                     continue
+            elif backend == dist.Backend.XCCL:
+                if not dist.is_xccl_available():
+                    continue
             # Multi-threaded PG is defined as a pure python class.
             # Its pg.name() does not going through Pybind, so its backend name
             # is still "threaded" instead of "custom".