From 7eca42484ce2afc04cedd50658abed1c0485a8dc Mon Sep 17 00:00:00 2001 From: pengwa Date: Fri, 3 Feb 2023 20:11:50 +0800 Subject: [PATCH] link mpi when either use_mpi or use_nccl enabled (#14467) ### Only link mpi when either use_mpi or use_nccl enabled To fix the issue https://github.com/microsoft/onnxruntime/issues/14278. Talked with @askhade, we think if users want to enable NCCL/MPi but MPI is not found, it should be failure instead of warning. So this PR made the change. As a result, to make CIs pass, we need disable NCCL/MPI explicitly in the build command. This PR take an alternative approach, e.g. since NCCL and MPi are not used for customers, disable NCCL by default if "--disable_nccl" not specified, disable MPI by default if "--use_mpi" not specified. ### Motivation and Context --- cmake/CMakeLists.txt | 31 ++++++++++++++++++------------- tools/ci_build/build.py | 6 ++++-- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 5c088aa8cddc4..1ff5760422177 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -1347,19 +1347,22 @@ if (onnxruntime_ENABLE_TRAINING) find_package(MPI) - if (MPI_CXX_FOUND) - message( STATUS "MPI Version: ${MPI_CXX_VERSION}") - message( STATUS "MPI (include: ${MPI_CXX_INCLUDE_DIRS}, library: ${MPI_CXX_LIBRARIES})" ) - mark_as_advanced(MPI_CXX_INCLUDE_DIRS MPI_CXX_LIBRARIES) - list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS}) - else () - set(onnxruntime_USE_NCCL OFF) - set(onnxruntime_USE_MPI OFF) - message( WARNING "MPI is not found. Please define onnxruntime_MPI_HOME to specify the path of MPI. Otherwise, NCCL will be disabled." ) + if (onnxruntime_USE_MPI OR onnxruntime_USE_NCCL) + if (MPI_CXX_FOUND) + message( STATUS "MPI Version: ${MPI_CXX_VERSION}") + message( STATUS "MPI (include: ${MPI_CXX_INCLUDE_DIRS}, library: ${MPI_CXX_LIBRARIES})" ) + mark_as_advanced(MPI_CXX_INCLUDE_DIRS MPI_CXX_LIBRARIES) + list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS}) + else () + message( + FATAL_ERROR + "MPI is not found. Please define onnxruntime_MPI_HOME to specify the path of MPI. Otherwise, NCCL will be disabled." + ) + endif() endif() # Find NCCL and MPI - if (onnxruntime_USE_NCCL AND MPI_CXX_FOUND) + if (onnxruntime_USE_NCCL) if (onnxruntime_USE_CUDA) set(NCCL_LIBNAME "nccl") elseif (onnxruntime_USE_ROCM) @@ -1417,13 +1420,15 @@ if (onnxruntime_ENABLE_TRAINING) add_definitions(-DORT_USE_NCCL=1) message( STATUS "NCCL is enabled in Linux GPU Build." ) else () - set(onnxruntime_USE_NCCL OFF) - message( WARNING "NCCL is not found. Please use --nccl_home to specify the path of NCCL. Otherwise, NCCL is disabled." ) + message( + FATAL_ERROR + "NCCL is not found. Please use --nccl_home to specify the path of NCCL. Otherwise, NCCL is disabled." + ) endif() endif() endif() - if (onnxruntime_USE_MPI AND MPI_CXX_FOUND) + if (onnxruntime_USE_MPI) add_definitions(-DUSE_MPI=1) endif() diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index d552fb71b6547..f421800523667 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -192,10 +192,12 @@ def convert_arg_line_to_args(self, arg_line): parser.add_argument("--enable_training_apis", action="store_true", help="Enable ort training apis.") parser.add_argument("--enable_training_ops", action="store_true", help="Enable training ops in inference graph.") - parser.add_argument("--disable_nccl", action="store_true", help="Disable Nccl.") + parser.add_argument("--disable_nccl", action="store_false", help="Disable NCCL, by default NCCL is disabled.") parser.add_argument("--mpi_home", help="Path to MPI installation dir") parser.add_argument("--nccl_home", help="Path to NCCL installation dir") - parser.add_argument("--use_mpi", nargs="?", default=True, const=True, type=_str_to_bool) + parser.add_argument( + "--use_mpi", nargs="?", default=False, const=True, type=_str_to_bool, help="Disabled by default." + ) # enable ONNX tests parser.add_argument(