Skip to content

Commit

Permalink
Merge pull request #820 from afender/opg_env
Browse files Browse the repository at this point in the history
[REVIEW] OPG infra and all-gather smoke test
  • Loading branch information
BradReesWork authored Apr 10, 2020
2 parents 7b236d1 + 95af9e1 commit 8acb743
Show file tree
Hide file tree
Showing 13 changed files with 253 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
- PR #803 Enable Ninja build
- PR #804 Cythonize in parallel
- PR #807 Updating the Python docs
- PR #820 OPG infra and all-gather smoke test

## Bug Fixes
- PR #763 Update RAPIDS conda dependencies to v0.14
Expand Down
1 change: 1 addition & 0 deletions ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ conda install -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge -c defaul
distributed>=2.12.0 \
dask-cudf=${MINOR_VERSION} \
dask-cuda=${MINOR_VERSION} \
nccl>=2.5 \
libcypher-parser \
ipython=7.3* \
jupyterlab
Expand Down
1 change: 1 addition & 0 deletions conda/environments/cugraph_dev_cuda10.0.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ dependencies:
- distributed>=2.12.0
- dask-cuda=0.14*
- dask-cudf=0.14*
- nccl>=2.5
- scipy
- networkx
- python-louvain
Expand Down
1 change: 1 addition & 0 deletions conda/environments/cugraph_dev_cuda10.1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ dependencies:
- distributed>=2.12.0
- dask-cuda=0.14*
- dask-cudf=0.14*
- nccl>=2.5
- scipy
- networkx
- python-louvain
Expand Down
1 change: 1 addition & 0 deletions conda/environments/cugraph_dev_cuda10.2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ dependencies:
- distributed>=2.12.0
- dask-cuda=0.14*
- dask-cudf=0.14*
- nccl>=2.5
- scipy
- networkx
- python-louvain
Expand Down
1 change: 1 addition & 0 deletions conda/recipes/cugraph/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ requirements:
- python x.x
- libcugraph={{ version }}
- cudf={{ minor_version }}
- nccl>=2.5

#test:
# commands:
Expand Down
2 changes: 2 additions & 0 deletions conda/recipes/libcugraph/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,11 @@ requirements:
- cudatoolkit {{ cuda_version }}.*
- boost-cpp>=1.66
- libcypher-parser
- nccl>=2.5
run:
- libcudf={{ minor_version }}
- {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
- nccl>=2.5

#test:
# commands:
Expand Down
21 changes: 19 additions & 2 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,13 @@ set(CMAKE_EXE_LINKER_FLAGS "-Wl,--disable-new-dtags")
option(BUILD_TESTS "Configure CMake to build tests"
ON)

option(BUILD_MPI "Build with MPI" OFF)
if (BUILD_MPI)
find_package(MPI REQUIRED)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${MPI_C_COMPILE_FLAGS}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MPI_CXX_COMPILE_FLAGS}")
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MPI_CXX_LINK_FLAGS}")
endif(BUILD_MPI)
###################################################################################################
# - cmake modules ---------------------------------------------------------------------------------

Expand Down Expand Up @@ -302,6 +309,15 @@ add_dependencies(gunrock cugunrock)

set_property(TARGET gunrock PROPERTY IMPORTED_LOCATION ${CUGUNROCK_DIR}/lib/libgunrock.a)

# - NCCL
if(NOT NCCL_PATH)
find_package(NCCL REQUIRED)
else()
message("-- Manually set NCCL PATH to ${NCCL_PATH}")
set(NCCL_INCLUDE_DIRS ${NCCL_PATH}/include)
set(NCCL_LIBRARIES ${NCCL_PATH}/lib/libnccl.so)
endif(NOT NCCL_PATH)

###################################################################################################
# - library targets -------------------------------------------------------------------------------

Expand Down Expand Up @@ -383,7 +399,6 @@ add_dependencies(cugraph cugunrock)

###################################################################################################
# - include paths ---------------------------------------------------------------------------------

target_include_directories(cugraph
PRIVATE
"${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}"
Expand All @@ -399,6 +414,8 @@ target_include_directories(cugraph
"${CUHORNET_INCLUDE_DIR}/primitives"
"${CMAKE_CURRENT_SOURCE_DIR}/src"
"${CUGUNROCK_DIR}/include"
"${NCCL_INCLUDE_DIRS}"
"${MPI_CXX_INCLUDE_PATH}"
PUBLIC
"${CMAKE_CURRENT_SOURCE_DIR}/include"
)
Expand All @@ -407,7 +424,7 @@ target_include_directories(cugraph
# - link libraries --------------------------------------------------------------------------------

target_link_libraries(cugraph PRIVATE
${CUDF_LIBRARY} ${RMM_LIBRARY} gunrock ${NVSTRINGS_LIBRARY} cublas cusparse curand cusolver cudart cuda ${LIBCYPHERPARSER_LIBRARY})
${CUDF_LIBRARY} ${RMM_LIBRARY} gunrock ${NVSTRINGS_LIBRARY} cublas cusparse curand cusolver cudart cuda ${LIBCYPHERPARSER_LIBRARY} ${MPI_CXX_LIBRARIES} ${NCCL_LIBRARIES})
if(OpenMP_CXX_FOUND)
target_link_libraries(cugraph PRIVATE
###################################################################################################
Expand Down
116 changes: 116 additions & 0 deletions cpp/cmake/Modules/FindNCCL.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# Copyright (c) 2019, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Based on FindPNG.cmake from cmake 3.14.3

#[=======================================================================[.rst:
FindNCCL
--------
Find libnccl, the NVIDIA Collective Communication Library. A hint to find NCCL
can be provided by setting NCCL_INSTALL_DIR.
Imported targets
^^^^^^^^^^^^^^^^
This module defines the following :prop_tgt:`IMPORTED` target:
``NCCL::NCCL``
The libnccl library, if found.
Result variables
^^^^^^^^^^^^^^^^
This module will set the following variables in your project:
``NCCL_INCLUDE_DIRS``
where to find nccl.h , etc.
``NCCL_LIBRARIES``
the libraries to link against to use NCCL.
``NCCL_FOUND``
If false, do not try to use NCCL.
``NCCL_VERSION_STRING``
the version of the NCCL library found
#]=======================================================================]

find_path(NCCL_NCCL_INCLUDE_DIR nccl.h HINTS ${NCCL_INSTALL_DIR} PATH_SUFFIXES include)

#TODO: Does this need to support finding the static library?

list(APPEND NCCL_NAMES nccl libnccl)
set(_NCCL_VERSION_SUFFIXES 2)

foreach(v IN LISTS _NCCL_VERSION_SUFFIXES)
list(APPEND NCCL_NAMES nccl${v} libnccl${v})
endforeach()
unset(_NCCL_VERSION_SUFFIXES)
# For compatibility with versions prior to this multi-config search, honor
# any NCCL_LIBRARY that is already specified and skip the search.
if(NOT NCCL_LIBRARY)
find_library(NCCL_LIBRARY_RELEASE NAMES ${NCCL_NAMES} HINTS ${NCCL_INSTALL_DIR} PATH_SUFFIXES lib)
include(${CMAKE_ROOT}/Modules/SelectLibraryConfigurations.cmake)
select_library_configurations(NCCL)
mark_as_advanced(NCCL_LIBRARY_RELEASE)
endif()
unset(NCCL_NAMES)

# Set by select_library_configurations(), but we want the one from
# find_package_handle_standard_args() below.
unset(NCCL_FOUND)

if (NCCL_LIBRARY AND NCCL_NCCL_INCLUDE_DIR)
set(NCCL_INCLUDE_DIRS ${NCCL_NCCL_INCLUDE_DIR} )
set(NCCL_LIBRARY ${NCCL_LIBRARY})

if(NOT TARGET NCCL::NCCL)
add_library(NCCL::NCCL UNKNOWN IMPORTED)
set_target_properties(NCCL::NCCL PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${NCCL_INCLUDE_DIRS}")
if(EXISTS "${NCCL_LIBRARY}")
set_target_properties(NCCL::NCCL PROPERTIES
IMPORTED_LINK_INTERFACE_LANGUAGES "C"
IMPORTED_LOCATION "${NCCL_LIBRARY}")
endif()
endif()
endif ()

if (NCCL_NCCL_INCLUDE_DIR AND EXISTS "${NCCL_NCCL_INCLUDE_DIR}/nccl.h")
file(STRINGS "${NCCL_NCCL_INCLUDE_DIR}/nccl.h" nccl_major_version_str REGEX "^#define[ \t]+NCCL_MAJOR[ \t]+[0-9]+")
string(REGEX REPLACE "^#define[ \t]+NCCL_MAJOR[ \t]+([0-9]+)" "\\1" nccl_major_version_str "${nccl_major_version_str}")

file(STRINGS "${NCCL_NCCL_INCLUDE_DIR}/nccl.h" nccl_minor_version_str REGEX "^#define[ \t]+NCCL_MINOR[ \t]+[0-9]+")
string(REGEX REPLACE "^#define[ \t]+NCCL_MINOR[ \t]+([0-9]+)" "\\1" nccl_minor_version_str "${nccl_minor_version_str}")

file(STRINGS "${NCCL_NCCL_INCLUDE_DIR}/nccl.h" nccl_patch_version_str REGEX "^#define[ \t]+NCCL_PATCH[ \t]+[0-9]+")
string(REGEX REPLACE "^#define[ \t]+NCCL_PATCH[ \t]+([0-9]+)" "\\1" nccl_patch_version_str "${nccl_patch_version_str}")

file(STRINGS "${NCCL_NCCL_INCLUDE_DIR}/nccl.h" nccl_suffix_version_str REGEX "^#define[ \t]+NCCL_SUFFIX[ \t]+\".*\"")
string(REGEX REPLACE "^#define[ \t]+NCCL_SUFFIX[ \t]+\"(.*)\"" "\\1" nccl_suffix_version_str "${nccl_suffix_version_str}")

set(NCCL_VERSION_STRING "${nccl_major_version_str}.${nccl_minor_version_str}.${nccl_patch_version_str}${nccl_suffix_version_str}")

unset(nccl_major_version_str)
unset(nccl_minor_version_str)
unset(nccl_patch_version_str)
unset(nccl_suffix_version_str)
endif ()

include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)
find_package_handle_standard_args(NCCL
REQUIRED_VARS NCCL_LIBRARY NCCL_NCCL_INCLUDE_DIR
VERSION_VAR NCCL_VERSION_STRING)

mark_as_advanced(NCCL_NCCL_INCLUDE_DIR NCCL_LIBRARY)
16 changes: 16 additions & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@ function(ConfigureTest CMAKE_TEST_NAME CMAKE_TEST_SRC CMAKE_EXTRA_LIBS)
target_link_libraries(${CMAKE_TEST_NAME}
PRIVATE
gtest gmock_main gmock cugraph ${CUDF_LIBRARY} ${RMM_LIBRARY} ${CMAKE_EXTRA_LIBS} cudart cuda)
if (BUILD_MPI)
include_directories(include ${MPI_CXX_INCLUDE_PATH} ${NCCL_INCLUDE_DIRS})
target_link_libraries(${CMAKE_TEST_NAME} PRIVATE ${MPI_C_LIBRARIES} ${NCCL_LIBRARIES} )
target_compile_options(${CMAKE_TEST_NAME} PUBLIC ${MPI_C_COMPILE_FLAGS})
endif(BUILD_MPI)

if(OpenMP_CXX_FOUND)
target_link_libraries(${CMAKE_TEST_NAME} PRIVATE
###################################################################################################
Expand Down Expand Up @@ -271,6 +277,16 @@ set(FIND_MATCHES_TEST_SRC

ConfigureTest(FIND_MATCHES_TEST "${FIND_MATCHES_TEST_SRC}" "")

###################################################################################################
#-NCCL tests ---------------------------------------------------------------------

if (BUILD_MPI)
set(NCCL_TEST_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/nccl/nccl_test.cu")

ConfigureTest(NCCL_TEST "${NCCL_TEST_SRC}" "")
endif(BUILD_MPI)

###################################################################################################
### enable testing ################################################################################
###################################################################################################
Expand Down
75 changes: 75 additions & 0 deletions cpp/tests/nccl/nccl_test.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#include "gtest/gtest.h"
#include <cugraph.h>
#include "test_utils.h"
#include <string.h>
#include <mpi.h>
#include <nccl.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>

TEST(allgather, success)
{
int p = 1, r = 0, dev = 0, dev_count = 0;
MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &p));
MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &r));
CUDA_RT_CALL(cudaGetDeviceCount(&dev_count));

// shortcut for device ID here
// may need something smarter later
dev = r%dev_count;
// cudaSetDevice must happen before ncclCommInitRank
CUDA_RT_CALL(cudaSetDevice(dev));

// print info
printf("# Rank %2d - Pid %6d - device %2d\n",
r, getpid(), dev);

// NCCL init
ncclUniqueId id;
ncclComm_t comm;
if (r == 0) NCCLCHECK(ncclGetUniqueId(&id));
MPICHECK(MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD));
NCCLCHECK(ncclCommInitRank(&comm, p, id, r));
MPICHECK(MPI_Barrier(MPI_COMM_WORLD));

//allocate device buffers
int size = 3;
float *sendbuff, *recvbuff;
CUDA_RT_CALL(cudaMalloc(&sendbuff, size * sizeof(float)));
CUDA_RT_CALL(cudaMalloc(&recvbuff, size*p * sizeof(float)));

//init values
thrust::fill(thrust::device_pointer_cast(sendbuff),
thrust::device_pointer_cast(sendbuff + size), (float)r);
thrust::fill(thrust::device_pointer_cast(recvbuff),
thrust::device_pointer_cast(recvbuff + size*p), -1.0f);

// ncclAllGather
NCCLCHECK(ncclAllGather((const void*)sendbuff, (void*)recvbuff, size, ncclFloat, comm, cudaStreamDefault));

// expect each rankid printed size times in ascending order
if (r == 0) {
thrust::device_ptr<float> dev_ptr(recvbuff);
std::cout.precision(15);
thrust::copy(dev_ptr, dev_ptr + size*p, std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
}

//free device buffers
CUDA_RT_CALL(cudaFree(sendbuff));
CUDA_RT_CALL(cudaFree(recvbuff));

//finalizing NCCL
NCCLCHECK(ncclCommDestroy(comm));
}

int main( int argc, char** argv )
{
testing::InitGoogleTest(&argc,argv);
MPI_Init(&argc, &argv);
rmmInitialize(nullptr);
int rc = RUN_ALL_TESTS();
rmmFinalize();
MPI_Finalize();
return rc;
}
18 changes: 18 additions & 0 deletions cpp/tests/test_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,24 @@ extern "C" {
}
#endif

#define NCCLCHECK(cmd) { \
ncclResult_t nccl_status = cmd; \
if (nccl_status!= ncclSuccess) { \
printf("NCCL failure %s:%d '%s'\n", \
__FILE__,__LINE__,ncclGetErrorString(nccl_status)); \
FAIL(); \
} \
}

#define MPICHECK(cmd) { \
int e = cmd; \
if ( e != MPI_SUCCESS ) { \
printf("Failed: MPI error %s:%d '%d'\n", \
__FILE__,__LINE__, e); \
FAIL(); \
} \
}

std::function<void(gdf_column*)> gdf_col_deleter = [](gdf_column* col){
if (col) {
col->size = 0;
Expand Down
2 changes: 1 addition & 1 deletion python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
cuda_include_dir],
library_dirs=[get_python_lib()],
runtime_library_dirs=[conda_lib_dir],
libraries=['cugraph', 'cudf'],
libraries=['cugraph', 'cudf', 'nccl'],
language='c++',
extra_compile_args=['-std=c++14'])
]
Expand Down

0 comments on commit 8acb743

Please sign in to comment.