diff --git a/CMakeLists.txt b/CMakeLists.txt index cde3903d1a59..61cde7f20c87 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -95,6 +95,8 @@ if (USE_CUDA) message(STATUS "CUDA GEN_CODE: ${GEN_CODE}") endif (USE_CUDA) +find_package(Threads REQUIRED) + if (USE_OPENMP) if (APPLE) # Require CMake 3.16+ on Mac OSX, as previous versions of CMake had trouble locating @@ -128,7 +130,7 @@ foreach(lib rabit rabit_base rabit_empty rabit_mock rabit_mock_static) # Explicitly link dmlc to rabit, so that configured header (build_config.h) # from dmlc is correctly applied to rabit. if (TARGET ${lib}) - target_link_libraries(${lib} dmlc) + target_link_libraries(${lib} dmlc ${CMAKE_THREAD_LIBS_INIT}) endif (TARGET ${lib}) endforeach() @@ -138,6 +140,7 @@ if (R_LIB) endif (R_LIB) # core xgboost +list(APPEND LINKED_LIBRARIES_PRIVATE Threads::Threads ${CMAKE_THREAD_LIBS_INIT}) add_subdirectory(${xgboost_SOURCE_DIR}/plugin) add_subdirectory(${xgboost_SOURCE_DIR}/src) target_link_libraries(objxgboost PUBLIC dmlc) diff --git a/Jenkinsfile b/Jenkinsfile index 8bb979ad266b..f3733d9cb281 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -63,6 +63,7 @@ pipeline { parallel ([ 'build-cpu': { BuildCPU() }, 'build-cpu-rabit-mock': { BuildCPUMock() }, + 'build-cpu-non-omp': { BuildCPUNonOmp() }, 'build-gpu-cuda9.0': { BuildCUDA(cuda_version: '9.0') }, 'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') }, 'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') }, @@ -215,12 +216,28 @@ def BuildCPUMock() { sh """ ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_mock_cmake.sh """ - echo 'Stashing rabit C++ test executable (xgboost)...' + echo 'Stashing rabit C++ test executable (xgboost)...' stash name: 'xgboost_rabit_tests', includes: 'xgboost' deleteDir() } } +def BuildCPUNonOmp() { + node('linux && cpu') { + unstash name: 'srcs' + echo "Build CPU without OpenMP" + def container_type = "cpu" + def docker_binary = "docker" + sh """ + ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_via_cmake.sh -DUSE_OPENMP=OFF + """ + echo "Running Non-OpenMP C++ test..." + sh """ + ${dockerRun} ${container_type} ${docker_binary} build/testxgboost + """ + deleteDir() + } +} def BuildCUDA(args) { node('linux && cpu') { @@ -259,7 +276,7 @@ def BuildJVMPackages(args) { ${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_jvm_packages.sh ${args.spark_version} """ echo 'Stashing XGBoost4J JAR...' - stash name: 'xgboost4j_jar', includes: 'jvm-packages/xgboost4j/target/*.jar,jvm-packages/xgboost4j-spark/target/*.jar,jvm-packages/xgboost4j-example/target/*.jar' + stash name: 'xgboost4j_jar', includes: "jvm-packages/xgboost4j/target/*.jar,jvm-packages/xgboost4j-spark/target/*.jar,jvm-packages/xgboost4j-example/target/*.jar" deleteDir() } } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 88b62088d9c9..8329835d6f0f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -88,12 +88,15 @@ if (XGBOOST_BUILTIN_PREFETCH_PRESENT) -DXGBOOST_BUILTIN_PREFETCH_PRESENT=1) endif (XGBOOST_BUILTIN_PREFETCH_PRESENT) +find_package(Threads REQUIRED) +list(APPEND SRC_LIBS Threads::Threads ${CMAKE_THREAD_LIBS_INIT}) + if (USE_OPENMP OR USE_CUDA) # CUDA requires OpenMP find_package(OpenMP REQUIRED) list(APPEND SRC_LIBS OpenMP::OpenMP_CXX) - set(LINKED_LIBRARIES_PRIVATE "${LINKED_LIBRARIES_PRIVATE};${SRC_LIBS}" PARENT_SCOPE) target_link_libraries(objxgboost PRIVATE OpenMP::OpenMP_CXX) endif (USE_OPENMP OR USE_CUDA) +set(LINKED_LIBRARIES_PRIVATE "${LINKED_LIBRARIES_PRIVATE};${SRC_LIBS}" PARENT_SCOPE) # For MSVC: Call msvc_use_static_runtime() once again to completely # replace /MD with /MT. See https://github.com/dmlc/xgboost/issues/4462 diff --git a/src/common/column_matrix.h b/src/common/column_matrix.h index e179b0505cf7..edac92d06908 100644 --- a/src/common/column_matrix.h +++ b/src/common/column_matrix.h @@ -227,8 +227,7 @@ class ColumnMatrix { /* missing values make sense only for column with type kDenseColumn, and if no missing values were observed it could be handled much faster. */ if (noMissingValues) { - const int32_t nthread = omp_get_max_threads(); // NOLINT - #pragma omp parallel for num_threads(nthread) +#pragma omp parallel for num_threads(omp_get_max_threads()) for (omp_ulong rid = 0; rid < nrow; ++rid) { const size_t ibegin = rid*nfeature; const size_t iend = (rid+1)*nfeature; diff --git a/src/common/threading_utils.h b/src/common/threading_utils.h index 3f32173b0e21..3f0519b054c6 100755 --- a/src/common/threading_utils.h +++ b/src/common/threading_utils.h @@ -110,10 +110,12 @@ class BlockedSpace2d { // Wrapper to implement nested parallelism with simple omp parallel for template -void ParallelFor2d(const BlockedSpace2d& space, const int nthreads, Func func) { +void ParallelFor2d(const BlockedSpace2d& space, int nthreads, Func func) { const size_t num_blocks_in_space = space.Size(); + nthreads = std::min(nthreads, omp_get_max_threads()); + nthreads = std::max(nthreads, 1); - #pragma omp parallel num_threads(nthreads) +#pragma omp parallel num_threads(nthreads) { size_t tid = omp_get_thread_num(); size_t chunck_size = num_blocks_in_space / nthreads + !!(num_blocks_in_space % nthreads); diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc index f6888a4cae30..b55c764d2139 100644 --- a/src/metric/rank_metric.cc +++ b/src/metric/rank_metric.cc @@ -189,12 +189,7 @@ struct EvalAuc : public Metric { rec[j - gptr[group_id]] = {h_preds[j], j}; } - if (omp_in_parallel()) { - std::stable_sort(rec.begin(), rec.end(), common::CmpFirst); - } else { - XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst); - } - + XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst); // calculate AUC double sum_pospair = 0.0; double sum_npos = 0.0, sum_nneg = 0.0, buf_pos = 0.0, buf_neg = 0.0; @@ -557,11 +552,7 @@ struct EvalAucPR : public Metric { continue; } - if (omp_in_parallel()) { - std::stable_sort(rec.begin(), rec.end(), common::CmpFirst); - } else { - XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst); - } + XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst); // calculate AUC double tp = 0.0, prevtp = 0.0, fp = 0.0, prevfp = 0.0, h = 0.0, a = 0.0, b = 0.0; diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt index f773aadc0109..98a81ea7f432 100644 --- a/tests/cpp/CMakeLists.txt +++ b/tests/cpp/CMakeLists.txt @@ -69,8 +69,8 @@ set_target_properties( target_link_libraries(testxgboost PRIVATE ${GTEST_LIBRARIES} - ${LINKED_LIBRARIES_PRIVATE} - OpenMP::OpenMP_CXX) + ${LINKED_LIBRARIES_PRIVATE}) + target_compile_definitions(testxgboost PRIVATE ${XGBOOST_DEFINITIONS}) set_output_directory(testxgboost ${xgboost_BINARY_DIR}) diff --git a/tests/cpp/common/test_threading_utils.cc b/tests/cpp/common/test_threading_utils.cc index 392e4ba98cfe..e7baa05f5956 100755 --- a/tests/cpp/common/test_threading_utils.cc +++ b/tests/cpp/common/test_threading_utils.cc @@ -38,7 +38,10 @@ TEST(ParallelFor2d, Test) { return kDim2; }, kGrainSize); - ParallelFor2d(space, 4, [&](size_t i, Range1d r) { + auto old = omp_get_max_threads(); + omp_set_num_threads(4); + + ParallelFor2d(space, omp_get_max_threads(), [&](size_t i, Range1d r) { for (auto j = r.begin(); j < r.end(); ++j) { matrix[i*kDim2 + j] += 1; } @@ -47,12 +50,17 @@ TEST(ParallelFor2d, Test) { for (size_t i = 0; i < kDim1 * kDim2; i++) { ASSERT_EQ(matrix[i], 1); } + + omp_set_num_threads(old); } TEST(ParallelFor2dNonUniform, Test) { constexpr size_t kDim1 = 5; constexpr size_t kGrainSize = 256; + auto old = omp_get_max_threads(); + omp_set_num_threads(4); + // here are quite non-uniform distribution in space // but ParallelFor2d should split them by blocks with max size = kGrainSize // and process in balanced manner (optimal performance) @@ -66,7 +74,7 @@ TEST(ParallelFor2dNonUniform, Test) { working_space[i].resize(dim2[i], 0); } - ParallelFor2d(space, 4, [&](size_t i, Range1d r) { + ParallelFor2d(space, omp_get_max_threads(), [&](size_t i, Range1d r) { for (auto j = r.begin(); j < r.end(); ++j) { working_space[i][j] += 1; } @@ -77,6 +85,8 @@ TEST(ParallelFor2dNonUniform, Test) { ASSERT_EQ(working_space[i][j], 1); } } + + omp_set_num_threads(old); } } // namespace common diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index 8385de54e059..951b8b66d649 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -301,9 +301,12 @@ std::unique_ptr CreateSparsePageDMatrix( batch_count++; row_count += batch.Size(); } +#if defined(_OPENMP) EXPECT_GE(batch_count, 2); EXPECT_EQ(row_count, dmat->Info().num_row_); - +#else +#warning "External memory doesn't work with Non-OpenMP build " +#endif // defined(_OPENMP) return dmat; }