Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add NVHPC CI and fix issues #1331

Merged
merged 15 commits into from
Jun 8, 2023
Merged
105 changes: 57 additions & 48 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,53 +98,18 @@ build/cuda92/nompi/gcc/all/release/shared:
BUILD_CUDA: "ON"
BUILD_HIP: "ON"
BUILD_TYPE: "Release"
RUN_EXAMPLES: "ON"
CUDA_ARCH: 61

# cuda 10.0 and friends
# Make sure that our jobs run when using self-installed
# third-party HWLOC.
build/cuda100/mvapich2/gcc/all/debug/shared:
extends:
- .build_template
- .default_variables
- .quick_test_condition
- .use_gko-cuda100-mvapich2-gnu7-llvm60-intel2018
variables:
BUILD_MPI: "ON"
BUILD_OMP: "ON"
BUILD_CUDA: "ON"
BUILD_HIP: "ON"
BUILD_TYPE: "Debug"
FAST_TESTS: "ON"
CUDA_ARCH: 35

# cuda 10.1 and friends
# Build CUDA NVIDIA without omp
# Make sure that our jobs run when HWLOC is
# forcibly switched off
build/cuda100/nompi/clang/all/release/static:
build/cuda101/nompi/clang/cuda_wo_omp/release/shared:
extends:
- .build_template
- .default_variables
- .full_test_condition
- .use_gko-cuda100-mvapich2-gnu7-llvm60-intel2018
variables:
C_COMPILER: "clang"
CXX_COMPILER: "clang++"
BUILD_OMP: "ON"
BUILD_CUDA: "ON"
BUILD_HIP: "ON"
BUILD_HWLOC: "OFF"
BUILD_TYPE: "Release"
BUILD_SHARED_LIBS: "OFF"
CUDA_ARCH: 35

# Build CUDA NVIDIA without omp
build/cuda100/nompi/clang/cuda_wo_omp/release/shared:
extends:
- .build_template
- .default_variables
- .full_test_condition
- .use_gko-cuda100-mvapich2-gnu7-llvm60-intel2018
- .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019
variables:
C_COMPILER: "clang"
CXX_COMPILER: "clang++"
Expand All @@ -154,19 +119,21 @@ build/cuda100/nompi/clang/cuda_wo_omp/release/shared:
BUILD_TYPE: "Release"
CUDA_ARCH: 35

# cuda 10.1 and friends
build/cuda101/nompi/gcc/all/debug/shared:
# Job with example runs.
build/cuda101/openmpi/gcc/all/debug/shared:
extends:
- .build_template
- .default_variables
- .full_test_condition
- .quick_test_condition
- .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019
variables:
BUILD_OMP: "ON"
BUILD_CUDA: "ON"
BUILD_MPI: "ON"
MPI_AS_ROOT: "ON"
BUILD_HIP: "ON"
BUILD_TYPE: "Debug"
FAST_TESTS: "ON"
RUN_EXAMPLES: "ON"
CUDA_ARCH: 35

build/cuda101/nompi/clang/all/release/static:
Expand Down Expand Up @@ -279,7 +246,7 @@ build/cuda110/mvapich2/gcc/cuda/debug/shared:
extends:
- .build_template
- .default_variables
- .full_test_condition
- .quick_test_condition
- .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
variables:
BUILD_OMP: "ON"
Expand All @@ -297,7 +264,7 @@ test/cuda110/mvapich2/gcc/cuda/debug/shared:
extends:
- .horeka_test_template
- .default_variables
- .full_test_condition
- .quick_test_condition
- .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
variables:
USE_NAME: "cuda110-mvapich2-gcc-${CI_PIPELINE_ID}"
Expand Down Expand Up @@ -345,7 +312,7 @@ build/cuda110/nompi/intel/cuda/debug/static:
extends:
- .build_template
- .default_variables
- .quick_test_condition
- .full_test_condition
- .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
variables:
C_COMPILER: "icc"
Expand All @@ -364,7 +331,7 @@ test/cuda110/nompi/intel/cuda/debug/static:
extends:
- .horeka_test_template
- .default_variables
- .quick_test_condition
- .full_test_condition
- .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
variables:
USE_NAME: "cuda110-nompi-intel-${CI_PIPELINE_ID}"
Expand Down Expand Up @@ -394,6 +361,48 @@ build/cuda114/nompi/gcc/cuda/debug/shared:
CUDA_ARCH: 61


# nvhpc and friends
build/nvhpc233/cuda120/nompi/nvcpp/release/static:
extends:
- .build_and_test_template
- .default_variables
- .quick_test_condition
- .use_gko_nvhpc233-cuda120-openmpi-gnu12-llvm16
variables:
C_COMPILER: "nvc"
CXX_COMPILER: "nvc++"
BUILD_OMP: "ON"
BUILD_CUDA: "ON"
BUILD_MPI: "OFF"
BUILD_TYPE: "Release"
FAST_TESTS: "ON"
BUILD_SHARED_LIBS: "OFF"
# disable common gflags warnings
CXX_FLAGS: "--diag_suppress=useless_using_declaration,declared_but_not_referenced"
# disable spurious unused argument warning
EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177"
CUDA_ARCH: 61

build/nvhpc227/cuda117/nompi/nvcpp/debug/shared:
extends:
- .build_and_test_template
- .default_variables
- .quick_test_condition
- .use_gko_nvhpc227-cuda117-openmpi-gnu11-llvm14
variables:
C_COMPILER: "nvc"
CXX_COMPILER: "nvc++"
BUILD_OMP: "ON"
BUILD_CUDA: "ON"
BUILD_MPI: "OFF"
BUILD_TYPE: "Debug"
FAST_TESTS: "ON"
# disable common gflags warnings
CXX_FLAGS: "--diag_suppress=useless_using_declaration,declared_but_not_referenced"
# disable spurious unused argument warning
EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177"
CUDA_ARCH: 61

# ROCm 4.5 and friends
build/amd/nompi/gcc/rocm45/release/shared:
extends:
Expand Down Expand Up @@ -593,7 +602,7 @@ build/nocuda-nomixed/nompi/clang/omp/debug/static:
BUILD_SHARED_LIBS: "OFF"
MIXED_PRECISION: "OFF"

build/dpcpp/cpu/release/static:
build/dpcpp/2022-1/cpu/release/static:
extends:
- .build_and_test_template
- .default_variables
Expand Down
21 changes: 13 additions & 8 deletions .gitlab/image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,6 @@
- private_ci
- nvidia-gpu

.use_gko-cuda100-mvapich2-gnu7-llvm60-intel2018:
image: ginkgohub/cuda:100-mvapich2-gnu7-llvm60-intel2018
tags:
- private_ci
- controller
- cpu

.use_gko-cuda101-openmpi-gnu8-llvm7-intel2019:
image: ginkgohub/cuda:101-openmpi-gnu8-llvm7-intel2019
tags:
Expand Down Expand Up @@ -69,6 +62,18 @@
- private_ci
- nvidia-gpu

.use_gko_nvhpc233-cuda120-openmpi-gnu12-llvm16:
image: ginkgohub/nvhpc:233-cuda120-openmpi-gnu12-llvm16
tags:
- private_ci
- nvidia-gpu
yhmtsai marked this conversation as resolved.
Show resolved Hide resolved

.use_gko_nvhpc227-cuda117-openmpi-gnu11-llvm14:
image: ginkgohub/nvhpc:227-cuda117-openmpi-gnu11-llvm14
tags:
- private_ci
- nvidia-gpu

.use_gko-rocm45-nompi-gnu8-llvm8:
image: ginkgohub/rocm:45-mvapich2-gnu8-llvm8
tags:
Expand All @@ -84,7 +89,7 @@
- gpu

.use_gko-oneapi-cpu:
image: ginkgohub/oneapi:latest
image: ginkgohub/oneapi:2022.1
tags:
- private_ci
- fairrs
Expand Down
8 changes: 5 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,11 @@ endif()

# For now, PGI/NVHPC nvc++ compiler doesn't seem to support
# `#pragma omp declare reduction`
if (${CMAKE_CXX_COMPILER_ID} MATCHES "PGI|NVHPC")
message(STATUS "OpenMP: Switching to OFF because PGI/NVHPC nvc++ compiler lacks important features.")
set(GINKGO_BUILD_OMP OFF)
#
# The math with optimization level -O2 doesn't follow IEEE standard, so we
# enable that back as well.
if (CMAKE_CXX_COMPILER_ID MATCHES "PGI|NVHPC")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Kieee")
endif()

set(GINKGO_CIRCULAR_DEPS_FLAGS "-Wl,--no-undefined")
Expand Down
5 changes: 4 additions & 1 deletion core/test/utils/matrix_generator_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,10 @@ TYPED_TEST(MatrixGenerator, OutputHasCorrectSize)
TYPED_TEST(MatrixGenerator, OutputHasCorrectNonzeroAverageAndDeviation)
{
using T = typename TestFixture::value_type;
// the nonzeros only needs to check the real part
// this test only tests integer distributions, so only test real types
if (gko::is_complex<T>()) {
GTEST_SKIP();
}
this->template check_average_and_deviation<T>(
begin(this->nnz_per_row_sample), end(this->nnz_per_row_sample), 50.0,
5.0, [](T val) { return gko::real(val); });
Expand Down
5 changes: 3 additions & 2 deletions cuda/base/pointer_mode_guard.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include <ginkgo/core/base/exception_helpers.hpp>
#include <ginkgo/core/base/std_extensions.hpp>


namespace gko {
Expand Down Expand Up @@ -79,7 +80,7 @@ class pointer_mode_guard {
~pointer_mode_guard() noexcept(false)
{
/* Ignore the error during stack unwinding for this call */
if (std::uncaught_exception()) {
if (xstd::uncaught_exception()) {
cublasSetPointerMode(*l_handle, CUBLAS_POINTER_MODE_DEVICE);
} else {
GKO_ASSERT_NO_CUBLAS_ERRORS(
Expand Down Expand Up @@ -126,7 +127,7 @@ class pointer_mode_guard {
~pointer_mode_guard() noexcept(false)
{
/* Ignore the error during stack unwinding for this call */
if (std::uncaught_exception()) {
if (xstd::uncaught_exception()) {
cusparseSetPointerMode(l_handle, CUSPARSE_POINTER_MODE_DEVICE);
} else {
GKO_ASSERT_NO_CUSPARSE_ERRORS(
Expand Down
5 changes: 2 additions & 3 deletions cuda/matrix/csr_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -407,10 +407,9 @@ bool try_general_sparselib_spmv(std::shared_ptr<const CudaExecutor> exec,
cusparse::destroy(vecc);
} else {
#if CUDA_VERSION >= 11060
if (b->get_size()[1] == 1 && exec->get_major_version() >= 7) {
if (b->get_size()[1] == 1) {
// cusparseSpMM seems to take the single strided vector as column
// major without considering stride and row major (SM >= 70 and
// cuda 11.6)
// major without considering stride and row major (cuda 11.6)
return false;
}
#endif // CUDA_VERSION >= 11060
Expand Down
26 changes: 18 additions & 8 deletions cuda/test/solver/lower_trs_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,21 +174,31 @@ TEST_F(LowerTrs, CudaMultipleRhsApplySyncfreeIsEquivalentToRef)
TEST_F(LowerTrs, CudaMultipleRhsApplyIsEquivalentToRef)
{
initialize_data(50, 3);
#if CUDA_VERSION >= 11031
#if CUDA_VERSION < 12000
// The cuSPARSE Generic SpSM implementation uses the wrong stride here
// so the input and output stride need to match
auto in_stride = 4;
auto out_stride = 4;
#else
// The cuSPARSE 12 solver is even worse: It only works if the stride is
// equal to the number of columns.
auto in_stride = 3;
auto out_stride = 3;
tcojean marked this conversation as resolved.
Show resolved Hide resolved
#endif
#else
auto in_stride = 4;
auto out_stride = 5;
#endif
auto lower_trs_factory =
gko::solver::LowerTrs<>::build().with_num_rhs(3u).on(ref);
auto d_lower_trs_factory =
gko::solver::LowerTrs<>::build().with_num_rhs(3u).on(exec);
auto solver = lower_trs_factory->generate(csr_mtx);
auto d_solver = d_lower_trs_factory->generate(d_csr_mtx);
auto db2_strided = Mtx::create(exec, b->get_size(), 4);
auto db2_strided = Mtx::create(exec, b->get_size(), in_stride);
d_b2->convert_to(db2_strided);
// The cuSPARSE Generic SpSM implementation uses the wrong stride here
// so the input and output stride need to match
#if CUDA_VERSION >= 11031
auto dx_strided = Mtx::create(exec, x->get_size(), 4);
#else
auto dx_strided = Mtx::create(exec, x->get_size(), 5);
#endif
auto dx_strided = Mtx::create(exec, x->get_size(), out_stride);

solver->apply(b2, x);
d_solver->apply(db2_strided, dx_strided);
Expand Down
26 changes: 18 additions & 8 deletions cuda/test/solver/upper_trs_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,21 +174,31 @@ TEST_F(UpperTrs, CudaMultipleRhsApplySyncfreeIsEquivalentToRef)
TEST_F(UpperTrs, CudaMultipleRhsApplyIsEquivalentToRef)
{
initialize_data(50, 3);
#if CUDA_VERSION >= 11031
#if CUDA_VERSION < 12000
// The cuSPARSE Generic SpSM implementation uses the wrong stride here
// so the input and output stride need to match
auto in_stride = 4;
auto out_stride = 4;
#else
// The cuSPARSE 12 solver is even worse: It only works if the stride is
// equal to the number of columns.
auto in_stride = 3;
auto out_stride = 3;
#endif
#else
auto in_stride = 4;
auto out_stride = 5;
#endif
tcojean marked this conversation as resolved.
Show resolved Hide resolved
auto upper_trs_factory =
gko::solver::UpperTrs<>::build().with_num_rhs(3u).on(ref);
auto d_upper_trs_factory =
gko::solver::UpperTrs<>::build().with_num_rhs(3u).on(exec);
auto solver = upper_trs_factory->generate(csr_mtx);
auto d_solver = d_upper_trs_factory->generate(d_csr_mtx);
auto db2_strided = Mtx::create(exec, b->get_size(), 4);
auto db2_strided = Mtx::create(exec, b->get_size(), in_stride);
d_b2->convert_to(db2_strided);
// The cuSPARSE Generic SpSM implementation uses the wrong stride here
// so the input and output stride need to match
#if CUDA_VERSION >= 11030
auto dx_strided = Mtx::create(exec, x->get_size(), 4);
#else
auto dx_strided = Mtx::create(exec, x->get_size(), 5);
#endif
auto dx_strided = Mtx::create(exec, x->get_size(), out_stride);

solver->apply(b2, x);
d_solver->apply(db2_strided, dx_strided);
Expand Down
5 changes: 3 additions & 2 deletions hip/base/pointer_mode_guard.hip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <ginkgo/core/base/exception_helpers.hpp>
#include <ginkgo/core/base/executor.hpp>
#include <ginkgo/core/base/std_extensions.hpp>


namespace gko {
Expand Down Expand Up @@ -81,7 +82,7 @@ class pointer_mode_guard {
~pointer_mode_guard() noexcept(false)
{
/* Ignore the error during stack unwinding for this call */
if (std::uncaught_exception()) {
if (xstd::uncaught_exception()) {
hipblasSetPointerMode(reinterpret_cast<hipblasHandle_t>(l_handle),
HIPBLAS_POINTER_MODE_DEVICE);
} else {
Expand Down Expand Up @@ -131,7 +132,7 @@ class pointer_mode_guard {
~pointer_mode_guard() noexcept(false)
{
/* Ignore the error during stack unwinding for this call */
if (std::uncaught_exception()) {
if (xstd::uncaught_exception()) {
hipsparseSetPointerMode(
reinterpret_cast<hipsparseHandle_t>(l_handle),
HIPSPARSE_POINTER_MODE_DEVICE);
Expand Down
Loading