ginkgo-project · tcojean · Jun 8, 2023 · May 4, 2023 · May 4, 2023 · May 4, 2023
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -98,53 +98,18 @@ build/cuda92/nompi/gcc/all/release/shared:
     BUILD_CUDA: "ON"
     BUILD_HIP: "ON"
     BUILD_TYPE: "Release"
-    RUN_EXAMPLES: "ON"
     CUDA_ARCH: 61
 
-# cuda 10.0 and friends
-# Make sure that our jobs run when using self-installed
-# third-party HWLOC.
-build/cuda100/mvapich2/gcc/all/debug/shared:
-  extends:
-    - .build_template
-    - .default_variables
-    - .quick_test_condition
-    - .use_gko-cuda100-mvapich2-gnu7-llvm60-intel2018
-  variables:
-    BUILD_MPI: "ON"
-    BUILD_OMP: "ON"
-    BUILD_CUDA: "ON"
-    BUILD_HIP: "ON"
-    BUILD_TYPE: "Debug"
-    FAST_TESTS: "ON"
-    CUDA_ARCH: 35
-
+# cuda 10.1 and friends
+# Build CUDA NVIDIA without omp
 # Make sure that our jobs run when HWLOC is
 # forcibly switched off
-build/cuda100/nompi/clang/all/release/static:
+build/cuda101/nompi/clang/cuda_wo_omp/release/shared:
   extends:
     - .build_template
     - .default_variables
     - .full_test_condition
-    - .use_gko-cuda100-mvapich2-gnu7-llvm60-intel2018
-  variables:
-    C_COMPILER: "clang"
-    CXX_COMPILER: "clang++"
-    BUILD_OMP: "ON"
-    BUILD_CUDA: "ON"
-    BUILD_HIP: "ON"
-    BUILD_HWLOC: "OFF"
-    BUILD_TYPE: "Release"
-    BUILD_SHARED_LIBS: "OFF"
-    CUDA_ARCH: 35
-
-# Build CUDA NVIDIA without omp
-build/cuda100/nompi/clang/cuda_wo_omp/release/shared:
-  extends:
-    - .build_template
-    - .default_variables
-    - .full_test_condition
-    - .use_gko-cuda100-mvapich2-gnu7-llvm60-intel2018
+    - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019
   variables:
     C_COMPILER: "clang"
     CXX_COMPILER: "clang++"
@@ -154,19 +119,21 @@ build/cuda100/nompi/clang/cuda_wo_omp/release/shared:
     BUILD_TYPE: "Release"
     CUDA_ARCH: 35
 
-# cuda 10.1 and friends
-build/cuda101/nompi/gcc/all/debug/shared:
+# Job with example runs.
+build/cuda101/openmpi/gcc/all/debug/shared:
   extends:
     - .build_template
     - .default_variables
-    - .full_test_condition
+    - .quick_test_condition
     - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019
   variables:
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
+    BUILD_MPI: "ON"
+    MPI_AS_ROOT: "ON"
     BUILD_HIP: "ON"
     BUILD_TYPE: "Debug"
-    FAST_TESTS: "ON"
+    RUN_EXAMPLES: "ON"
     CUDA_ARCH: 35
 
 build/cuda101/nompi/clang/all/release/static:
@@ -279,7 +246,7 @@ build/cuda110/mvapich2/gcc/cuda/debug/shared:
   extends:
     - .build_template
     - .default_variables
-    - .full_test_condition
+    - .quick_test_condition
     - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
   variables:
     BUILD_OMP: "ON"
@@ -297,7 +264,7 @@ test/cuda110/mvapich2/gcc/cuda/debug/shared:
   extends:
     - .horeka_test_template
     - .default_variables
-    - .full_test_condition
+    - .quick_test_condition
     - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
   variables:
     USE_NAME: "cuda110-mvapich2-gcc-${CI_PIPELINE_ID}"
@@ -345,7 +312,7 @@ build/cuda110/nompi/intel/cuda/debug/static:
   extends:
     - .build_template
     - .default_variables
-    - .quick_test_condition
+    - .full_test_condition
     - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
   variables:
     C_COMPILER: "icc"
@@ -364,7 +331,7 @@ test/cuda110/nompi/intel/cuda/debug/static:
   extends:
     - .horeka_test_template
     - .default_variables
-    - .quick_test_condition
+    - .full_test_condition
     - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
   variables:
     USE_NAME: "cuda110-nompi-intel-${CI_PIPELINE_ID}"
@@ -394,6 +361,48 @@ build/cuda114/nompi/gcc/cuda/debug/shared:
     CUDA_ARCH: 61
 
 
+# nvhpc and friends
+build/nvhpc233/cuda120/nompi/nvcpp/release/static:
+  extends:
+    - .build_and_test_template
+    - .default_variables
+    - .quick_test_condition
+    - .use_gko_nvhpc233-cuda120-openmpi-gnu12-llvm16
+  variables:
+    C_COMPILER: "nvc"
+    CXX_COMPILER: "nvc++"
+    BUILD_OMP: "ON"
+    BUILD_CUDA: "ON"
+    BUILD_MPI: "OFF"
+    BUILD_TYPE: "Release"
+    FAST_TESTS: "ON"
+    BUILD_SHARED_LIBS: "OFF"
+    # disable common gflags warnings
+    CXX_FLAGS: "--diag_suppress=useless_using_declaration,declared_but_not_referenced"
+    # disable spurious unused argument warning
+    EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177"
+    CUDA_ARCH: 61
+
+build/nvhpc227/cuda117/nompi/nvcpp/debug/shared:
+  extends:
+    - .build_and_test_template
+    - .default_variables
+    - .quick_test_condition
+    - .use_gko_nvhpc227-cuda117-openmpi-gnu11-llvm14
+  variables:
+    C_COMPILER: "nvc"
+    CXX_COMPILER: "nvc++"
+    BUILD_OMP: "ON"
+    BUILD_CUDA: "ON"
+    BUILD_MPI: "OFF"
+    BUILD_TYPE: "Debug"
+    FAST_TESTS: "ON"
+    # disable common gflags warnings
+    CXX_FLAGS: "--diag_suppress=useless_using_declaration,declared_but_not_referenced"
+    # disable spurious unused argument warning
+    EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177"
+    CUDA_ARCH: 61
+
 # ROCm 4.5 and friends
 build/amd/nompi/gcc/rocm45/release/shared:
   extends:
@@ -593,7 +602,7 @@ build/nocuda-nomixed/nompi/clang/omp/debug/static:
     BUILD_SHARED_LIBS: "OFF"
     MIXED_PRECISION: "OFF"
 
-build/dpcpp/cpu/release/static:
+build/dpcpp/2022-1/cpu/release/static:
   extends:
     - .build_and_test_template
     - .default_variables

diff --git a/.gitlab/image.yml b/.gitlab/image.yml
@@ -30,13 +30,6 @@
     - private_ci
     - nvidia-gpu
 
-.use_gko-cuda100-mvapich2-gnu7-llvm60-intel2018:
-  image: ginkgohub/cuda:100-mvapich2-gnu7-llvm60-intel2018
-  tags:
-    - private_ci
-    - controller
-    - cpu
-
 .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019:
   image: ginkgohub/cuda:101-openmpi-gnu8-llvm7-intel2019
   tags:
@@ -69,6 +62,18 @@
     - private_ci
     - nvidia-gpu
 
+.use_gko_nvhpc233-cuda120-openmpi-gnu12-llvm16:
+  image: ginkgohub/nvhpc:233-cuda120-openmpi-gnu12-llvm16
+  tags:
+    - private_ci
+    - nvidia-gpu
+
+.use_gko_nvhpc227-cuda117-openmpi-gnu11-llvm14:
+  image: ginkgohub/nvhpc:227-cuda117-openmpi-gnu11-llvm14
+  tags:
+    - private_ci
+    - nvidia-gpu
+
 .use_gko-rocm45-nompi-gnu8-llvm8:
   image: ginkgohub/rocm:45-mvapich2-gnu8-llvm8
   tags:
@@ -84,7 +89,7 @@
     - gpu
 
 .use_gko-oneapi-cpu:
-  image: ginkgohub/oneapi:latest
+  image: ginkgohub/oneapi:2022.1
   tags:
     - private_ci
     - fairrs

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -121,9 +121,11 @@ endif()
 
 # For now, PGI/NVHPC nvc++ compiler doesn't seem to support
 # `#pragma omp declare reduction`
-if (${CMAKE_CXX_COMPILER_ID} MATCHES "PGI|NVHPC")
-    message(STATUS "OpenMP: Switching to OFF because PGI/NVHPC nvc++ compiler lacks important features.")
-    set(GINKGO_BUILD_OMP OFF)
+#
+# The math with optimization level -O2 doesn't follow IEEE standard, so we
+# enable that back as well.
+if (CMAKE_CXX_COMPILER_ID MATCHES "PGI|NVHPC")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Kieee")
 endif()
 
 set(GINKGO_CIRCULAR_DEPS_FLAGS "-Wl,--no-undefined")

diff --git a/core/test/utils/matrix_generator_test.cpp b/core/test/utils/matrix_generator_test.cpp
@@ -171,7 +171,10 @@ TYPED_TEST(MatrixGenerator, OutputHasCorrectSize)
 TYPED_TEST(MatrixGenerator, OutputHasCorrectNonzeroAverageAndDeviation)
 {
     using T = typename TestFixture::value_type;
-    // the nonzeros only needs to check the real part
+    // this test only tests integer distributions, so only test real types
+    if (gko::is_complex<T>()) {
+        GTEST_SKIP();
+    }
     this->template check_average_and_deviation<T>(
         begin(this->nnz_per_row_sample), end(this->nnz_per_row_sample), 50.0,
         5.0, [](T val) { return gko::real(val); });

diff --git a/cuda/base/pointer_mode_guard.hpp b/cuda/base/pointer_mode_guard.hpp
@@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/std_extensions.hpp>
 
 
 namespace gko {
@@ -79,7 +80,7 @@ class pointer_mode_guard {
     ~pointer_mode_guard() noexcept(false)
     {
         /* Ignore the error during stack unwinding for this call */
-        if (std::uncaught_exception()) {
+        if (xstd::uncaught_exception()) {
             cublasSetPointerMode(*l_handle, CUBLAS_POINTER_MODE_DEVICE);
         } else {
             GKO_ASSERT_NO_CUBLAS_ERRORS(
@@ -126,7 +127,7 @@ class pointer_mode_guard {
     ~pointer_mode_guard() noexcept(false)
     {
         /* Ignore the error during stack unwinding for this call */
-        if (std::uncaught_exception()) {
+        if (xstd::uncaught_exception()) {
             cusparseSetPointerMode(l_handle, CUSPARSE_POINTER_MODE_DEVICE);
         } else {
             GKO_ASSERT_NO_CUSPARSE_ERRORS(

diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
@@ -407,10 +407,9 @@ bool try_general_sparselib_spmv(std::shared_ptr<const CudaExecutor> exec,
         cusparse::destroy(vecc);
     } else {
 #if CUDA_VERSION >= 11060
-        if (b->get_size()[1] == 1 && exec->get_major_version() >= 7) {
+        if (b->get_size()[1] == 1) {
             // cusparseSpMM seems to take the single strided vector as column
-            // major without considering stride and row major (SM >= 70 and
-            // cuda 11.6)
+            // major without considering stride and row major (cuda 11.6)
             return false;
         }
 #endif  // CUDA_VERSION >= 11060

diff --git a/cuda/test/solver/lower_trs_kernels.cpp b/cuda/test/solver/lower_trs_kernels.cpp
@@ -174,21 +174,31 @@ TEST_F(LowerTrs, CudaMultipleRhsApplySyncfreeIsEquivalentToRef)
 TEST_F(LowerTrs, CudaMultipleRhsApplyIsEquivalentToRef)
 {
     initialize_data(50, 3);
+#if CUDA_VERSION >= 11031
+#if CUDA_VERSION < 12000
+    // The cuSPARSE Generic SpSM implementation uses the wrong stride here
+    // so the input and output stride need to match
+    auto in_stride = 4;
+    auto out_stride = 4;
+#else
+    // The cuSPARSE 12 solver is even worse: It only works if the stride is
+    // equal to the number of columns.
+    auto in_stride = 3;
+    auto out_stride = 3;
+#endif
+#else
+    auto in_stride = 4;
+    auto out_stride = 5;
+#endif
     auto lower_trs_factory =
         gko::solver::LowerTrs<>::build().with_num_rhs(3u).on(ref);
     auto d_lower_trs_factory =
         gko::solver::LowerTrs<>::build().with_num_rhs(3u).on(exec);
     auto solver = lower_trs_factory->generate(csr_mtx);
     auto d_solver = d_lower_trs_factory->generate(d_csr_mtx);
-    auto db2_strided = Mtx::create(exec, b->get_size(), 4);
+    auto db2_strided = Mtx::create(exec, b->get_size(), in_stride);
     d_b2->convert_to(db2_strided);
-    // The cuSPARSE Generic SpSM implementation uses the wrong stride here
-    // so the input and output stride need to match
-#if CUDA_VERSION >= 11031
-    auto dx_strided = Mtx::create(exec, x->get_size(), 4);
-#else
-    auto dx_strided = Mtx::create(exec, x->get_size(), 5);
-#endif
+    auto dx_strided = Mtx::create(exec, x->get_size(), out_stride);
 
     solver->apply(b2, x);
     d_solver->apply(db2_strided, dx_strided);

diff --git a/cuda/test/solver/upper_trs_kernels.cpp b/cuda/test/solver/upper_trs_kernels.cpp
@@ -174,21 +174,31 @@ TEST_F(UpperTrs, CudaMultipleRhsApplySyncfreeIsEquivalentToRef)
 TEST_F(UpperTrs, CudaMultipleRhsApplyIsEquivalentToRef)
 {
     initialize_data(50, 3);
+#if CUDA_VERSION >= 11031
+#if CUDA_VERSION < 12000
+    // The cuSPARSE Generic SpSM implementation uses the wrong stride here
+    // so the input and output stride need to match
+    auto in_stride = 4;
+    auto out_stride = 4;
+#else
+    // The cuSPARSE 12 solver is even worse: It only works if the stride is
+    // equal to the number of columns.
+    auto in_stride = 3;
+    auto out_stride = 3;
+#endif
+#else
+    auto in_stride = 4;
+    auto out_stride = 5;
+#endif
     auto upper_trs_factory =
         gko::solver::UpperTrs<>::build().with_num_rhs(3u).on(ref);
     auto d_upper_trs_factory =
         gko::solver::UpperTrs<>::build().with_num_rhs(3u).on(exec);
     auto solver = upper_trs_factory->generate(csr_mtx);
     auto d_solver = d_upper_trs_factory->generate(d_csr_mtx);
-    auto db2_strided = Mtx::create(exec, b->get_size(), 4);
+    auto db2_strided = Mtx::create(exec, b->get_size(), in_stride);
     d_b2->convert_to(db2_strided);
-    // The cuSPARSE Generic SpSM implementation uses the wrong stride here
-    // so the input and output stride need to match
-#if CUDA_VERSION >= 11030
-    auto dx_strided = Mtx::create(exec, x->get_size(), 4);
-#else
-    auto dx_strided = Mtx::create(exec, x->get_size(), 5);
-#endif
+    auto dx_strided = Mtx::create(exec, x->get_size(), out_stride);
 
     solver->apply(b2, x);
     d_solver->apply(db2_strided, dx_strided);

diff --git a/hip/base/pointer_mode_guard.hip.hpp b/hip/base/pointer_mode_guard.hip.hpp
@@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/std_extensions.hpp>
 
 
 namespace gko {
@@ -81,7 +82,7 @@ class pointer_mode_guard {
     ~pointer_mode_guard() noexcept(false)
     {
         /* Ignore the error during stack unwinding for this call */
-        if (std::uncaught_exception()) {
+        if (xstd::uncaught_exception()) {
             hipblasSetPointerMode(reinterpret_cast<hipblasHandle_t>(l_handle),
                                   HIPBLAS_POINTER_MODE_DEVICE);
         } else {
@@ -131,7 +132,7 @@ class pointer_mode_guard {
     ~pointer_mode_guard() noexcept(false)
     {
         /* Ignore the error during stack unwinding for this call */
-        if (std::uncaught_exception()) {
+        if (xstd::uncaught_exception()) {
             hipsparseSetPointerMode(
                 reinterpret_cast<hipsparseHandle_t>(l_handle),
                 HIPSPARSE_POINTER_MODE_DEVICE);