Get sparse tests enabled and working for SYCL

Just needs #1225 to allow querying free/total memory. This will make spgemm, spgemm-jacobi and block GS tests pass. replaceSumIntoLonger still has a runtime error (too many registers used by arrays?), so that is disabled for now.
kokkos · Jan 18, 2022 · dcba5d0 · dcba5d0
1 parent 3f9c011
commit dcba5d0
Show file tree

Hide file tree

Showing 9 changed files with 75 additions and 35 deletions.
diff --git a/example/wiki/sparse/CMakeLists.txt b/example/wiki/sparse/CMakeLists.txt
@@ -25,13 +25,10 @@ KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST(
   SOURCES KokkosSparse_wiki_spadd.cpp
   )
 
-# FIXME_SYCL SYCL does not support querying free/total memory
-IF (NOT KOKKOS_ENABLE_SYCL)
 KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST(
   wiki_spgemm
   SOURCES KokkosSparse_wiki_spgemm.cpp
   )
-ENDIF()
 
 KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST(
   wiki_gauss_seidel

diff --git a/src/common/KokkosKernels_ExecSpaceUtils.hpp b/src/common/KokkosKernels_ExecSpaceUtils.hpp
@@ -205,16 +205,42 @@ inline void kk_get_free_total_memory<Kokkos::Experimental::HIPSpace>(
 }
 #endif
 
+template <typename ExecSpace>
+inline int kk_get_max_vector_size() {
+  return Kokkos::TeamPolicy<ExecSpace>::vector_length_max();
+}
+
+#ifdef KOKKOS_ENABLE_SYCL
+template <>
+inline int kk_get_max_vector_size<Kokkos::Experimental::SYCL>() {
+  // FIXME SYCL: hardcoding to 8 is a workaround that seems to work for all
+  // kernels. Wait for max subgroup size query to be fixed in SYCL and/or
+  // Kokkos. Then TeamPolicy::vector_length_max() can be used for all backends.
+  return 8;
+}
+#endif
+
 inline int kk_get_suggested_vector_size(const size_t nr, const size_t nnz,
                                         const ExecSpaceType exec_space) {
   int suggested_vector_size_ = 1;
+  int max_vector_size        = 1;
+  switch (exec_space) {
+    case Exec_CUDA: max_vector_size = 32; break;
+    case Exec_HIP: max_vector_size = 64; break;
+    case Exec_SYCL:
+      // FIXME SYCL: same as above - 8 is a workaround
+      max_vector_size = 8;
+      break;
+    default:;
+  }
   switch (exec_space) {
     default: break;
     case Exec_SERIAL:
     case Exec_OMP:
     case Exec_THREADS: break;
     case Exec_CUDA:
     case Exec_HIP:
+    case Exec_SYCL:
       if (nr > 0) suggested_vector_size_ = nnz / double(nr) + 0.5;
       if (suggested_vector_size_ < 3) {
         suggested_vector_size_ = 2;
@@ -224,23 +250,22 @@ inline int kk_get_suggested_vector_size(const size_t nr, const size_t nnz,
         suggested_vector_size_ = 8;
       } else if (suggested_vector_size_ <= 24) {
         suggested_vector_size_ = 16;
+      } else if (suggested_vector_size_ <= 48) {
+        suggested_vector_size_ = 32;
       } else {
-        if (exec_space == Exec_CUDA || suggested_vector_size_ <= 48) {
-          // use full CUDA warp, or half a HIP wavefront
-          suggested_vector_size_ = 32;
-        } else {
-          // use full HIP wavefront
-          suggested_vector_size_ = 64;
-        }
+        suggested_vector_size_ = 64;
       }
+      if (suggested_vector_size_ > max_vector_size)
+        suggested_vector_size_ = max_vector_size;
       break;
   }
   return suggested_vector_size_;
 }
 
 inline int kk_get_suggested_team_size(const int vector_size,
                                       const ExecSpaceType exec_space) {
-  if (exec_space == Exec_CUDA || exec_space == Exec_HIP) {
+  if (exec_space == Exec_CUDA || exec_space == Exec_HIP ||
+      exec_space == Exec_SYCL) {
     // TODO: where this is used, tune the target value for
     // threads per block (but 256 is probably OK for CUDA and HIP)
     return 256 / vector_size;

diff --git a/src/sparse/KokkosSparse_spadd.hpp b/src/sparse/KokkosSparse_spadd.hpp
@@ -461,8 +461,9 @@ void runSortedCountEntries(
     size_type pot_est_nnz = 1;
     while (pot_est_nnz < c_est_nnz) pot_est_nnz *= 2;
     // Estimate max number of uncompressed entries in each row of C
-    int vector_length     = 1;
-    int vector_length_max = TeamPol::vector_length_max();
+    int vector_length = 1;
+    int vector_length_max =
+        KokkosKernels::Impl::kk_get_max_vector_size<execution_space>();
     while (vector_length * 2 <= vector_length_max &&
            (size_type)vector_length * 2 <= pot_est_nnz) {
       vector_length *= 2;

diff --git a/src/sparse/KokkosSparse_spgemm_handle.hpp b/src/sparse/KokkosSparse_spgemm_handle.hpp
@@ -568,6 +568,16 @@ class SPGEMMHandle {
 #endif
     }
 #endif
+
+#if defined(KOKKOS_ENABLE_SYCL)
+    if (std::is_same<Kokkos::Experimental::SYCL, ExecutionSpace>::value) {
+      this->algorithm_type = SPGEMM_KK;
+#ifdef VERBOSE
+      std::cout << "SYCL Execution Space, Default Algorithm: SPGEMM_KK"
+                << std::endl;
+#endif
+    }
+#endif
   }
 
   void set_compression(bool compress_second_matrix_) {

diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
@@ -1361,26 +1361,27 @@ void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
   // choose parameters
   if (this->spgemm_algorithm == SPGEMM_KK ||
       SPGEMM_KK_LP == this->spgemm_algorithm) {
-    if (KokkosKernels::Impl::kk_is_gpu_exec_space<
-            typename HandleType::HandleExecSpace>()) {
+    if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
       // then chose the best method and parameters.
       size_type average_row_nnz = 0;
       size_t average_row_flops  = 0;
       if (this->a_row_cnt > 0) {
         average_row_nnz   = overall_nnz / this->a_row_cnt;
         average_row_flops = original_overall_flops / this->a_row_cnt;
       }
+      int vector_length_max =
+          KokkosKernels::Impl::kk_get_max_vector_size<MyExecSpace>();
       // if we have very low flops per row, or our maximum number of nnz is
       // prett small, then we do row-base algorithm.
       if (SPGEMM_KK_LP != this->spgemm_algorithm &&
-          (average_row_nnz < 32 || average_row_flops < 256)) {
+          (average_row_nnz < vector_length_max || average_row_flops < 256)) {
         algorithm_to_run = SPGEMM_KK_MEMORY;
         // if (average_row_nnz / double (thread_shmem_key_size) > 1.5)
         while (average_row_nnz > size_type(thread_shmem_key_size) &&
-               suggested_vector_size < 32) {
+               suggested_vector_size < vector_length_max) {
           suggested_vector_size = suggested_vector_size * 2;
           suggested_vector_size =
-              KOKKOSKERNELS_MACRO_MIN(32, suggested_vector_size);
+              KOKKOSKERNELS_MACRO_MIN(vector_length_max, suggested_vector_size);
           suggested_team_size =
               this->handle->get_suggested_team_size(suggested_vector_size);
           thread_memory = (shmem_size_to_use / 8 / suggested_team_size) * 8;
@@ -1412,7 +1413,7 @@ void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
         int team_cuckoo_key_size = 1;
         while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size)
           team_cuckoo_key_size = team_cuckoo_key_size * 2;
-        suggested_vector_size = 32;
+        suggested_vector_size = vector_length_max;
         suggested_team_size =
             this->handle->get_suggested_team_size(suggested_vector_size);
         algorithm_to_run = SPGEMM_KK_MEMORY_BIGSPREADTEAM;

diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
@@ -1456,6 +1456,8 @@ void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
   nnz_lno_t brows = b_rowmap_end.extent(0) - 1;
   size_type bnnz  = entriesb_.extent(0);
 
+  int max_vector_size =
+      KokkosKernels::Impl::kk_get_max_vector_size<MyExecSpace>();
   int suggested_vector_size =
       this->handle->get_suggested_vector_size(brows, bnnz);
 
@@ -1481,7 +1483,7 @@ void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
 #ifdef FIRSTPARAMS
       size_t estimate_max_nnz = maxNumRoughNonzeros / estimate_compress;
 #else
-      // THIS IS BETTER PARAMAMETER SELECTION.
+      // THIS IS BETTER PARAMETER SELECTION.
       size_t original_overall_flops =
           this->handle->get_spgemm_handle()->original_overall_flops;
       size_t estimate_max_nnz =
@@ -1508,7 +1510,7 @@ void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
           suggested_vector_size = suggested_vector_size * 2;
         }
         suggested_vector_size =
-            KOKKOSKERNELS_MACRO_MIN(32, suggested_vector_size);
+            KOKKOSKERNELS_MACRO_MIN(max_vector_size, suggested_vector_size);
         suggested_team_size =
             this->handle->get_suggested_team_size(suggested_vector_size);
       }
@@ -1767,6 +1769,8 @@ void KokkosSPGEMM<
                 << " bnnz:" << bnnz << std::endl;
     }
   }
+  int max_vector_size =
+      KokkosKernels::Impl::kk_get_max_vector_size<MyExecSpace>();
   int suggested_vector_size =
       this->handle->get_suggested_vector_size(brows, compressed_b_size);
 
@@ -1822,7 +1826,7 @@ void KokkosSPGEMM<
           suggested_vector_size = suggested_vector_size * 2;
         }
         suggested_vector_size =
-            KOKKOSKERNELS_MACRO_MIN(32, suggested_vector_size);
+            KOKKOSKERNELS_MACRO_MIN(max_vector_size, suggested_vector_size);
         suggested_team_size =
             this->handle->get_suggested_team_size(suggested_vector_size);
       }
@@ -1836,10 +1840,10 @@ void KokkosSPGEMM<
           shmem_key_size + (shmem_key_size - thread_shmem_hash_size) / 3;
       shmem_key_size = (shmem_key_size >> 1) << 1;
       while (estimate_max_nnz > size_t(shmem_key_size) &&
-             suggested_vector_size < 32) {
+             suggested_vector_size < max_vector_size) {
         suggested_vector_size = suggested_vector_size * 2;
         suggested_vector_size =
-            KOKKOSKERNELS_MACRO_MIN(32, suggested_vector_size);
+            KOKKOSKERNELS_MACRO_MIN(max_vector_size, suggested_vector_size);
         suggested_team_size =
             this->handle->get_suggested_team_size(suggested_vector_size);
         thread_memory = (shmem_size_to_use / 8 / suggested_team_size) * 8;

diff --git a/unit_test/CMakeLists.txt b/unit_test/CMakeLists.txt
@@ -133,14 +133,13 @@ IF (KOKKOS_ENABLE_SYCL)
     COMPONENTS batched
   )
 
-  # FIXME_SYCL
-  #KOKKOSKERNELS_ADD_UNIT_TEST(
-  #  sparse_sycl
-  #  SOURCES
-  #    Test_Main.cpp
-  #    sycl/Test_SYCL_Sparse.cpp
-  #  COMPONENTS sparse
-  #)
+  KOKKOSKERNELS_ADD_UNIT_TEST(
+    sparse_sycl
+    SOURCES
+      Test_Main.cpp
+      sycl/Test_SYCL_Sparse.cpp
+    COMPONENTS sparse
+  )
 
   KOKKOSKERNELS_ADD_UNIT_TEST(
     graph_sycl

diff --git a/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp b/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp
@@ -516,6 +516,8 @@ void test_replaceSumIntoLonger() {
     test_replaceSumIntoLonger<SCALAR, ORDINAL, OFFSET, DEVICE>();                     \
   }
 
+// FIXME SYCL: test hangs or gives "CL error -46 invalid kernel name"
+#ifndef KOKKOS_ENABLE_SYCL
 #if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
      defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
      defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
@@ -643,5 +645,6 @@ EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
      !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
 EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
 #endif
+#endif
 
 #undef EXECUTE_TEST
diff --git a/unit_test/sparse/Test_Sparse_spgemm.hpp b/unit_test/sparse/Test_Sparse_spgemm.hpp
@@ -343,13 +343,13 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth,
       else
         res = run_spgemm<crsMat_t, device>(A, B, spgemm_algorithm, output_mat);
     } catch (const char *message) {
-      EXPECT_TRUE(is_expected_to_fail) << algo;
+      EXPECT_TRUE(is_expected_to_fail) << algo << ": " << message;
       failed = true;
     } catch (std::string message) {
-      EXPECT_TRUE(is_expected_to_fail) << algo;
+      EXPECT_TRUE(is_expected_to_fail) << algo << ": " << message;
       failed = true;
     } catch (std::exception &e) {
-      EXPECT_TRUE(is_expected_to_fail) << algo;
+      EXPECT_TRUE(is_expected_to_fail) << algo << ": " << e.what();
       failed = true;
     }
     EXPECT_TRUE((failed == is_expected_to_fail));