Skip to content

Commit

Permalink
Get sparse tests enabled and working for SYCL
Browse files Browse the repository at this point in the history
Just needs #1225 to allow querying free/total memory. This will make spgemm,
spgemm-jacobi and block GS tests pass. replaceSumIntoLonger still has a runtime error
(too many registers used by arrays?), so that is disabled for now.
  • Loading branch information
brian-kelley committed Jan 18, 2022
1 parent 3f9c011 commit dcba5d0
Show file tree
Hide file tree
Showing 9 changed files with 75 additions and 35 deletions.
3 changes: 0 additions & 3 deletions example/wiki/sparse/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,10 @@ KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST(
SOURCES KokkosSparse_wiki_spadd.cpp
)

# FIXME_SYCL SYCL does not support querying free/total memory
IF (NOT KOKKOS_ENABLE_SYCL)
KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST(
wiki_spgemm
SOURCES KokkosSparse_wiki_spgemm.cpp
)
ENDIF()

KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST(
wiki_gauss_seidel
Expand Down
41 changes: 33 additions & 8 deletions src/common/KokkosKernels_ExecSpaceUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,16 +205,42 @@ inline void kk_get_free_total_memory<Kokkos::Experimental::HIPSpace>(
}
#endif

template <typename ExecSpace>
inline int kk_get_max_vector_size() {
return Kokkos::TeamPolicy<ExecSpace>::vector_length_max();
}

#ifdef KOKKOS_ENABLE_SYCL
template <>
inline int kk_get_max_vector_size<Kokkos::Experimental::SYCL>() {
// FIXME SYCL: hardcoding to 8 is a workaround that seems to work for all
// kernels. Wait for max subgroup size query to be fixed in SYCL and/or
// Kokkos. Then TeamPolicy::vector_length_max() can be used for all backends.
return 8;
}
#endif

inline int kk_get_suggested_vector_size(const size_t nr, const size_t nnz,
const ExecSpaceType exec_space) {
int suggested_vector_size_ = 1;
int max_vector_size = 1;
switch (exec_space) {
case Exec_CUDA: max_vector_size = 32; break;
case Exec_HIP: max_vector_size = 64; break;
case Exec_SYCL:
// FIXME SYCL: same as above - 8 is a workaround
max_vector_size = 8;
break;
default:;
}
switch (exec_space) {
default: break;
case Exec_SERIAL:
case Exec_OMP:
case Exec_THREADS: break;
case Exec_CUDA:
case Exec_HIP:
case Exec_SYCL:
if (nr > 0) suggested_vector_size_ = nnz / double(nr) + 0.5;
if (suggested_vector_size_ < 3) {
suggested_vector_size_ = 2;
Expand All @@ -224,23 +250,22 @@ inline int kk_get_suggested_vector_size(const size_t nr, const size_t nnz,
suggested_vector_size_ = 8;
} else if (suggested_vector_size_ <= 24) {
suggested_vector_size_ = 16;
} else if (suggested_vector_size_ <= 48) {
suggested_vector_size_ = 32;
} else {
if (exec_space == Exec_CUDA || suggested_vector_size_ <= 48) {
// use full CUDA warp, or half a HIP wavefront
suggested_vector_size_ = 32;
} else {
// use full HIP wavefront
suggested_vector_size_ = 64;
}
suggested_vector_size_ = 64;
}
if (suggested_vector_size_ > max_vector_size)
suggested_vector_size_ = max_vector_size;
break;
}
return suggested_vector_size_;
}

inline int kk_get_suggested_team_size(const int vector_size,
const ExecSpaceType exec_space) {
if (exec_space == Exec_CUDA || exec_space == Exec_HIP) {
if (exec_space == Exec_CUDA || exec_space == Exec_HIP ||
exec_space == Exec_SYCL) {
// TODO: where this is used, tune the target value for
// threads per block (but 256 is probably OK for CUDA and HIP)
return 256 / vector_size;
Expand Down
5 changes: 3 additions & 2 deletions src/sparse/KokkosSparse_spadd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -461,8 +461,9 @@ void runSortedCountEntries(
size_type pot_est_nnz = 1;
while (pot_est_nnz < c_est_nnz) pot_est_nnz *= 2;
// Estimate max number of uncompressed entries in each row of C
int vector_length = 1;
int vector_length_max = TeamPol::vector_length_max();
int vector_length = 1;
int vector_length_max =
KokkosKernels::Impl::kk_get_max_vector_size<execution_space>();
while (vector_length * 2 <= vector_length_max &&
(size_type)vector_length * 2 <= pot_est_nnz) {
vector_length *= 2;
Expand Down
10 changes: 10 additions & 0 deletions src/sparse/KokkosSparse_spgemm_handle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,16 @@ class SPGEMMHandle {
#endif
}
#endif

#if defined(KOKKOS_ENABLE_SYCL)
if (std::is_same<Kokkos::Experimental::SYCL, ExecutionSpace>::value) {
this->algorithm_type = SPGEMM_KK;
#ifdef VERBOSE
std::cout << "SYCL Execution Space, Default Algorithm: SPGEMM_KK"
<< std::endl;
#endif
}
#endif
}

void set_compression(bool compress_second_matrix_) {
Expand Down
13 changes: 7 additions & 6 deletions src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1361,26 +1361,27 @@ void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
// choose parameters
if (this->spgemm_algorithm == SPGEMM_KK ||
SPGEMM_KK_LP == this->spgemm_algorithm) {
if (KokkosKernels::Impl::kk_is_gpu_exec_space<
typename HandleType::HandleExecSpace>()) {
if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
// then chose the best method and parameters.
size_type average_row_nnz = 0;
size_t average_row_flops = 0;
if (this->a_row_cnt > 0) {
average_row_nnz = overall_nnz / this->a_row_cnt;
average_row_flops = original_overall_flops / this->a_row_cnt;
}
int vector_length_max =
KokkosKernels::Impl::kk_get_max_vector_size<MyExecSpace>();
// if we have very low flops per row, or our maximum number of nnz is
// prett small, then we do row-base algorithm.
if (SPGEMM_KK_LP != this->spgemm_algorithm &&
(average_row_nnz < 32 || average_row_flops < 256)) {
(average_row_nnz < vector_length_max || average_row_flops < 256)) {
algorithm_to_run = SPGEMM_KK_MEMORY;
// if (average_row_nnz / double (thread_shmem_key_size) > 1.5)
while (average_row_nnz > size_type(thread_shmem_key_size) &&
suggested_vector_size < 32) {
suggested_vector_size < vector_length_max) {
suggested_vector_size = suggested_vector_size * 2;
suggested_vector_size =
KOKKOSKERNELS_MACRO_MIN(32, suggested_vector_size);
KOKKOSKERNELS_MACRO_MIN(vector_length_max, suggested_vector_size);
suggested_team_size =
this->handle->get_suggested_team_size(suggested_vector_size);
thread_memory = (shmem_size_to_use / 8 / suggested_team_size) * 8;
Expand Down Expand Up @@ -1412,7 +1413,7 @@ void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
int team_cuckoo_key_size = 1;
while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size)
team_cuckoo_key_size = team_cuckoo_key_size * 2;
suggested_vector_size = 32;
suggested_vector_size = vector_length_max;
suggested_team_size =
this->handle->get_suggested_team_size(suggested_vector_size);
algorithm_to_run = SPGEMM_KK_MEMORY_BIGSPREADTEAM;
Expand Down
14 changes: 9 additions & 5 deletions src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1456,6 +1456,8 @@ void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
nnz_lno_t brows = b_rowmap_end.extent(0) - 1;
size_type bnnz = entriesb_.extent(0);

int max_vector_size =
KokkosKernels::Impl::kk_get_max_vector_size<MyExecSpace>();
int suggested_vector_size =
this->handle->get_suggested_vector_size(brows, bnnz);

Expand All @@ -1481,7 +1483,7 @@ void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
#ifdef FIRSTPARAMS
size_t estimate_max_nnz = maxNumRoughNonzeros / estimate_compress;
#else
// THIS IS BETTER PARAMAMETER SELECTION.
// THIS IS BETTER PARAMETER SELECTION.
size_t original_overall_flops =
this->handle->get_spgemm_handle()->original_overall_flops;
size_t estimate_max_nnz =
Expand All @@ -1508,7 +1510,7 @@ void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
suggested_vector_size = suggested_vector_size * 2;
}
suggested_vector_size =
KOKKOSKERNELS_MACRO_MIN(32, suggested_vector_size);
KOKKOSKERNELS_MACRO_MIN(max_vector_size, suggested_vector_size);
suggested_team_size =
this->handle->get_suggested_team_size(suggested_vector_size);
}
Expand Down Expand Up @@ -1767,6 +1769,8 @@ void KokkosSPGEMM<
<< " bnnz:" << bnnz << std::endl;
}
}
int max_vector_size =
KokkosKernels::Impl::kk_get_max_vector_size<MyExecSpace>();
int suggested_vector_size =
this->handle->get_suggested_vector_size(brows, compressed_b_size);

Expand Down Expand Up @@ -1822,7 +1826,7 @@ void KokkosSPGEMM<
suggested_vector_size = suggested_vector_size * 2;
}
suggested_vector_size =
KOKKOSKERNELS_MACRO_MIN(32, suggested_vector_size);
KOKKOSKERNELS_MACRO_MIN(max_vector_size, suggested_vector_size);
suggested_team_size =
this->handle->get_suggested_team_size(suggested_vector_size);
}
Expand All @@ -1836,10 +1840,10 @@ void KokkosSPGEMM<
shmem_key_size + (shmem_key_size - thread_shmem_hash_size) / 3;
shmem_key_size = (shmem_key_size >> 1) << 1;
while (estimate_max_nnz > size_t(shmem_key_size) &&
suggested_vector_size < 32) {
suggested_vector_size < max_vector_size) {
suggested_vector_size = suggested_vector_size * 2;
suggested_vector_size =
KOKKOSKERNELS_MACRO_MIN(32, suggested_vector_size);
KOKKOSKERNELS_MACRO_MIN(max_vector_size, suggested_vector_size);
suggested_team_size =
this->handle->get_suggested_team_size(suggested_vector_size);
thread_memory = (shmem_size_to_use / 8 / suggested_team_size) * 8;
Expand Down
15 changes: 7 additions & 8 deletions unit_test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -133,14 +133,13 @@ IF (KOKKOS_ENABLE_SYCL)
COMPONENTS batched
)

# FIXME_SYCL
#KOKKOSKERNELS_ADD_UNIT_TEST(
# sparse_sycl
# SOURCES
# Test_Main.cpp
# sycl/Test_SYCL_Sparse.cpp
# COMPONENTS sparse
#)
KOKKOSKERNELS_ADD_UNIT_TEST(
sparse_sycl
SOURCES
Test_Main.cpp
sycl/Test_SYCL_Sparse.cpp
COMPONENTS sparse
)

KOKKOSKERNELS_ADD_UNIT_TEST(
graph_sycl
Expand Down
3 changes: 3 additions & 0 deletions unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,8 @@ void test_replaceSumIntoLonger() {
test_replaceSumIntoLonger<SCALAR, ORDINAL, OFFSET, DEVICE>(); \
}

// FIXME SYCL: test hangs or gives "CL error -46 invalid kernel name"
#ifndef KOKKOS_ENABLE_SYCL
#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \
defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
Expand Down Expand Up @@ -643,5 +645,6 @@ EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
!defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
#endif
#endif

#undef EXECUTE_TEST
6 changes: 3 additions & 3 deletions unit_test/sparse/Test_Sparse_spgemm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -343,13 +343,13 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth,
else
res = run_spgemm<crsMat_t, device>(A, B, spgemm_algorithm, output_mat);
} catch (const char *message) {
EXPECT_TRUE(is_expected_to_fail) << algo;
EXPECT_TRUE(is_expected_to_fail) << algo << ": " << message;
failed = true;
} catch (std::string message) {
EXPECT_TRUE(is_expected_to_fail) << algo;
EXPECT_TRUE(is_expected_to_fail) << algo << ": " << message;
failed = true;
} catch (std::exception &e) {
EXPECT_TRUE(is_expected_to_fail) << algo;
EXPECT_TRUE(is_expected_to_fail) << algo << ": " << e.what();
failed = true;
}
EXPECT_TRUE((failed == is_expected_to_fail));
Expand Down

0 comments on commit dcba5d0

Please sign in to comment.