Skip to content

Commit

Permalink
KokkosKernels: spiluk patch to improve numeric phase
Browse files Browse the repository at this point in the history
Patched from kokkos/kokkos-kernels#1390
Authored by Vinh Dang @vqd8a
  • Loading branch information
ndellingwood committed Apr 26, 2022
1 parent 97353f2 commit 6318ff0
Show file tree
Hide file tree
Showing 3 changed files with 243 additions and 58 deletions.
50 changes: 40 additions & 10 deletions packages/kokkos-kernels/src/sparse/KokkosSparse_spiluk_handle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,17 @@ class SPILUKHandle {
nnz_lno_view_t level_idx; // the list of rows in each level
nnz_lno_view_t
level_ptr; // the starting index (into the view level_idx) of each level
nnz_lno_view_t level_nchunks; // number of chunks of rows at each level
nnz_lno_view_t
level_nrowsperchunk; // maximum number of rows among chunks at each level

size_type nrows;
size_type nlevel;
size_type nlevels;
size_type nnzL;
size_type nnzU;
size_type level_maxrows; // maximum number of rows of levels
size_type level_maxrows; // max. number of rows among levels
size_type
level_maxrowsperchunk; // max.number of rows among chunks among levels

bool symbolic_complete;

Expand All @@ -121,11 +126,14 @@ class SPILUKHandle {
: level_list(),
level_idx(),
level_ptr(),
level_nchunks(),
level_nrowsperchunk(),
nrows(nrows_),
nlevel(0),
nlevels(0),
nnzL(nnzL_),
nnzU(nnzU_),
level_maxrows(0),
level_maxrowsperchunk(0),
symbolic_complete(symbolic_complete_),
algm(choice),
team_size(-1),
Expand All @@ -138,9 +146,11 @@ class SPILUKHandle {
set_nnzL(nnzL_);
set_nnzU(nnzU_);
set_level_maxrows(0);
level_list = nnz_row_view_t("level_list", nrows_),
level_idx = nnz_lno_view_t("level_idx", nrows_),
level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1),
set_level_maxrowsperchunk(0);
level_list = nnz_row_view_t("level_list", nrows_),
level_idx = nnz_lno_view_t("level_idx", nrows_),
level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1),
level_nchunks = nnz_lno_view_t(), level_nrowsperchunk = nnz_lno_view_t(),
reset_symbolic_complete();
}

Expand All @@ -159,6 +169,20 @@ class SPILUKHandle {
KOKKOS_INLINE_FUNCTION
nnz_lno_view_t get_level_ptr() const { return level_ptr; }

KOKKOS_INLINE_FUNCTION
nnz_lno_view_t get_level_nchunks() const { return level_nchunks; }

void alloc_level_nchunks(const size_type nlevels_) {
level_nchunks = nnz_lno_view_t("level_nchunks", nlevels_);
}

KOKKOS_INLINE_FUNCTION
nnz_lno_view_t get_level_nrowsperchunk() const { return level_nrowsperchunk; }

void alloc_level_nrowsperchunk(const size_type nlevels_) {
level_nrowsperchunk = nnz_lno_view_t("level_nrowsperchunk", nlevels_);
}

KOKKOS_INLINE_FUNCTION
size_type get_nrows() const { return nrows; }

Expand All @@ -185,10 +209,18 @@ class SPILUKHandle {
this->level_maxrows = level_maxrows_;
}

KOKKOS_INLINE_FUNCTION
size_type get_level_maxrowsperchunk() const { return level_maxrowsperchunk; }

KOKKOS_INLINE_FUNCTION
void set_level_maxrowsperchunk(const size_type level_maxrowsperchunk_) {
this->level_maxrowsperchunk = level_maxrowsperchunk_;
}

bool is_symbolic_complete() const { return symbolic_complete; }

size_type get_num_levels() const { return nlevel; }
void set_num_levels(size_type nlevels_) { this->nlevel = nlevels_; }
size_type get_num_levels() const { return nlevels; }
void set_num_levels(size_type nlevels_) { this->nlevels = nlevels_; }

void set_symbolic_complete() { this->symbolic_complete = true; }
void reset_symbolic_complete() { this->symbolic_complete = false; }
Expand All @@ -202,11 +234,9 @@ class SPILUKHandle {
void print_algorithm() {
if (algm == SPILUKAlgorithm::SEQLVLSCHD_RP)
std::cout << "SEQLVLSCHD_RP" << std::endl;
;

if (algm == SPILUKAlgorithm::SEQLVLSCHD_TP1)
std::cout << "SEQLVLSCHD_TP1" << std::endl;
;

/*
if ( algm == SPILUKAlgorithm::SEQLVLSCHED_TP2 ) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -323,9 +323,9 @@ struct ILUKLvlSchedTP1NumericFunctor {
if (ipos != -1) {
auto lxu = -U_values(kk) * fact;
if (col < rowid)
L_values(ipos) += lxu;
Kokkos::atomic_add(&L_values(ipos), lxu);
else
U_values(ipos) += lxu;
Kokkos::atomic_add(&U_values(ipos), lxu);
}
}); // end for kk

Expand Down Expand Up @@ -383,28 +383,51 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map,
using size_type = typename IlukHandle::size_type;
using nnz_lno_t = typename IlukHandle::nnz_lno_t;
using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t;
using WorkViewType =
Kokkos::View<nnz_lno_t **, Kokkos::Device<execution_space, memory_space>>;
using LevelHostViewType = Kokkos::View<nnz_lno_t *, Kokkos::HostSpace>;

size_type nlevels = thandle.get_num_levels();
size_type nrows = thandle.get_nrows();

// Keep this as host View, create device version and copy to back to host
HandleDeviceEntriesType level_ptr = thandle.get_level_ptr();
// Keep these as host View, create device version and copy back to host
HandleDeviceEntriesType level_ptr = thandle.get_level_ptr();
HandleDeviceEntriesType level_idx = thandle.get_level_idx();
HandleDeviceEntriesType level_nchunks = thandle.get_level_nchunks();
HandleDeviceEntriesType level_nrowsperchunk =
thandle.get_level_nrowsperchunk();

// Make level_ptr_h a separate allocation, since it will be accessed on host
// between kernel launches. If a mirror were used and level_ptr is in UVM
// space, a fence would be required before each access since UVM views can
// share pages.
Kokkos::View<nnz_lno_t *, Kokkos::HostSpace> level_ptr_h(
LevelHostViewType level_ptr_h, level_nchunks_h, level_nrowsperchunk_h;
WorkViewType iw;

level_ptr_h = LevelHostViewType(
Kokkos::view_alloc(Kokkos::WithoutInitializing, "Host level pointers"),
level_ptr.extent(0));
Kokkos::deep_copy(level_ptr_h, level_ptr);

HandleDeviceEntriesType level_idx = thandle.get_level_idx();

using WorkViewType =
Kokkos::View<nnz_lno_t **, Kokkos::Device<execution_space, memory_space>>;

WorkViewType iw("iw", thandle.get_level_maxrows(), nrows);
Kokkos::deep_copy(iw, nnz_lno_t(-1));
if (thandle.get_algorithm() ==
KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) {
level_nchunks_h = LevelHostViewType(
Kokkos::view_alloc(Kokkos::WithoutInitializing, "Host level nchunks"),
level_nchunks.extent(0));
level_nrowsperchunk_h =
LevelHostViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing,
"Host level nrowsperchunk"),
level_nrowsperchunk.extent(0));
Kokkos::deep_copy(level_nchunks_h, level_nchunks);
Kokkos::deep_copy(level_nrowsperchunk_h, level_nrowsperchunk);
iw = WorkViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"),
thandle.get_level_maxrowsperchunk(), nrows);
Kokkos::deep_copy(iw, nnz_lno_t(-1));
} else {
iw = WorkViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"),
thandle.get_level_maxrows(), nrows);
Kokkos::deep_copy(iw, nnz_lno_t(-1));
}

// Main loop must be performed sequential. Question: Try out Cuda's graph
// stuff to reduce kernel launch overhead
Expand All @@ -429,20 +452,33 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map,
using policy_type = Kokkos::TeamPolicy<execution_space>;
int team_size = thandle.get_team_size();

ILUKLvlSchedTP1NumericFunctor<
ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType,
LValuesType, URowMapType, UEntriesType, UValuesType,
HandleDeviceEntriesType, WorkViewType, nnz_lno_t>
tstf(A_row_map, A_entries, A_values, L_row_map, L_entries, L_values,
U_row_map, U_entries, U_values, level_idx, iw, lev_start);
if (team_size == -1)
Kokkos::parallel_for("parfor_l_team",
policy_type(lev_end - lev_start, Kokkos::AUTO),
tstf);
else
Kokkos::parallel_for("parfor_l_team",
policy_type(lev_end - lev_start, team_size),
tstf);
nnz_lno_t lvl_rowid_start = 0;
nnz_lno_t lvl_nrows_chunk;
for (int chunkid = 0; chunkid < level_nchunks_h(lvl); chunkid++) {
if ((lvl_rowid_start + level_nrowsperchunk_h(lvl)) >
(lev_end - lev_start))
lvl_nrows_chunk = (lev_end - lev_start) - lvl_rowid_start;
else
lvl_nrows_chunk = level_nrowsperchunk_h(lvl);

ILUKLvlSchedTP1NumericFunctor<
ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType,
LValuesType, URowMapType, UEntriesType, UValuesType,
HandleDeviceEntriesType, WorkViewType, nnz_lno_t>
tstf(A_row_map, A_entries, A_values, L_row_map, L_entries,
L_values, U_row_map, U_entries, U_values, level_idx, iw,
lev_start + lvl_rowid_start);

if (team_size == -1)
Kokkos::parallel_for("parfor_l_team",
policy_type(lvl_nrows_chunk, Kokkos::AUTO),
tstf);
else
Kokkos::parallel_for("parfor_l_team",
policy_type(lvl_nrows_chunk, team_size), tstf);

lvl_rowid_start += lvl_nrows_chunk;
}
}
// /*
// // TP2 algorithm has issues with some offset-ordinal combo to be
Expand Down
Loading

0 comments on commit 6318ff0

Please sign in to comment.