Skip to content

Commit

Permalink
mv config to the first argument
Browse files Browse the repository at this point in the history
  • Loading branch information
yhmtsai committed May 25, 2021
1 parent 7805dff commit 8608a7d
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 37 deletions.
11 changes: 6 additions & 5 deletions dpcpp/components/prefix_sum.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,12 @@ constexpr auto block_cfg_list =

GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(start_prefix_sum, start_prefix_sum)
GKO_ENABLE_DEFAULT_CONFIG_CALL(start_prefix_sum_call, start_prefix_sum,
BlockCfg, block_cfg_list)
block_cfg_list)

GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_prefix_sum,
finalize_prefix_sum)
GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_prefix_sum_call, finalize_prefix_sum,
BlockCfg, block_cfg_list)
block_cfg_list)


template <typename IndexType>
Expand All @@ -81,13 +81,14 @@ void prefix_sum(std::shared_ptr<const DpcppExecutor> exec, IndexType *counts,
auto num_blocks = ceildiv(num_entries, wg_size);
Array<IndexType> block_sum_array(exec, num_blocks - 1);
auto block_sums = block_sum_array.get_data();
start_prefix_sum_call(num_blocks, wg_size, 0, exec->get_queue(), cfg,
start_prefix_sum_call(cfg, num_blocks, wg_size, 0, exec->get_queue(),
num_entries, counts, block_sums);
// add the total sum of the previous block only when the number of block
// is larger than 1.
if (num_blocks > 1) {
finalize_prefix_sum_call(num_blocks, wg_size, 0, exec->get_queue(),
cfg, num_entries, counts, block_sums);
finalize_prefix_sum_call(cfg, num_blocks, wg_size, 0,
exec->get_queue(), num_entries, counts,
block_sums);
}
}
}
Expand Down
6 changes: 3 additions & 3 deletions dpcpp/components/reduction.dp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_add_array_config,
reduce_add_array);

GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_add_array_call, reduce_add_array_config,
KCFG_1D, kcfg_1d_list);
kcfg_1d_list);


/**
Expand Down Expand Up @@ -277,15 +277,15 @@ ValueType reduce_add_array(std::shared_ptr<const DpcppExecutor> exec,

block_results.resize_and_reset(grid_dim);

reduce_add_array_call(grid_dim, wg_size, 0, exec->get_queue(), cfg,
reduce_add_array_call(cfg, grid_dim, wg_size, 0, exec->get_queue(),
size, source, block_results.get_data());

block_results_val = block_results.get_const_data();
}

auto d_result = Array<ValueType>(exec, 1);

reduce_add_array_call(1, wg_size, 0, exec->get_queue(), cfg, grid_dim,
reduce_add_array_call(cfg, 1, wg_size, 0, exec->get_queue(), grid_dim,
block_results_val, d_result.get_data());
answer = exec->copy_val_to_host(d_result.get_const_data());
return answer;
Expand Down
52 changes: 23 additions & 29 deletions dpcpp/matrix/dense_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,11 +188,6 @@ void compute_partial_reduce(
}
}

// GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_reduce_config,
// compute_partial_reduce);
// GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_reduce_call,
// compute_partial_reduce_config,
// KCFG_1D, kcfg_1d_list);

template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType,
typename CallableReduce, typename CallableFinalize>
Expand Down Expand Up @@ -267,7 +262,7 @@ void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory,
GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_dot,
compute_partial_dot)
GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_dot_call, compute_partial_dot,
KCFG_1D, kcfg_1d_list)
kcfg_1d_list)


template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
Expand Down Expand Up @@ -309,7 +304,7 @@ void finalize_dot_computation(dim3 grid, dim3 block,
GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_dot_computation,
finalize_dot_computation)
GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_dot_computation_call,
finalize_dot_computation, KCFG_1D, kcfg_1d_list)
finalize_dot_computation, kcfg_1d_list)


template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
Expand Down Expand Up @@ -353,7 +348,7 @@ void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory,
GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_norm2,
compute_partial_norm2)
GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_norm2_call,
compute_partial_norm2, KCFG_1D, kcfg_1d_list)
compute_partial_norm2, kcfg_1d_list)


template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
Expand Down Expand Up @@ -395,8 +390,7 @@ void finalize_norm2_computation(dim3 grid, dim3 block,
GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_norm2_computation,
finalize_norm2_computation)
GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_norm2_computation_call,
finalize_norm2_computation, KCFG_1D,
kcfg_1d_list)
finalize_norm2_computation, kcfg_1d_list)


template <typename ValueType, typename IndexType>
Expand Down Expand Up @@ -452,7 +446,7 @@ void count_nnz_per_row(size_type num_rows, size_type num_cols, size_type stride,
GKO_ENABLE_DEFAULT_HOST_CONFIG(count_nnz_per_row, count_nnz_per_row)
GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(count_nnz_per_row, count_nnz_per_row)
GKO_ENABLE_DEFAULT_CONFIG_CALL(count_nnz_per_row_call, count_nnz_per_row,
KCFG_1D, kcfg_1d_list)
kcfg_1d_list)


template <typename ValueType, typename IndexType>
Expand Down Expand Up @@ -552,7 +546,7 @@ GKO_ENABLE_DEFAULT_HOST_CONFIG(calculate_slice_lengths, calculate_slice_lengths)
GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(calculate_slice_lengths,
calculate_slice_lengths)
GKO_ENABLE_DEFAULT_CONFIG_CALL(calculate_slice_lengths_call,
calculate_slice_lengths, KCFG_1D, kcfg_1d_list)
calculate_slice_lengths, kcfg_1d_list)


template <typename ValueType, typename IndexType>
Expand Down Expand Up @@ -627,7 +621,7 @@ void reduce_max_nnz(dim3 grid, dim3 block, size_t dynamic_shared_memory,
}

GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz, reduce_max_nnz);
GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_call, reduce_max_nnz, KCFG_1D,
GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_call, reduce_max_nnz,
kcfg_1d_list)

template <ConfigSetType cfg>
Expand Down Expand Up @@ -666,7 +660,7 @@ GKO_ENABLE_DEFAULT_HOST_CONFIG(reduce_max_nnz_per_slice,
GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz_per_slice,
reduce_max_nnz_per_slice)
GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_per_slice_call,
reduce_max_nnz_per_slice, KCFG_1D, kcfg_1d_list)
reduce_max_nnz_per_slice, kcfg_1d_list)


template <ConfigSetType cfg>
Expand Down Expand Up @@ -708,7 +702,7 @@ void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory,
GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_total_cols,
reduce_total_cols);
GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_total_cols_call, reduce_total_cols,
KCFG_1D, kcfg_1d_list)
kcfg_1d_list)


template <typename IndexType, typename ValueType>
Expand Down Expand Up @@ -1109,11 +1103,11 @@ void compute_dot(std::shared_ptr<const DpcppExecutor> exec,
// TODO: write a kernel which does this more efficiently
for (size_type col = 0; col < x->get_size()[1]; ++col) {
kernel::compute_partial_dot_call(
grid_dim, block_dim, 0, exec->get_queue(), cfg,
cfg, grid_dim, block_dim, 0, exec->get_queue(),
x->get_size()[0], x->get_const_values() + col, x->get_stride(),
y->get_const_values() + col, y->get_stride(), work.get_data());
kernel::finalize_dot_computation_call(
1, block_dim, 0, exec->get_queue(), cfg, grid_dim.x,
cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x,
work.get_const_data(), result->get_values() + col);
}
}
Expand Down Expand Up @@ -1166,11 +1160,11 @@ void compute_norm2(std::shared_ptr<const DpcppExecutor> exec,
// TODO: write a kernel which does this more efficiently
for (size_type col = 0; col < x->get_size()[1]; ++col) {
kernel::compute_partial_norm2_call(
grid_dim, block_dim, 0, exec->get_queue(), cfg,
cfg, grid_dim, block_dim, 0, exec->get_queue(),
x->get_size()[0], x->get_const_values() + col, x->get_stride(),
work.get_data());
kernel::finalize_norm2_computation_call(
1, block_dim, 0, exec->get_queue(), cfg, grid_dim.x,
cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x,
work.get_const_data(), result->get_values() + col);
}
}
Expand Down Expand Up @@ -1237,9 +1231,9 @@ void convert_to_csr(std::shared_ptr<const DpcppExecutor> exec,
const auto rows_per_block = ceildiv(wg_size, sg_size);
const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block);

kernel::count_nnz_per_row_call(grid_dim_nnz, wg_size, 0, exec->get_queue(),
cfg, num_rows, num_cols, stride,
source->get_const_values(), row_ptrs);
kernel::count_nnz_per_row_call(
cfg, grid_dim_nnz, wg_size, 0, exec->get_queue(), num_rows, num_cols,
stride, source->get_const_values(), row_ptrs);

components::prefix_sum(exec, row_ptrs, num_rows + 1);

Expand Down Expand Up @@ -1330,7 +1324,7 @@ void convert_to_sellp(std::shared_ptr<const DpcppExecutor> exec,

if (grid_dim > 0) {
kernel::calculate_slice_lengths_call(
grid_dim, sg_size, 0, exec->get_queue(), cfg, num_rows, slice_size,
cfg, grid_dim, sg_size, 0, exec->get_queue(), num_rows, slice_size,
slice_num, stride_factor, nnz_per_row.get_const_data(),
slice_lengths, slice_sets);
}
Expand Down Expand Up @@ -1398,13 +1392,13 @@ void calculate_max_nnz_per_row(std::shared_ptr<const DpcppExecutor> exec,
auto block_results = Array<size_type>(exec, grid_dim);

kernel::reduce_max_nnz_call(
grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(), cfg,
cfg, grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(),
num_rows, nnz_per_row.get_const_data(), block_results.get_data());

auto d_result = Array<size_type>(exec, 1);

kernel::reduce_max_nnz_call(
1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), cfg,
cfg, 1, wg_size, wg_size * sizeof(size_type), exec->get_queue(),
grid_dim, block_results.get_const_data(), d_result.get_data());

*result = exec->copy_val_to_host(d_result.get_const_data());
Expand Down Expand Up @@ -1434,7 +1428,7 @@ void calculate_nonzeros_per_row(std::shared_ptr<const DpcppExecutor> exec,
const dim3 grid_size(grid_x, 1, 1);
if (grid_x > 0) {
kernel::count_nnz_per_row_call(
grid_size, block_size, 0, exec->get_queue(), cfg,
cfg, grid_size, block_size, 0, exec->get_queue(),
source->get_size()[0], source->get_size()[1], source->get_stride(),
source->get_const_values(), result->get_data());
}
Expand Down Expand Up @@ -1478,22 +1472,22 @@ void calculate_total_cols(std::shared_ptr<const DpcppExecutor> exec,
auto grid_dim = ceildiv(slice_num * sg_size, wg_size);

kernel::reduce_max_nnz_per_slice_call(
grid_dim, wg_size, 0, exec->get_queue(), cfg, num_rows, slice_size,
cfg, grid_dim, wg_size, 0, exec->get_queue(), num_rows, slice_size,
stride_factor, nnz_per_row.get_const_data(),
max_nnz_per_slice.get_data());

grid_dim = ceildiv(slice_num, wg_size);
auto block_results = Array<size_type>(exec, grid_dim);

kernel::reduce_total_cols_call(
grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(), cfg,
cfg, grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(),
slice_num, max_nnz_per_slice.get_const_data(),
block_results.get_data());

auto d_result = Array<size_type>(exec, 1);

kernel::reduce_total_cols_call(
1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), cfg,
cfg, 1, wg_size, wg_size * sizeof(size_type), exec->get_queue(),
grid_dim, block_results.get_const_data(), d_result.get_data());

*result = exec->copy_val_to_host(d_result.get_const_data());
Expand Down

0 comments on commit 8608a7d

Please sign in to comment.