Skip to content

Commit

Permalink
Change norm calculation for GPUs to NonComplex
Browse files Browse the repository at this point in the history
Now, all computations of norms have been changed to result in
non-complex numbers.
  • Loading branch information
Thomas Grützmacher committed Jun 17, 2020
1 parent c0615c9 commit 335967b
Show file tree
Hide file tree
Showing 15 changed files with 228 additions and 165 deletions.
90 changes: 67 additions & 23 deletions common/matrix/dense_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,12 @@ __global__ __launch_bounds__(block_size) void add_scaled(
}


template <size_type block_size, typename ValueType>
__global__ __launch_bounds__(block_size) void compute_partial_dot(
size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
const ValueType *__restrict__ y, size_type stride_y,
ValueType *__restrict__ work)
template <size_type block_size, typename OutType, typename CallableGetValue,
typename CallableReduce>
__device__ void compute_partial_reduce(size_type num_rows,
OutType *__restrict__ work,
CallableGetValue get_value,
CallableReduce reduce_op)
{
constexpr auto warps_per_block = block_size / config::warp_size;

Expand All @@ -86,52 +87,95 @@ __global__ __launch_bounds__(block_size) void compute_partial_dot(
const auto global_id =
thread::get_thread_id<config::warp_size, warps_per_block>();

auto tmp = zero<ValueType>();
auto tmp = zero<OutType>();
for (auto i = global_id; i < num_rows; i += block_size * num_blocks) {
tmp += x[i * stride_x] * y[i * stride_y];
tmp = reduce_op(tmp, get_value(i));
}
__shared__ UninitializedArray<ValueType, block_size> tmp_work;
__shared__ UninitializedArray<OutType, block_size> tmp_work;
tmp_work[local_id] = tmp;

reduce(group::this_thread_block(), static_cast<ValueType *>(tmp_work),
[](const ValueType &x, const ValueType &y) { return x + y; });
reduce(group::this_thread_block(), static_cast<OutType *>(tmp_work),
reduce_op);

if (local_id == 0) {
work[thread::get_block_id()] = tmp_work[0];
}
}


template <size_type block_size, typename ValueType>
__global__ __launch_bounds__(block_size) void finalize_dot_computation(
size_type size, const ValueType *work, ValueType *result)
template <size_type block_size, typename ValueType, typename CallableReduce,
typename CallableFinalize>
__device__ void finalize_reduce_computation(size_type size,
const ValueType *work,
ValueType *result,
CallableReduce reduce_op,
CallableFinalize finalize_op)
{
const auto local_id = thread::get_local_thread_id<config::warp_size>();

ValueType tmp = zero<ValueType>();
for (auto i = local_id; i < size; i += block_size) {
tmp += work[i];
tmp = reduce_op(tmp, work[i]);
}
__shared__ UninitializedArray<ValueType, block_size> tmp_work;
tmp_work[local_id] = tmp;

reduce(group::this_thread_block(), static_cast<ValueType *>(tmp_work),
[](const ValueType &x, const ValueType &y) { return x + y; });
reduce_op);

if (local_id == 0) {
*result = tmp_work[0];
*result = finalize_op(tmp_work[0]);
}
}


template <typename ValueType>
__global__ __launch_bounds__(default_block_size) void compute_sqrt(
size_type num_cols, ValueType *__restrict__ work)
template <size_type block_size, typename ValueType>
__global__ __launch_bounds__(block_size) void compute_partial_dot(
size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
const ValueType *__restrict__ y, size_type stride_y,
ValueType *__restrict__ work)
{
const auto tidx = thread::get_thread_id_flat();
if (tidx < num_cols) {
work[tidx] = sqrt(abs(work[tidx]));
}
compute_partial_reduce<block_size>(
num_rows, work,
[x, stride_x, y, stride_y](size_type i) {
return x[i * stride_x] * conj(y[i * stride_y]);
},
[](const ValueType &x, const ValueType &y) { return x + y; });
}


template <size_type block_size, typename ValueType>
__global__ __launch_bounds__(block_size) void finalize_dot_computation(
size_type size, const ValueType *work, ValueType *result)
{
finalize_reduce_computation<block_size>(
size, work, result,
[](const ValueType &x, const ValueType &y) { return x + y; },
[](const ValueType &x) { return x; });
}


template <size_type block_size, typename ValueType>
__global__ __launch_bounds__(block_size) void compute_partial_norm2(
size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
remove_complex<ValueType> *__restrict__ work)
{
using norm_type = remove_complex<ValueType>;
compute_partial_reduce<block_size>(
num_rows, work,
[x, stride_x](size_type i) { return squared_norm(x[i * stride_x]); },
[](const norm_type &x, const norm_type &y) { return x + y; });
}


template <size_type block_size, typename ValueType>
__global__ __launch_bounds__(block_size) void finalize_norm2_computation(
size_type size, const ValueType *work, ValueType *result)
{
finalize_reduce_computation<block_size>(
size, work, result,
[](const ValueType &x, const ValueType &y) { return x + y; },
[](const ValueType &x) { return sqrt(x); });
}


Expand Down
35 changes: 7 additions & 28 deletions common/solver/gmres_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -65,35 +65,12 @@ __global__ __launch_bounds__(block_size) void initialize_1_kernel(
}


// Must be called with at least `num_rows * stride_krylov` threads in total.
template <size_type block_size, typename ValueType>
__global__ __launch_bounds__(block_size) void initialize_2_1_kernel(
size_type num_rows, size_type num_rhs, size_type krylov_dim,
ValueType *__restrict__ krylov_bases, size_type stride_krylov,
ValueType *__restrict__ residual_norm_collection,
size_type stride_residual_nc)
{
const auto global_id = thread::get_thread_id_flat();
const auto row_idx = global_id / stride_krylov;
const auto col_idx = global_id % stride_krylov;
const auto krylov_bases_nrows = (krylov_dim + 1) * num_rows;
if (row_idx < krylov_bases_nrows && col_idx < num_rhs) {
krylov_bases[row_idx * stride_krylov + col_idx] = zero<ValueType>();
}

if (row_idx < krylov_dim + 1 && col_idx < num_rhs) {
residual_norm_collection[row_idx * stride_residual_nc + col_idx] =
zero<ValueType>();
}
}


// Must be called with at least `num_rows * num_rhs` threads in total.
template <size_type block_size, typename ValueType>
__global__ __launch_bounds__(block_size) void initialize_2_2_kernel(
size_type num_rows, size_type num_rhs,
const ValueType *__restrict__ residual, size_type stride_residual,
const ValueType *__restrict__ residual_norm,
const remove_complex<ValueType> *__restrict__ residual_norm,
ValueType *__restrict__ residual_norm_collection,
ValueType *__restrict__ krylov_bases, size_type stride_krylov,
size_type *__restrict__ final_iter_nums)
Expand Down Expand Up @@ -293,8 +270,10 @@ template <typename ValueType>
__device__ void calculate_residual_norm_kernel(
size_type col_idx, size_type num_cols, size_type iter,
const ValueType &register_sin, const ValueType &register_cos,
ValueType *residual_norm, ValueType *residual_norm_collection,
size_type stride_residual_norm_collection, const ValueType *b_norm)
remove_complex<ValueType> *residual_norm,
ValueType *residual_norm_collection,
size_type stride_residual_norm_collection,
const remove_complex<ValueType> *b_norm)
{
const auto this_rnc =
residual_norm_collection[iter * stride_residual_norm_collection +
Expand All @@ -315,10 +294,10 @@ __global__ __launch_bounds__(block_size) void givens_rotation_kernel(
ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg,
ValueType *__restrict__ givens_sin, size_type stride_sin,
ValueType *__restrict__ givens_cos, size_type stride_cos,
ValueType *__restrict__ residual_norm,
remove_complex<ValueType> *__restrict__ residual_norm,
ValueType *__restrict__ residual_norm_collection,
size_type stride_residual_norm_collection,
const ValueType *__restrict__ b_norm,
const remove_complex<ValueType> *__restrict__ b_norm,
const stopping_status *__restrict__ stop_status)
{
const auto col_idx = thread::get_thread_id_flat();
Expand Down
22 changes: 3 additions & 19 deletions cuda/base/cublas_bindings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,24 +214,9 @@ GKO_BIND_CUBLAS_DOT(ValueType, detail::not_implemented);
#undef GKO_BIND_CUBLAS_DOT


#define GKO_BIND_CUBLAS_COMPLEX_NORM2(ValueType, CublasName) \
inline void norm2(cublasHandle_t handle, int n, const ValueType *x, \
int incx, ValueType *result) \
{ \
cudaMemset(result, 0, sizeof(ValueType)); \
GKO_ASSERT_NO_CUBLAS_ERRORS( \
CublasName(handle, n, as_culibs_type(x), incx, \
reinterpret_cast<remove_complex<ValueType> *>( \
as_culibs_type(result)))); \
} \
static_assert(true, \
"This assert is used to counter the false positive extra " \
"semi-colon warnings")


#define GKO_BIND_CUBLAS_NORM2(ValueType, CublasName) \
inline void norm2(cublasHandle_t handle, int n, const ValueType *x, \
int incx, ValueType *result) \
int incx, remove_complex<ValueType> *result) \
{ \
GKO_ASSERT_NO_CUBLAS_ERRORS(CublasName(handle, n, as_culibs_type(x), \
incx, as_culibs_type(result))); \
Expand All @@ -243,12 +228,11 @@ GKO_BIND_CUBLAS_DOT(ValueType, detail::not_implemented);

GKO_BIND_CUBLAS_NORM2(float, cublasSnrm2);
GKO_BIND_CUBLAS_NORM2(double, cublasDnrm2);
GKO_BIND_CUBLAS_COMPLEX_NORM2(std::complex<float>, cublasScnrm2);
GKO_BIND_CUBLAS_COMPLEX_NORM2(std::complex<double>, cublasDznrm2);
GKO_BIND_CUBLAS_NORM2(std::complex<float>, cublasScnrm2);
GKO_BIND_CUBLAS_NORM2(std::complex<double>, cublasDznrm2);
template <typename ValueType>
GKO_BIND_CUBLAS_NORM2(ValueType, detail::not_implemented);

#undef GKO_BIND_CUBLAS_COMPLEX_NORM2
#undef GKO_BIND_CUBLAS_NORM2


Expand Down
29 changes: 22 additions & 7 deletions cuda/matrix/dense_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
template <typename ValueType>
void compute_norm2(std::shared_ptr<const CudaExecutor> exec,
const matrix::Dense<ValueType> *x,
matrix::Dense<ValueType> *result)
matrix::Dense<remove_complex<ValueType>> *result)
{
if (cublas::is_supported<ValueType>::value) {
for (size_type col = 0; col < x->get_size()[1]; ++col) {
Expand All @@ -222,12 +222,27 @@ void compute_norm2(std::shared_ptr<const CudaExecutor> exec,
result->get_values() + col);
}
} else {
compute_dot(exec, x, x, result);
const dim3 block_size(default_block_size, 1, 1);
const dim3 grid_size(ceildiv(result->get_size()[1], block_size.x), 1,
1);
kernel::compute_sqrt<<<grid_size, block_size, 0, 0>>>(
result->get_size()[1], as_cuda_type(result->get_values()));
using norm_type = remove_complex<ValueType>;
// TODO: these are tuning parameters obtained experimentally, once
// we decide how to handle this uniformly, they should be modified
// appropriately
constexpr auto work_per_thread = 32;
constexpr auto block_size = 1024;

constexpr auto work_per_block = work_per_thread * block_size;
const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
const dim3 block_dim{config::warp_size, 1,
block_size / config::warp_size};
Array<norm_type> work(exec, grid_dim.x);
// TODO: write a kernel which does this more efficiently
for (size_type col = 0; col < x->get_size()[1]; ++col) {
kernel::compute_partial_norm2<block_size><<<grid_dim, block_dim>>>(
x->get_size()[0], as_cuda_type(x->get_const_values() + col),
x->get_stride(), as_cuda_type(work.get_data()));
kernel::finalize_norm2_computation<block_size><<<1, block_dim>>>(
grid_dim.x, as_cuda_type(work.get_const_data()),
as_cuda_type(result->get_values() + col));
}
}
}

Expand Down
16 changes: 8 additions & 8 deletions cuda/solver/gmres_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ constexpr int default_dot_size = default_dot_dim * default_dot_dim;
template <typename ValueType>
void initialize_1(std::shared_ptr<const CudaExecutor> exec,
const matrix::Dense<ValueType> *b,
matrix::Dense<ValueType> *b_norm,
matrix::Dense<remove_complex<ValueType>> *b_norm,
matrix::Dense<ValueType> *residual,
matrix::Dense<ValueType> *givens_sin,
matrix::Dense<ValueType> *givens_cos,
Expand Down Expand Up @@ -106,7 +106,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL);
template <typename ValueType>
void initialize_2(std::shared_ptr<const CudaExecutor> exec,
const matrix::Dense<ValueType> *residual,
matrix::Dense<ValueType> *residual_norm,
matrix::Dense<remove_complex<ValueType>> *residual_norm,
matrix::Dense<ValueType> *residual_norm_collection,
matrix::Dense<ValueType> *krylov_bases,
Array<size_type> *final_iter_nums, size_type krylov_dim)
Expand Down Expand Up @@ -212,10 +212,10 @@ void givens_rotation(std::shared_ptr<const CudaExecutor> exec,
matrix::Dense<ValueType> *givens_sin,
matrix::Dense<ValueType> *givens_cos,
matrix::Dense<ValueType> *hessenberg_iter,
matrix::Dense<ValueType> *residual_norm,
matrix::Dense<remove_complex<ValueType>> *residual_norm,
matrix::Dense<ValueType> *residual_norm_collection,
const matrix::Dense<ValueType> *b_norm, size_type iter,
const Array<stopping_status> *stop_status)
const matrix::Dense<remove_complex<ValueType>> *b_norm,
size_type iter, const Array<stopping_status> *stop_status)
{
// TODO: tune block_size for optimal performance
constexpr auto block_size = default_block_size;
Expand All @@ -241,12 +241,12 @@ template <typename ValueType>
void step_1(std::shared_ptr<const CudaExecutor> exec, size_type num_rows,
matrix::Dense<ValueType> *givens_sin,
matrix::Dense<ValueType> *givens_cos,
matrix::Dense<ValueType> *residual_norm,
matrix::Dense<remove_complex<ValueType>> *residual_norm,
matrix::Dense<ValueType> *residual_norm_collection,
matrix::Dense<ValueType> *krylov_bases,
matrix::Dense<ValueType> *hessenberg_iter,
const matrix::Dense<ValueType> *b_norm, size_type iter,
Array<size_type> *final_iter_nums,
const matrix::Dense<remove_complex<ValueType>> *b_norm,
size_type iter, Array<size_type> *final_iter_nums,
const Array<stopping_status> *stop_status)
{
increase_final_iteration_numbers_kernel<<<
Expand Down
13 changes: 8 additions & 5 deletions cuda/stop/residual_norm_reduction_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,15 @@ constexpr int default_block_size = 512;
template <typename ValueType>
__global__
__launch_bounds__(default_block_size) void residual_norm_reduction_kernel(
size_type num_cols, remove_complex<ValueType> rel_residual_goal,
size_type num_cols, ValueType rel_residual_goal,
const ValueType *__restrict__ tau,
const ValueType *__restrict__ orig_tau, uint8 stoppingId,
bool setFinalized, stopping_status *__restrict__ stop_status,
bool *__restrict__ device_storage)
{
const auto tidx = thread::get_thread_id_flat();
if (tidx < num_cols) {
if (abs(tau[tidx]) < rel_residual_goal * abs(orig_tau[tidx])) {
if (tau[tidx] < rel_residual_goal * orig_tau[tidx]) {
stop_status[tidx].converge(stoppingId, setFinalized);
device_storage[1] = true;
}
Expand All @@ -93,12 +93,14 @@ template <typename ValueType>
void residual_norm_reduction(std::shared_ptr<const CudaExecutor> exec,
const matrix::Dense<ValueType> *tau,
const matrix::Dense<ValueType> *orig_tau,
remove_complex<ValueType> rel_residual_goal,
uint8 stoppingId, bool setFinalized,
ValueType rel_residual_goal, uint8 stoppingId,
bool setFinalized,
Array<stopping_status> *stop_status,
Array<bool> *device_storage, bool *all_converged,
bool *one_changed)
{
static_assert(is_complex_s<ValueType>::value == false,
"ValueType must not be complex in this function!");
init_kernel<<<1, 1>>>(as_cuda_type(device_storage->get_data()));

const dim3 block_size(default_block_size, 1, 1);
Expand All @@ -116,7 +118,8 @@ void residual_norm_reduction(std::shared_ptr<const CudaExecutor> exec,
*one_changed = exec->copy_val_to_host(device_storage->get_const_data() + 1);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL);
GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL);


} // namespace residual_norm_reduction
Expand Down
Loading

0 comments on commit 335967b

Please sign in to comment.