Change norm calculation for GPUs to NonComplex

Now, all computations of norms have been changed to result in non-complex numbers.
ginkgo-project · Jun 17, 2020 · 335967b · 335967b
1 parent c0615c9
commit 335967b
Show file tree

Hide file tree

Showing 15 changed files with 228 additions and 165 deletions.
diff --git a/common/matrix/dense_kernels.hpp.inc b/common/matrix/dense_kernels.hpp.inc
@@ -73,11 +73,12 @@ __global__ __launch_bounds__(block_size) void add_scaled(
 }
 
 
-template <size_type block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void compute_partial_dot(
-    size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
-    const ValueType *__restrict__ y, size_type stride_y,
-    ValueType *__restrict__ work)
+template <size_type block_size, typename OutType, typename CallableGetValue,
+          typename CallableReduce>
+__device__ void compute_partial_reduce(size_type num_rows,
+                                       OutType *__restrict__ work,
+                                       CallableGetValue get_value,
+                                       CallableReduce reduce_op)
 {
     constexpr auto warps_per_block = block_size / config::warp_size;
 
@@ -86,52 +87,95 @@ __global__ __launch_bounds__(block_size) void compute_partial_dot(
     const auto global_id =
         thread::get_thread_id<config::warp_size, warps_per_block>();
 
-    auto tmp = zero<ValueType>();
+    auto tmp = zero<OutType>();
     for (auto i = global_id; i < num_rows; i += block_size * num_blocks) {
-        tmp += x[i * stride_x] * y[i * stride_y];
+        tmp = reduce_op(tmp, get_value(i));
     }
-    __shared__ UninitializedArray<ValueType, block_size> tmp_work;
+    __shared__ UninitializedArray<OutType, block_size> tmp_work;
     tmp_work[local_id] = tmp;
 
-    reduce(group::this_thread_block(), static_cast<ValueType *>(tmp_work),
-           [](const ValueType &x, const ValueType &y) { return x + y; });
+    reduce(group::this_thread_block(), static_cast<OutType *>(tmp_work),
+           reduce_op);
 
     if (local_id == 0) {
         work[thread::get_block_id()] = tmp_work[0];
     }
 }
 
 
-template <size_type block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void finalize_dot_computation(
-    size_type size, const ValueType *work, ValueType *result)
+template <size_type block_size, typename ValueType, typename CallableReduce,
+          typename CallableFinalize>
+__device__ void finalize_reduce_computation(size_type size,
+                                            const ValueType *work,
+                                            ValueType *result,
+                                            CallableReduce reduce_op,
+                                            CallableFinalize finalize_op)
 {
     const auto local_id = thread::get_local_thread_id<config::warp_size>();
 
     ValueType tmp = zero<ValueType>();
     for (auto i = local_id; i < size; i += block_size) {
-        tmp += work[i];
+        tmp = reduce_op(tmp, work[i]);
     }
     __shared__ UninitializedArray<ValueType, block_size> tmp_work;
     tmp_work[local_id] = tmp;
 
     reduce(group::this_thread_block(), static_cast<ValueType *>(tmp_work),
-           [](const ValueType &x, const ValueType &y) { return x + y; });
+           reduce_op);
 
     if (local_id == 0) {
-        *result = tmp_work[0];
+        *result = finalize_op(tmp_work[0]);
     }
 }
 
 
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void compute_sqrt(
-    size_type num_cols, ValueType *__restrict__ work)
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void compute_partial_dot(
+    size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
+    const ValueType *__restrict__ y, size_type stride_y,
+    ValueType *__restrict__ work)
 {
-    const auto tidx = thread::get_thread_id_flat();
-    if (tidx < num_cols) {
-        work[tidx] = sqrt(abs(work[tidx]));
-    }
+    compute_partial_reduce<block_size>(
+        num_rows, work,
+        [x, stride_x, y, stride_y](size_type i) {
+            return x[i * stride_x] * conj(y[i * stride_y]);
+        },
+        [](const ValueType &x, const ValueType &y) { return x + y; });
+}
+
+
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void finalize_dot_computation(
+    size_type size, const ValueType *work, ValueType *result)
+{
+    finalize_reduce_computation<block_size>(
+        size, work, result,
+        [](const ValueType &x, const ValueType &y) { return x + y; },
+        [](const ValueType &x) { return x; });
+}
+
+
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void compute_partial_norm2(
+    size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
+    remove_complex<ValueType> *__restrict__ work)
+{
+    using norm_type = remove_complex<ValueType>;
+    compute_partial_reduce<block_size>(
+        num_rows, work,
+        [x, stride_x](size_type i) { return squared_norm(x[i * stride_x]); },
+        [](const norm_type &x, const norm_type &y) { return x + y; });
+}
+
+
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void finalize_norm2_computation(
+    size_type size, const ValueType *work, ValueType *result)
+{
+    finalize_reduce_computation<block_size>(
+        size, work, result,
+        [](const ValueType &x, const ValueType &y) { return x + y; },
+        [](const ValueType &x) { return sqrt(x); });
 }
 
 

diff --git a/common/solver/gmres_kernels.hpp.inc b/common/solver/gmres_kernels.hpp.inc
@@ -65,35 +65,12 @@ __global__ __launch_bounds__(block_size) void initialize_1_kernel(
 }
 
 
-// Must be called with at least `num_rows * stride_krylov` threads in total.
-template <size_type block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void initialize_2_1_kernel(
-    size_type num_rows, size_type num_rhs, size_type krylov_dim,
-    ValueType *__restrict__ krylov_bases, size_type stride_krylov,
-    ValueType *__restrict__ residual_norm_collection,
-    size_type stride_residual_nc)
-{
-    const auto global_id = thread::get_thread_id_flat();
-    const auto row_idx = global_id / stride_krylov;
-    const auto col_idx = global_id % stride_krylov;
-    const auto krylov_bases_nrows = (krylov_dim + 1) * num_rows;
-    if (row_idx < krylov_bases_nrows && col_idx < num_rhs) {
-        krylov_bases[row_idx * stride_krylov + col_idx] = zero<ValueType>();
-    }
-
-    if (row_idx < krylov_dim + 1 && col_idx < num_rhs) {
-        residual_norm_collection[row_idx * stride_residual_nc + col_idx] =
-            zero<ValueType>();
-    }
-}
-
-
 // Must be called with at least `num_rows * num_rhs` threads in total.
 template <size_type block_size, typename ValueType>
 __global__ __launch_bounds__(block_size) void initialize_2_2_kernel(
     size_type num_rows, size_type num_rhs,
     const ValueType *__restrict__ residual, size_type stride_residual,
-    const ValueType *__restrict__ residual_norm,
+    const remove_complex<ValueType> *__restrict__ residual_norm,
     ValueType *__restrict__ residual_norm_collection,
     ValueType *__restrict__ krylov_bases, size_type stride_krylov,
     size_type *__restrict__ final_iter_nums)
@@ -293,8 +270,10 @@ template <typename ValueType>
 __device__ void calculate_residual_norm_kernel(
     size_type col_idx, size_type num_cols, size_type iter,
     const ValueType &register_sin, const ValueType &register_cos,
-    ValueType *residual_norm, ValueType *residual_norm_collection,
-    size_type stride_residual_norm_collection, const ValueType *b_norm)
+    remove_complex<ValueType> *residual_norm,
+    ValueType *residual_norm_collection,
+    size_type stride_residual_norm_collection,
+    const remove_complex<ValueType> *b_norm)
 {
     const auto this_rnc =
         residual_norm_collection[iter * stride_residual_norm_collection +
@@ -315,10 +294,10 @@ __global__ __launch_bounds__(block_size) void givens_rotation_kernel(
     ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg,
     ValueType *__restrict__ givens_sin, size_type stride_sin,
     ValueType *__restrict__ givens_cos, size_type stride_cos,
-    ValueType *__restrict__ residual_norm,
+    remove_complex<ValueType> *__restrict__ residual_norm,
     ValueType *__restrict__ residual_norm_collection,
     size_type stride_residual_norm_collection,
-    const ValueType *__restrict__ b_norm,
+    const remove_complex<ValueType> *__restrict__ b_norm,
     const stopping_status *__restrict__ stop_status)
 {
     const auto col_idx = thread::get_thread_id_flat();

diff --git a/cuda/base/cublas_bindings.hpp b/cuda/base/cublas_bindings.hpp
@@ -214,24 +214,9 @@ GKO_BIND_CUBLAS_DOT(ValueType, detail::not_implemented);
 #undef GKO_BIND_CUBLAS_DOT
 
 
-#define GKO_BIND_CUBLAS_COMPLEX_NORM2(ValueType, CublasName)                 \
-    inline void norm2(cublasHandle_t handle, int n, const ValueType *x,      \
-                      int incx, ValueType *result)                           \
-    {                                                                        \
-        cudaMemset(result, 0, sizeof(ValueType));                            \
-        GKO_ASSERT_NO_CUBLAS_ERRORS(                                         \
-            CublasName(handle, n, as_culibs_type(x), incx,                   \
-                       reinterpret_cast<remove_complex<ValueType> *>(        \
-                           as_culibs_type(result))));                        \
-    }                                                                        \
-    static_assert(true,                                                      \
-                  "This assert is used to counter the false positive extra " \
-                  "semi-colon warnings")
-
-
 #define GKO_BIND_CUBLAS_NORM2(ValueType, CublasName)                           \
     inline void norm2(cublasHandle_t handle, int n, const ValueType *x,        \
-                      int incx, ValueType *result)                             \
+                      int incx, remove_complex<ValueType> *result)             \
     {                                                                          \
         GKO_ASSERT_NO_CUBLAS_ERRORS(CublasName(handle, n, as_culibs_type(x),   \
                                                incx, as_culibs_type(result))); \
@@ -243,12 +228,11 @@ GKO_BIND_CUBLAS_DOT(ValueType, detail::not_implemented);
 
 GKO_BIND_CUBLAS_NORM2(float, cublasSnrm2);
 GKO_BIND_CUBLAS_NORM2(double, cublasDnrm2);
-GKO_BIND_CUBLAS_COMPLEX_NORM2(std::complex<float>, cublasScnrm2);
-GKO_BIND_CUBLAS_COMPLEX_NORM2(std::complex<double>, cublasDznrm2);
+GKO_BIND_CUBLAS_NORM2(std::complex<float>, cublasScnrm2);
+GKO_BIND_CUBLAS_NORM2(std::complex<double>, cublasDznrm2);
 template <typename ValueType>
 GKO_BIND_CUBLAS_NORM2(ValueType, detail::not_implemented);
 
-#undef GKO_BIND_CUBLAS_COMPLEX_NORM2
 #undef GKO_BIND_CUBLAS_NORM2
 
 

diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
@@ -213,7 +213,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
 template <typename ValueType>
 void compute_norm2(std::shared_ptr<const CudaExecutor> exec,
                    const matrix::Dense<ValueType> *x,
-                   matrix::Dense<ValueType> *result)
+                   matrix::Dense<remove_complex<ValueType>> *result)
 {
     if (cublas::is_supported<ValueType>::value) {
         for (size_type col = 0; col < x->get_size()[1]; ++col) {
@@ -222,12 +222,27 @@ void compute_norm2(std::shared_ptr<const CudaExecutor> exec,
                           result->get_values() + col);
         }
     } else {
-        compute_dot(exec, x, x, result);
-        const dim3 block_size(default_block_size, 1, 1);
-        const dim3 grid_size(ceildiv(result->get_size()[1], block_size.x), 1,
-                             1);
-        kernel::compute_sqrt<<<grid_size, block_size, 0, 0>>>(
-            result->get_size()[1], as_cuda_type(result->get_values()));
+        using norm_type = remove_complex<ValueType>;
+        // TODO: these are tuning parameters obtained experimentally, once
+        // we decide how to handle this uniformly, they should be modified
+        // appropriately
+        constexpr auto work_per_thread = 32;
+        constexpr auto block_size = 1024;
+
+        constexpr auto work_per_block = work_per_thread * block_size;
+        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
+        const dim3 block_dim{config::warp_size, 1,
+                             block_size / config::warp_size};
+        Array<norm_type> work(exec, grid_dim.x);
+        // TODO: write a kernel which does this more efficiently
+        for (size_type col = 0; col < x->get_size()[1]; ++col) {
+            kernel::compute_partial_norm2<block_size><<<grid_dim, block_dim>>>(
+                x->get_size()[0], as_cuda_type(x->get_const_values() + col),
+                x->get_stride(), as_cuda_type(work.get_data()));
+            kernel::finalize_norm2_computation<block_size><<<1, block_dim>>>(
+                grid_dim.x, as_cuda_type(work.get_const_data()),
+                as_cuda_type(result->get_values() + col));
+        }
     }
 }
 

diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu
@@ -78,7 +78,7 @@ constexpr int default_dot_size = default_dot_dim * default_dot_dim;
 template <typename ValueType>
 void initialize_1(std::shared_ptr<const CudaExecutor> exec,
                   const matrix::Dense<ValueType> *b,
-                  matrix::Dense<ValueType> *b_norm,
+                  matrix::Dense<remove_complex<ValueType>> *b_norm,
                   matrix::Dense<ValueType> *residual,
                   matrix::Dense<ValueType> *givens_sin,
                   matrix::Dense<ValueType> *givens_cos,
@@ -106,7 +106,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL);
 template <typename ValueType>
 void initialize_2(std::shared_ptr<const CudaExecutor> exec,
                   const matrix::Dense<ValueType> *residual,
-                  matrix::Dense<ValueType> *residual_norm,
+                  matrix::Dense<remove_complex<ValueType>> *residual_norm,
                   matrix::Dense<ValueType> *residual_norm_collection,
                   matrix::Dense<ValueType> *krylov_bases,
                   Array<size_type> *final_iter_nums, size_type krylov_dim)
@@ -212,10 +212,10 @@ void givens_rotation(std::shared_ptr<const CudaExecutor> exec,
                      matrix::Dense<ValueType> *givens_sin,
                      matrix::Dense<ValueType> *givens_cos,
                      matrix::Dense<ValueType> *hessenberg_iter,
-                     matrix::Dense<ValueType> *residual_norm,
+                     matrix::Dense<remove_complex<ValueType>> *residual_norm,
                      matrix::Dense<ValueType> *residual_norm_collection,
-                     const matrix::Dense<ValueType> *b_norm, size_type iter,
-                     const Array<stopping_status> *stop_status)
+                     const matrix::Dense<remove_complex<ValueType>> *b_norm,
+                     size_type iter, const Array<stopping_status> *stop_status)
 {
     // TODO: tune block_size for optimal performance
     constexpr auto block_size = default_block_size;
@@ -241,12 +241,12 @@ template <typename ValueType>
 void step_1(std::shared_ptr<const CudaExecutor> exec, size_type num_rows,
             matrix::Dense<ValueType> *givens_sin,
             matrix::Dense<ValueType> *givens_cos,
-            matrix::Dense<ValueType> *residual_norm,
+            matrix::Dense<remove_complex<ValueType>> *residual_norm,
             matrix::Dense<ValueType> *residual_norm_collection,
             matrix::Dense<ValueType> *krylov_bases,
             matrix::Dense<ValueType> *hessenberg_iter,
-            const matrix::Dense<ValueType> *b_norm, size_type iter,
-            Array<size_type> *final_iter_nums,
+            const matrix::Dense<remove_complex<ValueType>> *b_norm,
+            size_type iter, Array<size_type> *final_iter_nums,
             const Array<stopping_status> *stop_status)
 {
     increase_final_iteration_numbers_kernel<<<

diff --git a/cuda/stop/residual_norm_reduction_kernels.cu b/cuda/stop/residual_norm_reduction_kernels.cu
@@ -60,15 +60,15 @@ constexpr int default_block_size = 512;
 template <typename ValueType>
 __global__
     __launch_bounds__(default_block_size) void residual_norm_reduction_kernel(
-        size_type num_cols, remove_complex<ValueType> rel_residual_goal,
+        size_type num_cols, ValueType rel_residual_goal,
         const ValueType *__restrict__ tau,
         const ValueType *__restrict__ orig_tau, uint8 stoppingId,
         bool setFinalized, stopping_status *__restrict__ stop_status,
         bool *__restrict__ device_storage)
 {
     const auto tidx = thread::get_thread_id_flat();
     if (tidx < num_cols) {
-        if (abs(tau[tidx]) < rel_residual_goal * abs(orig_tau[tidx])) {
+        if (tau[tidx] < rel_residual_goal * orig_tau[tidx]) {
             stop_status[tidx].converge(stoppingId, setFinalized);
             device_storage[1] = true;
         }
@@ -93,12 +93,14 @@ template <typename ValueType>
 void residual_norm_reduction(std::shared_ptr<const CudaExecutor> exec,
                              const matrix::Dense<ValueType> *tau,
                              const matrix::Dense<ValueType> *orig_tau,
-                             remove_complex<ValueType> rel_residual_goal,
-                             uint8 stoppingId, bool setFinalized,
+                             ValueType rel_residual_goal, uint8 stoppingId,
+                             bool setFinalized,
                              Array<stopping_status> *stop_status,
                              Array<bool> *device_storage, bool *all_converged,
                              bool *one_changed)
 {
+    static_assert(is_complex_s<ValueType>::value == false,
+                  "ValueType must not be complex in this function!");
     init_kernel<<<1, 1>>>(as_cuda_type(device_storage->get_data()));
 
     const dim3 block_size(default_block_size, 1, 1);
@@ -116,7 +118,8 @@ void residual_norm_reduction(std::shared_ptr<const CudaExecutor> exec,
     *one_changed = exec->copy_val_to_host(device_storage->get_const_data() + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
+    GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL);
 
 
 }  // namespace residual_norm_reduction