Update with the new upstream changes.

ginkgo-project · Sep 6, 2019 · ff77fb1 · ff77fb1
1 parent 1e53dbe
commit ff77fb1
Show file tree

Hide file tree

Showing 3 changed files with 132 additions and 53 deletions.
diff --git a/core/solver/upper_trs_kernels.hpp b/core/solver/upper_trs_kernels.hpp
@@ -48,7 +48,7 @@ namespace kernels {
 namespace upper_trs {
 
 
-#define GKO_DECLARE_UPPER_TRS_CHECK_SHOULD_PERFORM_TRANSPOSE_KERNEL()          \
+#define GKO_DECLARE_UPPER_TRS_SHOULD_PERFORM_TRANSPOSE_KERNEL()                \
     void should_perform_transpose(std::shared_ptr<const DefaultExecutor> exec, \
                                   bool &do_transpose)
 
@@ -73,12 +73,12 @@ namespace upper_trs {
                const matrix::Dense<_vtype> *b, matrix::Dense<_vtype> *x)
 
 
-#define GKO_DECLARE_ALL_AS_TEMPLATES                               \
-    GKO_DECLARE_UPPER_TRS_CHECK_SHOULD_PERFORM_TRANSPOSE_KERNEL(); \
-    GKO_DECLARE_UPPER_TRS_INIT_STRUCT_KERNEL();                    \
-    template <typename ValueType, typename IndexType>              \
-    GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL(ValueType, IndexType);      \
-    template <typename ValueType, typename IndexType>              \
+#define GKO_DECLARE_ALL_AS_TEMPLATES                          \
+    GKO_DECLARE_UPPER_TRS_SHOULD_PERFORM_TRANSPOSE_KERNEL();  \
+    GKO_DECLARE_UPPER_TRS_INIT_STRUCT_KERNEL();               \
+    template <typename ValueType, typename IndexType>         \
+    GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL(ValueType, IndexType); \
+    template <typename ValueType, typename IndexType>         \
     GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL(ValueType, IndexType)
 
 

diff --git a/cuda/solver/upper_trs_kernels.cu b/cuda/solver/upper_trs_kernels.cu
@@ -64,28 +64,99 @@ namespace upper_trs {
 
 
 void should_perform_transpose(std::shared_ptr<const CudaExecutor> exec,
-                              bool &do_transpose) GKO_NOT_IMPLEMENTED;
+                              bool &do_transpose)
+{
+#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020))
+
+
+    do_transpose = false;
+
+
+#elif (defined(CUDA_VERSION) && (CUDA_VERSION < 9020))
+
+
+    do_transpose = true;
+
+
+#endif
+}
 
 
 void init_struct(std::shared_ptr<const CudaExecutor> exec,
-                 std::shared_ptr<gko::solver::SolveStruct> &solve_struct)
+                 std::shared_ptr<solver::SolveStruct> &solve_struct)
 {
-    const auto id = exec->get_device_id();
-    device_guard g(id);
-    solve_struct = std::shared_ptr<gko::solver::SolveStruct>(
-        kernels::cuda::cusparse::init_trs_solve_struct(),
-        [id](gko::solver::SolveStruct *solve_struct_) {
-            device_guard g(id);
-            kernels::cuda::cusparse::clear_trs_solve_struct(solve_struct_);
-        });
+    solve_struct =
+        std::shared_ptr<solver::SolveStruct>(new solver::SolveStruct());
 }
 
 
 template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const CudaExecutor> exec,
               const matrix::Csr<ValueType, IndexType> *matrix,
-              solver::SolveStruct *solve_struct,
-              const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED;
+              solver::SolveStruct *solve_struct, const gko::size_type num_rhs)
+{
+    if (cusparse::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_cusparse_handle();
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSetMatFillMode(
+            solve_struct->factor_descr, CUSPARSE_FILL_MODE_UPPER));
+
+
+#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020))
+
+
+        ValueType one = 1.0;
+
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(
+            cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST));
+        cusparse::buffer_size_ext(
+            handle, solve_struct->algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE,
+            CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs,
+            matrix->get_num_stored_elements(), &one, solve_struct->factor_descr,
+            matrix->get_const_values(), matrix->get_const_row_ptrs(),
+            matrix->get_const_col_idxs(), nullptr, num_rhs,
+            solve_struct->solve_info, solve_struct->policy,
+            &solve_struct->factor_work_size);
+
+        // allocate workspace
+        if (solve_struct->factor_work_vec != nullptr) {
+            GKO_ASSERT_NO_CUDA_ERRORS(cudaFree(solve_struct->factor_work_vec));
+        }
+        solve_struct->factor_work_vec =
+            exec->alloc<void *>(solve_struct->factor_work_size);
+
+        cusparse::csrsm2_analysis(
+            handle, solve_struct->algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE,
+            CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs,
+            matrix->get_num_stored_elements(), &one, solve_struct->factor_descr,
+            matrix->get_const_values(), matrix->get_const_row_ptrs(),
+            matrix->get_const_col_idxs(), nullptr, num_rhs,
+            solve_struct->solve_info, solve_struct->policy,
+            solve_struct->factor_work_vec);
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(
+            cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_DEVICE));
+
+
+#elif (defined(CUDA_VERSION) && (CUDA_VERSION < 9020))
+
+
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(
+            cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST));
+        cusparse::csrsm_analysis(
+            handle, CUSPARSE_OPERATION_NON_TRANSPOSE, matrix->get_size()[0],
+            matrix->get_num_stored_elements(), solve_struct->factor_descr,
+            matrix->get_const_values(), matrix->get_const_row_ptrs(),
+            matrix->get_const_col_idxs(), solve_struct->solve_info);
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(
+            cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_DEVICE));
+
+
+#endif
+
+
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);

diff --git a/cuda/test/solver/upper_trs_kernels.cpp b/cuda/test/solver/upper_trs_kernels.cpp
@@ -33,14 +33,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/upper_trs.hpp>
 
 
-#include <gtest/gtest.h>
-
-
 #include <memory>
 #include <random>
 
 
 #include <cuda.h>
+#include <gtest/gtest.h>
 
 
 #include <ginkgo/core/base/exception.hpp>
@@ -60,6 +58,7 @@ class UpperTrs : public ::testing::Test {
 protected:
     using CsrMtx = gko::matrix::Csr<double, gko::int32>;
     using Mtx = gko::matrix::Dense<>;
+
     UpperTrs() : rand_engine(30) {}
 
     void SetUp()
@@ -92,7 +91,32 @@ class UpperTrs : public ::testing::Test {
             std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
     }
 
+    void initialize_data(int m, int n)
+    {
+        mtx = gen_u_mtx(m, m);
+        b = gen_mtx(m, n);
+        x = gen_mtx(m, n);
+        csr_mtx = CsrMtx::create(ref);
+        mtx->convert_to(csr_mtx.get());
+        d_csr_mtx = CsrMtx::create(cuda);
+        d_x = Mtx::create(cuda);
+        d_x->copy_from(x.get());
+        d_csr_mtx->copy_from(csr_mtx.get());
+        b2 = Mtx::create(ref);
+        d_b2 = Mtx::create(cuda);
+        d_b2->copy_from(b.get());
+        b2->copy_from(b.get());
+    }
 
+    std::shared_ptr<Mtx> b;
+    std::shared_ptr<Mtx> b2;
+    std::shared_ptr<Mtx> x;
+    std::shared_ptr<Mtx> mtx;
+    std::shared_ptr<CsrMtx> csr_mtx;
+    std::shared_ptr<Mtx> d_b;
+    std::shared_ptr<Mtx> d_b2;
+    std::shared_ptr<Mtx> d_x;
+    std::shared_ptr<CsrMtx> d_csr_mtx;
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::CudaExecutor> cuda;
     std::ranlux48 rand_engine;
@@ -103,65 +127,49 @@ TEST_F(UpperTrs, CudaUpperTrsFlagCheckIsCorrect)
 {
     bool trans_flag = true;
     bool expected_flag = false;
-#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020))
-    expected_flag = false;
-#elif (defined(CUDA_VERSION) && (CUDA_VERSION < 9020))
+
+
+#if (defined(CUDA_VERSION) && (CUDA_VERSION < 9020))
+
+
     expected_flag = true;
+
+
 #endif
-    gko::kernels::cuda::upper_trs::perform_transpose(cuda, trans_flag);
+
+
+    gko::kernels::cuda::upper_trs::should_perform_transpose(cuda, trans_flag);
 
     ASSERT_EQ(expected_flag, trans_flag);
 }
 
 
 TEST_F(UpperTrs, CudaSingleRhsApplyIsEquivalentToRef)
 {
-    std::shared_ptr<Mtx> mtx = gen_u_mtx(50, 50);
-    std::shared_ptr<Mtx> b = gen_mtx(50, 1);
-    std::shared_ptr<Mtx> x = gen_mtx(50, 1);
-    std::shared_ptr<CsrMtx> csr_mtx = CsrMtx::create(ref);
-    mtx->convert_to(csr_mtx.get());
-    std::shared_ptr<CsrMtx> d_csr_mtx = CsrMtx::create(cuda);
-    auto d_x = Mtx::create(cuda);
-    d_x->copy_from(x.get());
-    d_csr_mtx->copy_from(csr_mtx.get());
-    std::shared_ptr<Mtx> b2 = Mtx::create(ref);
-    std::shared_ptr<Mtx> d_b2 = Mtx::create(cuda);
-    d_b2->copy_from(b.get());
-    b2->copy_from(b.get());
-
+    initialize_data(50, 1);
     auto upper_trs_factory = gko::solver::UpperTrs<>::build().on(ref);
     auto d_upper_trs_factory = gko::solver::UpperTrs<>::build().on(cuda);
     auto solver = upper_trs_factory->generate(csr_mtx);
     auto d_solver = d_upper_trs_factory->generate(d_csr_mtx);
+
     solver->apply(b2.get(), x.get());
     d_solver->apply(d_b2.get(), d_x.get());
+
     GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
 }
 
 
 TEST_F(UpperTrs, CudaMultipleRhsApplyIsEquivalentToRef)
 {
-    std::shared_ptr<Mtx> mtx = gen_u_mtx(50, 50);
-    std::shared_ptr<Mtx> b = gen_mtx(50, 3);
-    std::shared_ptr<Mtx> x = gen_mtx(50, 3);
-    std::shared_ptr<CsrMtx> csr_mtx = CsrMtx::create(ref);
-    mtx->convert_to(csr_mtx.get());
-    std::shared_ptr<CsrMtx> d_csr_mtx = CsrMtx::create(cuda);
-    auto d_x = Mtx::create(cuda);
-    d_x->copy_from(x.get());
-    d_csr_mtx->copy_from(csr_mtx.get());
-    std::shared_ptr<Mtx> b2 = Mtx::create(ref);
-    std::shared_ptr<Mtx> d_b2 = Mtx::create(cuda);
-    d_b2->copy_from(b.get());
-    b2->copy_from(b.get());
+    initialize_data(50, 3);
 
     auto upper_trs_factory =
         gko::solver::UpperTrs<>::build().with_num_rhs(3u).on(ref);
     auto d_upper_trs_factory =
         gko::solver::UpperTrs<>::build().with_num_rhs(3u).on(cuda);
     auto solver = upper_trs_factory->generate(csr_mtx);
     auto d_solver = d_upper_trs_factory->generate(d_csr_mtx);
+
     solver->apply(b2.get(), x.get());
     d_solver->apply(d_b2.get(), d_x.get());