diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp index 7e4faf80d08..bcdf0bb53e2 100644 --- a/common/cuda_hip/base/math.hpp +++ b/common/cuda_hip/base/math.hpp @@ -116,6 +116,7 @@ GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) namespace gko { +using thrust::sqrt; // It is required by NVHPC 23.3, isnan is undefined when NVHPC are only as host // compiler. #if defined(__CUDACC__) || defined(GKO_COMPILING_HIP) @@ -156,6 +157,18 @@ __device__ __forceinline__ __half sqrt(const __half& val) } +// using overload here. Otherwise, compiler still think the is_finite +// specialization is still __host__ __device__ function. +__device__ __forceinline__ bool is_finite(const __half& value) +{ + return abs(value) < device_numeric_limits<__half>::inf(); +} + +__device__ __forceinline__ bool is_finite(const thrust::complex<__half>& value) +{ + return is_finite(value.real()) && is_finite(value.imag()); +} + #endif diff --git a/common/cuda_hip/factorization/par_ic_kernels.cpp b/common/cuda_hip/factorization/par_ic_kernels.cpp index 87e2fefd823..40de74279de 100644 --- a/common/cuda_hip/factorization/par_ic_kernels.cpp +++ b/common/cuda_hip/factorization/par_ic_kernels.cpp @@ -84,9 +84,13 @@ __global__ __launch_bounds__(default_block_size) void ic_sweep( } auto to_write = row == col - ? sqrt(a_val - sum) + ? gko::sqrt(a_val - sum) : (a_val - sum) / load_relaxed(l_vals + (l_row_ptrs[col + 1] - 1)); + // if (row == col && row < 30) { + // printf("%d: %lf\n", row, static_cast(real(to_write))); + // } if (is_finite(to_write)) { + printf("write?!!\n"); store_relaxed(l_vals + l_nz, to_write); } } diff --git a/common/cuda_hip/factorization/par_ilu_kernels.cpp b/common/cuda_hip/factorization/par_ilu_kernels.cpp index a22bb85275a..403979b0dd3 100644 --- a/common/cuda_hip/factorization/par_ilu_kernels.cpp +++ b/common/cuda_hip/factorization/par_ilu_kernels.cpp @@ -66,11 +66,13 @@ __global__ __launch_bounds__(default_block_size) void compute_l_u_factors( auto to_write = sum / load_relaxed(u_values + (u_row_ptrs[col + 1] - 1)); if (is_finite(to_write)) { + printf("write!\n"); store_relaxed(l_values + (l_idx - 1), to_write); } } else { auto to_write = sum; if (is_finite(to_write)) { + printf("write!\n"); store_relaxed(u_values + (u_idx - 1), to_write); } } diff --git a/test/factorization/par_ic_kernels.cpp b/test/factorization/par_ic_kernels.cpp index bb53a454e21..793d4f3ee61 100644 --- a/test/factorization/par_ic_kernels.cpp +++ b/test/factorization/par_ic_kernels.cpp @@ -41,7 +41,7 @@ class ParIc : public CommonTestFixture { mtx_l = gko::test::generate_random_lower_triangular_matrix( mtx_size[0], false, std::uniform_int_distribution(10, mtx_size[0]), - std::normal_distribution<>(0, 10.0), rand_engine, ref); + std::normal_distribution<>(0, 5.0), rand_engine, ref); dmtx_ani = Csr::create(exec); dmtx_l_ani = Csr::create(exec); dmtx_l_ani_init = Csr::create(exec); @@ -108,16 +108,22 @@ TYPED_TEST(ParIc, KernelComputeFactorIsEquivalentToRef) using Csr = typename TestFixture::Csr; using Coo = typename TestFixture::Coo; using value_type = typename TestFixture::value_type; - SKIP_IF_HALF(value_type); auto square_size = this->mtx_ani->get_size(); auto mtx_l_coo = Coo::create(this->ref, square_size); this->mtx_l_ani->convert_to(mtx_l_coo); auto dmtx_l_coo = gko::clone(this->exec, mtx_l_coo); - + auto mtx_init = this->dmtx_l_ani_init->clone(this->ref); gko::kernels::reference::par_ic_factorization::compute_factor( this->ref, 1, mtx_l_coo.get(), this->mtx_l_ani_init.get()); gko::kernels::GKO_DEVICE_NAMESPACE::par_ic_factorization::compute_factor( this->exec, 100, dmtx_l_coo.get(), this->dmtx_l_ani_init.get()); - GKO_ASSERT_MTX_NEAR(this->mtx_l_ani_init, this->dmtx_l_ani_init, 1e-4); + GKO_EXPECT_MTX_NEAR(this->mtx_l_ani_init, this->dmtx_l_ani_init, 1e-4); + + // gko::kernels::reference::par_ic_factorization::compute_factor( + // this->ref, 1, mtx_l_coo.get(), mtx_init.get()); + // gko::kernels::GKO_DEVICE_NAMESPACE::par_ic_factorization::compute_factor( + // this->exec, 2000, dmtx_l_coo.get(), this->dmtx_l_ani_init.get()); + // GKO_EXPECT_MTX_NEAR(this->mtx_l_ani_init, mtx_init, 1e-4); + GKO_EXPECT_MTX_NEAR(this->dmtx_l_ani_init, mtx_init, 0); } diff --git a/test/factorization/par_ict_kernels.cpp b/test/factorization/par_ict_kernels.cpp index 945f874ef26..a18658cc0fe 100644 --- a/test/factorization/par_ict_kernels.cpp +++ b/test/factorization/par_ict_kernels.cpp @@ -125,7 +125,7 @@ TYPED_TEST(ParIct, KernelComputeFactorIsEquivalentToRef) using Csr = typename TestFixture::Csr; using Coo = typename TestFixture::Coo; using value_type = typename TestFixture::value_type; - SKIP_IF_HALF(value_type); + // SKIP_IF_HALF(value_type); auto square_size = this->mtx_ani->get_size(); auto mtx_l_coo = Coo::create(this->ref, square_size); this->mtx_l_ani->convert_to(mtx_l_coo); diff --git a/test/factorization/par_ilu_kernels.cpp b/test/factorization/par_ilu_kernels.cpp index 216a4f597cb..76106c32f0f 100644 --- a/test/factorization/par_ilu_kernels.cpp +++ b/test/factorization/par_ilu_kernels.cpp @@ -238,7 +238,7 @@ TYPED_TEST(ParIlu, KernelComputeParILUIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using value_type = typename TestFixture::value_type; - SKIP_IF_HALF(value_type); + // SKIP_IF_HALF(value_type); std::unique_ptr l_mtx{}; std::unique_ptr u_mtx{}; std::unique_ptr dl_mtx{}; @@ -257,12 +257,12 @@ TYPED_TEST(ParIlu, KernelComputeParILUWithMoreIterationsIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using value_type = typename TestFixture::value_type; - SKIP_IF_HALF(value_type); + // SKIP_IF_HALF(value_type); std::unique_ptr l_mtx{}; std::unique_ptr u_mtx{}; std::unique_ptr dl_mtx{}; std::unique_ptr du_mtx{}; - gko::size_type iterations{200}; + gko::size_type iterations{500}; this->compute_lu(l_mtx, u_mtx, dl_mtx, du_mtx, iterations);