Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix triangular solvers on Windows CUDA #1665

Merged
merged 9 commits into from
Aug 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -486,8 +486,7 @@ build/windows-cuda/release/shared:
- mkdir install
- cmake -B build -DBUILD_SHARED_LIBS=ON -DGINKGO_BUILD_CUDA=ON "-DCMAKE_INSTALL_PREFIX=$pwd\install" .
- cmake --build build --config Release -j16
# we disable these tests until the triangular solver issues are resolved
# - ctest --test-dir build -C Release --no-tests=error --output-on-failure
- ctest --test-dir build -C Release --no-tests=error --output-on-failure
- $env:PATH+=";$pwd/install/bin"
- cmake --install build --config Release
- cmake --build build --target test_install --config Release
Expand Down
7 changes: 7 additions & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ function(ginkgo_benchmark_cusparse_linops type def)
target_compile_definitions(cusparse_linops_${type} PUBLIC ${def})
target_compile_definitions(cusparse_linops_${type} PRIVATE GKO_COMPILING_CUDA)
target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse)
ginkgo_compile_features(cusparse_linops_${type})
endfunction()

function(ginkgo_benchmark_hipsparse_linops type def)
Expand All @@ -31,13 +32,15 @@ function(ginkgo_benchmark_hipsparse_linops type def)
target_compile_definitions(hipsparse_linops_${type} PRIVATE GKO_COMPILING_HIP)
target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS})
target_link_libraries(hipsparse_linops_${type} Ginkgo::ginkgo ${HIPSPARSE_LIBRARIES})
ginkgo_compile_features(hipsparse_linops_${type})
endfunction()

function(ginkgo_benchmark_onemkl_linops type def)
add_library(onemkl_linops_${type} utils/dpcpp_linops.dp.cpp)
# make the dependency public to catch issues
target_compile_definitions(onemkl_linops_${type} PUBLIC ${def})
target_link_libraries(onemkl_linops_${type} PRIVATE Ginkgo::ginkgo MKL::MKL_DPCPP)
ginkgo_compile_features(onemkl_linops_${type})
endfunction()


Expand Down Expand Up @@ -116,6 +119,7 @@ if (GINKGO_BUILD_CUDA)
ginkgo_benchmark_cusparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION)
add_library(cuda_timer utils/cuda_timer.cpp)
target_link_libraries(cuda_timer ginkgo CUDA::cudart)
ginkgo_compile_features(cuda_timer)
endif()
if (GINKGO_BUILD_HIP)
ginkgo_benchmark_hipsparse_linops(d GKO_BENCHMARK_USE_DOUBLE_PRECISION)
Expand All @@ -125,6 +129,7 @@ if (GINKGO_BUILD_HIP)
set_source_files_properties(utils/hip_timer.hip.cpp PROPERTIES LANGUAGE HIP)
add_library(hip_timer utils/hip_timer.hip.cpp)
target_link_libraries(hip_timer ginkgo)
ginkgo_compile_features(hip_timer)
endif()

if (GINKGO_BUILD_SYCL)
Expand All @@ -136,11 +141,13 @@ if (GINKGO_BUILD_SYCL)
target_compile_options(dpcpp_timer PRIVATE ${GINKGO_DPCPP_FLAGS})
gko_add_sycl_to_target(TARGET dpcpp_timer SOURCES utils/dpcpp_timer.dp.cpp)
target_link_libraries(dpcpp_timer ginkgo)
ginkgo_compile_features(dpcpp_timer)
endif()

if (GINKGO_BUILD_MPI)
add_library(mpi_timer ${Ginkgo_SOURCE_DIR}/benchmark/utils/mpi_timer.cpp)
target_link_libraries(mpi_timer ginkgo)
ginkgo_compile_features(mpi_timer)
endif()

add_subdirectory(blas)
Expand Down
9 changes: 6 additions & 3 deletions benchmark/test/test_framework.py.in
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,12 @@ def sanitize_json_text(input: str) -> List[str]:
and pretty-printed to replace the original JSON input.
"""

result = json.dumps(sanitize_json(json.loads(input)), indent=4)
# json.dumps doesn't add a trailing newline
return result.splitlines() + [""]
try:
result = json.dumps(sanitize_json(json.loads(input)), indent=4)
# json.dumps doesn't add a trailing newline
return result.splitlines() + [""]
except Exception as e:
return f"Error: {str(e)}"


def sanitize_text(
Expand Down
43 changes: 38 additions & 5 deletions common/cuda_hip/components/uninitialized_array.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

#include <ginkgo/core/base/types.hpp>

#include "common/cuda_hip/base/thrust.hpp"


namespace gko {
namespace kernels {
Expand All @@ -34,7 +36,7 @@ class uninitialized_array {
*/
constexpr GKO_ATTRIBUTES operator const ValueType*() const noexcept
{
return &(*this)[0];
return data_;
}

/**
Expand All @@ -43,7 +45,7 @@ class uninitialized_array {
*
* @return the non-const pointer to the first entry of the array.
*/
GKO_ATTRIBUTES operator ValueType*() noexcept { return &(*this)[0]; }
GKO_ATTRIBUTES operator ValueType*() noexcept { return data_; }

/**
* constexpr array access operator.
Expand All @@ -56,7 +58,7 @@ class uninitialized_array {
constexpr GKO_ATTRIBUTES const ValueType& operator[](
size_type pos) const noexcept
{
return reinterpret_cast<const ValueType*>(data_)[pos];
return data_[pos];
}

/**
Expand All @@ -69,11 +71,42 @@ class uninitialized_array {
*/
GKO_ATTRIBUTES ValueType& operator[](size_type pos) noexcept
{
return reinterpret_cast<ValueType*>(data_)[pos];
return data_[pos];
}

private:
ValueType data_[size];
};


template <typename ValueType, size_type size>
class uninitialized_array<thrust::complex<ValueType>, size> {
public:
constexpr GKO_ATTRIBUTES operator const thrust::complex<ValueType>*()
const noexcept
{
return &(*this)[0];
}

GKO_ATTRIBUTES operator thrust::complex<ValueType>*() noexcept
{
return &(*this)[0];
}

constexpr GKO_ATTRIBUTES const thrust::complex<ValueType>& operator[](
size_type pos) const noexcept
{
return reinterpret_cast<const thrust::complex<ValueType>*>(data_)[pos];
}

GKO_ATTRIBUTES thrust::complex<ValueType>& operator[](
size_type pos) noexcept
{
return reinterpret_cast<thrust::complex<ValueType>*>(data_)[pos];
}

private:
unsigned char data_[sizeof(ValueType) / sizeof(unsigned char) * size];
ValueType data_[2 * size];
};


Expand Down
2 changes: 1 addition & 1 deletion common/cuda_hip/matrix/csr_kernels.template.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ __device__ void merge_path_reduce(const IndexType nwarps,
}
}
}
__shared__ uninitialized_array<IndexType, spmv_block_size> tmp_ind;
__shared__ IndexType tmp_ind[spmv_block_size];
upsj marked this conversation as resolved.
Show resolved Hide resolved
__shared__ uninitialized_array<arithmetic_type, spmv_block_size> tmp_val;
tmp_val[threadIdx.x] = value;
tmp_ind[threadIdx.x] = row;
Expand Down
60 changes: 54 additions & 6 deletions cuda/solver/common_trs_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#define GKO_CUDA_SOLVER_COMMON_TRS_KERNELS_CUH_


#include <cstring>
#include <functional>
#include <iostream>
#include <memory>
Expand Down Expand Up @@ -342,6 +343,52 @@ constexpr int default_block_size = 512;
constexpr int fallback_block_size = 32;


/** Returns an unsigned type matching the size of the given float type. */
template <typename T>
struct float_to_unsigned_impl {};

template <>
struct float_to_unsigned_impl<double> {
using type = uint64;
};

template <>
struct float_to_unsigned_impl<float> {
using type = uint32;
};


/**
* Checks if a floating point number representation matches the representation
* of the quiet NaN with value gko::nan() exactly.
*/
template <typename T>
GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<!is_complex_s<T>::value, bool>
is_nan_exact(const T& value)
{
using type = typename float_to_unsigned_impl<T>::type;
type value_bytes{};
type nan_bytes{};
auto nan_value = nan<T>();
using std::memcpy;
memcpy(&value_bytes, &value, sizeof(value));
memcpy(&nan_bytes, &nan_value, sizeof(value));
return value_bytes == nan_bytes;
}


/**
* Checks if any component of the complex value matches the quiet NaN with
* value gko::nan() exactly.
*/
template <typename T>
GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<is_complex_s<T>::value, bool>
is_nan_exact(const T& value)
{
return is_nan_exact(value.real()) || is_nan_exact(value.imag());
}


template <bool is_upper, typename ValueType, typename IndexType>
__global__ void sptrsv_naive_caching_kernel(
const IndexType* const rowptrs, const IndexType* const colidxs,
Expand Down Expand Up @@ -399,11 +446,12 @@ __global__ void sptrsv_naive_caching_kernel(
ValueType val{};
if (shmem_possible) {
const auto dependency_shid = dependency_gid % default_block_size;
while (is_nan(val = load_relaxed_shared(x_s + dependency_shid))) {
while (is_nan_exact(
val = load_relaxed_shared(x_s + dependency_shid))) {
}
} else {
while (
is_nan(val = load_relaxed(x + dependency * x_stride + rhs))) {
while (is_nan_exact(
val = load_relaxed(x + dependency * x_stride + rhs))) {
}
}

Expand All @@ -418,7 +466,7 @@ __global__ void sptrsv_naive_caching_kernel(
store_relaxed(x + row * x_stride + rhs, r);

// This check to ensure no infinite loops happen.
if (is_nan(r)) {
if (is_nan_exact(r)) {
store_relaxed_shared(x_s + self_shid, zero<ValueType>());
store_relaxed(x + row * x_stride + rhs, zero<ValueType>());
*nan_produced = true;
Expand Down Expand Up @@ -460,7 +508,7 @@ __global__ void sptrsv_naive_legacy_kernel(
auto col = colidxs[j];
while (j != row_end) {
auto x_val = load_relaxed(x + col * x_stride + rhs);
while (!is_nan(x_val)) {
while (!is_nan_exact(x_val)) {
sum += vals[j] * x_val;
j += row_step;
col = colidxs[j];
Expand All @@ -478,7 +526,7 @@ __global__ void sptrsv_naive_legacy_kernel(
// after we encountered the diagonal, we are done
// this also skips entries outside the triangle
j = row_end;
if (is_nan(r)) {
if (is_nan_exact(r)) {
store_relaxed(x + row * x_stride + rhs, zero<ValueType>());
*nan_produced = true;
}
Expand Down
15 changes: 11 additions & 4 deletions include/ginkgo/core/base/math.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1220,10 +1220,14 @@ GKO_INLINE GKO_ATTRIBUTES T safe_divide(T a, T b)
* @return `true` if the value is NaN.
*/
template <typename T>
GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<!is_complex_s<T>::value, bool>
is_nan(const T& value)
GKO_DEPRECATED(
"is_nan can't be used safely on the device (MSVC+CUDA), and will thus be "
"removed in a future release, without replacement")
upsj marked this conversation as resolved.
Show resolved Hide resolved
GKO_INLINE GKO_ATTRIBUTES
std::enable_if_t<!is_complex_s<T>::value, bool> is_nan(const T& value)
{
return std::isnan(value);
using std::isnan;
return isnan(value);
Comment on lines -1226 to +1230
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is std::isnan(value) different from the updated one?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so, but this has a better chance of using the CUDA isnan function.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unfortunately it still doesn't work with MSVC, which means I'll have to file a bug report some time ;)

}


Expand All @@ -1237,10 +1241,13 @@ is_nan(const T& value)
* @return `true` if any component of the given value is NaN.
*/
template <typename T>
GKO_DEPRECATED(
"is_nan can't be used safely on the device (MSVC+CUDA), and will thus be "
"removed in a future release, without replacement")
GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<is_complex_s<T>::value, bool> is_nan(
const T& value)
{
return std::isnan(value.real()) || std::isnan(value.imag());
return is_nan(value.real()) || is_nan(value.imag());
}


Expand Down
2 changes: 1 addition & 1 deletion test/solver/gcr_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ TEST_F(Gcr, GcrApplyOneRHSIsEquivalentToRef)
exec_solver->apply(d_b.get(), d_x.get());

GKO_ASSERT_MTX_NEAR(d_b, b, 0);
GKO_ASSERT_MTX_NEAR(d_x, x, r<value_type>::value * 1e2);
GKO_ASSERT_MTX_NEAR(d_x, x, r<value_type>::value * 1e3);
}


Expand Down
4 changes: 2 additions & 2 deletions test/solver/lower_trs_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ TEST_F(LowerTrs, ApplyTriangularDenseMtxIsEquivalentToRef)
solver->apply(b, x);
d_solver->apply(db, dx);

GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
GKO_ASSERT_MTX_NEAR(dx, x, 1e-13);
}


Expand Down Expand Up @@ -417,7 +417,7 @@ TEST_F(LowerTrs, ClassicalApplyTriangularDenseMtxIsEquivalentToRef)
solver->apply(b, x);
d_solver->apply(db, dx);

GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
GKO_ASSERT_MTX_NEAR(dx, x, 1e-13);
}


Expand Down
2 changes: 1 addition & 1 deletion test/solver/upper_trs_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ TEST_F(UpperTrs, ApplyTriangularDenseMtxIsEquivalentToRef)
solver->apply(b, x);
d_solver->apply(db, dx);

GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
GKO_ASSERT_MTX_NEAR(dx, x, 1e-13);
}


Expand Down
Loading