diff --git a/paddle/phi/backends/dynload/cusolver.h b/paddle/phi/backends/dynload/cusolver.h index d580751fe4448c..86651fc8f142ed 100644 --- a/paddle/phi/backends/dynload/cusolver.h +++ b/paddle/phi/backends/dynload/cusolver.h @@ -46,8 +46,10 @@ extern void *cusolver_dso_handle; __macro(cusolverDnSetStream); \ __macro(cusolverDnSpotrf_bufferSize); \ __macro(cusolverDnDpotrf_bufferSize); \ + __macro(cusolverDnXpotrf_bufferSize); \ __macro(cusolverDnSpotrf); \ __macro(cusolverDnDpotrf); \ + __macro(cusolverDnXpotrf); \ __macro(cusolverDnSpotrs); \ __macro(cusolverDnDpotrs); \ __macro(cusolverDnCpotrs); \ @@ -120,6 +122,8 @@ CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP) #if CUDA_VERSION >= 9020 #define CUSOLVER_ROUTINE_EACH_R2(__macro) \ __macro(cusolverDnCreateSyevjInfo); \ + __macro(cusolverDnCreateParams); \ + __macro(cusolverDnDestroyParams); \ __macro(cusolverDnSsyevj_bufferSize); \ __macro(cusolverDnDsyevj_bufferSize); \ __macro(cusolverDnCheevj_bufferSize); \ diff --git a/paddle/phi/kernels/gpu/cholesky_kernel.cu b/paddle/phi/kernels/gpu/cholesky_kernel.cu index 8a831d2ee299a8..40cf55017bf0fe 100644 --- a/paddle/phi/kernels/gpu/cholesky_kernel.cu +++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu @@ -92,6 +92,60 @@ struct MatrixBandPartFunctor { FUNC_WITH_TYPES(POTRF_INSTANCE); +#if CUDA_VERSION >= 11040 +#define POTRF64_INSTANCE(T, C) \ + void Potrf64(const GPUContext& dev_ctx, \ + cublasFillMode_t uplo, \ + int64_t n, \ + T* A, \ + int64_t lda, \ + int* info) { \ + auto handle = dev_ctx.cusolver_dn_handle(); \ + cusolverDnParams_t params; \ + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateParams(¶ms)); \ + size_t workspace_device_size = 0; \ + size_t workspace_host_size = 0; \ + cudaDataType_t data_type = \ + std::is_same::value ? CUDA_R_32F : CUDA_R_64F; \ + PADDLE_ENFORCE_GPU_SUCCESS( \ + dynload::cusolverDnXpotrf_bufferSize(handle, \ + params, \ + uplo, \ + n, \ + data_type, \ + A, \ + lda, \ + data_type, \ + &workspace_device_size, \ + &workspace_host_size)); \ + auto workspace_device = phi::memory_utils::Alloc( \ + dev_ctx.GetPlace(), \ + workspace_device_size, \ + phi::Stream(reinterpret_cast(dev_ctx.stream()))); \ + auto workspace_host = phi::memory_utils::Alloc( \ + phi::CPUPlace(), \ + workspace_host_size, \ + phi::Stream(reinterpret_cast(dev_ctx.stream()))); \ + PADDLE_ENFORCE_GPU_SUCCESS( \ + dynload::cusolverDnXpotrf(handle, \ + params, \ + uplo, \ + n, \ + data_type, \ + A, \ + lda, \ + data_type, \ + workspace_device->ptr(), \ + workspace_device_size, \ + workspace_host->ptr(), \ + workspace_host_size, \ + info)); \ + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDestroyParams(params)); \ + } + +FUNC_WITH_TYPES(POTRF64_INSTANCE); +#endif + #if CUDA_VERSION >= 9020 && !defined(_WIN32) #define POTRF_BATCH_INSTANCE(T, C) \ void PotrfBatched(const GPUContext& dev_ctx, \ @@ -114,7 +168,7 @@ void CholeskyKernel(const Context& dev_ctx, const DenseTensor& x, bool upper, DenseTensor* out) { - if (out->numel() == 0) { + if (x.numel() == 0) { dev_ctx.template Alloc(out); return; } @@ -125,7 +179,7 @@ void CholeskyKernel(const Context& dev_ctx, batch_count *= dims[i]; } int m = dims[dims.size() - 1]; - int tensor_size = batch_count * m * m; + int64_t tensor_size = batch_count * static_cast(m) * m; const auto* x_data = x.data(); auto* out_data = dev_ctx.template Alloc(out); @@ -135,22 +189,16 @@ void CholeskyKernel(const Context& dev_ctx, upper ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; // portf is inplace, thus copy the triangular part of the input matrices to // the output and set the other triangular part to 0 firstly + phi::funcs::ForRange for_range(dev_ctx, tensor_size); + // Pre-processing if (upper) { - MatrixBandPartFunctor matrix_band_part_functor(m, - m, - /* num_lower_diags */ 0, - /* num_upper_diags */ m, - x_data, - out_data); + MatrixBandPartFunctor matrix_band_part_functor( + m, m, 0, -1, x_data, out_data); for_range(matrix_band_part_functor); } else { - MatrixBandPartFunctor matrix_band_part_functor(m, - m, - /* num_lower_diags */ m, - /* num_upper_diags */ 0, - x_data, - out_data); + MatrixBandPartFunctor matrix_band_part_functor( + m, m, -1, 0, x_data, out_data); for_range(matrix_band_part_functor); } @@ -164,7 +212,7 @@ void CholeskyKernel(const Context& dev_ctx, if (batch_count > 1) { std::vector output_ptrs; for (int i = 0; i < batch_count; i++) { - output_ptrs.emplace_back(out_data + i * m * m); + output_ptrs.emplace_back(out_data + static_cast(i) * m * m); } thrust::device_vector dev_output_ptrs(output_ptrs.begin(), output_ptrs.end()); @@ -178,28 +226,28 @@ void CholeskyKernel(const Context& dev_ctx, // TODO(guosheng): There seems to a bug in cusolver potrfBatched and need // to clear the upper triangle of the output. Remove this workaround once // the bug is fixed. + if (!upper) { - MatrixBandPartFunctor matrix_band_part_functor(m, - m, - /* num_lower_diags */ m, - /* num_upper_diags */ 0, - out_data, - out_data); + MatrixBandPartFunctor matrix_band_part_functor( + m, m, -1, 0, out_data, out_data); for_range(matrix_band_part_functor); } } else { #endif for (int i = 0; i < batch_count; i++) { - Potrf(dev_ctx, uplo, m, out_data + i * m * m, m, info_ptr + i); + int64_t offset = static_cast(i) * m * m; +#if CUDA_VERSION >= 11040 + Potrf64(dev_ctx, uplo, m, out_data + offset, m, info_ptr + i); +#else + Potrf(dev_ctx, uplo, m, out_data + offset, m, info_ptr + i); +#endif } - #if CUDA_VERSION >= 9020 && !defined(_WIN32) } #endif // check the info - std::vector error_info; // only for checking positive matrix + std::vector error_info; error_info.resize(batch_count); - memory_utils::Copy(CPUPlace(), error_info.data(), dev_ctx.GetPlace(), @@ -208,13 +256,37 @@ void CholeskyKernel(const Context& dev_ctx, dev_ctx.stream()); for (int i = 0; i < batch_count; ++i) { - PADDLE_ENFORCE_EQ(error_info[i], - 0, - errors::PreconditionNotMet( - "For batch [%d]: U(%d, %d) is zero, singular U.", - i, - error_info[i], - error_info[i])); + const int info = error_info[i]; + if (info == 0) { + continue; + } + if (info < 0) { + PADDLE_ENFORCE_EQ( + info, + 0, + errors::InvalidArgument("Cholesky kernel failed for batch %d: " + "The %d-th argument was invalid, please " + "check the kernel implementation.", + i, + -info)); + } + PADDLE_ENFORCE_EQ( + info, + 0, + errors::PreconditionNotMet( + "Cholesky decomposition failed for batch %d: " + "The leading minor of order %d is not positive definite.", + i, + info)); + } + + // Post-processing to clear the other triangle + if (upper) { + MatrixBandPartFunctor band_part_post(m, m, 0, -1, out_data, out_data); + for_range(band_part_post); + } else { + MatrixBandPartFunctor band_part_post(m, m, -1, 0, out_data, out_data); + for_range(band_part_post); } } diff --git a/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h index 5d146dae8d53e2..f4dd48013d4b04 100644 --- a/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h @@ -254,7 +254,7 @@ void CholeskyGradKernel(const Context& dev_ctx, batch_count *= dims[i]; } auto m = dims[dims.size() - 1]; - int tensor_size = batch_count * m * m; + int64_t tensor_size = static_cast(batch_count) * m * m; std::vector axis(dims.size() - 2); std::iota(axis.begin(), axis.end(), 0); @@ -304,6 +304,7 @@ void CholeskyGradKernel(const Context& dev_ctx, for_range(eye_functor); // TODO(guosheng): use trsmBatched for GPU for (int i = 0; i < batch_count; i++) { + int64_t offset = static_cast(i) * m * m; blas.TRSM(/*side*/ CblasLeft, /*uplo*/ CblasLower, /*trans*/ CblasNoTrans, @@ -311,9 +312,9 @@ void CholeskyGradKernel(const Context& dev_ctx, /*m*/ m, /*n*/ m, /*alpha*/ T(1), - l_data + i * m * m, + l_data + offset, /*lda*/ m, - identity_data + i * m * m, + identity_data + offset, /*ldb*/ m); } DenseTensor& l_inverse = identity;