diff --git a/src/Drivers/Sparse/CMakeLists.txt b/src/Drivers/Sparse/CMakeLists.txt index 27c53cf81..91efb88a7 100644 --- a/src/Drivers/Sparse/CMakeLists.txt +++ b/src/Drivers/Sparse/CMakeLists.txt @@ -84,7 +84,8 @@ if(HIOP_USE_GINKGO) endif(HIOP_USE_GINKGO) if(HIOP_USE_RAJA AND HIOP_USE_GPU AND HIOP_USE_CUDA) - add_test(NAME NlpSparseRaja2_1 COMMAND ${RUNCMD} bash -c "$" "500" "-cusolver" "-inertiafree" "-selfcheck ") + add_test(NAME NlpSparseRaja2_1 COMMAND ${RUNCMD} "$" "500" "-inertiafree" "-selfcheck" "-resolve_cuda_glu") + add_test(NAME NlpSparseRaja2_2 COMMAND ${RUNCMD} "$" "500" "-inertiafree" "-selfcheck" "-resolve_cuda_rf") endif() add_test(NAME NlpSparse3_1 COMMAND ${RUNCMD} "$" "500" "-selfcheck") diff --git a/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp index 9617a72d8..a2455ebe1 100644 --- a/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp @@ -18,8 +18,8 @@ static bool parse_arguments(int argc, size_type& n, bool& self_check, bool& inertia_free, - bool& use_cusolver, - bool& use_resolve, + bool& use_resolve_cuda_glu, + bool& use_resolve_cuda_rf, bool& use_ginkgo, bool& use_ginkgo_cuda, bool& use_ginkgo_hip) @@ -27,11 +27,11 @@ static bool parse_arguments(int argc, self_check = false; n = 3; inertia_free = false; - use_cusolver = false; - use_resolve = false; + use_resolve_cuda_glu = false; + use_resolve_cuda_rf = false; use_ginkgo = false; use_ginkgo_cuda = false; - use_ginkgo_cuda = false; + use_ginkgo_hip = false; switch(argc) { case 1: //no arguments @@ -43,8 +43,10 @@ static bool parse_arguments(int argc, self_check = true; } else if(std::string(argv[4]) == "-inertiafree") { inertia_free = true; - } else if(std::string(argv[4]) == "-cusolver") { - use_cusolver = true; + } else if(std::string(argv[4]) == "-resolve_cuda_glu") { + use_resolve_cuda_glu = true; + } else if(std::string(argv[4]) == "-resolve_cuda_rf") { + use_resolve_cuda_rf = true; } else if(std::string(argv[4]) == "-ginkgo"){ use_ginkgo = true; } else if(std::string(argv[4]) == "-ginkgo_cuda"){ @@ -66,8 +68,10 @@ static bool parse_arguments(int argc, self_check = true; } else if(std::string(argv[3]) == "-inertiafree") { inertia_free = true; - } else if(std::string(argv[3]) == "-cusolver") { - use_cusolver = true; + } else if(std::string(argv[3]) == "-resolve_cuda_glu") { + use_resolve_cuda_glu = true; + } else if(std::string(argv[3]) == "-resolve_cuda_rf") { + use_resolve_cuda_rf = true; } else if(std::string(argv[3]) == "-ginkgo"){ use_ginkgo = true; } else if(std::string(argv[3]) == "-ginkgo_cuda"){ @@ -89,8 +93,10 @@ static bool parse_arguments(int argc, self_check = true; } else if(std::string(argv[2]) == "-inertiafree") { inertia_free = true; - } else if(std::string(argv[2]) == "-cusolver") { - use_cusolver = true; + } else if(std::string(argv[2]) == "-resolve_cuda_glu") { + use_resolve_cuda_glu = true; + } else if(std::string(argv[2]) == "-resolve_cuda_rf") { + use_resolve_cuda_rf = true; } else if(std::string(argv[2]) == "-ginkgo"){ use_ginkgo = true; } else if(std::string(argv[2]) == "-ginkgo_cuda"){ @@ -112,8 +118,10 @@ static bool parse_arguments(int argc, self_check = true; } else if(std::string(argv[1]) == "-inertiafree") { inertia_free = true; - } else if(std::string(argv[1]) == "-cusolver") { - use_cusolver = true; + } else if(std::string(argv[1]) == "-resolve_cuda_glu") { + use_resolve_cuda_glu = true; + } else if(std::string(argv[1]) == "-resolve_cuda_rf") { + use_resolve_cuda_rf = true; } else if(std::string(argv[1]) == "-ginkgo"){ use_ginkgo = true; } else if(std::string(argv[1]) == "-ginkgo_cuda"){ @@ -134,29 +142,33 @@ static bool parse_arguments(int argc, return false; // 4 or more arguments } -// If CUDA is not available, de-select cuSOLVER +// Currently only CUDA backend for ReSolve is available. Unselect ReSolve if CUDA is not enabled #ifndef HIOP_USE_CUDA - if(use_cusolver) { + if(use_resolve_cuda_glu) { printf("HiOp built without CUDA support. "); - printf("Using default instead of cuSOLVER ...\n"); - use_cusolver = false; + printf("Using default instead of ReSolve ...\n"); + use_resolve_cuda_glu = false; } -#endif - -// Use cuSOLVER's LU factorization, if it was configured -#ifdef HIOP_USE_RESOLVE - if(use_cusolver) { - use_resolve = true; + if(use_resolve_cuda_rf) { + printf("HiOp built without CUDA support. "); + printf("Using default instead of ReSolve ...\n"); + use_resolve_cuda_rf = false; } #endif - // If cuSOLVER was selected, but inertia free approach was not, add inertia-free - if(use_cusolver && !(inertia_free)) { + // If ReSolve was selected, but inertia free approach was not, add inertia-free + if((use_resolve_cuda_glu || use_resolve_cuda_rf) && !(inertia_free)) { inertia_free = true; - printf("LU solver from cuSOLVER library requires inertia free approach. "); + printf("LU solver from ReSolve library requires inertia free approach. "); printf("Enabling now ...\n"); } + if(use_resolve_cuda_glu && use_resolve_cuda_rf) { + use_resolve_cuda_rf = false; + printf("You can select either GLU or Rf refactorization with ReSolve, not both. "); + printf("Using default GLU refactorization ...\n"); + } + // If Ginkgo is not available, de-select it. #ifndef HIOP_USE_GINKGO if(use_ginkgo) { @@ -185,7 +197,8 @@ static void usage(const char* exeName) printf(" '-inertiafree': indicate if inertia free approach should be used [optional]\n"); printf(" '-selfcheck': compares the optimal objective with a previously saved value for the " "problem specified by 'problem_size'. [optional]\n"); - printf(" '-cusolver': use cuSOLVER linear solver [optional]\n"); + printf(" '-use_resolve_cuda_glu': use ReSolve linear solver with KLU factorization and cusolverGLU refactorization [optional]\n"); + printf(" '-use_resolve_cuda_rf' : use ReSolve linear solver with KLU factorization and cusolverRf refactorization [optional]\n"); printf(" '-ginkgo': use GINKGO linear solver [optional]\n"); } @@ -215,12 +228,12 @@ int main(int argc, char **argv) bool selfCheck = false; size_type n = 50; bool inertia_free = false; - bool use_cusolver = false; - bool use_resolve = false; - bool use_ginkgo = false; + bool use_resolve_cuda_glu = false; + bool use_resolve_cuda_rf = false; + bool use_ginkgo = false; bool use_ginkgo_cuda = false; - bool use_ginkgo_hip = false; - if(!parse_arguments(argc, argv, n, selfCheck, inertia_free, use_cusolver, use_resolve, use_ginkgo, use_ginkgo_cuda, use_ginkgo_hip)) { + bool use_ginkgo_hip = false; + if(!parse_arguments(argc, argv, n, selfCheck, inertia_free, use_resolve_cuda_glu, use_resolve_cuda_rf, use_ginkgo, use_ginkgo_cuda, use_ginkgo_hip)) { usage(argv[0]); #ifdef HIOP_USE_MPI MPI_Finalize(); @@ -243,6 +256,12 @@ int main(int argc, char **argv) // only support cusolverLU right now, 2023.02.28 //lsq initialization of the duals fails for this example since the Jacobian is rank deficient //use zero initialization + nlp.options->SetStringValue("linear_solver_sparse", "resolve"); + if(use_resolve_cuda_rf) { + nlp.options->SetStringValue("resolve_refactorization", "rf"); + nlp.options->SetIntegerValue("ir_inner_maxit", 20); + nlp.options->SetIntegerValue("ir_outer_maxit", 0); + } nlp.options->SetStringValue("duals_init", "zero"); nlp.options->SetStringValue("mem_space", "device"); nlp.options->SetStringValue("fact_acceptor", "inertia_free"); diff --git a/src/LinAlg/ReSolve/MatrixCsr.cpp b/src/LinAlg/ReSolve/MatrixCsr.cpp index 2898d3a28..1a5d9a1ac 100644 --- a/src/LinAlg/ReSolve/MatrixCsr.cpp +++ b/src/LinAlg/ReSolve/MatrixCsr.cpp @@ -78,13 +78,7 @@ namespace ReSolve { if(n_ == 0) return; - cudaFree(irows_); - cudaFree(jcols_); - cudaFree(vals_); - - delete [] irows_host_; - delete [] jcols_host_; - delete [] vals_host_ ; + clear_data(); } void MatrixCsr::allocate_size(int n) @@ -103,6 +97,28 @@ namespace ReSolve { vals_host_ = new double[nnz_]{0}; } + void MatrixCsr::clear_data() + { + checkCudaErrors(cudaFree(irows_)); + checkCudaErrors(cudaFree(jcols_)); + checkCudaErrors(cudaFree(vals_)); + + irows_ = nullptr; + jcols_ = nullptr; + vals_ = nullptr; + + delete [] irows_host_; + delete [] jcols_host_; + delete [] vals_host_ ; + + irows_host_ = nullptr; + jcols_host_ = nullptr; + vals_host_ = nullptr; + + n_ = 0; + nnz_ = 0; + } + void MatrixCsr::update_from_host_mirror() { checkCudaErrors(cudaMemcpy(irows_, irows_host_, sizeof(int) * (n_+1), cudaMemcpyHostToDevice)); diff --git a/src/LinAlg/ReSolve/MatrixCsr.hpp b/src/LinAlg/ReSolve/MatrixCsr.hpp index abe50e2d8..019e834c3 100644 --- a/src/LinAlg/ReSolve/MatrixCsr.hpp +++ b/src/LinAlg/ReSolve/MatrixCsr.hpp @@ -9,6 +9,7 @@ class MatrixCsr ~MatrixCsr(); void allocate_size(int n); void allocate_nnz(int nnz); + void clear_data(); int* get_irows() { diff --git a/src/LinAlg/ReSolve/RefactorizationSolver.cpp b/src/LinAlg/ReSolve/RefactorizationSolver.cpp index a3fca40b1..8d70e8936 100644 --- a/src/LinAlg/ReSolve/RefactorizationSolver.cpp +++ b/src/LinAlg/ReSolve/RefactorizationSolver.cpp @@ -57,9 +57,6 @@ #include "IterativeRefinement.hpp" #include "RefactorizationSolver.hpp" -#include "hiop_blasdefs.hpp" -// #include "KrylovSolverKernels.h" - #include "klu.h" #include "cusparse_v2.h" #include @@ -86,6 +83,9 @@ namespace ReSolve { cusparseSetMatType(descr_A_, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr_A_, CUSPARSE_INDEX_BASE_ZERO); + // Allocate host mirror for the solution vector + hostx_ = new double[n_]; + // Allocate solution and rhs vectors checkCudaErrors(cudaMalloc(&devx_, n_ * sizeof(double))); checkCudaErrors(cudaMalloc(&devr_, n_ * sizeof(double))); @@ -104,6 +104,9 @@ namespace ReSolve { cublasDestroy(handle_cublas_); cusparseDestroyMatDescr(descr_A_); + // Delete host mirror for the solution vector + delete [] hostx_; + // Delete residual and solution vectors cudaFree(devr_); cudaFree(devx_); @@ -201,27 +204,24 @@ namespace ReSolve { int RefactorizationSolver::refactorize() { - // TODO: This memcpy should not be in this function. - checkCudaErrors(cudaMemcpy(mat_A_csr_->get_vals(), mat_A_csr_->get_vals_host(), sizeof(double) * nnz_, cudaMemcpyHostToDevice)); - if(refact_ == "glu") { sp_status_ = cusolverSpDgluReset(handle_cusolver_, n_, /* A is original matrix */ nnz_, descr_A_, - mat_A_csr_->get_vals(), //da_, - mat_A_csr_->get_irows(), //dia_, - mat_A_csr_->get_jcols(), //dja_, + mat_A_csr_->get_vals(), + mat_A_csr_->get_irows(), + mat_A_csr_->get_jcols(), info_M_); sp_status_ = cusolverSpDgluFactor(handle_cusolver_, info_M_, d_work_); } else { if(refact_ == "rf") { sp_status_ = cusolverRfResetValues(n_, nnz_, - mat_A_csr_->get_irows(), //dia_, - mat_A_csr_->get_jcols(), //dja_, - mat_A_csr_->get_vals(), //da_, + mat_A_csr_->get_irows(), + mat_A_csr_->get_jcols(), + mat_A_csr_->get_vals(), d_P_, d_Q_, handle_rf_); @@ -232,83 +232,120 @@ namespace ReSolve { return 0; } - bool RefactorizationSolver::triangular_solve(double* dx, const double* rhs, double tol) + bool RefactorizationSolver::triangular_solve(double* dx, double tol, std::string memspace) { - if(refact_ == "glu") { + if(refact_ == "glu") + { + double* devx = nullptr; + if(memspace == "device") { + checkCudaErrors(cudaMemcpy(devr_, dx, sizeof(double) * n_, cudaMemcpyDeviceToDevice)); + devx = dx; + } else { + checkCudaErrors(cudaMemcpy(devr_, dx, sizeof(double) * n_, cudaMemcpyHostToDevice)); + devx = devx_; + } sp_status_ = cusolverSpDgluSolve(handle_cusolver_, n_, /* A is original matrix */ nnz_, descr_A_, - mat_A_csr_->get_vals(), //da_, - mat_A_csr_->get_irows(), //dia_, - mat_A_csr_->get_jcols(), //dja_, + mat_A_csr_->get_vals(), + mat_A_csr_->get_irows(), + mat_A_csr_->get_jcols(), devr_,/* right hand side */ - devx_,/* left hand side */ + devx,/* left hand side, local pointer */ &ite_refine_succ_, &r_nrminf_, info_M_, d_work_); - - if(sp_status_ == 0) { - checkCudaErrors(cudaMemcpy(dx, devx_, sizeof(double) * n_, cudaMemcpyDeviceToHost)); - } else { - // TODO: Implement ReSolve logging system to output these messages + if(sp_status_ != 0 && !silent_output_) { std::cout << "GLU solve failed with status: " << sp_status_ << "\n"; return false; } - } else { - if(refact_ == "rf") { - if(!is_first_solve_) { - sp_status_ = cusolverRfSolve(handle_rf_, - d_P_, - d_Q_, - 1, - d_T_, - n_, - devr_, - n_); - - if(sp_status_ == 0) { - if(use_ir_ == "yes") { - // Set tolerance based on barrier parameter mu - ir_->set_tol(tol); - - checkCudaErrors(cudaMemcpy(devx_, rhs, sizeof(double) * n_, cudaMemcpyHostToDevice)); - - ir_->fgmres(devr_, devx_); - if(!silent_output_ && (ir_->getFinalResidalNorm() > tol*ir_->getBNorm())) { - std::cout << "[Warning] Iterative refinement did not converge!\n"; - std::cout << "\t Iterative refinement tolerance " << tol << "\n"; - std::cout << "\t Relative solution error " << ir_->getFinalResidalNorm()/ir_->getBNorm() << "\n"; - std::cout << "\t fgmres: init residual norm: " << ir_->getInitialResidalNorm() << "\n" - << "\t final residual norm: " << ir_->getFinalResidalNorm() << "\n" - << "\t number of iterations: " << ir_->getFinalNumberOfIterations() << "\n"; - } - - } - checkCudaErrors(cudaMemcpy(dx, devr_, sizeof(double) * n_, cudaMemcpyDeviceToHost)); - - - } else { - // TODO: Implement ReSolve logging system to output these messages - std::cout << "Rf solve failed with status: " << sp_status_ << "\n"; - return false; - } + if(memspace == "device") { + // do nothing + } else { + checkCudaErrors(cudaMemcpy(dx, devx_, sizeof(double) * n_, cudaMemcpyDeviceToHost)); + } + return true; + } + + if(refact_ == "rf") + { + // First solve is performed on CPU + if(is_first_solve_) + { + double* hostx = nullptr; + if(memspace == "device") { + checkCudaErrors(cudaMemcpy(hostx_, dx, sizeof(double) * n_, cudaMemcpyDeviceToHost)); + hostx = hostx_; + } else { + hostx = dx; + } + int ok = klu_solve(Symbolic_, Numeric_, n_, 1, hostx, &Common_); // replace dx with hostx + klu_free_numeric(&Numeric_, &Common_); + klu_free_symbolic(&Symbolic_, &Common_); + is_first_solve_ = false; + if(memspace == "device") { + checkCudaErrors(cudaMemcpy(dx, hostx, sizeof(double) * n_, cudaMemcpyHostToDevice)); } else { - memcpy(dx, rhs, sizeof(double) * n_); - int ok = klu_solve(Symbolic_, Numeric_, n_, 1, dx, &Common_); - klu_free_numeric(&Numeric_, &Common_); - klu_free_symbolic(&Symbolic_, &Common_); - is_first_solve_ = false; + // do nothing + } + return true; + } + + double* devx = nullptr; + if(memspace == "device") { + devx = dx; + checkCudaErrors(cudaMemcpy(devr_, dx, sizeof(double) * n_, cudaMemcpyDeviceToDevice)); + } else { + checkCudaErrors(cudaMemcpy(devx_, dx, sizeof(double) * n_, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(devr_, devx_, sizeof(double) * n_, cudaMemcpyDeviceToDevice)); + devx = devx_; + } + + // Each next solve is performed on GPU + sp_status_ = cusolverRfSolve(handle_rf_, + d_P_, + d_Q_, + 1, + d_T_, + n_, + devx, // replace devx_ with local pointer devx + n_); + if(sp_status_ != 0) { + if(!silent_output_) + std::cout << "Rf solve failed with status: " << sp_status_ << "\n"; + return false; + } + + if(use_ir_ == "yes") { + // Set tolerance based on barrier parameter mu + ir_->set_tol(tol); + + ir_->fgmres(devx, devr_); // replace devx_ with local pointer devx + if(!silent_output_ && (ir_->getFinalResidalNorm() > tol*ir_->getBNorm())) { + std::cout << "[Warning] Iterative refinement did not converge!\n"; + std::cout << "\t Iterative refinement tolerance " << tol << "\n"; + std::cout << "\t Relative solution error " << ir_->getFinalResidalNorm()/ir_->getBNorm() << "\n"; + std::cout << "\t fgmres: init residual norm: " << ir_->getInitialResidalNorm() << "\n" + << "\t final residual norm: " << ir_->getFinalResidalNorm() << "\n" + << "\t number of iterations: " << ir_->getFinalNumberOfIterations() << "\n"; } + + } + if(memspace == "device") { + // do nothing } else { - // TODO: Implement ReSolve logging system to output these messages - std::cout << "Unknown refactorization, exiting\n"; - assert(false && "Only GLU and cuSolverRf are available refactorizations."); + checkCudaErrors(cudaMemcpy(dx, devx_, sizeof(double) * n_, cudaMemcpyDeviceToHost)); } + return true; + } + + if(!silent_output_) { + std::cout << "Unknown refactorization " << refact_ << ", exiting\n"; } - return true; + return false; } // helper private function needed for format conversion @@ -380,7 +417,6 @@ namespace ReSolve { int RefactorizationSolver::initializeCusolverGLU() { - // nlp_->log->printf(hovScalars, "CUSOLVER: Glu \n"); cusparseCreateMatDescr(&descr_M_); cusparseSetMatType(descr_M_, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr_M_, CUSPARSE_INDEX_BASE_ZERO); @@ -396,7 +432,6 @@ namespace ReSolve { int RefactorizationSolver::initializeCusolverRf() { - // nlp_->log->printf(hovScalars, "CUSOLVER: Rf \n"); cusolverRfCreate(&handle_rf_); checkCudaErrors(cusolverRfSetAlgs(handle_rf_, @@ -503,9 +538,9 @@ namespace ReSolve { /* A is original matrix */ nnz_, descr_A_, - mat_A_csr_->get_vals(), //da_, - mat_A_csr_->get_irows(), //dia_, - mat_A_csr_->get_jcols(), //dja_, + mat_A_csr_->get_vals(), + mat_A_csr_->get_irows(), + mat_A_csr_->get_jcols(), info_M_); assert(CUSOLVER_STATUS_SUCCESS == sp_status_); @@ -729,8 +764,8 @@ namespace ReSolve { // KS: might later become part of src/Utils, putting it here for now template void RefactorizationSolver::resolveCheckCudaError(T result, - const char* const file, - int const line) + const char* const file, + int const line) { if(result) { fprintf(stdout, diff --git a/src/LinAlg/ReSolve/RefactorizationSolver.hpp b/src/LinAlg/ReSolve/RefactorizationSolver.hpp index 68e3513b3..f88736a21 100644 --- a/src/LinAlg/ReSolve/RefactorizationSolver.hpp +++ b/src/LinAlg/ReSolve/RefactorizationSolver.hpp @@ -175,7 +175,7 @@ class RefactorizationSolver * @param tol * @return bool */ - bool triangular_solve(double* dx, const double* rhs, double tol); + bool triangular_solve(double* dx, double tol, std::string memspace); private: @@ -225,14 +225,17 @@ class RefactorizationSolver int* mia_ = nullptr; int* mja_ = nullptr; + /* CPU data */ + double* hostx_ = nullptr; + /* for GPU data */ - double* devx_; - double* devr_; + double* devx_ = nullptr; + double* devr_ = nullptr; /* needed for cuSolverRf */ - int* d_P_; - int* d_Q_; // permutation matrices - double* d_T_; + int* d_P_ = nullptr; + int* d_Q_ = nullptr; // permutation matrices + double* d_T_ = nullptr; /** * @brief Function that computes M = (L-I) + U diff --git a/src/LinAlg/hiopLinSolverSparseReSolve.cpp b/src/LinAlg/hiopLinSolverSparseReSolve.cpp index 22b48d6d4..5ae3c6d0d 100644 --- a/src/LinAlg/hiopLinSolverSparseReSolve.cpp +++ b/src/LinAlg/hiopLinSolverSparseReSolve.cpp @@ -49,6 +49,7 @@ * @file hiopLinSolverSparseReSolve.cpp * * @author Kasia Swirydowicz , PNNL + * @author Slaven Peles , ORNL * */ @@ -66,21 +67,72 @@ #define checkCudaErrors(val) hiopCheckCudaError((val), __FILE__, __LINE__) + +/** + * @brief Map elements of one array to the other + * + * for(int k = 0; k < nnz_; k++) { + * vals[k] = M_->M()[index_convert_CSR2Triplet_host_[k]]; + * } + * + */ +template +__global__ void +mapArraysKernel(T* dst, const T* src, const I* mapidx, I n) +{ + I tid = blockDim.x * blockIdx.x + threadIdx.x; + + if (tid < n) + { + dst[tid] = src[ mapidx[tid] ]; + } +} + +/** + * @brief Map elements of one array to the other + * + * for(int i = 0; i < n_; i++) { + * if(index_convert_extra_Diag2CSR_host_[i] != -1) + * vals[index_convert_extra_Diag2CSR_host_[i]] += M_->M()[M_->numberOfNonzeros() - n_ + i]; + * } + * + */ +template +__global__ void +addToArrayKernel(T* dst, const T* src, const I* mapidx, I n, I nnz) +{ + I tid = blockDim.x * blockIdx.x + threadIdx.x; + + if (tid < n) + { + if(mapidx[tid] != -1) + dst[ mapidx[tid] ] += src[nnz - n + tid]; + } +} + + namespace hiop { hiopLinSolverSymSparseReSolve::hiopLinSolverSymSparseReSolve(const int& n, const int& nnz, hiopNlpFormulation* nlp) : hiopLinSolverSymSparse(n, nnz, nlp), - index_covert_CSR2Triplet_{ nullptr }, - index_covert_extra_Diag2CSR_{ nullptr }, + index_convert_CSR2Triplet_host_{ nullptr }, + index_convert_extra_Diag2CSR_host_{ nullptr }, + index_convert_CSR2Triplet_device_{ nullptr }, + index_convert_extra_Diag2CSR_device_{ nullptr }, n_{ n }, nnz_{ 0 }, factorizationSetupSucc_{ 0 }, is_first_call_{ true } { + // Create ReSolve solver and allocate rhs temporary storage solver_ = new ReSolve::RefactorizationSolver(n); - rhs_ = new double[n]{ 0 }; + + // If memory space is device, allocate host mirror for HiOp's KKT matrix in triplet format + if(nlp_->options->GetString("mem_space") == "device") { + M_host_ = LinearAlgebraFactory::create_matrix_sparse("default", n, n, nnz); + } // Set verbosity of ReSolve based on HiOp verbosity if(nlp_->options->GetInteger("verbosity_level") >= 3) { @@ -101,7 +153,7 @@ namespace hiop ordering = 1; } solver_->ordering() = ordering; - std::cout << "Ordering: " << solver_->ordering() << "\n"; + nlp_->log->printf(hovSummary, "Ordering: %d\n", solver_->ordering()); // Select factorization std::string fact; @@ -113,7 +165,7 @@ namespace hiop fact = "klu"; } solver_->fact() = fact; - std::cout << "Factorization: " << solver_->fact() << "\n"; + nlp_->log->printf(hovSummary, "Factorization: %s\n", solver_->fact().c_str()); // Select refactorization std::string refact; @@ -125,7 +177,7 @@ namespace hiop refact = "glu"; } solver_->refact() = refact; - std::cout << "Refactorization: " << solver_->refact() << "\n"; + nlp_->log->printf(hovSummary, "Refactorization: %s\n", solver_->refact().c_str()); // by default, dont use iterative refinement std::string use_ir; @@ -206,17 +258,23 @@ namespace hiop } } solver_->use_ir() = use_ir; - std::cout << "Use IR: " << solver_->use_ir() << "\n"; + nlp_->log->printf(hovSummary, "Use IR: %s\n", solver_->use_ir().c_str()); } // constructor hiopLinSolverSymSparseReSolve::~hiopLinSolverSymSparseReSolve() { delete solver_; - delete [] rhs_; + + // If memory space is device, delete allocated host mirrors + if(nlp_->options->GetString("mem_space") == "device") { + delete M_host_; + } // Delete CSR <--> triplet mappings - delete[] index_covert_CSR2Triplet_; - delete[] index_covert_extra_Diag2CSR_; + delete[] index_convert_CSR2Triplet_host_; + delete[] index_convert_extra_Diag2CSR_host_; + checkCudaErrors(cudaFree(index_convert_CSR2Triplet_device_)); + checkCudaErrors(cudaFree(index_convert_extra_Diag2CSR_device_)); } int hiopLinSolverSymSparseReSolve::matrixChanged() @@ -263,11 +321,10 @@ namespace hiop // Set IR tolerance double ir_tol = nlp_->options->GetNumeric("ir_inner_tol"); + std::string mem_space = nlp_->options->GetString("mem_space"); double* dx = x.local_data(); - memcpy(rhs_, dx, n_*sizeof(double)); - checkCudaErrors(cudaMemcpy(solver_->devr(), rhs_, sizeof(double) * n_, cudaMemcpyHostToDevice)); - bool retval = solver_->triangular_solve(dx, rhs_, ir_tol); + bool retval = solver_->triangular_solve(dx, ir_tol, mem_space); if(!retval) { nlp_->log->printf(hovError, // catastrophic failure "ReSolve triangular solver failed\n"); @@ -282,6 +339,14 @@ namespace hiop assert(n_ == M_->n() && M_->n() == M_->m()); assert(n_ > 0); + // If the matrix is on device, copy it to the host mirror + std::string mem_space = nlp_->options->GetString("mem_space"); + if(mem_space == "device") { + checkCudaErrors(cudaMemcpy(M_host_->M(), M_->M(), sizeof(double) * M_->numberOfNonzeros(), cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(M_host_->i_row(), M_->i_row(), sizeof(index_type) * M_->numberOfNonzeros(), cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(M_host_->j_col(), M_->j_col(), sizeof(index_type) * M_->numberOfNonzeros(), cudaMemcpyDeviceToHost)); + } + // Transfer triplet to CSR form // Allocate row pointers and compute number of nonzeros. @@ -312,20 +377,44 @@ namespace hiop is_first_call_ = false; } + /// nnz_ is number of nonzeros in CSR matrix + /// M_->numberOfNonzeros() is number of zeros in symmetric triplet matrix void hiopLinSolverSymSparseReSolve::update_matrix_values() { - double* vals = solver_->mat_A_csr()->get_vals_host(); - // update matrix - for(int k = 0; k < nnz_; k++) { - vals[k] = M_->M()[index_covert_CSR2Triplet_[k]]; - } - for(int i = 0; i < n_; i++) { - if(index_covert_extra_Diag2CSR_[i] != -1) - vals[index_covert_extra_Diag2CSR_[i]] += M_->M()[M_->numberOfNonzeros() - n_ + i]; + std::string mem_space = nlp_->options->GetString("mem_space"); + if(mem_space == "device") { + + double* csr_vals = solver_->mat_A_csr()->get_vals(); + double* coo_vals = M_->M(); + int coo_nnz = M_->numberOfNonzeros(); + + const int blocksize = 512; + int gridsize = (nnz_ + blocksize - 1) / blocksize; + mapArraysKernel<<< gridsize, blocksize >>>(csr_vals, coo_vals, index_convert_CSR2Triplet_device_, nnz_); + + gridsize = (n_ + blocksize - 1) / blocksize; + addToArrayKernel<<< gridsize, blocksize>>>(csr_vals, coo_vals, index_convert_extra_Diag2CSR_device_, n_, coo_nnz); + + // If factorization was not successful, we need a copy of values on the host + if(factorizationSetupSucc_ == 0) + checkCudaErrors(cudaMemcpy(solver_->mat_A_csr()->get_vals_host(), solver_->mat_A_csr()->get_vals(), sizeof(double) * nnz_, cudaMemcpyDeviceToHost)); + + } else { + // KKT matrix is on the host + double* vals = solver_->mat_A_csr()->get_vals_host(); + // update matrix + for(int k = 0; k < nnz_; k++) { + vals[k] = M_->M()[index_convert_CSR2Triplet_host_[k]]; + } + for(int i = 0; i < n_; i++) { + if(index_convert_extra_Diag2CSR_host_[i] != -1) + vals[index_convert_extra_Diag2CSR_host_[i]] += M_->M()[M_->numberOfNonzeros() - n_ + i]; + } + checkCudaErrors(cudaMemcpy(solver_->mat_A_csr()->get_vals(), solver_->mat_A_csr()->get_vals_host(), sizeof(double) * nnz_, cudaMemcpyHostToDevice)); } } - + /// @pre Data is either on the host or the host mirror is synced with the device void hiopLinSolverSymSparseReSolve::compute_nnz() { // @@ -333,12 +422,24 @@ namespace hiop // int* row_ptr = solver_->mat_A_csr()->get_irows_host(); + // If the data is on device, fetch it from the host mirror + hiopMatrixSparse* M_host = nullptr; + std::string mem_space = nlp_->options->GetString("mem_space"); + if(mem_space == "host" || mem_space == "default") { + M_host = M_; + } else if(mem_space == "device") { + M_host = M_host_; + } else { + nlp_->log->printf(hovError, "Memory space %s incompatible with ReSolve.\n", mem_space.c_str()); + } + + // off-diagonal part row_ptr[0] = 0; - for(int k = 0; k < M_->numberOfNonzeros() - n_; k++) { - if(M_->i_row()[k] != M_->j_col()[k]) { - row_ptr[M_->i_row()[k] + 1]++; - row_ptr[M_->j_col()[k] + 1]++; + for(int k = 0; k < M_host->numberOfNonzeros() - n_; k++) { + if(M_host->i_row()[k] != M_host->j_col()[k]) { + row_ptr[M_host->i_row()[k] + 1]++; + row_ptr[M_host->j_col()[k] + 1]++; nnz_ += 2; } } @@ -354,8 +455,20 @@ namespace hiop assert(nnz_ == row_ptr[n_]); } + /// @pre Data is either on the host or the host mirror is synced with the device void hiopLinSolverSymSparseReSolve::set_csr_indices_values() { + // If the data is on device, fetch it from the host mirror + hiopMatrixSparse* M_host = nullptr; + std::string mem_space = nlp_->options->GetString("mem_space"); + if(mem_space == "host" || mem_space == "default") { + M_host = M_; + } else if(mem_space == "device") { + M_host = M_host_; + } else { + nlp_->log->printf(hovError, "Memory space %s incompatible with ReSolve.\n", mem_space.c_str()); + } + // // set correct col index and value // @@ -363,40 +476,42 @@ namespace hiop int* col_idx = solver_->mat_A_csr()->get_jcols_host(); double* vals = solver_->mat_A_csr()->get_vals_host(); - index_covert_CSR2Triplet_ = new int[nnz_]; - index_covert_extra_Diag2CSR_ = new int[n_]; + index_convert_CSR2Triplet_host_ = new int[nnz_]; + index_convert_extra_Diag2CSR_host_ = new int[n_]; + checkCudaErrors(cudaMalloc(&index_convert_CSR2Triplet_device_, nnz_ * sizeof(int))); + checkCudaErrors(cudaMalloc(&index_convert_extra_Diag2CSR_device_, n_ * sizeof(int))); int* nnz_each_row_tmp = new int[n_]{ 0 }; int total_nnz_tmp{ 0 }, nnz_tmp{ 0 }, rowID_tmp, colID_tmp; for(int k = 0; k < n_; k++) { - index_covert_extra_Diag2CSR_[k] = -1; + index_convert_extra_Diag2CSR_host_[k] = -1; } - for(int k = 0; k < M_->numberOfNonzeros() - n_; k++) { - rowID_tmp = M_->i_row()[k]; - colID_tmp = M_->j_col()[k]; + for(int k = 0; k < M_host->numberOfNonzeros() - n_; k++) { + rowID_tmp = M_host->i_row()[k]; + colID_tmp = M_host->j_col()[k]; if(rowID_tmp == colID_tmp) { nnz_tmp = nnz_each_row_tmp[rowID_tmp] + row_ptr[rowID_tmp]; col_idx[nnz_tmp] = colID_tmp; - vals[nnz_tmp] = M_->M()[k]; - index_covert_CSR2Triplet_[nnz_tmp] = k; + vals[nnz_tmp] = M_host->M()[k]; + index_convert_CSR2Triplet_host_[nnz_tmp] = k; - vals[nnz_tmp] += M_->M()[M_->numberOfNonzeros() - n_ + rowID_tmp]; - index_covert_extra_Diag2CSR_[rowID_tmp] = nnz_tmp; + vals[nnz_tmp] += M_host->M()[M_host->numberOfNonzeros() - n_ + rowID_tmp]; + index_convert_extra_Diag2CSR_host_[rowID_tmp] = nnz_tmp; nnz_each_row_tmp[rowID_tmp]++; total_nnz_tmp++; } else { nnz_tmp = nnz_each_row_tmp[rowID_tmp] + row_ptr[rowID_tmp]; col_idx[nnz_tmp] = colID_tmp; - vals[nnz_tmp] = M_->M()[k]; - index_covert_CSR2Triplet_[nnz_tmp] = k; + vals[nnz_tmp] = M_host->M()[k]; + index_convert_CSR2Triplet_host_[nnz_tmp] = k; nnz_tmp = nnz_each_row_tmp[colID_tmp] + row_ptr[colID_tmp]; col_idx[nnz_tmp] = rowID_tmp; - vals[nnz_tmp] = M_->M()[k]; - index_covert_CSR2Triplet_[nnz_tmp] = k; + vals[nnz_tmp] = M_host->M()[k]; + index_convert_CSR2Triplet_host_[nnz_tmp] = k; nnz_each_row_tmp[rowID_tmp]++; nnz_each_row_tmp[colID_tmp]++; @@ -409,8 +524,8 @@ namespace hiop assert(nnz_each_row_tmp[i] == row_ptr[i + 1] - row_ptr[i] - 1); nnz_tmp = nnz_each_row_tmp[i] + row_ptr[i]; col_idx[nnz_tmp] = i; - vals[nnz_tmp] = M_->M()[M_->numberOfNonzeros() - n_ + i]; - index_covert_CSR2Triplet_[nnz_tmp] = M_->numberOfNonzeros() - n_ + i; + vals[nnz_tmp] = M_host->M()[M_host->numberOfNonzeros() - n_ + i]; + index_convert_CSR2Triplet_host_[nnz_tmp] = M_host->numberOfNonzeros() - n_ + i; total_nnz_tmp += 1; std::vector ind_temp(row_ptr[i + 1] - row_ptr[i]); @@ -422,10 +537,12 @@ namespace hiop ); reorder(vals + row_ptr[i], ind_temp, row_ptr[i + 1] - row_ptr[i]); - reorder(index_covert_CSR2Triplet_ + row_ptr[i], ind_temp, row_ptr[i + 1] - row_ptr[i]); + reorder(index_convert_CSR2Triplet_host_ + row_ptr[i], ind_temp, row_ptr[i + 1] - row_ptr[i]); std::sort(col_idx + row_ptr[i], col_idx + row_ptr[i + 1]); } } + checkCudaErrors(cudaMemcpy(index_convert_CSR2Triplet_device_, index_convert_CSR2Triplet_host_, nnz_ * sizeof(int), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(index_convert_extra_Diag2CSR_device_, index_convert_extra_Diag2CSR_host_, n_ * sizeof(int), cudaMemcpyHostToDevice)); delete[] nnz_each_row_tmp; } @@ -447,88 +564,5 @@ namespace hiop } - - hiopLinSolverSymSparseReSolveGPU::hiopLinSolverSymSparseReSolveGPU(const int& n, - const int& nnz, - hiopNlpFormulation* nlp) - : hiopLinSolverSymSparseReSolve(n, nnz, nlp), - rhs_host_{nullptr}, - M_host_{nullptr} - { - rhs_host_ = LinearAlgebraFactory::create_vector("default", n); - M_host_ = LinearAlgebraFactory::create_matrix_sparse("default", n, n, nnz); - } - - hiopLinSolverSymSparseReSolveGPU::~hiopLinSolverSymSparseReSolveGPU() - { - delete rhs_host_; - delete M_host_; - } - - int hiopLinSolverSymSparseReSolveGPU::matrixChanged() - { - size_type nnz = M_->numberOfNonzeros(); - double* mval_dev = M_->M(); - double* mval_host= M_host_->M(); - - index_type* irow_dev = M_->i_row(); - index_type* irow_host= M_host_->i_row(); - - index_type* jcol_dev = M_->j_col(); - index_type* jcol_host= M_host_->j_col(); - - if("device" == nlp_->options->GetString("mem_space")) { - checkCudaErrors(cudaMemcpy(mval_host, mval_dev, sizeof(double) * nnz, cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(irow_host, irow_dev, sizeof(index_type) * nnz, cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(jcol_host, jcol_dev, sizeof(index_type) * nnz, cudaMemcpyDeviceToHost)); - } else { - checkCudaErrors(cudaMemcpy(mval_host, mval_dev, sizeof(double) * nnz, cudaMemcpyHostToHost)); - checkCudaErrors(cudaMemcpy(irow_host, irow_dev, sizeof(index_type) * nnz, cudaMemcpyHostToHost)); - checkCudaErrors(cudaMemcpy(jcol_host, jcol_dev, sizeof(index_type) * nnz, cudaMemcpyHostToHost)); - } - - hiopMatrixSparse* swap_ptr = M_; - M_ = M_host_; - M_host_ = swap_ptr; - - int vret = hiopLinSolverSymSparseReSolve::matrixChanged(); - - swap_ptr = M_; - M_ = M_host_; - M_host_ = swap_ptr; - - return vret; - } - - bool hiopLinSolverSymSparseReSolveGPU::solve(hiopVector& x) - { - double* mval_dev = x.local_data(); - double* mval_host= rhs_host_->local_data(); - - if("device" == nlp_->options->GetString("mem_space")) { - checkCudaErrors(cudaMemcpy(mval_host, mval_dev, sizeof(double) * n_, cudaMemcpyDeviceToHost)); - } else { - checkCudaErrors(cudaMemcpy(mval_host, mval_dev, sizeof(double) * n_, cudaMemcpyHostToHost)); - } - - hiopMatrixSparse* swap_ptr = M_; - M_ = M_host_; - M_host_ = swap_ptr; - - bool bret = hiopLinSolverSymSparseReSolve::solve(*rhs_host_); - - swap_ptr = M_; - M_ = M_host_; - M_host_ = swap_ptr; - - if("device" == nlp_->options->GetString("mem_space")) { - checkCudaErrors(cudaMemcpy(mval_dev, mval_host, sizeof(double) * n_, cudaMemcpyHostToDevice)); - } else { - checkCudaErrors(cudaMemcpy(mval_dev, mval_host, sizeof(double) * n_, cudaMemcpyHostToHost)); - } - - return bret; - } - } // namespace hiop diff --git a/src/LinAlg/hiopLinSolverSparseReSolve.hpp b/src/LinAlg/hiopLinSolverSparseReSolve.hpp index 0e08b1ad1..bc33e9835 100644 --- a/src/LinAlg/hiopLinSolverSparseReSolve.hpp +++ b/src/LinAlg/hiopLinSolverSparseReSolve.hpp @@ -50,6 +50,7 @@ * @file hiopLinSolverSparseReSolve.hpp * * @author Kasia Swirydowicz , PNNL + * @author Slaven Peles , ORNL * */ @@ -112,20 +113,24 @@ class hiopLinSolverSymSparseReSolve : public hiopLinSolverSymSparse protected: ReSolve::RefactorizationSolver* solver_; - /* Right-hand side vector */ - double* rhs_; - int m_; ///< number of rows of the whole matrix int n_; ///< number of cols of the whole matrix int nnz_; ///< number of nonzeros in the matrix - int* index_covert_CSR2Triplet_; - int* index_covert_extra_Diag2CSR_; + // Mapping on the host + int* index_convert_CSR2Triplet_host_; + int* index_convert_extra_Diag2CSR_host_; + + // Mapping on the device + int* index_convert_CSR2Triplet_device_; + int* index_convert_extra_Diag2CSR_device_; // Algorithm control flags int factorizationSetupSucc_; bool is_first_call_; + hiopMatrixSparse* M_host_{ nullptr }; ///< Host mirror for the KKT matrix + /* private function: creates a cuSolver data structure from KLU data * structures. */ @@ -152,28 +157,6 @@ class hiopLinSolverSymSparseReSolve : public hiopLinSolverSymSparse template void hiopCheckCudaError(T result, const char* const file, int const line); }; -class hiopLinSolverSymSparseReSolveGPU : public hiopLinSolverSymSparseReSolve -{ -public: - hiopLinSolverSymSparseReSolveGPU(const int& n, const int& nnz, hiopNlpFormulation* nlp); - virtual ~hiopLinSolverSymSparseReSolveGPU(); - - virtual int matrixChanged(); - virtual bool solve(hiopVector& x_); - - /** Multiple rhs not supported yet */ - virtual bool - solve(hiopMatrix& /* x */) - { - assert(false && "not yet supported"); - return false; - } - -private: - hiopVector* rhs_host_; - hiopMatrixSparse* M_host_; -}; - } // namespace hiop diff --git a/src/Optimization/hiopKKTLinSysSparse.cpp b/src/Optimization/hiopKKTLinSysSparse.cpp index 5b2c79d88..5ff1e20b7 100644 --- a/src/Optimization/hiopKKTLinSysSparse.cpp +++ b/src/Optimization/hiopKKTLinSysSparse.cpp @@ -335,7 +335,7 @@ namespace hiop if( (nullptr == linSys_ && linear_solver == "auto") || linear_solver == "resolve") { #if defined(HIOP_USE_RESOLVE) - linSys_ = new hiopLinSolverSymSparseReSolveGPU(n, nnz, nlp_); + linSys_ = new hiopLinSolverSymSparseReSolve(n, nnz, nlp_); linsol_actual = "ReSolve"; auto* fact_acceptor_ic = dynamic_cast (fact_acceptor_); if(fact_acceptor_ic) { @@ -695,7 +695,7 @@ namespace hiop if(linear_solver == "resolve" || linear_solver == "auto") { #if defined(HIOP_USE_RESOLVE) actual_lin_solver = "ReSolve"; - linSys_ = new hiopLinSolverSymSparseReSolveGPU(n, nnz, nlp_); + linSys_ = new hiopLinSolverSymSparseReSolve(n, nnz, nlp_); auto* fact_acceptor_ic = dynamic_cast (fact_acceptor_); if(fact_acceptor_ic) { nlp_->log->printf(hovError, @@ -766,7 +766,7 @@ namespace hiop if(linear_solver == "resolve" || linear_solver == "auto") { #if defined(HIOP_USE_RESOLVE) - linSys_ = new hiopLinSolverSymSparseReSolveGPU(n, nnz, nlp_); + linSys_ = new hiopLinSolverSymSparseReSolve(n, nnz, nlp_); nlp_->log->printf(hovScalars, "KKT_SPARSE_XDYcYd linsys: alloc ReSolve size %d (%d cons) (gpu)\n", n,