diff --git a/CMakeLists.txt b/CMakeLists.txt index 5715e6e2..d3b566ee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,6 +27,13 @@ option(RESOLVE_USE_GPU "Use GPU device for computations" ON) option(RESOLVE_USE_CUDA "Use CUDA language and SDK" ON) set(RESOLVE_CTEST_OUTPUT_DIR ${PROJECT_BINARY_DIR} CACHE PATH "Directory where CTest outputs are saved") +if(RESOLVE_USE_CUDA) + set(RESOLVE_USE_GPU On CACHE BOOL "Using GPU!" FORCE) +else() + set(RESOLVE_USE_GPU Off CACHE BOOL "Using GPU!" FORCE) +endif() + + set(CMAKE_MACOSX_RPATH 1) # set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") # https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling#always-full-rpath @@ -130,10 +137,8 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/ReSolveConfig.cmake" "${CMAKE_CURRENT_BINARY_DIR}/ReSolveConfigVersion.cmake" DESTINATION share/resolve/cmake) -# Add examples (for now only CUDA examples are available) -if(RESOLVE_USE_CUDA) - add_subdirectory(examples) -endif(RESOLVE_USE_CUDA) +# Add usage examples +add_subdirectory(examples) # Add tests add_subdirectory(tests) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 73dce68e..f2d323f4 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -14,30 +14,40 @@ target_link_libraries(klu_klu.exe PRIVATE ReSolve) add_executable(klu_klu_standalone.exe r_KLU_KLU_standalone.cpp) target_link_libraries(klu_klu_standalone.exe PRIVATE ReSolve) -# Build example with KLU factorization and GLU refactorization -add_executable(klu_glu.exe r_KLU_GLU.cpp) -target_link_libraries(klu_glu.exe PRIVATE ReSolve) +# Create CUDA examples +if(RESOLVE_USE_CUDA) + + # Build example with KLU factorization and GLU refactorization + add_executable(klu_glu.exe r_KLU_GLU.cpp) + target_link_libraries(klu_glu.exe PRIVATE ReSolve) -# Build example with KLU factorization and Rf refactorization -add_executable(klu_rf.exe r_KLU_rf.cpp) -target_link_libraries(klu_rf.exe PRIVATE ReSolve) + # Build example with KLU factorization and Rf refactorization + add_executable(klu_rf.exe r_KLU_rf.cpp) + target_link_libraries(klu_rf.exe PRIVATE ReSolve) -# Build example with KLU factorization, Rf refactorization, and FGMRES iterative refinement -add_executable(klu_rf_fgmres.exe r_KLU_rf_FGMRES.cpp) -target_link_libraries(klu_rf_fgmres.exe PRIVATE ReSolve) + # Build example with KLU factorization, Rf refactorization, and FGMRES iterative refinement + add_executable(klu_rf_fgmres.exe r_KLU_rf_FGMRES.cpp) + target_link_libraries(klu_rf_fgmres.exe PRIVATE ReSolve) -# Build example where matrix is factorized once, refactorized once and then the preconditioner is REUSED -add_executable(klu_rf_fgmres_reuse_refactorization.exe r_KLU_rf_FGMRES_reuse_factorization.cpp) -target_link_libraries(klu_rf_fgmres_reuse_refactorization.exe PRIVATE ReSolve) + # Build example where matrix is factorized once, refactorized once and then the preconditioner is REUSED + add_executable(klu_rf_fgmres_reuse_refactorization.exe r_KLU_rf_FGMRES_reuse_factorization.cpp) + target_link_libraries(klu_rf_fgmres_reuse_refactorization.exe PRIVATE ReSolve) -# Build example where matrix data is updated -add_executable(klu_glu_values_update.exe r_KLU_GLU_matrix_values_update.cpp) -target_link_libraries(klu_glu_values_update.exe PRIVATE ReSolve) + # Build example where matrix data is updated + add_executable(klu_glu_values_update.exe r_KLU_GLU_matrix_values_update.cpp) + target_link_libraries(klu_glu_values_update.exe PRIVATE ReSolve) + +endif(RESOLVE_USE_CUDA) # Install all examples in bin directory -set(installable_executables klu_klu.exe klu_klu_standalone.exe klu_glu.exe klu_rf.exe klu_rf_fgmres.exe klu_glu_values_update.exe) +set(installable_executables klu_klu.exe klu_klu_standalone.exe) + +if(RESOLVE_USE_CUDA) + set(installable_executables ${installable_executables} klu_glu.exe klu_rf.exe klu_rf_fgmres.exe klu_glu_values_update.exe) +endif(RESOLVE_USE_CUDA) + install(TARGETS ${installable_executables} RUNTIME DESTINATION bin) diff --git a/examples/r_KLU_GLU.cpp b/examples/r_KLU_GLU.cpp index a4c059a9..e2cbfde4 100644 --- a/examples/r_KLU_GLU.cpp +++ b/examples/r_KLU_GLU.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -9,7 +10,7 @@ #include #include #include -#include +#include using namespace ReSolve::constants; @@ -145,10 +146,12 @@ int main(int argc, char *argv[]) vec_r->update(rhs, "cpu", "cuda"); - matrix_handler->setValuesChanged(true); + matrix_handler->setValuesChanged(true, "cuda"); matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); - printf("\t 2-Norm of the residual: %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))); + std::cout << "\t 2-Norm of the residual: " + << std::scientific << std::setprecision(16) + << sqrt(vector_handler->dot(vec_r, vec_r, "cuda")) << "\n"; } // for (int i = 0; i < numSystems; ++i) diff --git a/examples/r_KLU_GLU_matrix_values_update.cpp b/examples/r_KLU_GLU_matrix_values_update.cpp index 96c77591..7d1bb141 100644 --- a/examples/r_KLU_GLU_matrix_values_update.cpp +++ b/examples/r_KLU_GLU_matrix_values_update.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -9,7 +10,7 @@ #include #include #include -#include +#include // this updates the matrix values to simulate what CFD/optimization software does. @@ -155,10 +156,12 @@ int main(int argc, char *argv[]) vec_r->update(rhs, "cpu", "cuda"); - matrix_handler->setValuesChanged(true); + matrix_handler->setValuesChanged(true, "cuda"); matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); - printf("\t 2-Norm of the residual: %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))); + std::cout << "\t 2-Norm of the residual: " + << std::scientific << std::setprecision(16) + << sqrt(vector_handler->dot(vec_r, vec_r, "cuda")) << "\n"; } diff --git a/examples/r_KLU_KLU.cpp b/examples/r_KLU_KLU.cpp index 63aad4b5..8b0ea59a 100644 --- a/examples/r_KLU_KLU.cpp +++ b/examples/r_KLU_KLU.cpp @@ -1,5 +1,7 @@ #include #include +#include +#include #include #include @@ -9,7 +11,7 @@ #include #include #include -#include +#include using namespace ReSolve::constants; @@ -35,10 +37,9 @@ int main(int argc, char *argv[]) ReSolve::matrix::Coo* A_coo; ReSolve::matrix::Csr* A; - ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA; - workspace_CUDA->initializeHandles(); - ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_CUDA); - ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_CUDA); + ReSolve::LinAlgWorkspaceCpu* workspace = new ReSolve::LinAlgWorkspaceCpu(); + ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace); + ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace); real_type* rhs; real_type* x; @@ -96,7 +97,11 @@ int main(int argc, char *argv[]) ReSolve::io::readAndUpdateMatrix(mat_file, A_coo); ReSolve::io::readAndUpdateRhs(rhs_file, &rhs); } - std::cout<<"Finished reading the matrix and rhs, size: "<getNumRows()<<" x "<getNumColumns()<< ", nnz: "<< A->getNnz()<< ", symmetric? "<symmetric()<< ", Expanded? "<expanded()<getNumRows() + << " x " << A->getNumColumns() + << ", nnz: " << A->getNnz() + << ", symmetric? " << A->symmetric() + << ", Expanded? " << A->expanded() << std::endl; mat_file.close(); rhs_file.close(); @@ -106,8 +111,8 @@ int main(int argc, char *argv[]) vec_rhs->update(rhs, "cpu", "cpu"); vec_rhs->setDataUpdated("cpu"); } else { - matrix_handler->coo2csr(A_coo, A, "cuda"); - vec_rhs->update(rhs, "cpu", "cuda"); + matrix_handler->coo2csr(A_coo, A, "cpu"); + vec_rhs->update(rhs, "cpu", "cpu"); } std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<solve(vec_rhs, vec_x); std::cout<<"KLU solve status: "<update(rhs, "cpu", "cuda"); - - matrix_handler->setValuesChanged(true); - - matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); - real_type* test = vec_r->getData("cpu"); - (void) test; // TODO: Do we need `test` variable in this example? + vec_r->update(rhs, "cpu", "cpu"); - printf("\t 2-Norm of the residual: %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))); + matrix_handler->setValuesChanged(true, "cpu"); + matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cpu"); + std::cout << "\t 2-Norm of the residual: " + << std::scientific << std::setprecision(16) + << sqrt(vector_handler->dot(vec_r, vec_r, "cpu")) << "\n"; } //now DELETE diff --git a/examples/r_KLU_KLU_standalone.cpp b/examples/r_KLU_KLU_standalone.cpp index fd84efe0..77e5b97a 100644 --- a/examples/r_KLU_KLU_standalone.cpp +++ b/examples/r_KLU_KLU_standalone.cpp @@ -1,5 +1,7 @@ #include #include +#include +#include #include #include @@ -9,7 +11,7 @@ #include #include #include -#include +#include using namespace ReSolve::constants; @@ -31,10 +33,9 @@ int main(int argc, char *argv[]) ReSolve::matrix::Coo* A_coo; ReSolve::matrix::Csr* A; - ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA; - workspace_CUDA->initializeHandles(); - ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_CUDA); - ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_CUDA); + ReSolve::LinAlgWorkspaceCpu* workspace = new ReSolve::LinAlgWorkspaceCpu(); + ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace); + ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace); real_type* rhs; real_type* x; @@ -95,15 +96,15 @@ int main(int argc, char *argv[]) std::cout << "KLU factorization status: " << status << std::endl; status = KLU->solve(vec_rhs, vec_x); std::cout << "KLU solve status: " << status << std::endl; - vec_r->update(rhs, "cpu", "cuda"); + vec_r->update(rhs, "cpu", "cpu"); - matrix_handler->setValuesChanged(true); + matrix_handler->setValuesChanged(true, "cpu"); - matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); - real_type* test = vec_r->getData("cpu"); - (void) test; // TODO: Do we need `test` variable in this example? + matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cpu"); - printf("\t 2-Norm of the residual: %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))); + std::cout << "\t 2-Norm of the residual: " + << std::scientific << std::setprecision(16) + << sqrt(vector_handler->dot(vec_r, vec_r, "cpu")) << "\n"; diff --git a/examples/r_KLU_rf.cpp b/examples/r_KLU_rf.cpp index 9e166d34..01fa0f3c 100644 --- a/examples/r_KLU_rf.cpp +++ b/examples/r_KLU_rf.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -10,7 +11,7 @@ #include #include #include -#include +#include using namespace ReSolve::constants; @@ -158,11 +159,13 @@ int main(int argc, char *argv[] ) } vec_r->update(rhs, "cpu", "cuda"); - matrix_handler->setValuesChanged(true); + matrix_handler->setValuesChanged(true, "cuda"); matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); - printf("\t 2-Norm of the residual: %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))); + std::cout << "\t 2-Norm of the residual: " + << std::scientific << std::setprecision(16) + << sqrt(vector_handler->dot(vec_r, vec_r, "cuda")) << "\n"; } // for (int i = 0; i < numSystems; ++i) diff --git a/examples/r_KLU_rf_FGMRES.cpp b/examples/r_KLU_rf_FGMRES.cpp index db816206..ee674869 100644 --- a/examples/r_KLU_rf_FGMRES.cpp +++ b/examples/r_KLU_rf_FGMRES.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -11,7 +12,7 @@ #include #include #include -#include +#include using namespace ReSolve::constants; @@ -125,7 +126,7 @@ int main(int argc, char *argv[]) real_type norm_b; if (i < 2){ KLU->setup(A); - matrix_handler->setValuesChanged(true); + matrix_handler->setValuesChanged(true, "cuda"); status = KLU->analyze(); std::cout<<"KLU analysis status: "<factorize(); @@ -135,7 +136,7 @@ int main(int argc, char *argv[]) vec_r->update(rhs, "cpu", "cuda"); norm_b = vector_handler->dot(vec_r, vec_r, "cuda"); norm_b = sqrt(norm_b); - matrix_handler->setValuesChanged(true); + matrix_handler->setValuesChanged(true, "cuda"); matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); printf("\t 2-Norm of the residual : %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b); if (i == 1) { @@ -165,18 +166,25 @@ int main(int argc, char *argv[]) norm_b = vector_handler->dot(vec_r, vec_r, "cuda"); norm_b = sqrt(norm_b); - //matrix_handler->setValuesChanged(true); + //matrix_handler->setValuesChanged(true, "cuda"); FGMRES->resetMatrix(A); FGMRES->setupPreconditioner("CuSolverRf", Rf); matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); - printf("\t 2-Norm of the residual (before IR): %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b); + std::cout << "\t 2-Norm of the residual (before IR): " + << std::scientific << std::setprecision(16) + << sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b << "\n"; vec_rhs->update(rhs, "cpu", "cuda"); FGMRES->solve(vec_rhs, vec_x); - printf("FGMRES: init nrm: %16.16e final nrm: %16.16e iter: %d \n", FGMRES->getInitResidualNorm()/norm_b, FGMRES->getFinalResidualNorm()/norm_b, FGMRES->getNumIter()); + std::cout << "FGMRES: init nrm: " + << std::scientific << std::setprecision(16) + << FGMRES->getInitResidualNorm()/norm_b + << " final nrm: " + << FGMRES->getFinalResidualNorm()/norm_b + << " iter: " << FGMRES->getNumIter() << "\n"; } } // for (int i = 0; i < numSystems; ++i) diff --git a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp index d0ca3a56..6a520a7a 100644 --- a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp +++ b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp @@ -1,5 +1,7 @@ #include #include +#include + #include #include #include @@ -11,7 +13,7 @@ #include #include #include -#include +#include using namespace ReSolve::constants; @@ -126,7 +128,7 @@ int main(int argc, char *argv[]) real_type norm_b; if (i < 2){ KLU->setup(A); - matrix_handler->setValuesChanged(true); + matrix_handler->setValuesChanged(true, "cuda"); status = KLU->analyze(); std::cout<<"KLU analysis status: "<factorize(); @@ -136,9 +138,11 @@ int main(int argc, char *argv[]) vec_r->update(rhs, "cpu", "cuda"); norm_b = vector_handler->dot(vec_r, vec_r, "cuda"); norm_b = sqrt(norm_b); - matrix_handler->setValuesChanged(true); + matrix_handler->setValuesChanged(true, "cuda"); matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); - printf("\t 2-Norm of the residual : %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b); + std::cout << "\t 2-Norm of the residual : " + << std::scientific << std::setprecision(16) + << sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b << "\n"; if (i == 1) { ReSolve::matrix::Csc* L_csc = (ReSolve::matrix::Csc*) KLU->getLFactor(); ReSolve::matrix::Csc* U_csc = (ReSolve::matrix::Csc*) KLU->getUFactor(); @@ -147,7 +151,9 @@ int main(int argc, char *argv[]) matrix_handler->csc2csr(L_csc,L, "cuda"); matrix_handler->csc2csr(U_csc,U, "cuda"); - if (L == nullptr) {printf("ERROR");} + if (L == nullptr) { + std::cout << "ERROR\n"; + } index_type* P = KLU->getPOrdering(); index_type* Q = KLU->getQOrdering(); Rf->setup(A, L, U, P, Q); @@ -163,14 +169,17 @@ int main(int argc, char *argv[]) if ((i % 2 == 0)) { status = Rf->refactorize(); - std::cout<<"CUSOLVER RF, using REAL refactorization, refactorization status: "<update(rhs, "cpu", "cuda"); status = Rf->solve(vec_rhs, vec_x); FGMRES->setupPreconditioner("CuSolverRf", Rf); } - //if (i%2!=0) vec_x->setToZero("cuda"); + //if (i%2!=0) vec_x->setToZero("cuda"); real_type norm_x = vector_handler->dot(vec_x, vec_x, "cuda"); - printf("Norm of x(before solve): %16.16e \n", sqrt(norm_x)); + std::cout << "Norm of x (before solve): " + << std::scientific << std::setprecision(16) + << sqrt(norm_x) << "\n"; std::cout<<"CUSOLVER RF solve status: "<update(rhs, "cpu", "cuda"); @@ -178,20 +187,31 @@ int main(int argc, char *argv[]) norm_b = vector_handler->dot(vec_r, vec_r, "cuda"); norm_b = sqrt(norm_b); - matrix_handler->setValuesChanged(true); + matrix_handler->setValuesChanged(true, "cuda"); FGMRES->resetMatrix(A); matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); - printf("\t 2-Norm of the residual (before IR): %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b); - printf("\t 2-Norm of the RIGHT HAND SIDE: %16.16e\n", norm_b); + std::cout << "\t 2-Norm of the residual (before IR): " + << std::scientific << std::setprecision(16) + << sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b << "\n"; + std::cout << "\t 2-Norm of the RIGHT HAND SIDE: " + << std::scientific << std::setprecision(16) + << norm_b << "\n"; vec_rhs->update(rhs, "cpu", "cuda"); FGMRES->solve(vec_rhs, vec_x); - printf("FGMRES: init nrm: %16.16e final nrm: %16.16e iter: %d \n", FGMRES->getInitResidualNorm()/norm_b, FGMRES->getFinalResidualNorm()/norm_b, FGMRES->getNumIter()); + std::cout << "FGMRES: init nrm: " + << std::scientific << std::setprecision(16) + << FGMRES->getInitResidualNorm()/norm_b + << " final nrm: " + << FGMRES->getFinalResidualNorm()/norm_b + << " iter: " << FGMRES->getNumIter() << "\n"; norm_x = vector_handler->dot(vec_x, vec_x, "cuda"); - printf("Norm of x (after IR): %16.16e \n", sqrt(norm_x)); + std::cout << "Norm of x (after IR): " + << std::scientific << std::setprecision(16) + << sqrt(norm_x) << "\n"; } diff --git a/resolve/CMakeLists.txt b/resolve/CMakeLists.txt index 675fc11f..8dbcc467 100644 --- a/resolve/CMakeLists.txt +++ b/resolve/CMakeLists.txt @@ -8,21 +8,24 @@ add_subdirectory(utilities) +# C++ files set(ReSolve_SRC LinSolver.cpp LinSolverDirectKLU.cpp - LinAlgWorkspace.cpp +) + +# C++ code that links to CUDA SDK libraries +set(ReSolve_CUDASDK_SRC LinSolverIterativeFGMRES.cpp GramSchmidt.cpp LinSolverDirectCuSolverGLU.cpp LinSolverDirectCuSolverRf.cpp ) - +# Header files to be installed set(ReSolve_HEADER_INSTALL Common.hpp cusolver_defs.hpp - LinAlgWorkspace.hpp LinSolver.hpp LinSolverDirectCuSolverGLU.hpp LinSolverDirectCuSolverRf.hpp @@ -39,11 +42,15 @@ if(NOT RESOLVE_USE_GPU) add_subdirectory(cpu) endif() -# First create CUDA backend (this should really be CUDA _API_ backend, separate backend will be needed for CUDA SDK) +# If CUDA support is enabled, create CUDA backend +# (this should really be CUDA _API_ backend, separate backend will be needed for CUDA SDK) if(RESOLVE_USE_CUDA) add_subdirectory(cuda) endif() +# Now, build workspaces +add_subdirectory(workspace) + # Next build vector and matrix objects that may use this backend. add_subdirectory(vector) add_subdirectory(matrix) @@ -66,19 +73,24 @@ set(ReSolve_Targets_List resolve_vector resolve_logger resolve_tpl + resolve_workspace ) -if(RESOLVE_USE_GPU) - if(RESOLVE_USE_CUDA) - set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cuda) - endif() -else(RESOLVE_USE_GPU) - set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cpu) -endif(RESOLVE_USE_GPU) +# If CUDA support is enabled add CUDA SDK specific code and dependencies +if(RESOLVE_USE_CUDA) + set(ReSolve_SRC ${ReSolve_SRC} ${ReSolve_CUDASDK_SRC}) + set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cuda) +endif() +# If no GPU support is enabled, link to dummy device backend +if(NOT RESOLVE_USE_GPU) + set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cpu) +endif() +# Set installable targets install(TARGETS ${ReSolve_Targets_List} EXPORT ReSolveTargets) +# Create ReSolve library add_library(ReSolve SHARED ${ReSolve_SRC}) target_include_directories(ReSolve INTERFACE @@ -89,10 +101,12 @@ target_include_directories(ReSolve INTERFACE # TODO: Make this PRIVATE dependency (requires refactoring ReSolve code) target_link_libraries(ReSolve PUBLIC ${ReSolve_Targets_List}) +# Install targets install(TARGETS ReSolve EXPORT ReSolveTargets ARCHIVE DESTINATION lib LIBRARY DESTINATION lib) + # install include headers install(FILES ${ReSolve_HEADER_INSTALL} DESTINATION include/resolve) diff --git a/resolve/LinSolverDirectCuSolverGLU.cpp b/resolve/LinSolverDirectCuSolverGLU.cpp index 63413c5b..75039ff4 100644 --- a/resolve/LinSolverDirectCuSolverGLU.cpp +++ b/resolve/LinSolverDirectCuSolverGLU.cpp @@ -3,11 +3,12 @@ #include #include +#include #include "LinSolverDirectCuSolverGLU.hpp" namespace ReSolve { - LinSolverDirectCuSolverGLU::LinSolverDirectCuSolverGLU(LinAlgWorkspace* workspace) + LinSolverDirectCuSolverGLU::LinSolverDirectCuSolverGLU(LinAlgWorkspaceCUDA* workspace) { this->workspace_ = workspace; } @@ -25,7 +26,7 @@ namespace ReSolve { int error_sum = 0; - LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_; + LinAlgWorkspaceCUDA* workspaceCUDA = workspace_; //get the handle handle_cusolversp_ = workspaceCUDA->getCusolverSpHandle(); A_ = (matrix::Csr*) A; diff --git a/resolve/LinSolverDirectCuSolverGLU.hpp b/resolve/LinSolverDirectCuSolverGLU.hpp index d76e1921..a48c8cba 100644 --- a/resolve/LinSolverDirectCuSolverGLU.hpp +++ b/resolve/LinSolverDirectCuSolverGLU.hpp @@ -1,6 +1,5 @@ #pragma once #include "Common.hpp" -#include #include "LinSolver.hpp" #include "cusolver_defs.hpp" #include @@ -19,12 +18,15 @@ namespace ReSolve class Sparse; } + // Forward declaration of ReSolve handlers workspace + class LinAlgWorkspace; + class LinSolverDirectCuSolverGLU : public LinSolverDirect { using vector_type = vector::Vector; public: - LinSolverDirectCuSolverGLU(LinAlgWorkspace* workspace); + LinSolverDirectCuSolverGLU(LinAlgWorkspaceCUDA* workspace); ~LinSolverDirectCuSolverGLU(); int refactorize(); @@ -38,7 +40,7 @@ namespace ReSolve //note: we need cuSolver handle, we can copy it from the workspace to avoid double allocation cusparseMatDescr_t descr_M_; //this is NOT sparse matrix descriptor cusparseMatDescr_t descr_A_; //this is NOT sparse matrix descriptor - LinAlgWorkspace *workspace_;// so we can copy cusparse handle + LinAlgWorkspaceCUDA* workspace_;// so we can copy cusparse handle cusolverSpHandle_t handle_cusolversp_; cusolverStatus_t status_cusolver_; cusparseStatus_t status_cusparse_; diff --git a/resolve/LinSolverIterativeFGMRES.cpp b/resolve/LinSolverIterativeFGMRES.cpp index 2a6da732..fa63f2d5 100644 --- a/resolve/LinSolverIterativeFGMRES.cpp +++ b/resolve/LinSolverIterativeFGMRES.cpp @@ -66,12 +66,12 @@ namespace ReSolve { if (d_V_ != nullptr) { // cudaFree(d_V_); - delete [] d_V_; + delete d_V_; } if (d_Z_ != nullptr) { // cudaFree(d_Z_); - delete [] d_Z_; + delete d_Z_; } } @@ -308,7 +308,7 @@ namespace ReSolve int LinSolverIterativeFGMRES::resetMatrix(matrix::Sparse* new_matrix) { A_ = new_matrix; - matrix_handler_->setValuesChanged(true); + matrix_handler_->setValuesChanged(true, "cuda"); return 0; } diff --git a/resolve/MemoryUtils.hpp b/resolve/MemoryUtils.hpp index 77c2a7af..00f3d653 100644 --- a/resolve/MemoryUtils.hpp +++ b/resolve/MemoryUtils.hpp @@ -48,14 +48,16 @@ namespace ReSolve } // namespace ReSolve -// Check if GPU support is enabled in Re::Solve and set appropriate device memory manager. #ifdef RESOLVE_USE_GPU +// Check if GPU support is enabled in Re::Solve and set appropriate device memory manager. #if defined RESOLVE_USE_CUDA #include using MemoryHandler = ReSolve::MemoryUtils; #elif defined RESOLVE_USE_HIP #error HIP support requested, but not available! Probably a bug in CMake configuration. +#else +#error Unrecognized device, probably bug in CMake configuration #endif #else diff --git a/resolve/cpu/CMakeLists.txt b/resolve/cpu/CMakeLists.txt index cb84dc55..7105655c 100644 --- a/resolve/cpu/CMakeLists.txt +++ b/resolve/cpu/CMakeLists.txt @@ -8,6 +8,7 @@ set(ReSolve_CPU_SRC MemoryUtils.cpp + cpuVectorKernels.cpp ) set(ReSolve_CPU_HEADER_INSTALL @@ -16,6 +17,8 @@ set(ReSolve_CPU_HEADER_INSTALL # First create dummy backend add_library(resolve_backend_cpu SHARED ${ReSolve_CPU_SRC}) +target_link_libraries(resolve_backend_cpu PRIVATE resolve_logger) + target_include_directories(resolve_backend_cpu INTERFACE $ $ diff --git a/resolve/cpu/cpuVectorKernels.cpp b/resolve/cpu/cpuVectorKernels.cpp new file mode 100644 index 00000000..26a20413 --- /dev/null +++ b/resolve/cpu/cpuVectorKernels.cpp @@ -0,0 +1,15 @@ +#include +#include + + +namespace ReSolve { namespace vector { + + +void set_array_const(index_type n, real_type val, real_type* arr) +{ + for(index_type i = 0; i < n; ++i) { + arr[i] = val; + } +} + +}} // namespace ReSolve::vector \ No newline at end of file diff --git a/resolve/cuda/CMakeLists.txt b/resolve/cuda/CMakeLists.txt index d3ead313..f97267bc 100644 --- a/resolve/cuda/CMakeLists.txt +++ b/resolve/cuda/CMakeLists.txt @@ -8,19 +8,24 @@ set(ReSolve_CUDA_SRC cudaKernels.cu + cudaVectorKernels.cu MemoryUtils.cu ) set(ReSolve_CUDA_HEADER_INSTALL cudaKernels.h + cudaVectorKernels.h CudaMemory.hpp cuda_check_errors.hpp ) set_source_files_properties(${ReSolve_CUDA_SRC} PROPERTIES LANGUAGE CUDA) -# First create CUDA backend (this should really be CUDA _API_ backend, separate backend will be needed for CUDA SDK) +# First create CUDA backend +# (this should really be CUDA _API_ backend, +# separate backend will be needed for CUDA SDK) add_library(resolve_backend_cuda SHARED ${ReSolve_CUDA_SRC}) +target_link_libraries(resolve_backend_cuda PRIVATE resolve_logger) target_link_libraries(resolve_backend_cuda PUBLIC resolve_cuda) target_include_directories(resolve_backend_cuda INTERFACE $ diff --git a/resolve/vector/cudaVectorKernels.cu b/resolve/cuda/cudaVectorKernels.cu similarity index 92% rename from resolve/vector/cudaVectorKernels.cu rename to resolve/cuda/cudaVectorKernels.cu index ea54a880..a1c53198 100644 --- a/resolve/vector/cudaVectorKernels.cu +++ b/resolve/cuda/cudaVectorKernels.cu @@ -1,5 +1,5 @@ #include "cudaVectorKernels.h" -#include "VectorKernels.hpp" +#include namespace ReSolve { namespace vector { diff --git a/resolve/vector/cudaVectorKernels.h b/resolve/cuda/cudaVectorKernels.h similarity index 100% rename from resolve/vector/cudaVectorKernels.h rename to resolve/cuda/cudaVectorKernels.h diff --git a/resolve/matrix/CMakeLists.txt b/resolve/matrix/CMakeLists.txt index 89ac925d..554c0ba7 100644 --- a/resolve/matrix/CMakeLists.txt +++ b/resolve/matrix/CMakeLists.txt @@ -1,4 +1,12 @@ +#[[ +@brief Build ReSolve matrix module + +@author Slaven Peles + +]] + +# C++ code set(Matrix_SRC io.cpp Sparse.cpp @@ -6,9 +14,15 @@ set(Matrix_SRC Csc.cpp Coo.cpp MatrixHandler.cpp + MatrixHandlerCpu.cpp ) +# C++ code that depends on CUDA SDK libraries +set(Matrix_CUDASDK_SRC + MatrixHandlerCuda.cpp +) +# Header files to be installed set(Matrix_HEADER_INSTALL io.hpp Sparse.hpp @@ -18,13 +32,24 @@ set(Matrix_HEADER_INSTALL MatrixHandler.hpp ) +# Add CUDA matrix handler if CUDA support is enabled +if(RESOLVE_USE_CUDA) + set(Matrix_SRC ${Matrix_SRC} ${Matrix_CUDASDK_SRC}) +endif() + + # Build shared library ReSolve::matrix add_library(resolve_matrix SHARED ${Matrix_SRC}) +target_link_libraries(resolve_matrix PRIVATE resolve_logger resolve_vector) +# Link to CUDA ReSolve backend if CUDA is support enabled if (RESOLVE_USE_CUDA) - target_link_libraries(resolve_matrix PUBLIC resolve_backend_cuda) -else() - target_link_libraries(resolve_matrix PUBLIC resolve_backend_cpu) + target_link_libraries(resolve_matrix PUBLIC resolve_backend_cuda) +endif() + +# Link to dummy device backend if GPU support is not enabled +if (NOT RESOLVE_USE_GPU) + target_link_libraries(resolve_matrix PUBLIC resolve_backend_cpu) endif() diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp index 19119731..8bf4302c 100644 --- a/resolve/matrix/MatrixHandler.cpp +++ b/resolve/matrix/MatrixHandler.cpp @@ -1,82 +1,98 @@ #include #include -#include #include #include #include #include +#include +#include #include "MatrixHandler.hpp" +#include "MatrixHandlerCpu.hpp" + +#ifdef RESOLVE_USE_CUDA +#include "MatrixHandlerCuda.hpp" +#endif namespace ReSolve { // Create a shortcut name for Logger static class using out = io::Logger; - //helper class - indexPlusValue::indexPlusValue() - { - idx_ = 0; - value_ = 0.0; - } - - - indexPlusValue::~indexPlusValue() - { - } - - void indexPlusValue::setIdx(index_type new_idx) - { - idx_ = new_idx; - } - - void indexPlusValue::setValue(real_type new_value) - { - value_ = new_value; - } - - index_type indexPlusValue::getIdx() - { - return idx_; - } - - real_type indexPlusValue::getValue() - { - return value_; - } - //end of helper class - + /** + * @brief Default constructor + * + * @post Instantiates CPU and CUDA matrix handlers, but does not + * create a workspace. + * + * @todo There is little utility for the default constructor. Rethink its purpose. + * Consider making it private method. + */ MatrixHandler::MatrixHandler() { - this->new_matrix_ = true; - this->values_changed_ = true; + new_matrix_ = true; + cpuImpl_ = new MatrixHandlerCpu(); } + /** + * @brief Destructor + * + */ MatrixHandler::~MatrixHandler() { + if (isCpuEnabled_) delete cpuImpl_; + if (isCudaEnabled_) delete cudaImpl_; } - MatrixHandler::MatrixHandler(LinAlgWorkspace* new_workspace) + /** + * @brief Constructor taking pointer to the workspace as its parameter. + * + * @note The CPU implementation currently does not require a workspace. + * The workspace pointer parameter is provided for forward compatibility. + */ + MatrixHandler::MatrixHandler(LinAlgWorkspaceCpu* new_workspace) { - workspace_ = new_workspace; + cpuImpl_ = new MatrixHandlerCpu(new_workspace); + isCpuEnabled_ = true; + isCudaEnabled_ = false; } - bool MatrixHandler::getValuesChanged() +#ifdef RESOLVE_USE_CUDA + /** + * @brief Constructor taking pointer to the CUDA workspace as its parameter. + * + * @post A CPU implementation instance is created because it is cheap and + * it does not require a workspace. + * + * @post A CUDA implementation instance is created with supplied workspace. + */ + MatrixHandler::MatrixHandler(LinAlgWorkspaceCUDA* new_workspace) { - return this->values_changed_; + cpuImpl_ = new MatrixHandlerCpu(); + cudaImpl_ = new MatrixHandlerCuda(new_workspace); + isCpuEnabled_ = true; + isCudaEnabled_ = true; } +#endif - void MatrixHandler::setValuesChanged(bool toWhat) + void MatrixHandler::setValuesChanged(bool isValuesChanged, std::string memspace) { - this->values_changed_ = toWhat; + if (memspace == "cpu") { + cpuImpl_->setValuesChanged(isValuesChanged); + } else if (memspace == "cuda") { + cudaImpl_->setValuesChanged(isValuesChanged); + } else { + out::error() << "Unsupported device " << memspace << "\n"; + } } + /** + * @brief Converts COO to CSR matrix format. + * + * Conversion takes place on CPU, and then CSR matrix is copied to `memspace`. + */ int MatrixHandler::coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace) { - //this happens on the CPU not on the GPU - //but will return whatever memspace requested. - //count nnzs first - index_type nnz_unpacked = 0; index_type nnz = A_coo->getNnz(); index_type n = A_coo->getNumRows(); @@ -125,7 +141,7 @@ namespace ReSolve { index_type* nnz_shifts = new index_type[n]; std::fill_n(nnz_shifts, n , 0); - indexPlusValue* tmp = new indexPlusValue[nnz_unpacked]; + IndexValuePair* tmp = new IndexValuePair[nnz_unpacked]; csr_ia[0] = 0; @@ -229,189 +245,48 @@ namespace ReSolve { return 0; } - int MatrixHandler::matvec(matrix::Sparse* Ageneric, + /** + * @brief Matrix vector product: result = alpha * A * x + beta * result + * + * @param[in] A - Sparse matrix + * @param[in] vec_x - Vector multiplied by the matrix + * @param[out] vec_result - Vector where the result is stored + * @param[in] alpha - scalar parameter + * @param[in] beta - scalar parameter + * @param[in] matrixFormat - Only CSR format is supported at this time + * @param[in] memspace - Device where the product is computed + * @return result := alpha * A * x + beta * result + */ + int MatrixHandler::matvec(matrix::Sparse* A, vector_type* vec_x, vector_type* vec_result, const real_type* alpha, const real_type* beta, std::string matrixFormat, - std::string memspace) + std::string memspace) { - using namespace constants; - int error_sum = 0; - if (matrixFormat == "csr"){ - matrix::Csr* A = (matrix::Csr*) Ageneric; - cusparseStatus_t status; - //result = alpha *A*x + beta * result - if (memspace == "cuda" ){ - // std::cout << "Matvec on NVIDIA GPU ...\n"; - LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_; - cusparseDnVecDescr_t vecx = workspaceCUDA->getVecX(); - //printf("is vec_x NULL? %d\n", vec_x->getData("cuda") == nullptr); - //printf("is vec_result NULL? %d\n", vec_result->getData("cuda") == nullptr); - cusparseCreateDnVec(&vecx, A->getNumRows(), vec_x->getData("cuda"), CUDA_R_64F); - - - cusparseDnVecDescr_t vecAx = workspaceCUDA->getVecY(); - cusparseCreateDnVec(&vecAx, A->getNumRows(), vec_result->getData("cuda"), CUDA_R_64F); - - cusparseSpMatDescr_t matA = workspaceCUDA->getSpmvMatrixDescriptor(); - - void* buffer_spmv = workspaceCUDA->getSpmvBuffer(); - cusparseHandle_t handle_cusparse = workspaceCUDA->getCusparseHandle(); - if (values_changed_){ - status = cusparseCreateCsr(&matA, - A->getNumRows(), - A->getNumColumns(), - A->getNnzExpanded(), - A->getRowData("cuda"), - A->getColData("cuda"), - A->getValues("cuda"), - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, - CUDA_R_64F); - error_sum += status; - values_changed_ = false; - } - if (!workspaceCUDA->matvecSetup()){ - //setup first, allocate, etc. - size_t bufferSize = 0; - - status = cusparseSpMV_bufferSize(handle_cusparse, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &MINUSONE, - matA, - vecx, - &ONE, - vecAx, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - &bufferSize); - error_sum += status; - mem_.deviceSynchronize(); - mem_.allocateBufferOnDevice(&buffer_spmv, bufferSize); - workspaceCUDA->setSpmvMatrixDescriptor(matA); - workspaceCUDA->setSpmvBuffer(buffer_spmv); - - workspaceCUDA->matvecSetupDone(); - } - - status = cusparseSpMV(handle_cusparse, - CUSPARSE_OPERATION_NON_TRANSPOSE, - alpha, - matA, - vecx, - beta, - vecAx, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - buffer_spmv); - error_sum += status; - mem_.deviceSynchronize(); - if (status) - out::error() << "Matvec status: " << status - << "Last error code: " << mem_.getLastDeviceError() << std::endl; - vec_result->setDataUpdated("cuda"); - - cusparseDestroyDnVec(vecx); - cusparseDestroyDnVec(vecAx); - return error_sum; - } else { - if (memspace == "cpu"){ - //cpu csr matvec - index_type* ia = A->getRowData("cpu"); - index_type* ja = A->getColData("cpu"); - real_type* a = A->getValues("cpu"); - - real_type* x_data = vec_x->getData("cpu"); - real_type* result_data = vec_result->getData("cpu"); - real_type sum; - real_type y, t, c; - //Kahan algorithm for stability; Kahan-Babushka version didnt make a difference - for (int i = 0; i < A->getNumRows(); ++i) { - sum = 0.0; - c = 0.0; - for (int j = ia[i]; j < ia[i+1]; ++j) { - y = ( a[j] * x_data[ja[j]]) - c; - t = sum + y; - c = (t - sum) - y; - sum = t; - // sum += ( a[j] * x_data[ja[j]]); - } - sum *= (*alpha); - result_data[i] = result_data[i]*(*beta) + sum; - } - vec_result->setDataUpdated("cpu"); - return 0; - } else { - out::error() << "Not implemented (yet)" << std::endl; - - return 1; - } - } + if (memspace == "cuda" ) { + return cudaImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat); + } else if (memspace == "cpu") { + return cpuImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat); } else { - - out::error() << "Not implemented (yet), only csr matvec is implemented" << std::endl; - return 1; + out::error() << "Support for device " << memspace << " not implemented (yet)" << std::endl; + return 1; } } int MatrixHandler::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace) { - //it ONLY WORKS WITH CUDA - index_type error_sum = 0; if (memspace == "cuda") { - LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_; - - A_csr->allocateMatrixData("cuda"); - index_type n = A_csc->getNumRows(); - index_type m = A_csc->getNumRows(); - index_type nnz = A_csc->getNnz(); - size_t bufferSize; - void* d_work; - cusparseStatus_t status = cusparseCsr2cscEx2_bufferSize(workspaceCUDA->getCusparseHandle(), - n, - m, - nnz, - A_csc->getValues("cuda"), - A_csc->getColData("cuda"), - A_csc->getRowData("cuda"), - A_csr->getValues("cuda"), - A_csr->getRowData("cuda"), - A_csr->getColData("cuda"), - CUDA_R_64F, - CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG1, - &bufferSize); - error_sum += status; - mem_.allocateBufferOnDevice(&d_work, bufferSize); - status = cusparseCsr2cscEx2(workspaceCUDA->getCusparseHandle(), - n, - m, - nnz, - A_csc->getValues("cuda"), - A_csc->getColData("cuda"), - A_csc->getRowData("cuda"), - A_csr->getValues("cuda"), - A_csr->getRowData("cuda"), - A_csr->getColData("cuda"), - CUDA_R_64F, - CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG1, - d_work); - error_sum += status; - return error_sum; - mem_.deleteOnDevice(d_work); - } else { - out::error() << "Not implemented (yet)" << std::endl; + return cudaImpl_->csc2csr(A_csc, A_csr); + } else if (memspace == "cpu") { + out::warning() << "Using untested csc2csr on CPU ..." << std::endl; + return cpuImpl_->csc2csr(A_csc, A_csr); + } else { + out::error() << "csc2csr not implemented for " << memspace << " device." << std::endl; return -1; - } - - + } } } // namespace ReSolve diff --git a/resolve/matrix/MatrixHandler.hpp b/resolve/matrix/MatrixHandler.hpp index 366b5422..398a8039 100644 --- a/resolve/matrix/MatrixHandler.hpp +++ b/resolve/matrix/MatrixHandler.hpp @@ -1,12 +1,8 @@ -// this class encapsulates various matrix manipulation operations, commonly required by linear solvers: -// this includes -// (1) Matrix format conversion: coo2csr, csr2csc -// (2) Matrix vector product (SpMV) -// (3) Matrix 1-norm #pragma once #include #include + namespace ReSolve { namespace vector @@ -20,44 +16,41 @@ namespace ReSolve class Csc; class Csr; } - class LinAlgWorkspace; + class LinAlgWorkspaceCpu; + class LinAlgWorkspaceCUDA; + class MatrixHandlerImpl; } namespace ReSolve { - //helper class - class indexPlusValue - { - public: - indexPlusValue(); - ~indexPlusValue(); - void setIdx (index_type new_idx); - void setValue (real_type new_value); - - index_type getIdx(); - real_type getValue(); - - bool operator < (const indexPlusValue& str) const - { - return (idx_ < str.idx_); - } - - private: - index_type idx_; - real_type value_; - }; - + /** + * @brief this class encapsulates various matrix manipulation operations, + * commonly required by linear solvers. + * + * This includes: + * - Matrix format conversion: coo2csr, csr2csc + * - Matrix vector product (SpMV) + * - Matrix 1-norm + * + * The class uses pointer to implementation (PIMPL) idiom to create + * multiple matrix operation implementations running on CUDA and HIP devices + * as well as on CPU. + * + * @author Kasia Swirydowicz + * @author Slaven Peles + */ class MatrixHandler { using vector_type = vector::Vector; public: MatrixHandler(); - MatrixHandler(LinAlgWorkspace* workspace); + MatrixHandler(LinAlgWorkspaceCpu* workspace); + MatrixHandler(LinAlgWorkspaceCUDA* workspace); ~MatrixHandler(); - int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace); //memspace decides on what is returned (cpu or cuda pointer) + int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace); int coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace); /// Should compute vec_result := alpha*A*vec_x + beta*vec_result, but at least on cpu alpha and beta are flipped @@ -68,16 +61,18 @@ namespace ReSolve { const real_type* beta, std::string matrix_type, std::string memspace); - void Matrix1Norm(matrix::Sparse *A, real_type* norm); - bool getValuesChanged(); - void setValuesChanged(bool toWhat); + int Matrix1Norm(matrix::Sparse *A, real_type* norm); + void setValuesChanged(bool toWhat, std::string memspace); private: - LinAlgWorkspace* workspace_{nullptr}; - bool new_matrix_{true}; ///< if the structure changed, you need a new handler. - bool values_changed_{true}; ///< needed for matvec + bool new_matrix_{true}; ///< if the structure changed, you need a new handler. + + MemoryHandler mem_; ///< Device memory manager object + MatrixHandlerImpl* cpuImpl_{nullptr}; ///< Pointer to CPU implementation + MatrixHandlerImpl* cudaImpl_{nullptr}; ///< Pointer to CUDA implementation - MemoryHandler mem_; ///< Device memory manager object + bool isCpuEnabled_{false}; ///< true if CPU implementation is instantiated + bool isCudaEnabled_{false}; ///< true if CUDA implementation is instantiated }; } // namespace ReSolve diff --git a/resolve/matrix/MatrixHandlerCpu.cpp b/resolve/matrix/MatrixHandlerCpu.cpp new file mode 100644 index 00000000..2c434dcb --- /dev/null +++ b/resolve/matrix/MatrixHandlerCpu.cpp @@ -0,0 +1,168 @@ +#include +#include + +#include +#include +#include +#include +#include +#include "MatrixHandlerCpu.hpp" + +namespace ReSolve { + // Create a shortcut name for Logger static class + using out = io::Logger; + + MatrixHandlerCpu::MatrixHandlerCpu() + { + } + + MatrixHandlerCpu::~MatrixHandlerCpu() + { + } + + MatrixHandlerCpu::MatrixHandlerCpu(LinAlgWorkspaceCpu* new_workspace) + { + workspace_ = new_workspace; + } + + void MatrixHandlerCpu::setValuesChanged(bool values_changed) + { + values_changed_ = values_changed; + } + + + /** + * @brief result := alpha * A * x + beta * result + */ + int MatrixHandlerCpu::matvec(matrix::Sparse* Ageneric, + vector_type* vec_x, + vector_type* vec_result, + const real_type* alpha, + const real_type* beta, + std::string matrixFormat) + { + using namespace constants; + // int error_sum = 0; + if (matrixFormat == "csr") { + matrix::Csr* A = (matrix::Csr*) Ageneric; + index_type* ia = A->getRowData("cpu"); + index_type* ja = A->getColData("cpu"); + real_type* a = A->getValues("cpu"); + + real_type* x_data = vec_x->getData("cpu"); + real_type* result_data = vec_result->getData("cpu"); + real_type sum; + real_type y; + real_type t; + real_type c; + + //Kahan algorithm for stability; Kahan-Babushka version didnt make a difference + for (int i = 0; i < A->getNumRows(); ++i) { + sum = 0.0; + c = 0.0; + for (int j = ia[i]; j < ia[i+1]; ++j) { + y = ( a[j] * x_data[ja[j]]) - c; + t = sum + y; + c = (t - sum) - y; + sum = t; + // sum += ( a[j] * x_data[ja[j]]); + } + sum *= (*alpha); + result_data[i] = result_data[i]*(*beta) + sum; + } + vec_result->setDataUpdated("cpu"); + return 0; + } else { + out::error() << "MatVec not implemented (yet) for " + << matrixFormat << " matrix format." << std::endl; + return 1; + } + } + + int MatrixHandlerCpu::Matrix1Norm(matrix::Sparse* /* A */, real_type* /* norm */) + { + return -1; + } + + /** + * @brief Convert CSC to CSR matrix on the host + * + * @authors Slaven Peles , Daniel Reynolds (SMU), and + * David Gardner and Carol Woodward (LLNL) + */ + int MatrixHandlerCpu::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr) + { + // int error_sum = 0; TODO: Collect error output! + assert(A_csc->getNnz() == A_csr->getNnz()); + assert(A_csc->getNumRows() == A_csr->getNumColumns()); + assert(A_csr->getNumRows() == A_csc->getNumColumns()); + + index_type nnz = A_csc->getNnz(); + index_type n = A_csc->getNumColumns(); + + index_type* rowIdxCsc = A_csc->getRowData("cpu"); + index_type* colPtrCsc = A_csc->getColData("cpu"); + real_type* valuesCsc = A_csc->getValues("cpu"); + + index_type* rowPtrCsr = A_csr->getRowData("cpu"); + index_type* colIdxCsr = A_csr->getColData("cpu"); + real_type* valuesCsr = A_csr->getValues("cpu"); + + // Set all CSR row pointers to zero + for (index_type i = 0; i <= n; ++i) { + rowPtrCsr[i] = 0; + } + + // Set all CSR values and column indices to zero + for (index_type i = 0; i < nnz; ++i) { + colIdxCsr[i] = 0; + valuesCsr[i] = 0.0; + } + + // Compute number of entries per row + for (index_type i = 0; i < nnz; ++i) { + rowPtrCsr[rowIdxCsc[i]]++; + } + + // Compute cumualtive sum of nnz per row + for (index_type row = 0, rowsum = 0; row < n; ++row) + { + // Store value in row pointer to temp + index_type temp = rowPtrCsr[row]; + + // Copy cumulative sum to the row pointer + rowPtrCsr[row] = rowsum; + + // Update row sum + rowsum += temp; + } + rowPtrCsr[n] = nnz; + + for (index_type col = 0; col < n; ++col) + { + // Compute positions of column indices and values in CSR matrix and store them there + // Overwrites CSR row pointers in the process + for (index_type jj = colPtrCsc[col]; jj < colPtrCsc[col+1]; jj++) + { + index_type row = rowIdxCsc[jj]; + index_type dest = rowPtrCsr[row]; + + colIdxCsr[dest] = col; + valuesCsr[dest] = valuesCsc[jj]; + + rowPtrCsr[row]++; + } + } + + // Restore CSR row pointer values + for (index_type row = 0, last = 0; row <= n; row++) + { + index_type temp = rowPtrCsr[row]; + rowPtrCsr[row] = last; + last = temp; + } + + return 0; + } + +} // namespace ReSolve diff --git a/resolve/matrix/MatrixHandlerCpu.hpp b/resolve/matrix/MatrixHandlerCpu.hpp new file mode 100644 index 00000000..0b0afbd3 --- /dev/null +++ b/resolve/matrix/MatrixHandlerCpu.hpp @@ -0,0 +1,57 @@ +#pragma once +#include +#include +#include + +namespace ReSolve +{ + namespace vector + { + class Vector; + } + namespace matrix + { + class Sparse; + class Coo; + class Csc; + class Csr; + } + class LinAlgWorkspaceCpu; +} + + +namespace ReSolve { + /** + * @class MatrixHandlerCpu + * + * @brief CPU implementation of the matrix handler. + */ + class MatrixHandlerCpu : public MatrixHandlerImpl + { + using vector_type = vector::Vector; + + public: + MatrixHandlerCpu(); + MatrixHandlerCpu(LinAlgWorkspaceCpu* workspace); + virtual ~MatrixHandlerCpu(); + + int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr); + + virtual int matvec(matrix::Sparse* A, + vector_type* vec_x, + vector_type* vec_result, + const real_type* alpha, + const real_type* beta, + std::string matrix_type); + virtual int Matrix1Norm(matrix::Sparse *A, real_type* norm); + void setValuesChanged(bool isValuesChanged); + + private: + LinAlgWorkspaceCpu* workspace_{nullptr}; + bool values_changed_{true}; ///< needed for matvec + + MemoryHandler mem_; ///< Device memory manager object + }; + +} // namespace ReSolve + diff --git a/resolve/matrix/MatrixHandlerCuda.cpp b/resolve/matrix/MatrixHandlerCuda.cpp new file mode 100644 index 00000000..3405ba8d --- /dev/null +++ b/resolve/matrix/MatrixHandlerCuda.cpp @@ -0,0 +1,173 @@ +#include + +#include +#include +#include +#include +#include +#include +#include "MatrixHandlerCuda.hpp" + +namespace ReSolve { + // Create a shortcut name for Logger static class + using out = io::Logger; + + MatrixHandlerCuda::~MatrixHandlerCuda() + { + } + + MatrixHandlerCuda::MatrixHandlerCuda(LinAlgWorkspaceCUDA* new_workspace) + { + workspace_ = new_workspace; + } + + void MatrixHandlerCuda::setValuesChanged(bool values_changed) + { + values_changed_ = values_changed; + } + + + int MatrixHandlerCuda::matvec(matrix::Sparse* Ageneric, + vector_type* vec_x, + vector_type* vec_result, + const real_type* alpha, + const real_type* beta, + std::string matrixFormat) + { + using namespace constants; + int error_sum = 0; + if (matrixFormat == "csr") { + matrix::Csr* A = dynamic_cast(Ageneric); + //result = alpha *A*x + beta * result + cusparseStatus_t status; + LinAlgWorkspaceCUDA* workspaceCUDA = workspace_; + cusparseDnVecDescr_t vecx = workspaceCUDA->getVecX(); + cusparseCreateDnVec(&vecx, A->getNumRows(), vec_x->getData("cuda"), CUDA_R_64F); + + + cusparseDnVecDescr_t vecAx = workspaceCUDA->getVecY(); + cusparseCreateDnVec(&vecAx, A->getNumRows(), vec_result->getData("cuda"), CUDA_R_64F); + + cusparseSpMatDescr_t matA = workspaceCUDA->getSpmvMatrixDescriptor(); + + void* buffer_spmv = workspaceCUDA->getSpmvBuffer(); + cusparseHandle_t handle_cusparse = workspaceCUDA->getCusparseHandle(); + if (values_changed_) { + status = cusparseCreateCsr(&matA, + A->getNumRows(), + A->getNumColumns(), + A->getNnzExpanded(), + A->getRowData("cuda"), + A->getColData("cuda"), + A->getValues("cuda"), + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F); + error_sum += status; + values_changed_ = false; + } + if (!workspaceCUDA->matvecSetup()) { + //setup first, allocate, etc. + size_t bufferSize = 0; + + status = cusparseSpMV_bufferSize(handle_cusparse, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &MINUSONE, + matA, + vecx, + &ONE, + vecAx, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + &bufferSize); + error_sum += status; + mem_.deviceSynchronize(); + mem_.allocateBufferOnDevice(&buffer_spmv, bufferSize); + workspaceCUDA->setSpmvMatrixDescriptor(matA); + workspaceCUDA->setSpmvBuffer(buffer_spmv); + + workspaceCUDA->matvecSetupDone(); + } + + status = cusparseSpMV(handle_cusparse, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha, + matA, + vecx, + beta, + vecAx, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + buffer_spmv); + error_sum += status; + mem_.deviceSynchronize(); + if (status) + out::error() << "Matvec status: " << status + << "Last error code: " << mem_.getLastDeviceError() << std::endl; + vec_result->setDataUpdated("cuda"); + + cusparseDestroyDnVec(vecx); + cusparseDestroyDnVec(vecAx); + return error_sum; + } else { + out::error() << "MatVec not implemented (yet) for " + << matrixFormat << " matrix format." << std::endl; + return 1; + } + } + + int MatrixHandlerCuda::Matrix1Norm(matrix::Sparse* /* A */, real_type* /* norm */) + { + return -1; + } + + int MatrixHandlerCuda::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr) + { + index_type error_sum = 0; + LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_; + + A_csr->allocateMatrixData("cuda"); + index_type n = A_csc->getNumRows(); + index_type m = A_csc->getNumRows(); + index_type nnz = A_csc->getNnz(); + size_t bufferSize; + void* d_work; + cusparseStatus_t status = cusparseCsr2cscEx2_bufferSize(workspaceCUDA->getCusparseHandle(), + n, + m, + nnz, + A_csc->getValues("cuda"), + A_csc->getColData("cuda"), + A_csc->getRowData("cuda"), + A_csr->getValues("cuda"), + A_csr->getRowData("cuda"), + A_csr->getColData("cuda"), + CUDA_R_64F, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG1, + &bufferSize); + error_sum += status; + mem_.allocateBufferOnDevice(&d_work, bufferSize); + status = cusparseCsr2cscEx2(workspaceCUDA->getCusparseHandle(), + n, + m, + nnz, + A_csc->getValues("cuda"), + A_csc->getColData("cuda"), + A_csc->getRowData("cuda"), + A_csr->getValues("cuda"), + A_csr->getRowData("cuda"), + A_csr->getColData("cuda"), + CUDA_R_64F, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG1, + d_work); + error_sum += status; + return error_sum; + mem_.deleteOnDevice(d_work); + } + +} // namespace ReSolve diff --git a/resolve/matrix/MatrixHandlerCuda.hpp b/resolve/matrix/MatrixHandlerCuda.hpp new file mode 100644 index 00000000..efd4c356 --- /dev/null +++ b/resolve/matrix/MatrixHandlerCuda.hpp @@ -0,0 +1,55 @@ +#pragma once +#include +#include +#include + +namespace ReSolve +{ + namespace vector + { + class Vector; + } + namespace matrix + { + class Sparse; + class Coo; + class Csc; + class Csr; + } + class LinAlgWorkspaceCUDA; +} + + +namespace ReSolve { + /** + * @class MatrixHandlerCuda + * + * @brief CUDA implementation of the matrix handler. + */ + class MatrixHandlerCuda : public MatrixHandlerImpl + { + using vector_type = vector::Vector; + + public: + MatrixHandlerCuda(LinAlgWorkspaceCUDA* workspace); + virtual ~MatrixHandlerCuda(); + + int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr); + virtual int matvec(matrix::Sparse* A, + vector_type* vec_x, + vector_type* vec_result, + const real_type* alpha, + const real_type* beta, + std::string matrix_type); + virtual int Matrix1Norm(matrix::Sparse *A, real_type* norm); + void setValuesChanged(bool isValuesChanged); + + private: + LinAlgWorkspaceCUDA* workspace_{nullptr}; + bool values_changed_{true}; ///< needed for matvec + + MemoryHandler mem_; ///< Device memory manager object + }; + +} // namespace ReSolve + diff --git a/resolve/matrix/MatrixHandlerImpl.hpp b/resolve/matrix/MatrixHandlerImpl.hpp new file mode 100644 index 00000000..2bef6b3d --- /dev/null +++ b/resolve/matrix/MatrixHandlerImpl.hpp @@ -0,0 +1,51 @@ +#pragma once +#include +#include + +namespace ReSolve +{ + namespace vector + { + class Vector; + } + namespace matrix + { + class Sparse; + class Coo; + class Csc; + class Csr; + } +} + + +namespace ReSolve { + /** + * @class MatrixHandlerImpl + * + * @brief Base class for different matrix handler implementations. + */ + class MatrixHandlerImpl + { + using vector_type = vector::Vector; + + public: + MatrixHandlerImpl() + {} + virtual ~MatrixHandlerImpl() + {} + + virtual int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr) = 0; + + virtual int matvec(matrix::Sparse* A, + vector_type* vec_x, + vector_type* vec_result, + const real_type* alpha, + const real_type* beta, + std::string matrix_type) = 0; + virtual int Matrix1Norm(matrix::Sparse* A, real_type* norm) = 0; + + virtual void setValuesChanged(bool isValuesChanged) = 0; + }; + +} // namespace ReSolve + diff --git a/resolve/utilities/misc/IndexValuePair.hpp b/resolve/utilities/misc/IndexValuePair.hpp new file mode 100644 index 00000000..b09ccf97 --- /dev/null +++ b/resolve/utilities/misc/IndexValuePair.hpp @@ -0,0 +1,43 @@ +#pragma once + + +namespace ReSolve { + + /// @brief Helper class for COO matrix sorting + class IndexValuePair + { + public: + IndexValuePair() : idx_(0), value_(0.0) + {} + ~IndexValuePair() + {} + void setIdx (index_type new_idx) + { + idx_ = new_idx; + } + void setValue (real_type new_value) + { + value_ = new_value; + } + + index_type getIdx() + { + return idx_; + } + real_type getValue() + { + return value_; + } + + bool operator < (const IndexValuePair& str) const + { + return (idx_ < str.idx_); + } + + private: + index_type idx_; + real_type value_; + }; + +} // namespace ReSolve + diff --git a/resolve/vector/CMakeLists.txt b/resolve/vector/CMakeLists.txt index 57feb5df..16d53010 100644 --- a/resolve/vector/CMakeLists.txt +++ b/resolve/vector/CMakeLists.txt @@ -1,32 +1,49 @@ +#[[ +@brief Build ReSolve (multi)vector module + +@author Slaven Peles + +]] + +# C++ code set(Vector_SRC Vector.cpp VectorHandler.cpp + VectorHandlerCpu.cpp ) - - -set(Vector_SRC_CUDA - cudaVectorKernels.cu +# C++ code that depends on CUDA SDK libraries +set(Vector_CUDASDK_SRC + VectorHandlerCuda.cpp ) +# Header files to be installed set(Vector_HEADER_INSTALL Vector.hpp VectorHandler.hpp + VectorKernels.hpp ) +# Add CUDA vector handler if CUDA support is enabled +if(RESOLVE_USE_CUDA) + set(Vector_SRC ${Vector_SRC} ${Vector_CUDASDK_SRC}) +endif() + +add_library(resolve_vector SHARED ${Vector_SRC}) +target_link_libraries(resolve_vector PRIVATE resolve_logger) + +# Link to ReSolve CUDA backend if CUDA is enabled if (RESOLVE_USE_CUDA) - set_source_files_properties(${Vector_SRC_CUDA} PROPERTIES LANGUAGE CUDA) - - # Build shared library ReSolve::vector - add_library(resolve_vector SHARED ${Vector_SRC} ${Vector_SRC_CUDA}) - target_link_libraries(resolve_vector PUBLIC resolve_backend_cuda) -else() - # Build shared library ReSolve::vector - add_library(resolve_vector SHARED ${Vector_SRC}) - target_link_libraries(resolve_vector PUBLIC resolve_backend_cpu) + target_link_libraries(resolve_vector PUBLIC resolve_backend_cuda) endif() +# If no GPU is enabled link to dummy device backend +if(NOT RESOLVE_USE_GPU) + target_link_libraries(resolve_vector PUBLIC resolve_backend_cpu) +endif(NOT RESOLVE_USE_GPU) + + target_include_directories(resolve_vector INTERFACE $ $ diff --git a/resolve/vector/VectorHandler.cpp b/resolve/vector/VectorHandler.cpp index 032cc9b1..8c89cb2f 100644 --- a/resolve/vector/VectorHandler.cpp +++ b/resolve/vector/VectorHandler.cpp @@ -1,11 +1,17 @@ #include +#include #include -#include -#include #include +#include +#include +#include #include "VectorHandler.hpp" +#ifdef RESOLVE_USE_CUDA +#include +#endif + namespace ReSolve { using out = io::Logger; @@ -14,17 +20,36 @@ namespace ReSolve { */ VectorHandler::VectorHandler() { + cpuImpl_ = new VectorHandlerCpu(); + isCpuEnabled_ = true; + } + + /** + * @brief constructor + * + * @param new_workspace - workspace to be set + */ + VectorHandler::VectorHandler(LinAlgWorkspaceCpu* new_workspace) + { + cpuImpl_ = new VectorHandlerCpu(new_workspace); + isCpuEnabled_ = true; } +#ifdef RESOLVE_USE_CUDA /** * @brief constructor * * @param new_workspace - workspace to be set */ - VectorHandler:: VectorHandler(LinAlgWorkspace* new_workspace) + VectorHandler::VectorHandler(LinAlgWorkspaceCUDA* new_workspace) { - workspace_ = new_workspace; + cudaImpl_ = new VectorHandlerCuda(new_workspace); + cpuImpl_ = new VectorHandlerCpu(); + + isCudaEnabled_ = true; + isCpuEnabled_ = true; } +#endif /** * @brief destructor @@ -46,28 +71,11 @@ namespace ReSolve { real_type VectorHandler::dot(vector::Vector* x, vector::Vector* y, std::string memspace) { - if (memspace == "cuda" ){ - LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_; - cublasHandle_t handle_cublas = workspaceCUDA->getCublasHandle(); - double nrm = 0.0; - cublasStatus_t st= cublasDdot (handle_cublas, x->getSize(), x->getData("cuda"), 1, y->getData("cuda"), 1, &nrm); - if (st!=0) {printf("dot product crashed with code %d \n", st);} - return nrm; + if (memspace == "cuda" ) { + return cudaImpl_->dot(x, y); } else { if (memspace == "cpu") { - real_type* x_data = x->getData("cpu"); - real_type* y_data = y->getData("cpu"); - real_type sum = 0.0; - real_type c = 0.0; - real_type t, y; - for (int i = 0; i < x->getSize(); ++i){ - y = (x_data[i] * y_data[i]) - c; - t = sum + y; - c = (t - sum) - y; - sum = t; - // sum += (x_data[i] * y_data[i]); - } - return sum; + return cpuImpl_->dot(x, y); } else { out::error() << "Not implemented (yet)" << std::endl; return NAN; @@ -85,20 +93,11 @@ namespace ReSolve { */ void VectorHandler::scal(const real_type* alpha, vector::Vector* x, std::string memspace) { - if (memspace == "cuda" ) { - LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_; - cublasHandle_t handle_cublas = workspaceCUDA->getCublasHandle(); - cublasStatus_t st = cublasDscal(handle_cublas, x->getSize(), alpha, x->getData("cuda"), 1); - if (st!=0) { - ReSolve::io::Logger::error() << "scal crashed with code " << st << "\n"; - } + if (memspace == "cuda" ) { + cudaImpl_->scal(alpha, x); } else { if (memspace == "cpu") { - real_type* x_data = x->getData("cpu"); - - for (int i = 0; i < x->getSize(); ++i){ - x_data[i] *= (*alpha); - } + cpuImpl_->scal(alpha, x); } else { out::error() << "Not implemented (yet)" << std::endl; } @@ -114,26 +113,14 @@ namespace ReSolve { * @param[in] memspace String containg memspace (cpu or cuda) * */ - void VectorHandler::axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y, std::string memspace ) + void VectorHandler::axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y, std::string memspace) { //AXPY: y = alpha * x + y - if (memspace == "cuda" ) { - LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_; - cublasHandle_t handle_cublas = workspaceCUDA->getCublasHandle(); - cublasDaxpy(handle_cublas, - x->getSize(), - alpha, - x->getData("cuda"), - 1, - y->getData("cuda"), - 1); + if (memspace == "cuda" ) { + cudaImpl_->axpy(alpha, x, y); } else { if (memspace == "cpu") { - real_type* x_data = x->getData("cpu"); - real_type* y_data = y->getData("cpu"); - for (int i = 0; i < x->getSize(); ++i){ - y_data[i] = (*alpha) * x_data[i] + y_data[i]; - } + cpuImpl_->axpy(alpha, x, y); } else { out::error() <<"Not implemented (yet)" << std::endl; } @@ -160,38 +147,9 @@ namespace ReSolve { void VectorHandler::gemv(std::string transpose, index_type n, index_type k, const real_type* alpha, const real_type* beta, vector::Vector* V, vector::Vector* y, vector::Vector* x, std::string memspace) { if (memspace == "cuda") { - LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_; - cublasHandle_t handle_cublas = workspaceCUDA->getCublasHandle(); - if (transpose == "T") { - - cublasDgemv(handle_cublas, - CUBLAS_OP_T, - n, - k, - alpha, - V->getData("cuda"), - n, - y->getData("cuda"), - 1, - beta, - x->getData("cuda"), - 1); - - } else { - cublasDgemv(handle_cublas, - CUBLAS_OP_N, - n, - k, - alpha, - V->getData("cuda"), - n, - y->getData("cuda"), - 1, - beta, - x->getData("cuda"), - 1); - } - + cudaImpl_->gemv(transpose, n, k, alpha, beta, V, y, x); + } else if (memspace == "cpu") { + cpuImpl_->gemv(transpose, n, k, alpha, beta, V, y, x); } else { out::error() << "Not implemented (yet)" << std::endl; } @@ -213,26 +171,9 @@ namespace ReSolve { { using namespace constants; if (memspace == "cuda") { - if (k < 200) { - mass_axpy(size, k, x->getData("cuda"), y->getData("cuda"),alpha->getData("cuda")); - } else { - LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_; - cublasHandle_t handle_cublas = workspaceCUDA->getCublasHandle(); - cublasDgemm(handle_cublas, - CUBLAS_OP_N, - CUBLAS_OP_N, - size, // m - 1, // n - k + 1, // k - &MINUSONE, // alpha - x->getData("cuda"), // A - size, // lda - alpha->getData("cuda"), // B - k + 1, // ldb - &ONE, - y->getData("cuda"), // c - size); // ldc - } + cudaImpl_->massAxpy(size, alpha, k, x, y); + } else if (memspace == "cpu") { + cpuImpl_->massAxpy(size, alpha, k, x, y); } else { out::error() << "Not implemented (yet)" << std::endl; } @@ -254,29 +195,10 @@ namespace ReSolve { */ void VectorHandler::massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res, std::string memspace) { - using namespace constants; - if (memspace == "cuda") { - if (k < 200) { - mass_inner_product_two_vectors(size, k, x->getData("cuda") , x->getData(1, "cuda"), V->getData("cuda"), res->getData("cuda")); - } else { - LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_; - cublasHandle_t handle_cublas = workspaceCUDA->getCublasHandle(); - cublasDgemm(handle_cublas, - CUBLAS_OP_T, - CUBLAS_OP_N, - k + 1, //m - 2, //n - size, //k - &ONE, //alpha - V->getData("cuda"), //A - size, //lda - x->getData("cuda"), //B - size, //ldb - &ZERO, - res->getData("cuda"), //c - k + 1); //ldc - } + cudaImpl_->massDot2Vec(size, V, k, x, res); + } else if (memspace == "cpu") { + cpuImpl_->massDot2Vec(size, V, k, x, res); } else { out::error() << "Not implemented (yet)" << std::endl; } diff --git a/resolve/vector/VectorHandler.hpp b/resolve/vector/VectorHandler.hpp index bf2d37d7..c17d4688 100644 --- a/resolve/vector/VectorHandler.hpp +++ b/resolve/vector/VectorHandler.hpp @@ -7,7 +7,9 @@ namespace ReSolve { class Vector; } - class LinAlgWorkspace; + class VectorHandlerImpl; + class LinAlgWorkspaceCpu; + class LinAlgWorkspaceCUDA; } @@ -15,14 +17,15 @@ namespace ReSolve { //namespace vector { class VectorHandler { public: VectorHandler(); - VectorHandler(LinAlgWorkspace* new_workspace); + VectorHandler(LinAlgWorkspaceCpu* new_workspace); + VectorHandler(LinAlgWorkspaceCUDA* new_workspace); ~VectorHandler(); //y = alpha x + y - void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y, std::string memspace ); + void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y, std::string memspace); //dot: x \cdot y - real_type dot(vector::Vector* x, vector::Vector* y, std::string memspace ); + real_type dot(vector::Vector* x, vector::Vector* y, std::string memspace); //scal = alpha * x void scal(const real_type* alpha, vector::Vector* x, std::string memspace); @@ -40,9 +43,21 @@ namespace ReSolve { //namespace vector { * if `transpose = T` (yes), `x = beta*x + alpha*V^T*y`, * where `x` is `[k x 1]`, `V` is `[n x k]` and `y` is `[n x 1]`. */ - void gemv(std::string transpose, index_type n, index_type k, const real_type* alpha, const real_type* beta, vector::Vector* V, vector::Vector* y, vector::Vector* x, std::string memspace); + void gemv(std::string transpose, + index_type n, + index_type k, + const real_type* alpha, + const real_type* beta, + vector::Vector* V, + vector::Vector* y, + vector::Vector* x, + std::string memspace); private: - LinAlgWorkspace* workspace_; + VectorHandlerImpl* cpuImpl_{nullptr}; + VectorHandlerImpl* cudaImpl_{nullptr}; + + bool isCpuEnabled_{false}; + bool isCudaEnabled_{false}; }; } //} // namespace ReSolve::vector diff --git a/resolve/vector/VectorHandlerCpu.cpp b/resolve/vector/VectorHandlerCpu.cpp new file mode 100644 index 00000000..f5cc463d --- /dev/null +++ b/resolve/vector/VectorHandlerCpu.cpp @@ -0,0 +1,166 @@ +#include + +#include +#include +#include +#include +#include +#include "VectorHandlerCpu.hpp" + +namespace ReSolve { + using out = io::Logger; + + /** + * @brief empty constructor that does absolutely nothing + */ + VectorHandlerCpu::VectorHandlerCpu() + { + } + + /** + * @brief constructor + * + * @param new_workspace - workspace to be set + */ + VectorHandlerCpu:: VectorHandlerCpu(LinAlgWorkspaceCpu* new_workspace) + { + workspace_ = new_workspace; + } + + /** + * @brief destructor + */ + VectorHandlerCpu::~VectorHandlerCpu() + { + //delete the workspace TODO + } + + /** + * @brief dot product of two vectors i.e, a = x^Ty + * + * @param[in] x The first vector + * @param[in] y The second vector + * @param[in] memspace String containg memspace (cpu or cuda) + * + * @return dot product (real number) of _x_ and _y_ + */ + + real_type VectorHandlerCpu::dot(vector::Vector* x, vector::Vector* y) + { + real_type* x_data = x->getData("cpu"); + real_type* y_data = y->getData("cpu"); + real_type sum = 0.0; + real_type c = 0.0; + // real_type t, y; + for (int i = 0; i < x->getSize(); ++i) { + real_type y = (x_data[i] * y_data[i]) - c; + real_type t = sum + y; + c = (t - sum) - y; + sum = t; + // sum += (x_data[i] * y_data[i]); + } + return sum; + } + + /** + * @brief scale a vector by a constant i.e, x = alpha*x where alpha is a constant + * + * @param[in] alpha The constant + * @param[in,out] x The vector + * @param memspace string containg memspace (cpu or cuda) + * + */ + void VectorHandlerCpu::scal(const real_type* alpha, vector::Vector* x) + { + real_type* x_data = x->getData("cpu"); + + for (int i = 0; i < x->getSize(); ++i){ + x_data[i] *= (*alpha); + } + } + + /** + * @brief axpy i.e, y = alpha*x+y where alpha is a constant + * + * @param[in] alpha The constant + * @param[in] x The first vector + * @param[in,out] y The second vector (result is return in y) + * @param[in] memspace String containg memspace (cpu or cuda) + * + */ + void VectorHandlerCpu::axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y) + { + //AXPY: y = alpha * x + y + real_type* x_data = x->getData("cpu"); + real_type* y_data = y->getData("cpu"); + for (int i = 0; i < x->getSize(); ++i) { + y_data[i] = (*alpha) * x_data[i] + y_data[i]; + } + } + + /** + * @brief gemv computes matrix-vector product where both matrix and vectors are dense. + * i.e., x = beta*x + alpha*V*y + * + * @param[in] Transpose - yes (T) or no (N) + * @param[in] n Number of rows in (non-transposed) matrix + * @param[in] k Number of columns in (non-transposed) + * @param[in] alpha Constant real number + * @param[in] beta Constant real number + * @param[in] V Multivector containing the matrix, organized columnwise + * @param[in] y Vector, k x 1 if N and n x 1 if T + * @param[in,out] x Vector, n x 1 if N and k x 1 if T + * @param[in] memspace cpu or cuda (for now) + * + * @pre V is stored colum-wise, _n_ > 0, _k_ > 0 + * + */ + void VectorHandlerCpu::gemv(std::string /* transpose */, + index_type /* n */, + index_type /* k */, + const real_type* /* alpha */, + const real_type* /* beta */, + vector::Vector* /* V */, + vector::Vector* /* y */, + vector::Vector* /* x */) + { + out::error() << "Not implemented (yet)" << std::endl; + } + + /** + * @brief mass (bulk) axpy i.e, y = y - x*alpha where alpha is a vector + * + * @param[in] size number of elements in y + * @param[in] alpha vector size k x 1 + * @param[in] x (multi)vector size size x k + * @param[in,out] y vector size size x 1 (this is where the result is stored) + * @param[in] memspace string containg memspace (cpu or cuda) + * + * @pre _k_ > 0, _size_ > 0, _size_ = x->getSize() + * + */ + void VectorHandlerCpu::massAxpy(index_type /* size */, vector::Vector* /* alpha */, index_type /* k */, vector::Vector* /* x */, vector::Vector* /* y */) + { + out::error() << "Not implemented (yet)" << std::endl; + } + + /** + * @brief mass (bulk) dot product i.e, V^T x, where V is n x k dense multivector (a dense multivector consisting of k vectors size n) + * and x is k x 2 dense multivector (a multivector consisiting of two vectors size n each) + * + * @param[in] size Number of elements in a single vector in V + * @param[in] V Multivector; k vectors size n x 1 each + * @param[in] k Number of vectors in V + * @param[in] x Multivector; 2 vectors size n x 1 each + * @param[out] res Multivector; 2 vectors size k x 1 each (result is returned in res) + * @param[in] memspace String containg memspace (cpu or cuda) + * + * @pre _size_ > 0, _k_ > 0, size = x->getSize(), _res_ needs to be allocated + * + */ + void VectorHandlerCpu::massDot2Vec(index_type /* size */, vector::Vector* /* V */, index_type /* k */, vector::Vector* /* x */, vector::Vector* /* res */) + { + out::error() << "Not implemented (yet)" << std::endl; + } + +} // namespace ReSolve diff --git a/resolve/vector/VectorHandlerCpu.hpp b/resolve/vector/VectorHandlerCpu.hpp new file mode 100644 index 00000000..3f2f6133 --- /dev/null +++ b/resolve/vector/VectorHandlerCpu.hpp @@ -0,0 +1,57 @@ +#pragma once +#include + +namespace ReSolve +{ + namespace vector + { + class Vector; + } + class LinAlgWorkspaceCpu; + class VectorHandlerImpl; +} + + +namespace ReSolve { //namespace vector { + class VectorHandlerCpu : public VectorHandlerImpl + { + public: + VectorHandlerCpu(); + VectorHandlerCpu(LinAlgWorkspaceCpu* workspace); + virtual ~VectorHandlerCpu(); + + //y = alpha x + y + virtual void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y); + + //dot: x \cdot y + virtual real_type dot(vector::Vector* x, vector::Vector* y); + + //scal = alpha * x + virtual void scal(const real_type* alpha, vector::Vector* x); + + //mass axpy: x*alpha + y where x is [n x k] and alpha is [k x 1]; x is stored columnwise + virtual void massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y); + + //mass dot: V^T x, where V is [n x k] and x is [k x 2], everything is stored and returned columnwise + //Size = n + virtual void massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res); + + /** gemv: + * if `transpose = N` (no), `x = beta*x + alpha*V*y`, + * where `x` is `[n x 1]`, `V` is `[n x k]` and `y` is `[k x 1]`. + * if `transpose = T` (yes), `x = beta*x + alpha*V^T*y`, + * where `x` is `[k x 1]`, `V` is `[n x k]` and `y` is `[n x 1]`. + */ + virtual void gemv(std::string transpose, + index_type n, + index_type k, + const real_type* alpha, + const real_type* beta, + vector::Vector* V, + vector::Vector* y, + vector::Vector* x); + private: + LinAlgWorkspaceCpu* workspace_; + }; + +} //} // namespace ReSolve::vector diff --git a/resolve/vector/VectorHandlerCuda.cpp b/resolve/vector/VectorHandlerCuda.cpp new file mode 100644 index 00000000..3c887e85 --- /dev/null +++ b/resolve/vector/VectorHandlerCuda.cpp @@ -0,0 +1,236 @@ +#include + +#include +#include +#include +#include +#include +#include "VectorHandlerCuda.hpp" + +namespace ReSolve { + using out = io::Logger; + + /** + * @brief empty constructor that does absolutely nothing + */ + VectorHandlerCuda::VectorHandlerCuda() + { + } + + /** + * @brief constructor + * + * @param new_workspace - workspace to be set + */ + VectorHandlerCuda:: VectorHandlerCuda(LinAlgWorkspaceCUDA* new_workspace) + { + workspace_ = new_workspace; + } + + /** + * @brief destructor + */ + VectorHandlerCuda::~VectorHandlerCuda() + { + //delete the workspace TODO + } + + /** + * @brief dot product of two vectors i.e, a = x^Ty + * + * @param[in] x The first vector + * @param[in] y The second vector + * @param[in] memspace String containg memspace (cpu or cuda) + * + * @return dot product (real number) of _x_ and _y_ + */ + + real_type VectorHandlerCuda::dot(vector::Vector* x, vector::Vector* y) + { + LinAlgWorkspaceCUDA* workspaceCUDA = workspace_; + cublasHandle_t handle_cublas = workspaceCUDA->getCublasHandle(); + double nrm = 0.0; + cublasStatus_t st= cublasDdot (handle_cublas, x->getSize(), x->getData("cuda"), 1, y->getData("cuda"), 1, &nrm); + if (st!=0) {printf("dot product crashed with code %d \n", st);} + return nrm; + } + + /** + * @brief scale a vector by a constant i.e, x = alpha*x where alpha is a constant + * + * @param[in] alpha The constant + * @param[in,out] x The vector + * @param memspace string containg memspace (cpu or cuda) + * + */ + void VectorHandlerCuda::scal(const real_type* alpha, vector::Vector* x) + { + LinAlgWorkspaceCUDA* workspaceCUDA = workspace_; + cublasHandle_t handle_cublas = workspaceCUDA->getCublasHandle(); + cublasStatus_t st = cublasDscal(handle_cublas, x->getSize(), alpha, x->getData("cuda"), 1); + if (st!=0) { + ReSolve::io::Logger::error() << "scal crashed with code " << st << "\n"; + } + } + + /** + * @brief axpy i.e, y = alpha*x+y where alpha is a constant + * + * @param[in] alpha The constant + * @param[in] x The first vector + * @param[in,out] y The second vector (result is return in y) + * @param[in] memspace String containg memspace (cpu or cuda) + * + */ + void VectorHandlerCuda::axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y) + { + //AXPY: y = alpha * x + y + LinAlgWorkspaceCUDA* workspaceCUDA = workspace_; + cublasHandle_t handle_cublas = workspaceCUDA->getCublasHandle(); + cublasDaxpy(handle_cublas, + x->getSize(), + alpha, + x->getData("cuda"), + 1, + y->getData("cuda"), + 1); + } + + /** + * @brief gemv computes matrix-vector product where both matrix and vectors are dense. + * i.e., x = beta*x + alpha*V*y + * + * @param[in] Transpose - yes (T) or no (N) + * @param[in] n Number of rows in (non-transposed) matrix + * @param[in] k Number of columns in (non-transposed) + * @param[in] alpha Constant real number + * @param[in] beta Constant real number + * @param[in] V Multivector containing the matrix, organized columnwise + * @param[in] y Vector, k x 1 if N and n x 1 if T + * @param[in,out] x Vector, n x 1 if N and k x 1 if T + * @param[in] memspace cpu or cuda (for now) + * + * @pre V is stored colum-wise, _n_ > 0, _k_ > 0 + * + */ + void VectorHandlerCuda::gemv(std::string transpose, + index_type n, + index_type k, + const real_type* alpha, + const real_type* beta, + vector::Vector* V, + vector::Vector* y, + vector::Vector* x) + { + LinAlgWorkspaceCUDA* workspaceCUDA = workspace_; + cublasHandle_t handle_cublas = workspaceCUDA->getCublasHandle(); + if (transpose == "T") { + + cublasDgemv(handle_cublas, + CUBLAS_OP_T, + n, + k, + alpha, + V->getData("cuda"), + n, + y->getData("cuda"), + 1, + beta, + x->getData("cuda"), + 1); + + } else { + cublasDgemv(handle_cublas, + CUBLAS_OP_N, + n, + k, + alpha, + V->getData("cuda"), + n, + y->getData("cuda"), + 1, + beta, + x->getData("cuda"), + 1); + } + } + + /** + * @brief mass (bulk) axpy i.e, y = y - x*alpha where alpha is a vector + * + * @param[in] size number of elements in y + * @param[in] alpha vector size k x 1 + * @param[in] x (multi)vector size size x k + * @param[in,out] y vector size size x 1 (this is where the result is stored) + * @param[in] memspace string containg memspace (cpu or cuda) + * + * @pre _k_ > 0, _size_ > 0, _size_ = x->getSize() + * + */ + void VectorHandlerCuda::massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y) + { + using namespace constants; + if (k < 200) { + mass_axpy(size, k, x->getData("cuda"), y->getData("cuda"),alpha->getData("cuda")); + } else { + LinAlgWorkspaceCUDA* workspaceCUDA = workspace_; + cublasHandle_t handle_cublas = workspaceCUDA->getCublasHandle(); + cublasDgemm(handle_cublas, + CUBLAS_OP_N, + CUBLAS_OP_N, + size, // m + 1, // n + k + 1, // k + &MINUSONE, // alpha + x->getData("cuda"), // A + size, // lda + alpha->getData("cuda"), // B + k + 1, // ldb + &ONE, + y->getData("cuda"), // c + size); // ldc + } + } + + /** + * @brief mass (bulk) dot product i.e, V^T x, where V is n x k dense multivector + * (a dense multivector consisting of k vectors size n) and x is k x 2 dense + * multivector (a multivector consisiting of two vectors size n each) + * + * @param[in] size Number of elements in a single vector in V + * @param[in] V Multivector; k vectors size n x 1 each + * @param[in] k Number of vectors in V + * @param[in] x Multivector; 2 vectors size n x 1 each + * @param[out] res Multivector; 2 vectors size k x 1 each (result is returned in res) + * @param[in] memspace String containg memspace (cpu or cuda) + * + * @pre _size_ > 0, _k_ > 0, size = x->getSize(), _res_ needs to be allocated + * + */ + void VectorHandlerCuda::massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res) + { + using namespace constants; + + if (k < 200) { + mass_inner_product_two_vectors(size, k, x->getData("cuda") , x->getData(1, "cuda"), V->getData("cuda"), res->getData("cuda")); + } else { + LinAlgWorkspaceCUDA* workspaceCUDA = workspace_; + cublasHandle_t handle_cublas = workspaceCUDA->getCublasHandle(); + cublasDgemm(handle_cublas, + CUBLAS_OP_T, + CUBLAS_OP_N, + k + 1, //m + 2, //n + size, //k + &ONE, //alpha + V->getData("cuda"), //A + size, //lda + x->getData("cuda"), //B + size, //ldb + &ZERO, + res->getData("cuda"), //c + k + 1); //ldc + } + } + +} // namespace ReSolve diff --git a/resolve/vector/VectorHandlerCuda.hpp b/resolve/vector/VectorHandlerCuda.hpp new file mode 100644 index 00000000..0ee2752d --- /dev/null +++ b/resolve/vector/VectorHandlerCuda.hpp @@ -0,0 +1,57 @@ +#pragma once +#include + +namespace ReSolve +{ + namespace vector + { + class Vector; + } + class LinAlgWorkspaceCUDA; + class VectorHandlerImpl; +} + + +namespace ReSolve { //namespace vector { + class VectorHandlerCuda : public VectorHandlerImpl + { + public: + VectorHandlerCuda(); + VectorHandlerCuda(LinAlgWorkspaceCUDA* workspace); + virtual ~VectorHandlerCuda(); + + //y = alpha x + y + virtual void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y); + + //dot: x \cdot y + virtual real_type dot(vector::Vector* x, vector::Vector* y); + + //scal = alpha * x + virtual void scal(const real_type* alpha, vector::Vector* x); + + //mass axpy: x*alpha + y where x is [n x k] and alpha is [k x 1]; x is stored columnwise + virtual void massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y); + + //mass dot: V^T x, where V is [n x k] and x is [k x 2], everything is stored and returned columnwise + //Size = n + virtual void massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res); + + /** gemv: + * if `transpose = N` (no), `x = beta*x + alpha*V*y`, + * where `x` is `[n x 1]`, `V` is `[n x k]` and `y` is `[k x 1]`. + * if `transpose = T` (yes), `x = beta*x + alpha*V^T*y`, + * where `x` is `[k x 1]`, `V` is `[n x k]` and `y` is `[n x 1]`. + */ + virtual void gemv(std::string transpose, + index_type n, + index_type k, + const real_type* alpha, + const real_type* beta, + vector::Vector* V, + vector::Vector* y, + vector::Vector* x); + private: + LinAlgWorkspaceCUDA* workspace_; + }; + +} //} // namespace ReSolve::vector diff --git a/resolve/vector/VectorHandlerImpl.hpp b/resolve/vector/VectorHandlerImpl.hpp new file mode 100644 index 00000000..229a7461 --- /dev/null +++ b/resolve/vector/VectorHandlerImpl.hpp @@ -0,0 +1,57 @@ +#pragma once +#include + +namespace ReSolve +{ + namespace vector + { + class Vector; + } + class VectorHandlerCpu; + class VectorHandlerCuda; +} + + +namespace ReSolve +{ + class VectorHandlerImpl + { + public: + VectorHandlerImpl() + {} + virtual ~VectorHandlerImpl() + {} + + //y = alpha x + y + virtual void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y ) = 0; + + //dot: x \cdot y + virtual real_type dot(vector::Vector* x, vector::Vector* y ) = 0; + + //scal = alpha * x + virtual void scal(const real_type* alpha, vector::Vector* x) = 0; + + //mass axpy: x*alpha + y where x is [n x k] and alpha is [k x 1]; x is stored columnwise + virtual void massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y) = 0; + + //mass dot: V^T x, where V is [n x k] and x is [k x 2], everything is stored and returned columnwise + //Size = n + virtual void massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res) = 0; + + /** gemv: + * if `transpose = N` (no), `x = beta*x + alpha*V*y`, + * where `x` is `[n x 1]`, `V` is `[n x k]` and `y` is `[k x 1]`. + * if `transpose = T` (yes), `x = beta*x + alpha*V^T*y`, + * where `x` is `[k x 1]`, `V` is `[n x k]` and `y` is `[n x 1]`. + */ + virtual void gemv(std::string transpose, + index_type n, + index_type k, + const real_type* alpha, + const real_type* beta, + vector::Vector* V, + vector::Vector* y, + vector::Vector* x) = 0; + }; + +} //} // namespace ReSolve::vector diff --git a/resolve/workspace/CMakeLists.txt b/resolve/workspace/CMakeLists.txt new file mode 100644 index 00000000..673fac4b --- /dev/null +++ b/resolve/workspace/CMakeLists.txt @@ -0,0 +1,43 @@ +#[[ + +@brief Build ReSolve workspace module + +@author Slaven Peles + +]] + +# C++ code +set(ReSolve_Workspace_SRC + LinAlgWorkspaceCpu.cpp +) + +# C++ code that depends on CUDA SDK libraries +set(ReSolve_Workspace_CUDASDK_SRC + LinAlgWorkspaceCUDA.cpp +) + +set(ReSolve_Workspace_HEADER_INSTALL + LinAlgWorkspace.hpp + LinAlgWorkspaceCpu.hpp + LinAlgWorkspaceCUDA.hpp +) + +# If cuda is enabled, add CUDA SDK workspace files +if(RESOLVE_USE_CUDA) + set(ReSolve_Workspace_SRC ${ReSolve_Workspace_SRC} ${ReSolve_Workspace_CUDASDK_SRC}) +endif() + +add_library(resolve_workspace SHARED ${ReSolve_Workspace_SRC}) + +# If CUDA is enabled, link to ReSolve CUDA backend +if(RESOLVE_USE_CUDA) + target_link_libraries(resolve_workspace PUBLIC resolve_backend_cuda) +endif(RESOLVE_USE_CUDA) + +target_include_directories(resolve_workspace INTERFACE + $ + $ +) + +# install include headers +install(FILES ${ReSolve_Workspace_HEADER_INSTALL} DESTINATION include/resolve/workspace) diff --git a/resolve/workspace/LinAlgWorkspace.hpp b/resolve/workspace/LinAlgWorkspace.hpp new file mode 100644 index 00000000..6da58fda --- /dev/null +++ b/resolve/workspace/LinAlgWorkspace.hpp @@ -0,0 +1,8 @@ +#pragma once + +#include + +#ifdef RESOLVE_USE_CUDA +#include +#endif + diff --git a/resolve/LinAlgWorkspace.cpp b/resolve/workspace/LinAlgWorkspaceCUDA.cpp similarity index 84% rename from resolve/LinAlgWorkspace.cpp rename to resolve/workspace/LinAlgWorkspaceCUDA.cpp index 9dee78c5..200ac437 100644 --- a/resolve/LinAlgWorkspace.cpp +++ b/resolve/workspace/LinAlgWorkspaceCUDA.cpp @@ -1,15 +1,7 @@ -#include "LinAlgWorkspace.hpp" +#include namespace ReSolve { - LinAlgWorkspace::LinAlgWorkspace() - { - } - - LinAlgWorkspace::~LinAlgWorkspace() - { - } - LinAlgWorkspaceCUDA::LinAlgWorkspaceCUDA() { handle_cusolversp_ = nullptr; @@ -29,8 +21,6 @@ namespace ReSolve cusolverSpDestroy(handle_cusolversp_); cublasDestroy(handle_cublas_); cusparseDestroySpMat(mat_A_); - // cusparseDestroyDnVec(vec_x_); - // cusparseDestroyDnVec(vec_y_); } void* LinAlgWorkspaceCUDA::getSpmvBuffer() @@ -93,21 +83,23 @@ namespace ReSolve mat_A_ = mat; } - cusparseDnVecDescr_t LinAlgWorkspaceCUDA::getVecX() + cusparseDnVecDescr_t LinAlgWorkspaceCUDA::getVecX() { return vec_x_; } - cusparseDnVecDescr_t LinAlgWorkspaceCUDA::getVecY() + cusparseDnVecDescr_t LinAlgWorkspaceCUDA::getVecY() { return vec_y_; } - bool LinAlgWorkspaceCUDA::matvecSetup(){ + bool LinAlgWorkspaceCUDA::matvecSetup() + { return matvec_setup_done_; } - void LinAlgWorkspaceCUDA::matvecSetupDone() { + void LinAlgWorkspaceCUDA::matvecSetupDone() + { matvec_setup_done_ = true; } @@ -117,4 +109,4 @@ namespace ReSolve cublasCreate(&handle_cublas_); cusolverSpCreate(&handle_cusolversp_); } -} +} // namespace ReSolve diff --git a/resolve/LinAlgWorkspace.hpp b/resolve/workspace/LinAlgWorkspaceCUDA.hpp similarity index 71% rename from resolve/LinAlgWorkspace.hpp rename to resolve/workspace/LinAlgWorkspaceCUDA.hpp index e5c79580..5076563a 100644 --- a/resolve/LinAlgWorkspace.hpp +++ b/resolve/workspace/LinAlgWorkspaceCUDA.hpp @@ -8,17 +8,7 @@ namespace ReSolve { - class LinAlgWorkspace - { - public: - LinAlgWorkspace(); - ~LinAlgWorkspace(); - protected: - MemoryHandler mem_; - }; - - - class LinAlgWorkspaceCUDA : public LinAlgWorkspace + class LinAlgWorkspaceCUDA { public: LinAlgWorkspaceCUDA(); @@ -65,20 +55,8 @@ namespace ReSolve void* buffer_1norm_; bool matvec_setup_done_; //check if setup is done for matvec i.e. if buffer is allocated, csr structure is set etc. + + MemoryHandler mem_; }; - /// @brief Workspace factory - /// @param[in] memspace memory space ID - /// @return pointer to the linear algebra workspace - inline LinAlgWorkspace* createLinAlgWorkspace(std::string memspace) - { - if (memspace == "cuda") { - LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA(); - workspace->initializeHandles(); - return workspace; - } - // If not CUDA, return default - return (new LinAlgWorkspace()); - } - -} +} // namespace ReSolve diff --git a/resolve/workspace/LinAlgWorkspaceCpu.cpp b/resolve/workspace/LinAlgWorkspaceCpu.cpp new file mode 100644 index 00000000..3ed9aa43 --- /dev/null +++ b/resolve/workspace/LinAlgWorkspaceCpu.cpp @@ -0,0 +1,16 @@ +#include "LinAlgWorkspaceCpu.hpp" + +namespace ReSolve +{ + LinAlgWorkspaceCpu::LinAlgWorkspaceCpu() + { + } + + LinAlgWorkspaceCpu::~LinAlgWorkspaceCpu() + { + } + + void LinAlgWorkspaceCpu::initializeHandles() + { + } +} diff --git a/resolve/workspace/LinAlgWorkspaceCpu.hpp b/resolve/workspace/LinAlgWorkspaceCpu.hpp new file mode 100644 index 00000000..00e5f38e --- /dev/null +++ b/resolve/workspace/LinAlgWorkspaceCpu.hpp @@ -0,0 +1,18 @@ +#pragma once + + +#include + +namespace ReSolve +{ + class LinAlgWorkspaceCpu + { + public: + LinAlgWorkspaceCpu(); + ~LinAlgWorkspaceCpu(); + void initializeHandles(); + private: + MemoryHandler mem_; + }; + +} diff --git a/runResolve b/runResolve index 0a8e5dbf..2d5c67ca 100755 --- a/runResolve +++ b/runResolve @@ -13,9 +13,9 @@ foo=" ${foo} ${i} ${i}" done -matfile="/gpfs/wolf/csc359/scratch/peles/testcases/ACTIVSg10k_AC/matrix_ACTIVSg10k_AC_" +matfile="$HOME/testcases/ACTIVSg10k_AC/matrix_ACTIVSg10k_AC_" -rhsfile="/gpfs/wolf/csc359/scratch/peles/testcases/ACTIVSg10k_AC/rhs_ACTIVSg10k_AC_" +rhsfile="$HOME/testcases/ACTIVSg10k_AC/rhs_ACTIVSg10k_AC_" -./resolve/klu_rf_fgmres.exe $matfile $rhsfile 10 $foo +./$1 $matfile $rhsfile 10 $foo diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ae24f9db..19b799e4 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -6,9 +6,6 @@ ]] -if(RESOLVE_USE_CUDA) - add_subdirectory(functionality) -endif(RESOLVE_USE_CUDA) - +add_subdirectory(functionality) add_subdirectory(unit) diff --git a/tests/functionality/CMakeLists.txt b/tests/functionality/CMakeLists.txt index e2a6b4b6..a6652c26 100644 --- a/tests/functionality/CMakeLists.txt +++ b/tests/functionality/CMakeLists.txt @@ -10,26 +10,31 @@ add_executable(klu_klu_test.exe testKLU.cpp) target_link_libraries(klu_klu_test.exe PRIVATE ReSolve) -# Build KLU+Rf test -add_executable(klu_rf_test.exe testKLU_Rf.cpp) -target_link_libraries(klu_rf_test.exe PRIVATE ReSolve) +if(RESOLVE_USE_CUDA) + + # Build KLU+Rf test + add_executable(klu_rf_test.exe testKLU_Rf.cpp) + target_link_libraries(klu_rf_test.exe PRIVATE ReSolve) -# Build KLU+Rf+fgmres test -add_executable(klu_rf_fgmres_test.exe testKLU_Rf_FGMRES.cpp) -target_link_libraries(klu_rf_fgmres_test.exe PRIVATE ReSolve) + # Build KLU+Rf+fgmres test + add_executable(klu_rf_fgmres_test.exe testKLU_Rf_FGMRES.cpp) + target_link_libraries(klu_rf_fgmres_test.exe PRIVATE ReSolve) + # Build KLU+GLU test + add_executable(klu_glu_test.exe testKLU_GLU.cpp) + target_link_libraries(klu_glu_test.exe PRIVATE ReSolve) - -# Build KLU+GLU test -add_executable(klu_glu_test.exe testKLU_GLU.cpp) -target_link_libraries(klu_glu_test.exe PRIVATE ReSolve) +endif(RESOLVE_USE_CUDA) # Install tests -set(installable_tests - klu_klu_test.exe - klu_rf_test.exe - klu_rf_fgmres_test.exe - klu_glu_test.exe) +set(installable_tests klu_klu_test.exe) + +if(RESOLVE_USE_CUDA) + set(installable_tests ${installable_tests} + klu_rf_test.exe + klu_rf_fgmres_test.exe + klu_glu_test.exe) +endif(RESOLVE_USE_CUDA) install(TARGETS ${installable_tests} RUNTIME DESTINATION bin/resolve/tests/functionality) @@ -40,6 +45,8 @@ install(DIRECTORY data DESTINATION bin/resolve/tests/functionality) set(test_data_dir ${CMAKE_SOURCE_DIR}/tests/functionality/) add_test(NAME klu_klu_test COMMAND $ "${test_data_dir}") -add_test(NAME klu_rf_test COMMAND $ "${test_data_dir}") -add_test(NAME klu_rf_fgmres_test COMMAND $ "${test_data_dir}") -add_test(NAME klu_glu_test COMMAND $ "${test_data_dir}") +if(RESOLVE_USE_CUDA) + add_test(NAME klu_rf_test COMMAND $ "${test_data_dir}") + add_test(NAME klu_rf_fgmres_test COMMAND $ "${test_data_dir}") + add_test(NAME klu_glu_test COMMAND $ "${test_data_dir}") +endif(RESOLVE_USE_CUDA) diff --git a/tests/functionality/testKLU.cpp b/tests/functionality/testKLU.cpp index d5ff9d6d..f3c1da57 100644 --- a/tests/functionality/testKLU.cpp +++ b/tests/functionality/testKLU.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -9,7 +10,7 @@ #include #include #include -#include +#include //author: KS //functionality test to check whether KLU works correctly. using namespace ReSolve::constants; @@ -26,10 +27,9 @@ int main(int argc, char *argv[]) int error_sum = 0; int status = 0; - ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA; - workspace_CUDA->initializeHandles(); - ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_CUDA); - ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_CUDA); + ReSolve::LinAlgWorkspaceCpu* workspace = new ReSolve::LinAlgWorkspaceCpu(); + ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace); + ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace); ReSolve::LinSolverDirectKLU* KLU = new ReSolve::LinSolverDirectKLU; KLU->setupParameters(1, 0.1, false); @@ -101,32 +101,32 @@ int main(int argc, char *argv[]) } vec_test->setData(x_data, "cpu"); - vec_r->update(rhs, "cpu", "cuda"); - vec_diff->update(x_data, "cpu", "cuda"); + vec_r->update(rhs, "cpu", "cpu"); + vec_diff->update(x_data, "cpu", "cpu"); - // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cuda")); - matrix_handler->setValuesChanged(true); - status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cuda"); + // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cpu")); + matrix_handler->setValuesChanged(true, "cpu"); + status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cpu"); error_sum += status; - real_type normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda")); + real_type normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cpu")); //for testing only - control - real_type normXtrue = sqrt(vector_handler->dot(vec_x, vec_x, "cuda")); - real_type normB1 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cuda")); + real_type normXtrue = sqrt(vector_handler->dot(vec_x, vec_x, "cpu")); + real_type normB1 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cpu")); //compute x-x_true - vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cuda"); + vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cpu"); //evaluate its norm - real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda")); + real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cpu")); //compute the residual using exact solution - vec_r->update(rhs, "cpu", "cuda"); - status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "cuda"); + vec_r->update(rhs, "cpu", "cpu"); + status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "cpu"); error_sum += status; - real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda")); + real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cpu")); //evaluate the residual ON THE CPU using COMPUTED solution vec_r->update(rhs, "cpu", "cpu"); @@ -134,7 +134,7 @@ int main(int argc, char *argv[]) status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cpu"); error_sum += status; - real_type normRmatrix1CPU = sqrt(vector_handler->dot(vec_r, vec_r, "cuda")); + real_type normRmatrix1CPU = sqrt(vector_handler->dot(vec_r, vec_r, "cpu")); std::cout<<"Results (first matrix): "<coo2csr(A_coo, A, "cuda"); - vec_rhs->update(rhs, "cpu", "cuda"); + matrix_handler->coo2csr(A_coo, A, "cpu"); + vec_rhs->update(rhs, "cpu", "cpu"); // and solve it too status = KLU->refactorize(); @@ -174,27 +174,27 @@ int main(int argc, char *argv[]) status = KLU->solve(vec_rhs, vec_x); error_sum += status; - vec_r->update(rhs, "cpu", "cuda"); - matrix_handler->setValuesChanged(true); + vec_r->update(rhs, "cpu", "cpu"); + matrix_handler->setValuesChanged(true, "cpu"); - status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); + status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cpu"); error_sum += status; - real_type normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda")); + real_type normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cpu")); //for testing only - control - real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cuda")); + real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cpu")); //compute x-x_true - vec_diff->update(x_data, "cpu", "cuda"); - vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cuda"); + vec_diff->update(x_data, "cpu", "cpu"); + vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cpu"); //evaluate its norm - real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda")); + real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cpu")); //compute the residual using exact solution - vec_r->update(rhs, "cpu", "cuda"); - status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "cuda"); + vec_r->update(rhs, "cpu", "cpu"); + status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "cpu"); error_sum += status; - real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda")); + real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cpu")); std::cout<<"Results (second matrix): "< #include #include -#include +#include //author: KS //functionality test to check whether cuSolverGLU works correctly. @@ -30,7 +30,7 @@ int main(int argc, char *argv[]) int error_sum = 0; int status = 0; - ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA; + ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA(); workspace_CUDA->initializeHandles(); ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_CUDA); ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_CUDA); @@ -127,7 +127,7 @@ int main(int argc, char *argv[]) vec_diff->update(x_data, "cpu", "cuda"); // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cuda")); - matrix_handler->setValuesChanged(true); + matrix_handler->setValuesChanged(true, "cuda"); status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cuda"); error_sum += status; @@ -198,7 +198,7 @@ int main(int argc, char *argv[]) error_sum += status; vec_r->update(rhs, "cpu", "cuda"); - matrix_handler->setValuesChanged(true); + matrix_handler->setValuesChanged(true, "cuda"); status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); error_sum += status; diff --git a/tests/functionality/testKLU_Rf.cpp b/tests/functionality/testKLU_Rf.cpp index 21aba8b0..729968f5 100644 --- a/tests/functionality/testKLU_Rf.cpp +++ b/tests/functionality/testKLU_Rf.cpp @@ -11,7 +11,7 @@ #include #include #include -#include +#include //author: KS //functionality test to check whether cuSolverRf works correctly. @@ -30,7 +30,7 @@ int main(int argc, char *argv[]) int error_sum = 0; int status = 0; - ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA; + ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA(); workspace_CUDA->initializeHandles(); ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_CUDA); ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_CUDA); @@ -111,7 +111,7 @@ int main(int argc, char *argv[]) vec_diff->update(x_data, "cpu", "cuda"); // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cuda")); - matrix_handler->setValuesChanged(true); + matrix_handler->setValuesChanged(true, "cuda"); status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cuda"); error_sum += status; @@ -194,8 +194,8 @@ int main(int argc, char *argv[]) status = Rf->solve(vec_rhs, vec_x); error_sum += status; - vec_r->update(rhs, "cpu", "cuda"); - matrix_handler->setValuesChanged(true); + vec_r->update(rhs, "cpu", "cuda"); + matrix_handler->setValuesChanged(true, "cuda"); status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); error_sum += status; diff --git a/tests/functionality/testKLU_Rf_FGMRES.cpp b/tests/functionality/testKLU_Rf_FGMRES.cpp index c63a3f99..a474e406 100644 --- a/tests/functionality/testKLU_Rf_FGMRES.cpp +++ b/tests/functionality/testKLU_Rf_FGMRES.cpp @@ -12,7 +12,7 @@ #include #include #include -#include +#include //author: KS //functionality test to check whether cuSolverRf/FGMRES works correctly. @@ -31,7 +31,7 @@ int main(int argc, char *argv[]) int error_sum = 0; int status = 0; - ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA; + ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA(); workspace_CUDA->initializeHandles(); ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_CUDA); ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_CUDA); @@ -117,7 +117,7 @@ int main(int argc, char *argv[]) vec_diff->update(x_data, "cpu", "cuda"); // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cuda")); - matrix_handler->setValuesChanged(true); + matrix_handler->setValuesChanged(true, "cuda"); //evaluate the residual ||b-Ax|| status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cuda"); error_sum += status; @@ -221,7 +221,7 @@ int main(int argc, char *argv[]) error_sum += status; vec_r->update(rhs, "cpu", "cuda"); - matrix_handler->setValuesChanged(true); + matrix_handler->setValuesChanged(true, "cuda"); //evaluate final residual status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); @@ -258,5 +258,18 @@ int main(int argc, char *argv[]) } else { std::cout<<"Test 4 (KLU with cuSolverRf refactorization + IR) FAILED, error sum: "< #include #include -#include +#include #include #include #include @@ -43,8 +43,7 @@ class MatrixHandlerTests : TestBase { TestStatus status; - ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_); - ReSolve::MatrixHandler handler(workspace); + ReSolve::MatrixHandler* handler = createMatrixHandler(); matrix::Csr* A = createCsrMatrix(N, memspace_); vector::Vector x(N); @@ -57,12 +56,12 @@ class MatrixHandlerTests : TestBase real_type alpha = 2.0/30.0; real_type beta = 2.0; - handler.setValuesChanged(true); - handler.matvec(A, &x, &y, &alpha, &beta, "csr", memspace_); + handler->setValuesChanged(true, memspace_); + handler->matvec(A, &x, &y, &alpha, &beta, "csr", memspace_); status *= verifyAnswer(y, 4.0, memspace_); - delete workspace; + delete handler; delete A; return status.report(__func__); @@ -71,6 +70,23 @@ class MatrixHandlerTests : TestBase private: std::string memspace_{"cpu"}; + ReSolve::MatrixHandler* createMatrixHandler() + { + if (memspace_ == "cpu") { + LinAlgWorkspaceCpu* workspace = new LinAlgWorkspaceCpu(); + return new MatrixHandler(workspace); +#ifdef RESOLVE_USE_CUDA + } else if (memspace_ == "cuda") { + LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA(); + workspace->initializeHandles(); + return new MatrixHandler(workspace); +#endif + } else { + std::cout << "ReSolve not built with support for memory space " << memspace_ << "\n"; + } + return nullptr; + } + bool verifyAnswer(vector::Vector& x, real_type answer, std::string memspace) { bool status = true; diff --git a/tests/unit/matrix/runMatrixHandlerTests.cpp b/tests/unit/matrix/runMatrixHandlerTests.cpp index d18f0965..6eee90d5 100644 --- a/tests/unit/matrix/runMatrixHandlerTests.cpp +++ b/tests/unit/matrix/runMatrixHandlerTests.cpp @@ -20,6 +20,7 @@ int main(int, char**) std::cout << "\n"; } +#ifdef RESOLVE_USE_CUDA { std::cout << "Running tests with CUDA backend:\n"; ReSolve::tests::MatrixHandlerTests test("cuda"); @@ -30,6 +31,7 @@ int main(int, char**) std::cout << "\n"; } +#endif return result.summary(); } diff --git a/tests/unit/vector/GramSchmidtTests.hpp b/tests/unit/vector/GramSchmidtTests.hpp index 6f0d305b..9981ea48 100644 --- a/tests/unit/vector/GramSchmidtTests.hpp +++ b/tests/unit/vector/GramSchmidtTests.hpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include namespace ReSolve { namespace tests { @@ -66,8 +66,7 @@ namespace ReSolve { break; } - ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_); - ReSolve::VectorHandler* handler = new ReSolve::VectorHandler(workspace); + ReSolve::VectorHandler* handler = createVectorHandler(); vector::Vector* V = new vector::Vector(N, 3); // we will be using a space of 3 vectors real_type* H = new real_type[6]; //in this case, Hessenberg matrix is 3 x 2 @@ -114,7 +113,7 @@ namespace ReSolve { status *= verifyAnswer(V, 3, handler, memspace_); - delete workspace; + delete handler; delete [] H; delete V; delete GS; @@ -125,6 +124,23 @@ namespace ReSolve { private: std::string memspace_{"cuda"}; + ReSolve::VectorHandler* createVectorHandler() + { + if (memspace_ == "cpu") { // TODO: Fix memory leak here + LinAlgWorkspaceCpu* workpsace = new LinAlgWorkspaceCpu(); + return new VectorHandler(workpsace); +#ifdef RESOLVE_USE_CUDA + } else if (memspace_ == "cuda") { + LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA(); + workspace->initializeHandles(); + return new VectorHandler(workspace); +#endif + } else { + std::cout << "ReSolve not built with support for memory space " << memspace_ << "\n"; + } + return nullptr; + } + // x is a multivector containing K vectors bool verifyAnswer(vector::Vector* x, index_type K, ReSolve::VectorHandler* handler, std::string memspace) { diff --git a/tests/unit/vector/VectorHandlerTests.hpp b/tests/unit/vector/VectorHandlerTests.hpp index 0737385d..d2f8c73c 100644 --- a/tests/unit/vector/VectorHandlerTests.hpp +++ b/tests/unit/vector/VectorHandlerTests.hpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include namespace ReSolve { namespace tests { @@ -38,8 +38,7 @@ namespace ReSolve { { TestStatus status; - ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_); - ReSolve::VectorHandler handler(workspace); + ReSolve::VectorHandler* handler = createVectorHandler(); vector::Vector* x = new vector::Vector(N); vector::Vector* y = new vector::Vector(N); @@ -52,10 +51,10 @@ namespace ReSolve { real_type alpha = 0.5; //the result is a vector with y[i] = 2.5; - handler.axpy(&alpha, x, y, memspace_); + handler->axpy(&alpha, x, y, memspace_); status *= verifyAnswer(y, 2.5, memspace_); - delete workspace; + delete handler; delete x; delete y; @@ -66,8 +65,7 @@ namespace ReSolve { { TestStatus status; - ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_); - ReSolve::VectorHandler handler(workspace); + ReSolve::VectorHandler* handler = createVectorHandler(); vector::Vector* x = new vector::Vector(N); vector::Vector* y = new vector::Vector(N); @@ -79,7 +77,7 @@ namespace ReSolve { y->setToConst(4.0, memspace_); real_type ans; //the result is N - ans = handler.dot(x, y, memspace_); + ans = handler->dot(x, y, memspace_); bool st = true;; if (ans != (real_type) N) { @@ -88,7 +86,7 @@ namespace ReSolve { } status *= st; - delete workspace; + delete handler; delete x; delete y; @@ -99,8 +97,7 @@ namespace ReSolve { { TestStatus status; - ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_); - ReSolve::VectorHandler handler(workspace); + ReSolve::VectorHandler* handler = createVectorHandler(); vector::Vector* x = new vector::Vector(N); @@ -111,10 +108,10 @@ namespace ReSolve { real_type alpha = 3.5; //the answer is x[i] = 4.375; - handler.scal(&alpha, x, memspace_); + handler->scal(&alpha, x, memspace_); status *= verifyAnswer(x, 4.375, memspace_); - delete workspace; + delete handler; delete x; return status.report(__func__); @@ -124,8 +121,7 @@ namespace ReSolve { { TestStatus status; - ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_); - ReSolve::VectorHandler handler(workspace); + ReSolve::VectorHandler* handler = createVectorHandler(); vector::Vector* x = new vector::Vector(N, K); vector::Vector* y = new vector::Vector(N); @@ -148,11 +144,11 @@ namespace ReSolve { index_type r = K % 2; real_type res = (real_type) ((floor((real_type) K / 2.0) + r) * 1.0 + floor((real_type) K / 2.0) * (-0.5)); - handler.massAxpy(N, alpha, K, x, y, memspace_); + handler->massAxpy(N, alpha, K, x, y, memspace_); status *= verifyAnswer(y, 2.0 - res, memspace_); - delete workspace; + delete handler; delete x; delete y; delete alpha; @@ -164,8 +160,7 @@ namespace ReSolve { { TestStatus status; - ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_); - ReSolve::VectorHandler handler(workspace); + ReSolve::VectorHandler* handler = createVectorHandler(); vector::Vector* x = new vector::Vector(N, K); vector::Vector* y = new vector::Vector(N, 2); @@ -176,11 +171,11 @@ namespace ReSolve { x->setToConst(1.0, memspace_); y->setToConst(-1.0, memspace_); - handler.massDot2Vec(N, x, K, y, res, memspace_); + handler->massDot2Vec(N, x, K, y, res, memspace_); status *= verifyAnswer(res, (-1.0) * (real_type) N, memspace_); - delete workspace; + delete handler; delete x; delete y; delete res; @@ -190,8 +185,7 @@ namespace ReSolve { TestOutcome gemv(index_type N, index_type K) { TestStatus status; - ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_); - ReSolve::VectorHandler handler(workspace); + ReSolve::VectorHandler* handler = createVectorHandler(); vector::Vector* V = new vector::Vector(N, K); // for the test with NO TRANSPOSE vector::Vector* yN = new vector::Vector(K); @@ -214,9 +208,9 @@ namespace ReSolve { real_type alpha = -1.0; real_type beta = 1.0; - handler.gemv("N", N, K, &alpha, &beta, V, yN, xN, memspace_); + handler->gemv("N", N, K, &alpha, &beta, V, yN, xN, memspace_); status *= verifyAnswer(xN, (real_type) (K) + 0.5, memspace_); - handler.gemv("T", N, K, &alpha, &beta, V, yT, xT, memspace_); + handler->gemv("T", N, K, &alpha, &beta, V, yT, xT, memspace_); status *= verifyAnswer(xT, (real_type) (N) + 0.5, memspace_); return status.report(__func__); @@ -225,7 +219,24 @@ namespace ReSolve { private: std::string memspace_{"cpu"}; - // we can verify through norm but that would defeat the purpose of testing vector handler... + ReSolve::VectorHandler* createVectorHandler() + { + if (memspace_ == "cpu") { + LinAlgWorkspaceCpu* workpsace = new LinAlgWorkspaceCpu(); + return new VectorHandler(workpsace); +#ifdef RESOLVE_USE_CUDA + } else if (memspace_ == "cuda") { + LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA(); + workspace->initializeHandles(); + return new VectorHandler(workspace); +#endif + } else { + std::cout << "ReSolve not built with support for memory space " << memspace_ << "\n"; + } + return nullptr; + } + + // we can verify through norm but that would defeat the purpose of testing vector handler ... bool verifyAnswer(vector::Vector* x, real_type answer, std::string memspace) { bool status = true; diff --git a/tests/unit/vector/runGramSchmidtTests.cpp b/tests/unit/vector/runGramSchmidtTests.cpp index 2b5a3a17..e118eb6d 100644 --- a/tests/unit/vector/runGramSchmidtTests.cpp +++ b/tests/unit/vector/runGramSchmidtTests.cpp @@ -7,6 +7,7 @@ int main(int, char**) { ReSolve::tests::TestingResults result; +#ifdef RESOLVE_USE_CUDA { std::cout << "Running tests with CUDA backend:\n"; ReSolve::tests::GramSchmidtTests test("cuda"); @@ -18,6 +19,7 @@ int main(int, char**) result += test.orthogonalize(5000, ReSolve::GramSchmidt::mgs_pm); std::cout << "\n"; } +#endif return result.summary(); } diff --git a/tests/unit/vector/runVectorHandlerTests.cpp b/tests/unit/vector/runVectorHandlerTests.cpp index ae40f5d7..77e99471 100644 --- a/tests/unit/vector/runVectorHandlerTests.cpp +++ b/tests/unit/vector/runVectorHandlerTests.cpp @@ -19,6 +19,7 @@ int main(int, char**) std::cout << "\n"; } +#ifdef RESOLVE_USE_CUDA { std::cout << "Running tests with CUDA backend:\n"; ReSolve::tests::VectorHandlerTests test("cuda"); @@ -34,6 +35,7 @@ int main(int, char**) std::cout << "\n"; } +#endif return result.summary(); }