From d806abf828f34246373ce34665cb9e298ae2b40e Mon Sep 17 00:00:00 2001
From: Slaven Peles <peless@ornl.gov>
Date: Sun, 15 Oct 2023 17:27:49 -0400
Subject: [PATCH 01/15] Don't build CUDA examples and tests when CUDA is
 disabled.

---
 CMakeLists.txt                                |  6 +-
 examples/CMakeLists.txt                       | 41 +++++++------
 examples/r_KLU_KLU.cpp                        | 23 ++++----
 examples/r_KLU_KLU_standalone.cpp             | 13 ++---
 resolve/cuda/CMakeLists.txt                   |  2 +
 resolve/{vector => cuda}/cudaVectorKernels.cu |  2 +-
 resolve/{vector => cuda}/cudaVectorKernels.h  |  0
 resolve/matrix/CMakeLists.txt                 |  8 +++
 resolve/vector/CMakeLists.txt                 | 22 +++----
 tests/CMakeLists.txt                          |  5 +-
 tests/functionality/CMakeLists.txt            | 43 ++++++++------
 tests/functionality/testKLU.cpp               | 57 +++++++++----------
 12 files changed, 122 insertions(+), 100 deletions(-)
 rename resolve/{vector => cuda}/cudaVectorKernels.cu (92%)
 rename resolve/{vector => cuda}/cudaVectorKernels.h (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5715e6e2..635fa390 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -130,10 +130,8 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/ReSolveConfig.cmake"
         "${CMAKE_CURRENT_BINARY_DIR}/ReSolveConfigVersion.cmake"
         DESTINATION share/resolve/cmake)
     
-# Add examples (for now only CUDA examples are available)
-if(RESOLVE_USE_CUDA)
-  add_subdirectory(examples)
-endif(RESOLVE_USE_CUDA)
+# Add usage examples
+add_subdirectory(examples)
 
 # Add tests
 add_subdirectory(tests)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 73dce68e..6250400b 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -14,30 +14,39 @@ target_link_libraries(klu_klu.exe PRIVATE ReSolve)
 add_executable(klu_klu_standalone.exe r_KLU_KLU_standalone.cpp)
 target_link_libraries(klu_klu_standalone.exe PRIVATE ReSolve)
 
-# Build example with KLU factorization and GLU refactorization
-add_executable(klu_glu.exe r_KLU_GLU.cpp)
-target_link_libraries(klu_glu.exe PRIVATE ReSolve)
+if(RESOLVE_USE_CUDA)
+        
+  # Build example with KLU factorization and GLU refactorization
+  add_executable(klu_glu.exe r_KLU_GLU.cpp)
+  target_link_libraries(klu_glu.exe PRIVATE ReSolve)
 
-# Build example with KLU factorization and Rf refactorization
-add_executable(klu_rf.exe r_KLU_rf.cpp)
-target_link_libraries(klu_rf.exe PRIVATE ReSolve)
+  # Build example with KLU factorization and Rf refactorization
+  add_executable(klu_rf.exe r_KLU_rf.cpp)
+  target_link_libraries(klu_rf.exe PRIVATE ReSolve)
 
-# Build example with KLU factorization, Rf refactorization, and FGMRES iterative refinement
-add_executable(klu_rf_fgmres.exe r_KLU_rf_FGMRES.cpp)
-target_link_libraries(klu_rf_fgmres.exe PRIVATE ReSolve)
+  # Build example with KLU factorization, Rf refactorization, and FGMRES iterative refinement
+  add_executable(klu_rf_fgmres.exe r_KLU_rf_FGMRES.cpp)
+  target_link_libraries(klu_rf_fgmres.exe PRIVATE ReSolve)
 
-# Build example where matrix is factorized once, refactorized once and then the preconditioner is REUSED
-add_executable(klu_rf_fgmres_reuse_refactorization.exe r_KLU_rf_FGMRES_reuse_factorization.cpp)
-target_link_libraries(klu_rf_fgmres_reuse_refactorization.exe PRIVATE ReSolve)
+  # Build example where matrix is factorized once, refactorized once and then the preconditioner is REUSED
+  add_executable(klu_rf_fgmres_reuse_refactorization.exe r_KLU_rf_FGMRES_reuse_factorization.cpp)
+  target_link_libraries(klu_rf_fgmres_reuse_refactorization.exe PRIVATE ReSolve)
 
-# Build example where matrix data is updated 
-add_executable(klu_glu_values_update.exe r_KLU_GLU_matrix_values_update.cpp)
-target_link_libraries(klu_glu_values_update.exe PRIVATE ReSolve)
+  # Build example where matrix data is updated 
+  add_executable(klu_glu_values_update.exe r_KLU_GLU_matrix_values_update.cpp)
+  target_link_libraries(klu_glu_values_update.exe PRIVATE ReSolve)
+
+endif(RESOLVE_USE_CUDA)
 
 
 
 # Install all examples in bin directory
-set(installable_executables klu_klu.exe klu_klu_standalone.exe klu_glu.exe klu_rf.exe klu_rf_fgmres.exe klu_glu_values_update.exe)
+set(installable_executables klu_klu.exe klu_klu_standalone.exe)
+
+if(RESOLVE_USE_CUDA)
+  set(installable_executables ${installable_executables} klu_glu.exe klu_rf.exe klu_rf_fgmres.exe klu_glu_values_update.exe)        
+endif(RESOLVE_USE_CUDA)
+
 install(TARGETS ${installable_executables} 
         RUNTIME DESTINATION bin)
 
diff --git a/examples/r_KLU_KLU.cpp b/examples/r_KLU_KLU.cpp
index 63aad4b5..439c8f41 100644
--- a/examples/r_KLU_KLU.cpp
+++ b/examples/r_KLU_KLU.cpp
@@ -35,10 +35,9 @@ int main(int argc, char *argv[])
 
   ReSolve::matrix::Coo* A_coo;
   ReSolve::matrix::Csr* A;
-  ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA;
-  workspace_CUDA->initializeHandles();
-  ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_CUDA);
-  ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_CUDA);
+  ReSolve::LinAlgWorkspace* workspace = ReSolve::createLinAlgWorkspace("cpu");
+  ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace);
+  ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace);
   real_type* rhs;
   real_type* x;
 
@@ -96,7 +95,11 @@ int main(int argc, char *argv[])
       ReSolve::io::readAndUpdateMatrix(mat_file, A_coo);
       ReSolve::io::readAndUpdateRhs(rhs_file, &rhs);
     }
-    std::cout<<"Finished reading the matrix and rhs, size: "<<A->getNumRows()<<" x "<<A->getNumColumns()<< ", nnz: "<< A->getNnz()<< ", symmetric? "<<A->symmetric()<< ", Expanded? "<<A->expanded()<<std::endl;
+    std::cout << "Finished reading the matrix and rhs, size: " << A->getNumRows()
+              << " x "           << A->getNumColumns()
+              << ", nnz: "       << A->getNnz()
+              << ", symmetric? " << A->symmetric()
+              << ", Expanded? "  << A->expanded() << std::endl;
     mat_file.close();
     rhs_file.close();
 
@@ -106,8 +109,8 @@ int main(int argc, char *argv[])
       vec_rhs->update(rhs, "cpu", "cpu");
       vec_rhs->setDataUpdated("cpu");
     } else { 
-      matrix_handler->coo2csr(A_coo, A, "cuda");
-      vec_rhs->update(rhs, "cpu", "cuda");
+      matrix_handler->coo2csr(A_coo, A, "cpu");
+      vec_rhs->update(rhs, "cpu", "cpu");
     }
     std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<<std::endl;
     //Now call direct solver
@@ -129,15 +132,15 @@ int main(int argc, char *argv[])
       status = KLU->solve(vec_rhs, vec_x);
       std::cout<<"KLU solve status: "<<status<<std::endl;      
     }
-    vec_r->update(rhs, "cpu", "cuda");
+    // vec_r->update(rhs, "cpu", "cuda");
 
     matrix_handler->setValuesChanged(true);
 
-    matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); 
+    matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cpu"); 
     real_type* test = vec_r->getData("cpu");
     (void) test; // TODO: Do we need `test` variable in this example?
 
-    printf("\t 2-Norm of the residual: %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda")));
+    printf("\t 2-Norm of the residual: %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cpu")));
 
 
   }
diff --git a/examples/r_KLU_KLU_standalone.cpp b/examples/r_KLU_KLU_standalone.cpp
index fd84efe0..3323b68b 100644
--- a/examples/r_KLU_KLU_standalone.cpp
+++ b/examples/r_KLU_KLU_standalone.cpp
@@ -31,10 +31,9 @@ int main(int argc, char *argv[])
 
   ReSolve::matrix::Coo* A_coo;
   ReSolve::matrix::Csr* A;
-  ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA;
-  workspace_CUDA->initializeHandles();
-  ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_CUDA);
-  ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_CUDA);
+  ReSolve::LinAlgWorkspace* workspace = ReSolve::createLinAlgWorkspace("cpu");
+  ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace);
+  ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace);
   real_type* rhs;
   real_type* x;
 
@@ -95,15 +94,15 @@ int main(int argc, char *argv[])
   std::cout << "KLU factorization status: " << status << std::endl;
   status = KLU->solve(vec_rhs, vec_x);
   std::cout << "KLU solve status: " << status << std::endl;      
-  vec_r->update(rhs, "cpu", "cuda");
+  vec_r->update(rhs, "cpu", "cpu");
 
   matrix_handler->setValuesChanged(true);
 
-  matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); 
+  matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cpu"); 
   real_type* test = vec_r->getData("cpu");
   (void) test; // TODO: Do we need `test` variable in this example?
 
-  printf("\t 2-Norm of the residual: %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda")));
+  printf("\t 2-Norm of the residual: %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cpu")));
 
 
 
diff --git a/resolve/cuda/CMakeLists.txt b/resolve/cuda/CMakeLists.txt
index d3ead313..04a9c2ff 100644
--- a/resolve/cuda/CMakeLists.txt
+++ b/resolve/cuda/CMakeLists.txt
@@ -8,11 +8,13 @@
 
 set(ReSolve_CUDA_SRC
     cudaKernels.cu
+    cudaVectorKernels.cu
     MemoryUtils.cu
 )
 
 set(ReSolve_CUDA_HEADER_INSTALL
     cudaKernels.h
+    cudaVectorKernels.h
     CudaMemory.hpp
     cuda_check_errors.hpp
 )
diff --git a/resolve/vector/cudaVectorKernels.cu b/resolve/cuda/cudaVectorKernels.cu
similarity index 92%
rename from resolve/vector/cudaVectorKernels.cu
rename to resolve/cuda/cudaVectorKernels.cu
index ea54a880..a1c53198 100644
--- a/resolve/vector/cudaVectorKernels.cu
+++ b/resolve/cuda/cudaVectorKernels.cu
@@ -1,5 +1,5 @@
 #include "cudaVectorKernels.h"
-#include "VectorKernels.hpp"
+#include <resolve/vector/VectorKernels.hpp>
 
 
 namespace ReSolve { namespace vector {
diff --git a/resolve/vector/cudaVectorKernels.h b/resolve/cuda/cudaVectorKernels.h
similarity index 100%
rename from resolve/vector/cudaVectorKernels.h
rename to resolve/cuda/cudaVectorKernels.h
diff --git a/resolve/matrix/CMakeLists.txt b/resolve/matrix/CMakeLists.txt
index 89ac925d..289f805a 100644
--- a/resolve/matrix/CMakeLists.txt
+++ b/resolve/matrix/CMakeLists.txt
@@ -1,3 +1,11 @@
+#[[
+
+@brief Build ReSolve matrix module
+
+@author Slaven Peles <peless@ornl.gov>
+
+]]
+
 
 set(Matrix_SRC 
     io.cpp
diff --git a/resolve/vector/CMakeLists.txt b/resolve/vector/CMakeLists.txt
index 57feb5df..8fa9cd59 100644
--- a/resolve/vector/CMakeLists.txt
+++ b/resolve/vector/CMakeLists.txt
@@ -1,3 +1,11 @@
+#[[
+
+@brief Build ReSolve (multi)vector module
+
+@author Slaven Peles <peless@ornl.gov>
+
+]]
+
 
 set(Vector_SRC 
     Vector.cpp
@@ -5,25 +13,17 @@ set(Vector_SRC
 )
 
 
-
-set(Vector_SRC_CUDA
-    cudaVectorKernels.cu
-)
-
 set(Vector_HEADER_INSTALL
     Vector.hpp
     VectorHandler.hpp
+    VectorKernels.hpp
 )
 
-if (RESOLVE_USE_CUDA)
-    set_source_files_properties(${Vector_SRC_CUDA} PROPERTIES LANGUAGE CUDA)
+add_library(resolve_vector SHARED ${Vector_SRC})
 
-    # Build shared library ReSolve::vector
-    add_library(resolve_vector SHARED ${Vector_SRC} ${Vector_SRC_CUDA})
+if (RESOLVE_USE_CUDA)
     target_link_libraries(resolve_vector PUBLIC resolve_backend_cuda)
 else()
-    # Build shared library ReSolve::vector
-    add_library(resolve_vector SHARED ${Vector_SRC})
     target_link_libraries(resolve_vector PUBLIC resolve_backend_cpu)
 endif()
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index ae24f9db..19b799e4 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -6,9 +6,6 @@
 
 ]]
 
-if(RESOLVE_USE_CUDA)
-  add_subdirectory(functionality) 
-endif(RESOLVE_USE_CUDA)
-
+add_subdirectory(functionality) 
 add_subdirectory(unit)
 
diff --git a/tests/functionality/CMakeLists.txt b/tests/functionality/CMakeLists.txt
index e2a6b4b6..a6652c26 100644
--- a/tests/functionality/CMakeLists.txt
+++ b/tests/functionality/CMakeLists.txt
@@ -10,26 +10,31 @@
 add_executable(klu_klu_test.exe testKLU.cpp)
 target_link_libraries(klu_klu_test.exe PRIVATE ReSolve)
 
-# Build KLU+Rf test
-add_executable(klu_rf_test.exe testKLU_Rf.cpp)
-target_link_libraries(klu_rf_test.exe PRIVATE ReSolve)
+if(RESOLVE_USE_CUDA)
+        
+  # Build KLU+Rf test
+  add_executable(klu_rf_test.exe testKLU_Rf.cpp)
+  target_link_libraries(klu_rf_test.exe PRIVATE ReSolve)
 
-# Build KLU+Rf+fgmres test
-add_executable(klu_rf_fgmres_test.exe testKLU_Rf_FGMRES.cpp)
-target_link_libraries(klu_rf_fgmres_test.exe PRIVATE ReSolve)
+  # Build KLU+Rf+fgmres test
+  add_executable(klu_rf_fgmres_test.exe testKLU_Rf_FGMRES.cpp)
+  target_link_libraries(klu_rf_fgmres_test.exe PRIVATE ReSolve)
 
+  # Build KLU+GLU test
+  add_executable(klu_glu_test.exe testKLU_GLU.cpp)
+  target_link_libraries(klu_glu_test.exe PRIVATE ReSolve)
 
-
-# Build KLU+GLU test
-add_executable(klu_glu_test.exe testKLU_GLU.cpp)
-target_link_libraries(klu_glu_test.exe PRIVATE ReSolve)
+endif(RESOLVE_USE_CUDA)
 
 # Install tests
-set(installable_tests 
-        klu_klu_test.exe
-        klu_rf_test.exe
-        klu_rf_fgmres_test.exe
-        klu_glu_test.exe)
+set(installable_tests klu_klu_test.exe)
+
+if(RESOLVE_USE_CUDA)
+  set(installable_tests ${installable_tests}
+                        klu_rf_test.exe
+                        klu_rf_fgmres_test.exe
+                        klu_glu_test.exe)
+endif(RESOLVE_USE_CUDA)
 
 install(TARGETS ${installable_tests} 
         RUNTIME DESTINATION bin/resolve/tests/functionality)
@@ -40,6 +45,8 @@ install(DIRECTORY data DESTINATION bin/resolve/tests/functionality)
 set(test_data_dir ${CMAKE_SOURCE_DIR}/tests/functionality/)
 
 add_test(NAME klu_klu_test COMMAND $<TARGET_FILE:klu_klu_test.exe> "${test_data_dir}")
-add_test(NAME klu_rf_test  COMMAND $<TARGET_FILE:klu_rf_test.exe>  "${test_data_dir}")
-add_test(NAME klu_rf_fgmres_test COMMAND $<TARGET_FILE:klu_rf_fgmres_test.exe> "${test_data_dir}")
-add_test(NAME klu_glu_test COMMAND $<TARGET_FILE:klu_glu_test.exe> "${test_data_dir}")
+if(RESOLVE_USE_CUDA)
+  add_test(NAME klu_rf_test  COMMAND $<TARGET_FILE:klu_rf_test.exe>  "${test_data_dir}")
+  add_test(NAME klu_rf_fgmres_test COMMAND $<TARGET_FILE:klu_rf_fgmres_test.exe> "${test_data_dir}")
+  add_test(NAME klu_glu_test COMMAND $<TARGET_FILE:klu_glu_test.exe> "${test_data_dir}")
+endif(RESOLVE_USE_CUDA)
diff --git a/tests/functionality/testKLU.cpp b/tests/functionality/testKLU.cpp
index d5ff9d6d..229dfc1d 100644
--- a/tests/functionality/testKLU.cpp
+++ b/tests/functionality/testKLU.cpp
@@ -26,10 +26,9 @@ int main(int argc, char *argv[])
   int error_sum = 0;
   int status = 0;
 
-  ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA;
-  workspace_CUDA->initializeHandles();
-  ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_CUDA);
-  ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_CUDA);
+  ReSolve::LinAlgWorkspace* workspace = ReSolve::createLinAlgWorkspace("cpu");
+  ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace);
+  ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace);
   ReSolve::LinSolverDirectKLU* KLU = new ReSolve::LinSolverDirectKLU;
   KLU->setupParameters(1, 0.1, false);
 
@@ -101,32 +100,32 @@ int main(int argc, char *argv[])
   }
 
   vec_test->setData(x_data, "cpu");
-  vec_r->update(rhs, "cpu", "cuda");
-  vec_diff->update(x_data, "cpu", "cuda");
+  vec_r->update(rhs, "cpu", "cpu");
+  vec_diff->update(x_data, "cpu", "cpu");
 
-  // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cuda"));
+  // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cpu"));
   matrix_handler->setValuesChanged(true);
-  status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cuda"); 
+  status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cpu"); 
   error_sum += status;
   
-  real_type normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda"));
+  real_type normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cpu"));
 
 
   //for testing only - control
   
-  real_type normXtrue = sqrt(vector_handler->dot(vec_x, vec_x, "cuda"));
-  real_type normB1 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cuda"));
+  real_type normXtrue = sqrt(vector_handler->dot(vec_x, vec_x, "cpu"));
+  real_type normB1 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cpu"));
   
   //compute x-x_true
-  vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cuda");
+  vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cpu");
   //evaluate its norm
-  real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda"));
+  real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cpu"));
  
   //compute the residual using exact solution
-  vec_r->update(rhs, "cpu", "cuda");
-  status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
+  vec_r->update(rhs, "cpu", "cpu");
+  status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "cpu"); 
   error_sum += status;
-  real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda"));
+  real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cpu"));
   //evaluate the residual ON THE CPU using COMPUTED solution
  
   vec_r->update(rhs, "cpu", "cpu");
@@ -134,7 +133,7 @@ int main(int argc, char *argv[])
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cpu");
   error_sum += status;
  
-  real_type normRmatrix1CPU = sqrt(vector_handler->dot(vec_r, vec_r, "cuda"));
+  real_type normRmatrix1CPU = sqrt(vector_handler->dot(vec_r, vec_r, "cpu"));
  
   std::cout<<"Results (first matrix): "<<std::endl<<std::endl;
   std::cout<<"\t ||b-A*x||_2                 : " << std::setprecision(16) << normRmatrix1    << " (residual norm)" << std::endl;
@@ -164,8 +163,8 @@ int main(int argc, char *argv[])
   ReSolve::io::readAndUpdateRhs(rhs2_file, &rhs);
   rhs2_file.close();
 
-  matrix_handler->coo2csr(A_coo, A, "cuda");
-  vec_rhs->update(rhs, "cpu", "cuda");
+  matrix_handler->coo2csr(A_coo, A, "cpu");
+  vec_rhs->update(rhs, "cpu", "cpu");
 
   // and solve it too
   status =  KLU->refactorize();
@@ -174,27 +173,27 @@ int main(int argc, char *argv[])
   status = KLU->solve(vec_rhs, vec_x);
   error_sum += status;
 
-  vec_r->update(rhs, "cpu", "cuda");
+  vec_r->update(rhs, "cpu", "cpu");
   matrix_handler->setValuesChanged(true);
 
-  status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); 
+  status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cpu"); 
   error_sum += status;
 
-  real_type normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda"));
+  real_type normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cpu"));
   
   //for testing only - control
-  real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cuda"));
+  real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cpu"));
   //compute x-x_true
-  vec_diff->update(x_data, "cpu", "cuda");
-  vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cuda");
+  vec_diff->update(x_data, "cpu", "cpu");
+  vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cpu");
   //evaluate its norm
-  real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda"));
+  real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cpu"));
  
   //compute the residual using exact solution
-  vec_r->update(rhs, "cpu", "cuda");
-  status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "cuda"); 
+  vec_r->update(rhs, "cpu", "cpu");
+  status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "cpu"); 
   error_sum += status;
-  real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda"));
+  real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cpu"));
   
   std::cout<<"Results (second matrix): "<<std::endl<<std::endl;
   std::cout<<"\t ||b-A*x||_2                 : "<<normRmatrix2<<" (residual norm)"<<std::endl;

From 5ddd386e009ae8b108b9c2fbef95d25d3c0e1cd1 Mon Sep 17 00:00:00 2001
From: Slaven Peles <peless@ornl.gov>
Date: Sun, 15 Oct 2023 20:16:00 -0400
Subject: [PATCH 02/15] Create modular (and eventually portable) workspace.

---
 examples/r_KLU_GLU.cpp                        |  2 +-
 examples/r_KLU_GLU_matrix_values_update.cpp   |  2 +-
 examples/r_KLU_KLU.cpp                        |  2 +-
 examples/r_KLU_KLU_standalone.cpp             |  2 +-
 examples/r_KLU_rf.cpp                         |  2 +-
 examples/r_KLU_rf_FGMRES.cpp                  |  2 +-
 .../r_KLU_rf_FGMRES_reuse_factorization.cpp   |  2 +-
 resolve/CMakeLists.txt                        |  9 +++--
 resolve/LinSolverDirectCuSolverGLU.cpp        |  1 +
 resolve/LinSolverDirectCuSolverGLU.hpp        |  6 ++-
 resolve/matrix/MatrixHandler.cpp              |  2 +-
 resolve/vector/VectorHandler.cpp              |  2 +-
 resolve/workspace/CMakeLists.txt              | 37 +++++++++++++++++++
 resolve/workspace/LinAlgWorkspace.cpp         | 13 +++++++
 resolve/workspace/LinAlgWorkspace.hpp         | 17 +++++++++
 .../LinAlgWorkspaceCUDA.cpp}                  | 24 ++++--------
 .../LinAlgWorkspaceCUDA.hpp}                  | 28 +-------------
 resolve/workspace/LinAlgWorkspaceFactory.cpp  | 25 +++++++++++++
 resolve/workspace/LinAlgWorkspaceFactory.hpp  | 14 +++++++
 tests/functionality/testKLU.cpp               |  2 +-
 tests/functionality/testKLU_GLU.cpp           |  2 +-
 tests/functionality/testKLU_Rf.cpp            |  2 +-
 tests/functionality/testKLU_Rf_FGMRES.cpp     |  2 +-
 tests/unit/matrix/MatrixHandlerTests.hpp      |  2 +-
 tests/unit/vector/GramSchmidtTests.hpp        |  2 +-
 tests/unit/vector/VectorHandlerTests.hpp      |  2 +-
 26 files changed, 143 insertions(+), 63 deletions(-)
 create mode 100644 resolve/workspace/CMakeLists.txt
 create mode 100644 resolve/workspace/LinAlgWorkspace.cpp
 create mode 100644 resolve/workspace/LinAlgWorkspace.hpp
 rename resolve/{LinAlgWorkspace.cpp => workspace/LinAlgWorkspaceCUDA.cpp} (84%)
 rename resolve/{LinAlgWorkspace.hpp => workspace/LinAlgWorkspaceCUDA.hpp} (70%)
 create mode 100644 resolve/workspace/LinAlgWorkspaceFactory.cpp
 create mode 100644 resolve/workspace/LinAlgWorkspaceFactory.hpp

diff --git a/examples/r_KLU_GLU.cpp b/examples/r_KLU_GLU.cpp
index a4c059a9..82c949db 100644
--- a/examples/r_KLU_GLU.cpp
+++ b/examples/r_KLU_GLU.cpp
@@ -9,7 +9,7 @@
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverGLU.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
 
 using namespace ReSolve::constants;
 
diff --git a/examples/r_KLU_GLU_matrix_values_update.cpp b/examples/r_KLU_GLU_matrix_values_update.cpp
index 96c77591..19d7ec3c 100644
--- a/examples/r_KLU_GLU_matrix_values_update.cpp
+++ b/examples/r_KLU_GLU_matrix_values_update.cpp
@@ -9,7 +9,7 @@
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverGLU.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
 
 // this updates the matrix values to simulate what CFD/optimization software does. 
 
diff --git a/examples/r_KLU_KLU.cpp b/examples/r_KLU_KLU.cpp
index 439c8f41..b1e87203 100644
--- a/examples/r_KLU_KLU.cpp
+++ b/examples/r_KLU_KLU.cpp
@@ -9,7 +9,7 @@
 #include <resolve/matrix/MatrixHandler.hpp>
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
 
 using namespace ReSolve::constants;
 
diff --git a/examples/r_KLU_KLU_standalone.cpp b/examples/r_KLU_KLU_standalone.cpp
index 3323b68b..f0b7c3a4 100644
--- a/examples/r_KLU_KLU_standalone.cpp
+++ b/examples/r_KLU_KLU_standalone.cpp
@@ -9,7 +9,7 @@
 #include <resolve/matrix/MatrixHandler.hpp>
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
 
 using namespace ReSolve::constants;
 
diff --git a/examples/r_KLU_rf.cpp b/examples/r_KLU_rf.cpp
index 9e166d34..f2e96dfe 100644
--- a/examples/r_KLU_rf.cpp
+++ b/examples/r_KLU_rf.cpp
@@ -10,7 +10,7 @@
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverRf.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
 
 using namespace ReSolve::constants;
 
diff --git a/examples/r_KLU_rf_FGMRES.cpp b/examples/r_KLU_rf_FGMRES.cpp
index db816206..d9543895 100644
--- a/examples/r_KLU_rf_FGMRES.cpp
+++ b/examples/r_KLU_rf_FGMRES.cpp
@@ -11,7 +11,7 @@
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverRf.hpp>
 #include <resolve/LinSolverIterativeFGMRES.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
 
 using namespace ReSolve::constants;
 
diff --git a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
index d0ca3a56..d50bdb4a 100644
--- a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
+++ b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
@@ -11,7 +11,7 @@
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverRf.hpp>
 #include <resolve/LinSolverIterativeFGMRES.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
 
 using namespace ReSolve::constants;
 
diff --git a/resolve/CMakeLists.txt b/resolve/CMakeLists.txt
index 675fc11f..7551d2c3 100644
--- a/resolve/CMakeLists.txt
+++ b/resolve/CMakeLists.txt
@@ -11,7 +11,6 @@ add_subdirectory(utilities)
 set(ReSolve_SRC
     LinSolver.cpp
     LinSolverDirectKLU.cpp
-    LinAlgWorkspace.cpp
     LinSolverIterativeFGMRES.cpp
     GramSchmidt.cpp
     LinSolverDirectCuSolverGLU.cpp
@@ -22,7 +21,6 @@ set(ReSolve_SRC
 set(ReSolve_HEADER_INSTALL
     Common.hpp
     cusolver_defs.hpp
-    LinAlgWorkspace.hpp
     LinSolver.hpp
     LinSolverDirectCuSolverGLU.hpp
     LinSolverDirectCuSolverRf.hpp
@@ -39,11 +37,15 @@ if(NOT RESOLVE_USE_GPU)
     add_subdirectory(cpu)
 endif()
 
-# First create CUDA backend (this should really be CUDA _API_ backend, separate backend will be needed for CUDA SDK)
+# If CUDA support is enabled, create CUDA backend
+# (this should really be CUDA _API_ backend, separate backend will be needed for CUDA SDK)
 if(RESOLVE_USE_CUDA)
     add_subdirectory(cuda)
 endif()
 
+# Now, build workspaces
+add_subdirectory(workspace)
+
 # Next build vector and matrix objects that may use this backend.
 add_subdirectory(vector)
 add_subdirectory(matrix)
@@ -66,6 +68,7 @@ set(ReSolve_Targets_List
     resolve_vector
     resolve_logger
     resolve_tpl
+    resolve_workspace
 )
 
 if(RESOLVE_USE_GPU)
diff --git a/resolve/LinSolverDirectCuSolverGLU.cpp b/resolve/LinSolverDirectCuSolverGLU.cpp
index 63413c5b..feaf11b8 100644
--- a/resolve/LinSolverDirectCuSolverGLU.cpp
+++ b/resolve/LinSolverDirectCuSolverGLU.cpp
@@ -3,6 +3,7 @@
 
 #include <resolve/vector/Vector.hpp>
 #include <resolve/matrix/Csr.hpp>
+#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
 #include "LinSolverDirectCuSolverGLU.hpp"
 
 namespace ReSolve
diff --git a/resolve/LinSolverDirectCuSolverGLU.hpp b/resolve/LinSolverDirectCuSolverGLU.hpp
index d76e1921..0aa27df9 100644
--- a/resolve/LinSolverDirectCuSolverGLU.hpp
+++ b/resolve/LinSolverDirectCuSolverGLU.hpp
@@ -1,6 +1,5 @@
 #pragma once
 #include "Common.hpp"
-#include <resolve/LinAlgWorkspace.hpp>
 #include "LinSolver.hpp"
 #include "cusolver_defs.hpp"
 #include <resolve/MemoryUtils.hpp>
@@ -19,6 +18,9 @@ namespace ReSolve
     class Sparse;
   }
 
+  // Forward declaration of ReSolve handlers workspace
+  class LinAlgWorkspace;
+
   class LinSolverDirectCuSolverGLU : public LinSolverDirect 
   {
     using vector_type = vector::Vector;
@@ -38,7 +40,7 @@ namespace ReSolve
       //note: we need cuSolver handle, we can copy it from the workspace to avoid double allocation
       cusparseMatDescr_t descr_M_; //this is NOT sparse matrix descriptor
       cusparseMatDescr_t descr_A_; //this is NOT sparse matrix descriptor
-      LinAlgWorkspace *workspace_;// so we can copy cusparse handle
+      LinAlgWorkspace* workspace_;// so we can copy cusparse handle
       cusolverSpHandle_t handle_cusolversp_; 
       cusolverStatus_t status_cusolver_;
       cusparseStatus_t status_cusparse_;
diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp
index 19119731..370d3a53 100644
--- a/resolve/matrix/MatrixHandler.cpp
+++ b/resolve/matrix/MatrixHandler.cpp
@@ -1,11 +1,11 @@
 #include <algorithm>
 
 #include <resolve/utilities/logger/Logger.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
 #include <resolve/vector/Vector.hpp>
 #include <resolve/matrix/Coo.hpp>
 #include <resolve/matrix/Csc.hpp>
 #include <resolve/matrix/Csr.hpp>
+#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
 #include "MatrixHandler.hpp"
 
 namespace ReSolve {
diff --git a/resolve/vector/VectorHandler.cpp b/resolve/vector/VectorHandler.cpp
index 032cc9b1..71bfce9c 100644
--- a/resolve/vector/VectorHandler.cpp
+++ b/resolve/vector/VectorHandler.cpp
@@ -1,9 +1,9 @@
 #include <iostream>
 
 #include <resolve/utilities/logger/Logger.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
 #include <resolve/cuda/cudaKernels.h>
 #include <resolve/vector/Vector.hpp>
+#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
 #include "VectorHandler.hpp"
 
 namespace ReSolve {
diff --git a/resolve/workspace/CMakeLists.txt b/resolve/workspace/CMakeLists.txt
new file mode 100644
index 00000000..a50982c8
--- /dev/null
+++ b/resolve/workspace/CMakeLists.txt
@@ -0,0 +1,37 @@
+#[[
+
+@brief Build ReSolve workspace module
+
+@author Slaven Peles <peless@ornl.gov>
+
+]]
+
+set(ReSolve_Workspace_SRC
+  LinAlgWorkspace.cpp
+  LinAlgWorkspaceFactory.cpp
+)
+
+set(ReSolve_Workspace_CUDA_SRC # consider adding a separate dir for cuda-sdk dependent code
+    LinAlgWorkspaceCUDA.cpp
+)
+
+set(ReSolve_Workspace_HEADER_INSTALL
+  LinAlgWorkspace.hpp
+  LinAlgWorkspaceFactory.hpp
+  LinAlgWorkspaceCUDA.hpp
+)
+
+if(RESOLVE_USE_CUDA)
+  add_library(resolve_workspace SHARED ${ReSolve_Workspace_SRC} ${ReSolve_Workspace_CUDA_SRC})
+  target_link_libraries(resolve_workspace PUBLIC resolve_backend_cuda)
+else(RESOLVE_USE_CUDA)  
+  add_library(resolve_workspace SHARED ${ReSolve_Workspace_SRC})
+endif(RESOLVE_USE_CUDA)
+
+target_include_directories(resolve_workspace INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:include>
+)
+
+# install include headers
+install(FILES ${ReSolve_Workspace_HEADER_INSTALL} DESTINATION include/resolve/workspace)
diff --git a/resolve/workspace/LinAlgWorkspace.cpp b/resolve/workspace/LinAlgWorkspace.cpp
new file mode 100644
index 00000000..d6d2085e
--- /dev/null
+++ b/resolve/workspace/LinAlgWorkspace.cpp
@@ -0,0 +1,13 @@
+#include "LinAlgWorkspace.hpp"
+
+namespace ReSolve
+{
+  LinAlgWorkspace::LinAlgWorkspace()
+  {
+  }
+  
+  LinAlgWorkspace::~LinAlgWorkspace()
+  {
+  }
+
+}
diff --git a/resolve/workspace/LinAlgWorkspace.hpp b/resolve/workspace/LinAlgWorkspace.hpp
new file mode 100644
index 00000000..38dc0c19
--- /dev/null
+++ b/resolve/workspace/LinAlgWorkspace.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+
+#include <resolve/MemoryUtils.hpp>
+
+namespace ReSolve
+{
+  class LinAlgWorkspace
+  {
+    public:
+      LinAlgWorkspace();
+      ~LinAlgWorkspace();
+    protected:
+      MemoryHandler mem_;
+  };
+
+}
diff --git a/resolve/LinAlgWorkspace.cpp b/resolve/workspace/LinAlgWorkspaceCUDA.cpp
similarity index 84%
rename from resolve/LinAlgWorkspace.cpp
rename to resolve/workspace/LinAlgWorkspaceCUDA.cpp
index 9dee78c5..200ac437 100644
--- a/resolve/LinAlgWorkspace.cpp
+++ b/resolve/workspace/LinAlgWorkspaceCUDA.cpp
@@ -1,15 +1,7 @@
-#include "LinAlgWorkspace.hpp"
+#include <resolve/workspace/LinAlgWorkspaceCUDA.hpp>
 
 namespace ReSolve
 {
-  LinAlgWorkspace::LinAlgWorkspace()
-  {
-  }
-  
-  LinAlgWorkspace::~LinAlgWorkspace()
-  {
-  }
-
   LinAlgWorkspaceCUDA::LinAlgWorkspaceCUDA()
   {
     handle_cusolversp_ = nullptr;
@@ -29,8 +21,6 @@ namespace ReSolve
     cusolverSpDestroy(handle_cusolversp_);
     cublasDestroy(handle_cublas_);
     cusparseDestroySpMat(mat_A_);
-    // cusparseDestroyDnVec(vec_x_);
-    // cusparseDestroyDnVec(vec_y_);
   }
 
   void* LinAlgWorkspaceCUDA::getSpmvBuffer()
@@ -93,21 +83,23 @@ namespace ReSolve
     mat_A_ = mat;
   }
 
-  cusparseDnVecDescr_t   LinAlgWorkspaceCUDA::getVecX()
+  cusparseDnVecDescr_t LinAlgWorkspaceCUDA::getVecX()
   {
     return vec_x_;
   }
 
-  cusparseDnVecDescr_t   LinAlgWorkspaceCUDA::getVecY()
+  cusparseDnVecDescr_t LinAlgWorkspaceCUDA::getVecY()
   {
     return vec_y_;
   }
 
-  bool LinAlgWorkspaceCUDA::matvecSetup(){
+  bool LinAlgWorkspaceCUDA::matvecSetup()
+  {
     return matvec_setup_done_;
   }
 
-  void LinAlgWorkspaceCUDA::matvecSetupDone() {
+  void LinAlgWorkspaceCUDA::matvecSetupDone()
+  {
     matvec_setup_done_ = true;
   }
 
@@ -117,4 +109,4 @@ namespace ReSolve
     cublasCreate(&handle_cublas_);
     cusolverSpCreate(&handle_cusolversp_);
   }
-}
+} // namespace ReSolve
diff --git a/resolve/LinAlgWorkspace.hpp b/resolve/workspace/LinAlgWorkspaceCUDA.hpp
similarity index 70%
rename from resolve/LinAlgWorkspace.hpp
rename to resolve/workspace/LinAlgWorkspaceCUDA.hpp
index e5c79580..c0d4554c 100644
--- a/resolve/LinAlgWorkspace.hpp
+++ b/resolve/workspace/LinAlgWorkspaceCUDA.hpp
@@ -4,20 +4,10 @@
 #include "cusparse.h"
 #include "cusolverSp.h"
 
-#include <resolve/MemoryUtils.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 
 namespace ReSolve
 {
-  class LinAlgWorkspace
-  {
-    public:
-      LinAlgWorkspace();
-      ~LinAlgWorkspace();
-    protected:
-      MemoryHandler mem_;
-  };
-
-
   class LinAlgWorkspaceCUDA : public LinAlgWorkspace
   {
     public:
@@ -67,18 +57,4 @@ namespace ReSolve
       bool matvec_setup_done_; //check if setup is done for matvec i.e. if buffer is allocated, csr structure is set etc.
   };
 
-  /// @brief  Workspace factory
-  /// @param[in] memspace memory space ID 
-  /// @return pointer to the linear algebra workspace
-  inline LinAlgWorkspace* createLinAlgWorkspace(std::string memspace)
-  {
-    if (memspace == "cuda") {
-      LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA();
-      workspace->initializeHandles();
-      return workspace;
-    } 
-    // If not CUDA, return default
-    return (new LinAlgWorkspace());
-  }
-
-}
+} // namespace ReSolve
diff --git a/resolve/workspace/LinAlgWorkspaceFactory.cpp b/resolve/workspace/LinAlgWorkspaceFactory.cpp
new file mode 100644
index 00000000..11980c23
--- /dev/null
+++ b/resolve/workspace/LinAlgWorkspaceFactory.cpp
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "LinAlgWorkspaceFactory.hpp"
+
+namespace ReSolve
+{
+  /// @brief  Workspace factory
+  /// @param[in] memspace memory space ID 
+  /// @return pointer to the linear algebra workspace
+  LinAlgWorkspace* createLinAlgWorkspace(std::string memspace)
+  {
+    if (memspace == "cuda") {
+#ifdef RESOLVE_USE_CUDA
+      LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA();
+      workspace->initializeHandles();
+      return workspace;
+#else
+      return nullptr;
+#endif
+    } 
+    // If not CUDA, return default
+    return (new LinAlgWorkspace());
+  }
+
+}
diff --git a/resolve/workspace/LinAlgWorkspaceFactory.hpp b/resolve/workspace/LinAlgWorkspaceFactory.hpp
new file mode 100644
index 00000000..2c30738e
--- /dev/null
+++ b/resolve/workspace/LinAlgWorkspaceFactory.hpp
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <string>
+
+#include <resolve/workspace/LinAlgWorkspace.hpp>
+
+#ifdef RESOLVE_USE_CUDA
+#include <resolve/workspace/LinAlgWorkspaceCUDA.hpp>
+#endif
+
+namespace ReSolve
+{
+  LinAlgWorkspace* createLinAlgWorkspace(std::string memspace);
+}
diff --git a/tests/functionality/testKLU.cpp b/tests/functionality/testKLU.cpp
index 229dfc1d..a60083cd 100644
--- a/tests/functionality/testKLU.cpp
+++ b/tests/functionality/testKLU.cpp
@@ -9,7 +9,7 @@
 #include <resolve/matrix/MatrixHandler.hpp>
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
 //author: KS
 //functionality test to check whether KLU works correctly.
 using namespace ReSolve::constants;
diff --git a/tests/functionality/testKLU_GLU.cpp b/tests/functionality/testKLU_GLU.cpp
index 37f888e4..60dcce50 100644
--- a/tests/functionality/testKLU_GLU.cpp
+++ b/tests/functionality/testKLU_GLU.cpp
@@ -10,7 +10,7 @@
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverGLU.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
 //author: KS
 //functionality test to check whether cuSolverGLU works correctly.
 
diff --git a/tests/functionality/testKLU_Rf.cpp b/tests/functionality/testKLU_Rf.cpp
index 21aba8b0..50add7cd 100644
--- a/tests/functionality/testKLU_Rf.cpp
+++ b/tests/functionality/testKLU_Rf.cpp
@@ -11,7 +11,7 @@
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverRf.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
 //author: KS
 //functionality test to check whether cuSolverRf works correctly.
 
diff --git a/tests/functionality/testKLU_Rf_FGMRES.cpp b/tests/functionality/testKLU_Rf_FGMRES.cpp
index c63a3f99..2dae74b5 100644
--- a/tests/functionality/testKLU_Rf_FGMRES.cpp
+++ b/tests/functionality/testKLU_Rf_FGMRES.cpp
@@ -12,7 +12,7 @@
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverRf.hpp>
 #include <resolve/LinSolverIterativeFGMRES.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
 //author: KS
 //functionality test to check whether cuSolverRf/FGMRES works correctly.
 
diff --git a/tests/unit/matrix/MatrixHandlerTests.hpp b/tests/unit/matrix/MatrixHandlerTests.hpp
index 18fbe412..41df9f4b 100644
--- a/tests/unit/matrix/MatrixHandlerTests.hpp
+++ b/tests/unit/matrix/MatrixHandlerTests.hpp
@@ -5,7 +5,7 @@
 #include <iterator>
 #include <algorithm>
 #include <resolve/matrix/Csr.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
 #include <resolve/matrix/MatrixHandler.hpp>
 #include <resolve/vector/Vector.hpp>
 #include <tests/unit/TestBase.hpp>
diff --git a/tests/unit/vector/GramSchmidtTests.hpp b/tests/unit/vector/GramSchmidtTests.hpp
index 6f0d305b..2d801c49 100644
--- a/tests/unit/vector/GramSchmidtTests.hpp
+++ b/tests/unit/vector/GramSchmidtTests.hpp
@@ -6,7 +6,7 @@
 #include <resolve/matrix/MatrixHandler.hpp>
 #include <resolve/vector/Vector.hpp>
 #include <tests/unit/TestBase.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
 
 namespace ReSolve { 
   namespace tests {
diff --git a/tests/unit/vector/VectorHandlerTests.hpp b/tests/unit/vector/VectorHandlerTests.hpp
index 0737385d..ad0d3cfb 100644
--- a/tests/unit/vector/VectorHandlerTests.hpp
+++ b/tests/unit/vector/VectorHandlerTests.hpp
@@ -7,7 +7,7 @@
 #include <resolve/vector/Vector.hpp>
 #include <resolve/vector/VectorHandler.hpp>
 #include <tests/unit/TestBase.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
 
 namespace ReSolve { 
   namespace tests {

From d8cf0e66f5a9643d8243a8fae8ec983085b5301c Mon Sep 17 00:00:00 2001
From: Slaven Peles <peless@ornl.gov>
Date: Sun, 15 Oct 2023 21:26:19 -0400
Subject: [PATCH 03/15] Enforce dependencies between CMake options.

---
 CMakeLists.txt                               |  7 +++++++
 resolve/CMakeLists.txt                       | 12 ++++++------
 resolve/MemoryUtils.hpp                      |  4 +++-
 resolve/workspace/LinAlgWorkspace.cpp        |  3 +++
 resolve/workspace/LinAlgWorkspace.hpp        |  1 +
 resolve/workspace/LinAlgWorkspaceFactory.cpp |  2 --
 resolve/workspace/LinAlgWorkspaceFactory.hpp |  4 ++++
 7 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 635fa390..d3b566ee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,6 +27,13 @@ option(RESOLVE_USE_GPU  "Use GPU device for computations" ON)
 option(RESOLVE_USE_CUDA "Use CUDA language and SDK" ON)
 set(RESOLVE_CTEST_OUTPUT_DIR ${PROJECT_BINARY_DIR} CACHE PATH "Directory where CTest outputs are saved")
 
+if(RESOLVE_USE_CUDA)
+  set(RESOLVE_USE_GPU On CACHE BOOL "Using GPU!" FORCE)
+else()
+  set(RESOLVE_USE_GPU Off CACHE BOOL "Using GPU!" FORCE)
+endif()
+
+
 set(CMAKE_MACOSX_RPATH 1)
 # set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
 # https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling#always-full-rpath
diff --git a/resolve/CMakeLists.txt b/resolve/CMakeLists.txt
index 7551d2c3..2efac125 100644
--- a/resolve/CMakeLists.txt
+++ b/resolve/CMakeLists.txt
@@ -71,13 +71,13 @@ set(ReSolve_Targets_List
     resolve_workspace
 )
 
-if(RESOLVE_USE_GPU)
-    if(RESOLVE_USE_CUDA)
-        set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cuda)
-    endif()
-else(RESOLVE_USE_GPU)
+if(RESOLVE_USE_CUDA)
+  set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cuda)
+endif()
+
+if(NOT RESOLVE_USE_GPU)
     set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cpu)
-endif(RESOLVE_USE_GPU)
+endif()
 
 
 install(TARGETS ${ReSolve_Targets_List} EXPORT ReSolveTargets)
diff --git a/resolve/MemoryUtils.hpp b/resolve/MemoryUtils.hpp
index 77c2a7af..00f3d653 100644
--- a/resolve/MemoryUtils.hpp
+++ b/resolve/MemoryUtils.hpp
@@ -48,14 +48,16 @@ namespace ReSolve
 
 } // namespace ReSolve
 
-// Check if GPU support is enabled in Re::Solve and set appropriate device memory manager.
 #ifdef RESOLVE_USE_GPU
 
+// Check if GPU support is enabled in Re::Solve and set appropriate device memory manager.
 #if defined RESOLVE_USE_CUDA
 #include <resolve/cuda/CudaMemory.hpp>
 using MemoryHandler = ReSolve::MemoryUtils<ReSolve::memory::Cuda>;
 #elif defined RESOLVE_USE_HIP
 #error HIP support requested, but not available! Probably a bug in CMake configuration.
+#else
+#error Unrecognized device, probably bug in CMake configuration
 #endif
 
 #else
diff --git a/resolve/workspace/LinAlgWorkspace.cpp b/resolve/workspace/LinAlgWorkspace.cpp
index d6d2085e..5e4b55d8 100644
--- a/resolve/workspace/LinAlgWorkspace.cpp
+++ b/resolve/workspace/LinAlgWorkspace.cpp
@@ -10,4 +10,7 @@ namespace ReSolve
   {
   }
 
+  void LinAlgWorkspace::initializeHandles()
+  {
+  }
 }
diff --git a/resolve/workspace/LinAlgWorkspace.hpp b/resolve/workspace/LinAlgWorkspace.hpp
index 38dc0c19..68074eb8 100644
--- a/resolve/workspace/LinAlgWorkspace.hpp
+++ b/resolve/workspace/LinAlgWorkspace.hpp
@@ -10,6 +10,7 @@ namespace ReSolve
     public:
       LinAlgWorkspace();
       ~LinAlgWorkspace();
+      void initializeHandles();
     protected:
       MemoryHandler mem_;
   };
diff --git a/resolve/workspace/LinAlgWorkspaceFactory.cpp b/resolve/workspace/LinAlgWorkspaceFactory.cpp
index 11980c23..43a10870 100644
--- a/resolve/workspace/LinAlgWorkspaceFactory.cpp
+++ b/resolve/workspace/LinAlgWorkspaceFactory.cpp
@@ -1,5 +1,3 @@
-#pragma once
-
 #include "LinAlgWorkspaceFactory.hpp"
 
 namespace ReSolve
diff --git a/resolve/workspace/LinAlgWorkspaceFactory.hpp b/resolve/workspace/LinAlgWorkspaceFactory.hpp
index 2c30738e..0f5860c1 100644
--- a/resolve/workspace/LinAlgWorkspaceFactory.hpp
+++ b/resolve/workspace/LinAlgWorkspaceFactory.hpp
@@ -8,6 +8,10 @@
 #include <resolve/workspace/LinAlgWorkspaceCUDA.hpp>
 #endif
 
+// #ifndef RESOLVE_USE_CUDA
+// using ReSolve::LinAlgWorkspace = ReSolve::LinAlgWorkspaceCUDA;
+// #endif
+
 namespace ReSolve
 {
   LinAlgWorkspace* createLinAlgWorkspace(std::string memspace);

From e40c43dfa95df269150c3a904bce2cb1321b75fe Mon Sep 17 00:00:00 2001
From: Slaven Peles <peless@ornl.gov>
Date: Mon, 16 Oct 2023 12:48:43 -0400
Subject: [PATCH 04/15] Working PIMPL in MatrixHandler; needs lots of cleanup.

---
 resolve/matrix/CMakeLists.txt        |   3 +
 resolve/matrix/MatrixHandler.cpp     | 145 ++--------
 resolve/matrix/MatrixHandler.hpp     |   6 +-
 resolve/matrix/MatrixHandlerCpu.cpp  | 338 +++++++++++++++++++++++
 resolve/matrix/MatrixHandlerCpu.hpp  |  66 +++++
 resolve/matrix/MatrixHandlerCuda.cpp | 387 +++++++++++++++++++++++++++
 resolve/matrix/MatrixHandlerCuda.hpp |  66 +++++
 resolve/matrix/MatrixHandlerImpl.hpp |  75 ++++++
 8 files changed, 967 insertions(+), 119 deletions(-)
 create mode 100644 resolve/matrix/MatrixHandlerCpu.cpp
 create mode 100644 resolve/matrix/MatrixHandlerCpu.hpp
 create mode 100644 resolve/matrix/MatrixHandlerCuda.cpp
 create mode 100644 resolve/matrix/MatrixHandlerCuda.hpp
 create mode 100644 resolve/matrix/MatrixHandlerImpl.hpp

diff --git a/resolve/matrix/CMakeLists.txt b/resolve/matrix/CMakeLists.txt
index 289f805a..8c74ad09 100644
--- a/resolve/matrix/CMakeLists.txt
+++ b/resolve/matrix/CMakeLists.txt
@@ -14,6 +14,8 @@ set(Matrix_SRC
     Csc.cpp
     Coo.cpp
     MatrixHandler.cpp
+    MatrixHandlerCpu.cpp
+    MatrixHandlerCuda.cpp
 )
 
 
@@ -24,6 +26,7 @@ set(Matrix_HEADER_INSTALL
     Csr.hpp
     Csc.hpp
     MatrixHandler.hpp
+    MatrixHandlerImpl.hpp
 )
 
 # Build shared library ReSolve::matrix
diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp
index 370d3a53..30c08991 100644
--- a/resolve/matrix/MatrixHandler.cpp
+++ b/resolve/matrix/MatrixHandler.cpp
@@ -7,6 +7,8 @@
 #include <resolve/matrix/Csr.hpp>
 #include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
 #include "MatrixHandler.hpp"
+#include "MatrixHandlerCpu.hpp"
+#include "MatrixHandlerCuda.hpp"
 
 namespace ReSolve {
   // Create a shortcut name for Logger static class
@@ -49,6 +51,8 @@ namespace ReSolve {
   {
     this->new_matrix_ = true;
     this->values_changed_ = true;
+    cpuImpl_  = new MatrixHandlerCpu();
+    cudaImpl_ = new MatrixHandlerCuda();
   }
 
   MatrixHandler::~MatrixHandler()
@@ -58,16 +62,20 @@ namespace ReSolve {
   MatrixHandler::MatrixHandler(LinAlgWorkspace* new_workspace)
   {
     workspace_ = new_workspace;
+    cpuImpl_  = new MatrixHandlerCpu(new_workspace);
+    cudaImpl_ = new MatrixHandlerCuda(new_workspace);
   }
 
   bool MatrixHandler::getValuesChanged()
   {
+    values_changed_ = cpuImpl_->isValuesChanged();
     return this->values_changed_;
   }
 
   void MatrixHandler::setValuesChanged(bool toWhat)
   {
     this->values_changed_ = toWhat;
+    cpuImpl_->setValuesChanged(values_changed_);
   }
 
   int MatrixHandler::coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace)
@@ -229,131 +237,32 @@ namespace ReSolve {
     return 0;
   }
 
-  int MatrixHandler::matvec(matrix::Sparse* Ageneric, 
+  /**
+   * @brief Matrix vector product method  result = alpha *A*x + beta * result
+   * @param A 
+   * @param vec_x 
+   * @param vec_result 
+   * @param[in] alpha 
+   * @param[in] beta 
+   * @param[in] matrixFormat 
+   * @param[in] memspace 
+   * @return result := alpha * A * x + beta * result
+   */
+  int MatrixHandler::matvec(matrix::Sparse* A, 
                             vector_type* vec_x, 
                             vector_type* vec_result, 
                             const real_type* alpha, 
                             const real_type* beta,
                             std::string matrixFormat, 
-                            std::string memspace) 
+                            std::string memspace)
   {
-    using namespace constants;
-    int error_sum = 0;
-    if (matrixFormat == "csr"){
-      matrix::Csr* A = (matrix::Csr*) Ageneric;
-      cusparseStatus_t status;
-      //result = alpha *A*x + beta * result
-      if (memspace == "cuda" ){
-        // std::cout << "Matvec on NVIDIA GPU ...\n";
-        LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-        cusparseDnVecDescr_t vecx = workspaceCUDA->getVecX();
-        //printf("is vec_x NULL? %d\n", vec_x->getData("cuda") == nullptr);
-        //printf("is vec_result NULL? %d\n", vec_result->getData("cuda") == nullptr);
-        cusparseCreateDnVec(&vecx, A->getNumRows(), vec_x->getData("cuda"), CUDA_R_64F);
-
-
-        cusparseDnVecDescr_t vecAx = workspaceCUDA->getVecY();
-        cusparseCreateDnVec(&vecAx, A->getNumRows(), vec_result->getData("cuda"), CUDA_R_64F);
-
-        cusparseSpMatDescr_t matA = workspaceCUDA->getSpmvMatrixDescriptor();
-
-        void* buffer_spmv = workspaceCUDA->getSpmvBuffer();
-        cusparseHandle_t handle_cusparse = workspaceCUDA->getCusparseHandle();
-        if (values_changed_){ 
-          status = cusparseCreateCsr(&matA, 
-                                     A->getNumRows(),
-                                     A->getNumColumns(),
-                                     A->getNnzExpanded(),
-                                     A->getRowData("cuda"),
-                                     A->getColData("cuda"),
-                                     A->getValues("cuda"), 
-                                     CUSPARSE_INDEX_32I, 
-                                     CUSPARSE_INDEX_32I,
-                                     CUSPARSE_INDEX_BASE_ZERO,
-                                     CUDA_R_64F);
-          error_sum += status;
-          values_changed_ = false;
-        }
-        if (!workspaceCUDA->matvecSetup()){
-          //setup first, allocate, etc.
-          size_t bufferSize = 0;
-
-          status = cusparseSpMV_bufferSize(handle_cusparse, 
-                                           CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                           &MINUSONE,
-                                           matA,
-                                           vecx,
-                                           &ONE,
-                                           vecAx,
-                                           CUDA_R_64F,
-                                           CUSPARSE_SPMV_CSR_ALG2, 
-                                           &bufferSize);
-          error_sum += status;
-          mem_.deviceSynchronize();
-          mem_.allocateBufferOnDevice(&buffer_spmv, bufferSize);
-          workspaceCUDA->setSpmvMatrixDescriptor(matA);
-          workspaceCUDA->setSpmvBuffer(buffer_spmv);
-
-          workspaceCUDA->matvecSetupDone();
-        } 
-
-        status = cusparseSpMV(handle_cusparse,
-                              CUSPARSE_OPERATION_NON_TRANSPOSE,       
-                              alpha, 
-                              matA, 
-                              vecx, 
-                              beta, 
-                              vecAx, 
-                              CUDA_R_64F,
-                              CUSPARSE_SPMV_CSR_ALG2, 
-                              buffer_spmv);
-        error_sum += status;
-        mem_.deviceSynchronize();
-        if (status)
-          out::error() << "Matvec status: " << status 
-                       << "Last error code: " << mem_.getLastDeviceError() << std::endl;
-        vec_result->setDataUpdated("cuda");
-
-        cusparseDestroyDnVec(vecx);
-        cusparseDestroyDnVec(vecAx);
-        return error_sum;
-      } else {
-        if (memspace == "cpu"){
-          //cpu csr matvec
-          index_type* ia = A->getRowData("cpu");
-          index_type* ja = A->getColData("cpu");
-          real_type* a = A->getValues("cpu");
-
-          real_type* x_data = vec_x->getData("cpu");
-          real_type* result_data = vec_result->getData("cpu");
-          real_type sum;
-          real_type y, t, c;
-          //Kahan algorithm for stability; Kahan-Babushka version didnt make a difference   
-          for (int i = 0; i < A->getNumRows(); ++i) {
-            sum = 0.0;
-            c = 0.0;
-            for (int j = ia[i]; j < ia[i+1]; ++j) { 
-              y =  ( a[j] * x_data[ja[j]]) - c;
-              t = sum + y;
-              c = (t - sum) - y;
-              sum = t;
-              //  sum += ( a[j] * x_data[ja[j]]);
-            }
-            sum *= (*alpha);
-            result_data[i] = result_data[i]*(*beta) + sum;
-          } 
-          vec_result->setDataUpdated("cpu");
-          return 0;
-        } else {
-          out::error() << "Not implemented (yet)" << std::endl;
-
-          return 1;
-        }
-      }
+    if (memspace == "cuda" ) {
+      return cudaImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat);
+    } else if (memspace == "cpu") {
+        return cpuImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat);
     } else {
-
-      out::error() << "Not implemented (yet), only csr matvec is implemented" << std::endl;
-      return 1;
+        out::error() << "Support for device " << memspace << " not implemented (yet)" << std::endl;
+        return 1;
     }
   }
 
diff --git a/resolve/matrix/MatrixHandler.hpp b/resolve/matrix/MatrixHandler.hpp
index 366b5422..ea6abbcb 100644
--- a/resolve/matrix/MatrixHandler.hpp
+++ b/resolve/matrix/MatrixHandler.hpp
@@ -7,6 +7,8 @@
 #include <resolve/Common.hpp>
 #include <resolve/MemoryUtils.hpp>
 
+#include <resolve/matrix/MatrixHandlerImpl.hpp>
+
 namespace ReSolve
 { 
   namespace vector
@@ -68,7 +70,7 @@ namespace ReSolve {
                  const real_type* beta,
                  std::string matrix_type,
                  std::string memspace);
-      void Matrix1Norm(matrix::Sparse *A, real_type* norm);
+      int Matrix1Norm(matrix::Sparse *A, real_type* norm);
       bool getValuesChanged();
       void setValuesChanged(bool toWhat); 
     
@@ -78,6 +80,8 @@ namespace ReSolve {
       bool values_changed_{true}; ///< needed for matvec
 
       MemoryHandler mem_; ///< Device memory manager object
+      MatrixHandlerImpl* cpuImpl_{nullptr};
+      MatrixHandlerImpl* cudaImpl_{nullptr};
   };
 
 } // namespace ReSolve
diff --git a/resolve/matrix/MatrixHandlerCpu.cpp b/resolve/matrix/MatrixHandlerCpu.cpp
new file mode 100644
index 00000000..5d61501d
--- /dev/null
+++ b/resolve/matrix/MatrixHandlerCpu.cpp
@@ -0,0 +1,338 @@
+#include <algorithm>
+
+#include <resolve/utilities/logger/Logger.hpp>
+#include <resolve/vector/Vector.hpp>
+#include <resolve/matrix/Coo.hpp>
+#include <resolve/matrix/Csc.hpp>
+#include <resolve/matrix/Csr.hpp>
+#include "MatrixHandlerCpu.hpp"
+
+namespace ReSolve {
+  // Create a shortcut name for Logger static class
+  using out = io::Logger;
+
+  // //helper class
+  // indexPlusValue::indexPlusValue()
+  // {
+  //   idx_ = 0;
+  //   value_ = 0.0;
+  // }
+
+
+  // indexPlusValue::~indexPlusValue()
+  // {
+  // }
+
+  // void indexPlusValue::setIdx(index_type new_idx)
+  // {
+  //   idx_ = new_idx;
+  // }
+
+  // void indexPlusValue::setValue(real_type new_value)
+  // {
+  //   value_ = new_value;
+  // }
+
+  // index_type indexPlusValue::getIdx()
+  // {
+  //   return idx_;
+  // }
+
+  // real_type indexPlusValue::getValue()
+  // {
+  //   return value_;
+  // }
+  // //end of helper class
+
+  MatrixHandlerCpu::MatrixHandlerCpu()
+  {
+    // new_matrix_ = true;
+    // values_changed_ = true;
+  }
+
+  MatrixHandlerCpu::~MatrixHandlerCpu()
+  {
+  }
+
+  MatrixHandlerCpu::MatrixHandlerCpu(LinAlgWorkspace* new_workspace)
+  {
+    workspace_ = new_workspace;
+  }
+
+  // bool MatrixHandlerCpu::getValuesChanged()
+  // {
+  //   return this->values_changed_;
+  // }
+
+  // void MatrixHandlerCpu::setValuesChanged(bool toWhat)
+  // {
+  //   this->values_changed_ = toWhat;
+  // }
+
+//   int MatrixHandlerCpu::coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace)
+//   {
+//     //this happens on the CPU not on the GPU
+//     //but will return whatever memspace requested.
+
+//     //count nnzs first
+
+//     index_type nnz_unpacked = 0;
+//     index_type nnz = A_coo->getNnz();
+//     index_type n = A_coo->getNumRows();
+//     bool symmetric = A_coo->symmetric();
+//     bool expanded = A_coo->expanded();
+
+//     index_type* nnz_counts =  new index_type[n];
+//     std::fill_n(nnz_counts, n, 0);
+//     index_type* coo_rows = A_coo->getRowData("cpu");
+//     index_type* coo_cols = A_coo->getColData("cpu");
+//     real_type* coo_vals = A_coo->getValues("cpu");
+
+//     index_type* diag_control = new index_type[n]; //for DEDUPLICATION of the diagonal
+//     std::fill_n(diag_control, n, 0);
+//     index_type nnz_unpacked_no_duplicates = 0;
+//     index_type nnz_no_duplicates = nnz;
+
+
+//     //maybe check if they exist?
+//     for (index_type i = 0; i < nnz; ++i)
+//     {
+//       nnz_counts[coo_rows[i]]++;
+//       nnz_unpacked++;
+//       nnz_unpacked_no_duplicates++;
+//       if ((coo_rows[i] != coo_cols[i])&& (symmetric) && (!expanded))
+//       {
+//         nnz_counts[coo_cols[i]]++;
+//         nnz_unpacked++;
+//         nnz_unpacked_no_duplicates++;
+//       }
+//       if (coo_rows[i] == coo_cols[i]){
+//         if (diag_control[coo_rows[i]] > 0) {
+//          //duplicate
+//           nnz_unpacked_no_duplicates--;
+//           nnz_no_duplicates--;
+//         }
+//         diag_control[coo_rows[i]]++;
+//       }
+//     }
+//     A_csr->setExpanded(true);
+//     A_csr->setNnzExpanded(nnz_unpacked_no_duplicates);
+//     index_type* csr_ia = new index_type[n+1];
+//     std::fill_n(csr_ia, n + 1, 0);
+//     index_type* csr_ja = new index_type[nnz_unpacked];
+//     real_type* csr_a = new real_type[nnz_unpacked];
+//     index_type* nnz_shifts = new index_type[n];
+//     std::fill_n(nnz_shifts, n , 0);
+
+//     indexPlusValue* tmp = new indexPlusValue[nnz_unpacked]; 
+
+//     csr_ia[0] = 0;
+
+//     for (index_type i = 1; i < n + 1; ++i){
+//       csr_ia[i] = csr_ia[i - 1] + nnz_counts[i - 1] - (diag_control[i-1] - 1);
+//     }
+
+//     int r, start;
+
+
+//     for (index_type i = 0; i < nnz; ++i){
+//       //which row
+//       r = coo_rows[i];
+//       start = csr_ia[r];
+
+//       if ((start + nnz_shifts[r]) > nnz_unpacked) {
+//         out::warning() << "index out of bounds (case 1) start: " << start << "nnz_shifts[" << r << "] = " << nnz_shifts[r] << std::endl;
+//       }
+//       if ((r == coo_cols[i]) && (diag_control[r] > 1)) {//diagonal, and there are duplicates
+//         bool already_there = false;  
+//         for (index_type j = start; j < start + nnz_shifts[r]; ++j)
+//         {
+//           index_type c = tmp[j].getIdx();
+//           if (c == r) {
+//             real_type val = tmp[j].getValue();
+//             val += coo_vals[i];
+//             tmp[j].setValue(val);
+//             already_there = true;
+//             out::warning() << " duplicate found, row " << c << " adding in place " << j << " current value: " << val << std::endl;
+//           }  
+//         }  
+//         if (!already_there){ // first time this duplicates appears
+
+//           tmp[start + nnz_shifts[r]].setIdx(coo_cols[i]);
+//           tmp[start + nnz_shifts[r]].setValue(coo_vals[i]);
+
+//           nnz_shifts[r]++;
+//         }
+//       } else {//not diagonal
+//         tmp[start + nnz_shifts[r]].setIdx(coo_cols[i]);
+//         tmp[start + nnz_shifts[r]].setValue(coo_vals[i]);
+//         nnz_shifts[r]++;
+
+//         if ((coo_rows[i] != coo_cols[i]) && (symmetric == 1))
+//         {
+//           r = coo_cols[i];
+//           start = csr_ia[r];
+
+//           if ((start + nnz_shifts[r]) > nnz_unpacked)
+//             out::warning() << "index out of bounds (case 2) start: " << start << "nnz_shifts[" << r << "] = " << nnz_shifts[r] << std::endl;
+//           tmp[start + nnz_shifts[r]].setIdx(coo_rows[i]);
+//           tmp[start + nnz_shifts[r]].setValue(coo_vals[i]);
+//           nnz_shifts[r]++;
+//         }
+//       }
+//     }
+//     //now sort whatever is inside rows
+
+//     for (int i = 0; i < n; ++i)
+//     {
+
+//       //now sorting (and adding 1)
+//       int colStart = csr_ia[i];
+//       int colEnd = csr_ia[i + 1];
+//       int length = colEnd - colStart;
+//       std::sort(&tmp[colStart],&tmp[colStart] + length);
+//     }
+
+//     for (index_type i = 0; i < nnz_unpacked; ++i)
+//     {
+//       csr_ja[i] = tmp[i].getIdx();
+//       csr_a[i] = tmp[i].getValue();
+//     }
+// #if 0
+//     for (int i = 0; i<n; ++i){
+//       printf("Row: %d \n", i);
+//       for (int j = csr_ia[i]; j<csr_ia[i+1]; ++j){
+//         printf("(%d %16.16f) ", csr_ja[j], csr_a[j]);
+//       }
+//       printf("\n");
+//     }
+// #endif
+//     A_csr->setNnz(nnz_no_duplicates);
+//     if (memspace == "cpu"){
+//       A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cpu");
+//     } else {
+//       if (memspace == "cuda"){      
+//         A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cuda");
+//       } else {
+//         //display error
+//       }
+//     }
+//     delete [] nnz_counts;
+//     delete [] tmp;
+//     delete [] nnz_shifts;
+//     delete [] csr_ia;
+//     delete [] csr_ja;
+//     delete [] csr_a;
+//     delete [] diag_control; 
+
+//     return 0;
+//   }
+
+  int MatrixHandlerCpu::matvec(matrix::Sparse* Ageneric, 
+                               vector_type* vec_x, 
+                               vector_type* vec_result, 
+                               const real_type* alpha, 
+                               const real_type* beta,
+                               std::string matrixFormat) 
+  {
+    using namespace constants;
+    // int error_sum = 0;
+    if (matrixFormat == "csr") {
+      matrix::Csr* A = (matrix::Csr*) Ageneric;
+      //result = alpha *A*x + beta * result
+      index_type* ia = A->getRowData("cpu");
+      index_type* ja = A->getColData("cpu");
+      real_type*   a = A->getValues("cpu");
+
+      real_type* x_data      = vec_x->getData("cpu");
+      real_type* result_data = vec_result->getData("cpu");
+      real_type sum;
+      real_type y;
+      real_type t;
+      real_type c;
+
+      //Kahan algorithm for stability; Kahan-Babushka version didnt make a difference   
+      for (int i = 0; i < A->getNumRows(); ++i) {
+        sum = 0.0;
+        c = 0.0;
+        for (int j = ia[i]; j < ia[i+1]; ++j) { 
+          y =  ( a[j] * x_data[ja[j]]) - c;
+          t = sum + y;
+          c = (t - sum) - y;
+          sum = t;
+          //  sum += ( a[j] * x_data[ja[j]]);
+        }
+        sum *= (*alpha);
+        result_data[i] = result_data[i]*(*beta) + sum;
+      } 
+      vec_result->setDataUpdated("cpu");
+      return 0;
+    } else {
+      out::error() << "MatVec not implemented (yet) for " 
+                   << matrixFormat << " matrix format." << std::endl;
+      return 1;
+    }
+  }
+
+  int MatrixHandlerCpu::Matrix1Norm(matrix::Sparse* /* A */, real_type* /* norm */)
+  {
+    return -1;
+  }
+
+  // int MatrixHandlerCpu::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace)
+  // {
+  //   //it ONLY WORKS WITH CUDA
+  //   index_type error_sum = 0;
+  //   if (memspace == "cuda") { 
+  //     LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
+
+  //     A_csr->allocateMatrixData("cuda");
+  //     index_type n = A_csc->getNumRows();
+  //     index_type m = A_csc->getNumRows();
+  //     index_type nnz = A_csc->getNnz();
+  //     size_t bufferSize;
+  //     void* d_work;
+  //     cusparseStatus_t status = cusparseCsr2cscEx2_bufferSize(workspaceCUDA->getCusparseHandle(),
+  //                                                             n, 
+  //                                                             m, 
+  //                                                             nnz, 
+  //                                                             A_csc->getValues("cuda"), 
+  //                                                             A_csc->getColData("cuda"), 
+  //                                                             A_csc->getRowData("cuda"), 
+  //                                                             A_csr->getValues("cuda"), 
+  //                                                             A_csr->getRowData("cuda"),
+  //                                                             A_csr->getColData("cuda"), 
+  //                                                             CUDA_R_64F, 
+  //                                                             CUSPARSE_ACTION_NUMERIC,
+  //                                                             CUSPARSE_INDEX_BASE_ZERO, 
+  //                                                             CUSPARSE_CSR2CSC_ALG1, 
+  //                                                             &bufferSize);
+  //     error_sum += status;
+  //     mem_.allocateBufferOnDevice(&d_work, bufferSize);
+  //     status = cusparseCsr2cscEx2(workspaceCUDA->getCusparseHandle(),
+  //                                 n, 
+  //                                 m, 
+  //                                 nnz, 
+  //                                 A_csc->getValues("cuda"), 
+  //                                 A_csc->getColData("cuda"), 
+  //                                 A_csc->getRowData("cuda"), 
+  //                                 A_csr->getValues("cuda"), 
+  //                                 A_csr->getRowData("cuda"),
+  //                                 A_csr->getColData("cuda"), 
+  //                                 CUDA_R_64F,
+  //                                 CUSPARSE_ACTION_NUMERIC,
+  //                                 CUSPARSE_INDEX_BASE_ZERO,
+  //                                 CUSPARSE_CSR2CSC_ALG1,
+  //                                 d_work);
+  //     error_sum += status;
+  //     return error_sum;
+  //     mem_.deleteOnDevice(d_work);
+  //   } else { 
+  //     out::error() << "Not implemented (yet)" << std::endl;
+  //     return -1;
+  //   } 
+
+
+  // }
+
+} // namespace ReSolve
diff --git a/resolve/matrix/MatrixHandlerCpu.hpp b/resolve/matrix/MatrixHandlerCpu.hpp
new file mode 100644
index 00000000..e4de02bf
--- /dev/null
+++ b/resolve/matrix/MatrixHandlerCpu.hpp
@@ -0,0 +1,66 @@
+// this class encapsulates various matrix manipulation operations, commonly required by linear solvers:
+// this includes 
+// (1) Matrix format conversion: coo2csr, csr2csc
+// (2) Matrix vector product (SpMV)
+// (3) Matrix 1-norm
+#pragma once
+#include <resolve/Common.hpp>
+#include <resolve/MemoryUtils.hpp>
+#include <resolve/matrix/MatrixHandlerImpl.hpp>
+
+namespace ReSolve
+{ 
+  namespace vector
+  {
+    class Vector;
+  }
+  namespace matrix
+  {
+    class Sparse;
+    class Coo;
+    class Csc;
+    class Csr;
+  }
+  class LinAlgWorkspace;
+}
+
+
+namespace ReSolve {
+  /**
+   * @class MatrixHandlerCpu
+   * 
+   * @brief CPU implementation of the matrix handler.
+   */
+  class MatrixHandlerCpu : public MatrixHandlerImpl
+  {
+    using vector_type = vector::Vector;
+    
+    public:
+      MatrixHandlerCpu();
+      MatrixHandlerCpu(LinAlgWorkspace* workspace);
+      virtual ~MatrixHandlerCpu();
+
+      // int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace); //memspace decides on what is returned (cpu or cuda pointer)
+      // int coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace);
+
+      /// Should compute vec_result := alpha*A*vec_x + beta*vec_result, but at least on cpu alpha and beta are flipped
+      virtual int matvec(matrix::Sparse* A,
+                 vector_type* vec_x,
+                 vector_type* vec_result,
+                 const real_type* alpha,
+                 const real_type* beta,
+                 std::string matrix_type);
+      virtual int Matrix1Norm(matrix::Sparse *A, real_type* norm);
+      // bool getValuesChanged();
+      // void setValuesChanged(bool toWhat); 
+    
+    private: 
+      LinAlgWorkspace* workspace_{nullptr};
+      // bool new_matrix_{true};     ///< if the structure changed, you need a new handler.
+      // bool values_changed_{true}; ///< needed for matvec
+
+      MemoryHandler mem_; ///< Device memory manager object
+  };
+
+} // namespace ReSolve
+
diff --git a/resolve/matrix/MatrixHandlerCuda.cpp b/resolve/matrix/MatrixHandlerCuda.cpp
new file mode 100644
index 00000000..bd151cbf
--- /dev/null
+++ b/resolve/matrix/MatrixHandlerCuda.cpp
@@ -0,0 +1,387 @@
+#include <algorithm>
+
+#include <resolve/utilities/logger/Logger.hpp>
+#include <resolve/vector/Vector.hpp>
+#include <resolve/matrix/Coo.hpp>
+#include <resolve/matrix/Csc.hpp>
+#include <resolve/matrix/Csr.hpp>
+#include <resolve/workspace/LinAlgWorkspaceCUDA.hpp>
+#include "MatrixHandlerCuda.hpp"
+
+namespace ReSolve {
+  // Create a shortcut name for Logger static class
+  using out = io::Logger;
+
+  // //helper class
+  // indexPlusValue::indexPlusValue()
+  // {
+  //   idx_ = 0;
+  //   value_ = 0.0;
+  // }
+
+
+  // indexPlusValue::~indexPlusValue()
+  // {
+  // }
+
+  // void indexPlusValue::setIdx(index_type new_idx)
+  // {
+  //   idx_ = new_idx;
+  // }
+
+  // void indexPlusValue::setValue(real_type new_value)
+  // {
+  //   value_ = new_value;
+  // }
+
+  // index_type indexPlusValue::getIdx()
+  // {
+  //   return idx_;
+  // }
+
+  // real_type indexPlusValue::getValue()
+  // {
+  //   return value_;
+  // }
+  // //end of helper class
+
+  MatrixHandlerCuda::MatrixHandlerCuda()
+  {
+    // new_matrix_ = true;
+    values_changed_ = true;
+  }
+
+  MatrixHandlerCuda::~MatrixHandlerCuda()
+  {
+  }
+
+  MatrixHandlerCuda::MatrixHandlerCuda(LinAlgWorkspace* new_workspace)
+  {
+    workspace_ = new_workspace;
+  }
+
+  // bool MatrixHandlerCuda::getValuesChanged()
+  // {
+  //   return this->values_changed_;
+  // }
+
+  // void MatrixHandlerCuda::setValuesChanged(bool toWhat)
+  // {
+  //   this->values_changed_ = toWhat;
+  // }
+
+//   int MatrixHandlerCuda::coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace)
+//   {
+//     //this happens on the CPU not on the GPU
+//     //but will return whatever memspace requested.
+
+//     //count nnzs first
+
+//     index_type nnz_unpacked = 0;
+//     index_type nnz = A_coo->getNnz();
+//     index_type n = A_coo->getNumRows();
+//     bool symmetric = A_coo->symmetric();
+//     bool expanded = A_coo->expanded();
+
+//     index_type* nnz_counts =  new index_type[n];
+//     std::fill_n(nnz_counts, n, 0);
+//     index_type* coo_rows = A_coo->getRowData("cpu");
+//     index_type* coo_cols = A_coo->getColData("cpu");
+//     real_type* coo_vals = A_coo->getValues("cpu");
+
+//     index_type* diag_control = new index_type[n]; //for DEDUPLICATION of the diagonal
+//     std::fill_n(diag_control, n, 0);
+//     index_type nnz_unpacked_no_duplicates = 0;
+//     index_type nnz_no_duplicates = nnz;
+
+
+//     //maybe check if they exist?
+//     for (index_type i = 0; i < nnz; ++i)
+//     {
+//       nnz_counts[coo_rows[i]]++;
+//       nnz_unpacked++;
+//       nnz_unpacked_no_duplicates++;
+//       if ((coo_rows[i] != coo_cols[i])&& (symmetric) && (!expanded))
+//       {
+//         nnz_counts[coo_cols[i]]++;
+//         nnz_unpacked++;
+//         nnz_unpacked_no_duplicates++;
+//       }
+//       if (coo_rows[i] == coo_cols[i]){
+//         if (diag_control[coo_rows[i]] > 0) {
+//          //duplicate
+//           nnz_unpacked_no_duplicates--;
+//           nnz_no_duplicates--;
+//         }
+//         diag_control[coo_rows[i]]++;
+//       }
+//     }
+//     A_csr->setExpanded(true);
+//     A_csr->setNnzExpanded(nnz_unpacked_no_duplicates);
+//     index_type* csr_ia = new index_type[n+1];
+//     std::fill_n(csr_ia, n + 1, 0);
+//     index_type* csr_ja = new index_type[nnz_unpacked];
+//     real_type* csr_a = new real_type[nnz_unpacked];
+//     index_type* nnz_shifts = new index_type[n];
+//     std::fill_n(nnz_shifts, n , 0);
+
+//     indexPlusValue* tmp = new indexPlusValue[nnz_unpacked]; 
+
+//     csr_ia[0] = 0;
+
+//     for (index_type i = 1; i < n + 1; ++i){
+//       csr_ia[i] = csr_ia[i - 1] + nnz_counts[i - 1] - (diag_control[i-1] - 1);
+//     }
+
+//     int r, start;
+
+
+//     for (index_type i = 0; i < nnz; ++i){
+//       //which row
+//       r = coo_rows[i];
+//       start = csr_ia[r];
+
+//       if ((start + nnz_shifts[r]) > nnz_unpacked) {
+//         out::warning() << "index out of bounds (case 1) start: " << start << "nnz_shifts[" << r << "] = " << nnz_shifts[r] << std::endl;
+//       }
+//       if ((r == coo_cols[i]) && (diag_control[r] > 1)) {//diagonal, and there are duplicates
+//         bool already_there = false;  
+//         for (index_type j = start; j < start + nnz_shifts[r]; ++j)
+//         {
+//           index_type c = tmp[j].getIdx();
+//           if (c == r) {
+//             real_type val = tmp[j].getValue();
+//             val += coo_vals[i];
+//             tmp[j].setValue(val);
+//             already_there = true;
+//             out::warning() << " duplicate found, row " << c << " adding in place " << j << " current value: " << val << std::endl;
+//           }  
+//         }  
+//         if (!already_there){ // first time this duplicates appears
+
+//           tmp[start + nnz_shifts[r]].setIdx(coo_cols[i]);
+//           tmp[start + nnz_shifts[r]].setValue(coo_vals[i]);
+
+//           nnz_shifts[r]++;
+//         }
+//       } else {//not diagonal
+//         tmp[start + nnz_shifts[r]].setIdx(coo_cols[i]);
+//         tmp[start + nnz_shifts[r]].setValue(coo_vals[i]);
+//         nnz_shifts[r]++;
+
+//         if ((coo_rows[i] != coo_cols[i]) && (symmetric == 1))
+//         {
+//           r = coo_cols[i];
+//           start = csr_ia[r];
+
+//           if ((start + nnz_shifts[r]) > nnz_unpacked)
+//             out::warning() << "index out of bounds (case 2) start: " << start << "nnz_shifts[" << r << "] = " << nnz_shifts[r] << std::endl;
+//           tmp[start + nnz_shifts[r]].setIdx(coo_rows[i]);
+//           tmp[start + nnz_shifts[r]].setValue(coo_vals[i]);
+//           nnz_shifts[r]++;
+//         }
+//       }
+//     }
+//     //now sort whatever is inside rows
+
+//     for (int i = 0; i < n; ++i)
+//     {
+
+//       //now sorting (and adding 1)
+//       int colStart = csr_ia[i];
+//       int colEnd = csr_ia[i + 1];
+//       int length = colEnd - colStart;
+//       std::sort(&tmp[colStart],&tmp[colStart] + length);
+//     }
+
+//     for (index_type i = 0; i < nnz_unpacked; ++i)
+//     {
+//       csr_ja[i] = tmp[i].getIdx();
+//       csr_a[i] = tmp[i].getValue();
+//     }
+// #if 0
+//     for (int i = 0; i<n; ++i){
+//       printf("Row: %d \n", i);
+//       for (int j = csr_ia[i]; j<csr_ia[i+1]; ++j){
+//         printf("(%d %16.16f) ", csr_ja[j], csr_a[j]);
+//       }
+//       printf("\n");
+//     }
+// #endif
+//     A_csr->setNnz(nnz_no_duplicates);
+//     if (memspace == "cpu"){
+//       A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cpu");
+//     } else {
+//       if (memspace == "cuda"){      
+//         A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cuda");
+//       } else {
+//         //display error
+//       }
+//     }
+//     delete [] nnz_counts;
+//     delete [] tmp;
+//     delete [] nnz_shifts;
+//     delete [] csr_ia;
+//     delete [] csr_ja;
+//     delete [] csr_a;
+//     delete [] diag_control; 
+
+//     return 0;
+//   }
+
+  int MatrixHandlerCuda::matvec(matrix::Sparse* Ageneric, 
+                               vector_type* vec_x, 
+                               vector_type* vec_result, 
+                               const real_type* alpha, 
+                               const real_type* beta,
+                               std::string matrixFormat) 
+  {
+    using namespace constants;
+    int error_sum = 0;
+    if (matrixFormat == "csr") {
+      matrix::Csr* A = dynamic_cast<matrix::Csr*>(Ageneric);
+      //result = alpha *A*x + beta * result
+      // if (memspace == "cuda" ) {
+      cusparseStatus_t status;
+      // std::cout << "Matvec on NVIDIA GPU ...\n";
+      LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
+      cusparseDnVecDescr_t vecx = workspaceCUDA->getVecX();
+      //printf("is vec_x NULL? %d\n", vec_x->getData("cuda") == nullptr);
+      //printf("is vec_result NULL? %d\n", vec_result->getData("cuda") == nullptr);
+      cusparseCreateDnVec(&vecx, A->getNumRows(), vec_x->getData("cuda"), CUDA_R_64F);
+
+
+      cusparseDnVecDescr_t vecAx = workspaceCUDA->getVecY();
+      cusparseCreateDnVec(&vecAx, A->getNumRows(), vec_result->getData("cuda"), CUDA_R_64F);
+
+      cusparseSpMatDescr_t matA = workspaceCUDA->getSpmvMatrixDescriptor();
+
+      void* buffer_spmv = workspaceCUDA->getSpmvBuffer();
+      cusparseHandle_t handle_cusparse = workspaceCUDA->getCusparseHandle();
+      if (values_changed_) { 
+        status = cusparseCreateCsr(&matA, 
+                                    A->getNumRows(),
+                                    A->getNumColumns(),
+                                    A->getNnzExpanded(),
+                                    A->getRowData("cuda"),
+                                    A->getColData("cuda"),
+                                    A->getValues("cuda"), 
+                                    CUSPARSE_INDEX_32I, 
+                                    CUSPARSE_INDEX_32I,
+                                    CUSPARSE_INDEX_BASE_ZERO,
+                                    CUDA_R_64F);
+        error_sum += status;
+        values_changed_ = false;
+      }
+      if (!workspaceCUDA->matvecSetup()) {
+        //setup first, allocate, etc.
+        size_t bufferSize = 0;
+
+        status = cusparseSpMV_bufferSize(handle_cusparse, 
+                                         CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                         &MINUSONE,
+                                         matA,
+                                         vecx,
+                                         &ONE,
+                                         vecAx,
+                                         CUDA_R_64F,
+                                         CUSPARSE_SPMV_CSR_ALG2, 
+                                         &bufferSize);
+        error_sum += status;
+        mem_.deviceSynchronize();
+        mem_.allocateBufferOnDevice(&buffer_spmv, bufferSize);
+        workspaceCUDA->setSpmvMatrixDescriptor(matA);
+        workspaceCUDA->setSpmvBuffer(buffer_spmv);
+
+        workspaceCUDA->matvecSetupDone();
+      } 
+
+      status = cusparseSpMV(handle_cusparse,
+                            CUSPARSE_OPERATION_NON_TRANSPOSE,       
+                            alpha, 
+                            matA, 
+                            vecx, 
+                            beta, 
+                            vecAx, 
+                            CUDA_R_64F,
+                            CUSPARSE_SPMV_CSR_ALG2, 
+                            buffer_spmv);
+      error_sum += status;
+      mem_.deviceSynchronize();
+      if (status)
+        out::error() << "Matvec status: " << status 
+                      << "Last error code: " << mem_.getLastDeviceError() << std::endl;
+      vec_result->setDataUpdated("cuda");
+
+      cusparseDestroyDnVec(vecx);
+      cusparseDestroyDnVec(vecAx);
+      return error_sum;
+    } else {
+      out::error() << "MatVec not implemented (yet) for " 
+                   << matrixFormat << " matrix format." << std::endl;
+      return 1;
+    }
+  }
+
+  int MatrixHandlerCuda::Matrix1Norm(matrix::Sparse* /* A */, real_type* /* norm */)
+  {
+    return -1;
+  }
+
+  // int MatrixHandlerCuda::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace)
+  // {
+  //   //it ONLY WORKS WITH CUDA
+  //   index_type error_sum = 0;
+  //   if (memspace == "cuda") { 
+  //     LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
+
+  //     A_csr->allocateMatrixData("cuda");
+  //     index_type n = A_csc->getNumRows();
+  //     index_type m = A_csc->getNumRows();
+  //     index_type nnz = A_csc->getNnz();
+  //     size_t bufferSize;
+  //     void* d_work;
+  //     cusparseStatus_t status = cusparseCsr2cscEx2_bufferSize(workspaceCUDA->getCusparseHandle(),
+  //                                                             n, 
+  //                                                             m, 
+  //                                                             nnz, 
+  //                                                             A_csc->getValues("cuda"), 
+  //                                                             A_csc->getColData("cuda"), 
+  //                                                             A_csc->getRowData("cuda"), 
+  //                                                             A_csr->getValues("cuda"), 
+  //                                                             A_csr->getRowData("cuda"),
+  //                                                             A_csr->getColData("cuda"), 
+  //                                                             CUDA_R_64F, 
+  //                                                             CUSPARSE_ACTION_NUMERIC,
+  //                                                             CUSPARSE_INDEX_BASE_ZERO, 
+  //                                                             CUSPARSE_CSR2CSC_ALG1, 
+  //                                                             &bufferSize);
+  //     error_sum += status;
+  //     mem_.allocateBufferOnDevice(&d_work, bufferSize);
+  //     status = cusparseCsr2cscEx2(workspaceCUDA->getCusparseHandle(),
+  //                                 n, 
+  //                                 m, 
+  //                                 nnz, 
+  //                                 A_csc->getValues("cuda"), 
+  //                                 A_csc->getColData("cuda"), 
+  //                                 A_csc->getRowData("cuda"), 
+  //                                 A_csr->getValues("cuda"), 
+  //                                 A_csr->getRowData("cuda"),
+  //                                 A_csr->getColData("cuda"), 
+  //                                 CUDA_R_64F,
+  //                                 CUSPARSE_ACTION_NUMERIC,
+  //                                 CUSPARSE_INDEX_BASE_ZERO,
+  //                                 CUSPARSE_CSR2CSC_ALG1,
+  //                                 d_work);
+  //     error_sum += status;
+  //     return error_sum;
+  //     mem_.deleteOnDevice(d_work);
+  //   } else { 
+  //     out::error() << "Not implemented (yet)" << std::endl;
+  //     return -1;
+  //   } 
+
+
+  // }
+
+} // namespace ReSolve
diff --git a/resolve/matrix/MatrixHandlerCuda.hpp b/resolve/matrix/MatrixHandlerCuda.hpp
new file mode 100644
index 00000000..1e411ad1
--- /dev/null
+++ b/resolve/matrix/MatrixHandlerCuda.hpp
@@ -0,0 +1,66 @@
+// this class encapsulates various matrix manipulation operations, commonly required by linear solvers:
+// this includes 
+// (1) Matrix format conversion: coo2csr, csr2csc
+// (2) Matrix vector product (SpMV)
+// (3) Matrix 1-norm
+#pragma once
+#include <resolve/Common.hpp>
+#include <resolve/MemoryUtils.hpp>
+#include <resolve/matrix/MatrixHandlerImpl.hpp>
+
+namespace ReSolve
+{ 
+  namespace vector
+  {
+    class Vector;
+  }
+  namespace matrix
+  {
+    class Sparse;
+    class Coo;
+    class Csc;
+    class Csr;
+  }
+  class LinAlgWorkspaceCUDA;
+}
+
+
+namespace ReSolve {
+  /**
+   * @class MatrixHandlerCuda
+   * 
+   * @brief CPU implementation of the matrix handler.
+   */
+  class MatrixHandlerCuda : public MatrixHandlerImpl
+  {
+    using vector_type = vector::Vector;
+    
+    public:
+      MatrixHandlerCuda();
+      MatrixHandlerCuda(LinAlgWorkspace* workspace);
+      virtual ~MatrixHandlerCuda();
+
+      // int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace); //memspace decides on what is returned (cpu or cuda pointer)
+      // int coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace);
+
+      /// Should compute vec_result := alpha*A*vec_x + beta*vec_result, but at least on cpu alpha and beta are flipped
+      virtual int matvec(matrix::Sparse* A,
+                 vector_type* vec_x,
+                 vector_type* vec_result,
+                 const real_type* alpha,
+                 const real_type* beta,
+                 std::string matrix_type);
+      virtual int Matrix1Norm(matrix::Sparse *A, real_type* norm);
+      // bool getValuesChanged();
+      // void setValuesChanged(bool toWhat); 
+    
+    private: 
+      LinAlgWorkspace* workspace_{nullptr};
+      // bool new_matrix_{true};     ///< if the structure changed, you need a new handler.
+      // bool values_changed_{true}; ///< needed for matvec
+
+      MemoryHandler mem_; ///< Device memory manager object
+  };
+
+} // namespace ReSolve
+
diff --git a/resolve/matrix/MatrixHandlerImpl.hpp b/resolve/matrix/MatrixHandlerImpl.hpp
new file mode 100644
index 00000000..5a1d6b6f
--- /dev/null
+++ b/resolve/matrix/MatrixHandlerImpl.hpp
@@ -0,0 +1,75 @@
+// this class encapsulates various matrix manipulation operations, commonly required by linear solvers:
+// this includes 
+// (1) Matrix format conversion: coo2csr, csr2csc
+// (2) Matrix vector product (SpMV)
+// (3) Matrix 1-norm
+#pragma once
+#include <resolve/Common.hpp>
+#include <resolve/MemoryUtils.hpp>
+
+namespace ReSolve
+{ 
+  namespace vector
+  {
+    class Vector;
+  }
+  namespace matrix
+  {
+    class Sparse;
+    class Coo;
+    class Csc;
+    class Csr;
+  }
+  // class LinAlgWorkspace;
+}
+
+
+namespace ReSolve {
+  /**
+   * @class MatrixHandlerImpl
+   * 
+   * @brief Base class for different matrix handler implementations.
+   */
+  class MatrixHandlerImpl
+  {
+    using vector_type = vector::Vector;
+    
+    public:
+      MatrixHandlerImpl()
+      {}
+      // MatrixHandlerImpl(LinAlgWorkspace* workspace);
+      virtual ~MatrixHandlerImpl()
+      {}
+
+      // int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace); //memspace decides on what is returned (cpu or cuda pointer)
+      // int coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace);
+
+      /// Should compute vec_result := alpha*A*vec_x + beta*vec_result, but at least on cpu alpha and beta are flipped
+      virtual int matvec(matrix::Sparse* A,
+                 vector_type* vec_x,
+                 vector_type* vec_result,
+                 const real_type* alpha,
+                 const real_type* beta,
+                 std::string matrix_type) = 0;
+      virtual int Matrix1Norm(matrix::Sparse* A, real_type* norm) = 0;
+
+      bool isValuesChanged()
+      {
+        return values_changed_;
+      }
+
+      void setValuesChanged(bool isValuesChanged)
+      {
+        values_changed_ = isValuesChanged;
+      } 
+    
+    protected: 
+      // LinAlgWorkspace* workspace_{nullptr};
+      // bool new_matrix_{true};     ///< if the structure changed, you need a new handler.
+      bool values_changed_{true}; ///< needed for matvec
+
+      // MemoryHandler mem_; ///< Device memory manager object
+  };
+
+} // namespace ReSolve
+

From 6ad87aad21a25ecacede4f42c501ac1bdc031a9f Mon Sep 17 00:00:00 2001
From: Slaven Peles <peless@ornl.gov>
Date: Mon, 16 Oct 2023 21:45:48 -0400
Subject: [PATCH 05/15] Separate CPU and CUDA mmethods for csc2csr conversion.

---
 examples/r_KLU_KLU.cpp               |   2 +-
 resolve/matrix/CMakeLists.txt        |   1 -
 resolve/matrix/MatrixHandler.cpp     |  62 ++-----------
 resolve/matrix/MatrixHandler.hpp     |   3 +-
 resolve/matrix/MatrixHandlerCpu.cpp  | 130 ++++++++++++++++-----------
 resolve/matrix/MatrixHandlerCpu.hpp  |   3 +-
 resolve/matrix/MatrixHandlerCuda.cpp | 123 ++++++++++++-------------
 resolve/matrix/MatrixHandlerCuda.hpp |   3 +-
 resolve/matrix/MatrixHandlerImpl.hpp |   2 +-
 9 files changed, 146 insertions(+), 183 deletions(-)

diff --git a/examples/r_KLU_KLU.cpp b/examples/r_KLU_KLU.cpp
index b1e87203..85c84921 100644
--- a/examples/r_KLU_KLU.cpp
+++ b/examples/r_KLU_KLU.cpp
@@ -132,7 +132,7 @@ int main(int argc, char *argv[])
       status = KLU->solve(vec_rhs, vec_x);
       std::cout<<"KLU solve status: "<<status<<std::endl;      
     }
-    // vec_r->update(rhs, "cpu", "cuda");
+    vec_r->update(rhs, "cpu", "cpu");
 
     matrix_handler->setValuesChanged(true);
 
diff --git a/resolve/matrix/CMakeLists.txt b/resolve/matrix/CMakeLists.txt
index 8c74ad09..5d05df1b 100644
--- a/resolve/matrix/CMakeLists.txt
+++ b/resolve/matrix/CMakeLists.txt
@@ -26,7 +26,6 @@ set(Matrix_HEADER_INSTALL
     Csr.hpp
     Csc.hpp
     MatrixHandler.hpp
-    MatrixHandlerImpl.hpp
 )
 
 # Build shared library ReSolve::matrix
diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp
index 30c08991..7005046d 100644
--- a/resolve/matrix/MatrixHandler.cpp
+++ b/resolve/matrix/MatrixHandler.cpp
@@ -66,12 +66,6 @@ namespace ReSolve {
     cudaImpl_ = new MatrixHandlerCuda(new_workspace);
   }
 
-  bool MatrixHandler::getValuesChanged()
-  {
-    values_changed_ = cpuImpl_->isValuesChanged();
-    return this->values_changed_;
-  }
-
   void MatrixHandler::setValuesChanged(bool toWhat)
   {
     this->values_changed_ = toWhat;
@@ -269,58 +263,16 @@ namespace ReSolve {
 
   int MatrixHandler::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace)
   {
-    //it ONLY WORKS WITH CUDA
     index_type error_sum = 0;
     if (memspace == "cuda") { 
-      LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-
-      A_csr->allocateMatrixData("cuda");
-      index_type n = A_csc->getNumRows();
-      index_type m = A_csc->getNumRows();
-      index_type nnz = A_csc->getNnz();
-      size_t bufferSize;
-      void* d_work;
-      cusparseStatus_t status = cusparseCsr2cscEx2_bufferSize(workspaceCUDA->getCusparseHandle(),
-                                                              n, 
-                                                              m, 
-                                                              nnz, 
-                                                              A_csc->getValues("cuda"), 
-                                                              A_csc->getColData("cuda"), 
-                                                              A_csc->getRowData("cuda"), 
-                                                              A_csr->getValues("cuda"), 
-                                                              A_csr->getRowData("cuda"),
-                                                              A_csr->getColData("cuda"), 
-                                                              CUDA_R_64F, 
-                                                              CUSPARSE_ACTION_NUMERIC,
-                                                              CUSPARSE_INDEX_BASE_ZERO, 
-                                                              CUSPARSE_CSR2CSC_ALG1, 
-                                                              &bufferSize);
-      error_sum += status;
-      mem_.allocateBufferOnDevice(&d_work, bufferSize);
-      status = cusparseCsr2cscEx2(workspaceCUDA->getCusparseHandle(),
-                                  n, 
-                                  m, 
-                                  nnz, 
-                                  A_csc->getValues("cuda"), 
-                                  A_csc->getColData("cuda"), 
-                                  A_csc->getRowData("cuda"), 
-                                  A_csr->getValues("cuda"), 
-                                  A_csr->getRowData("cuda"),
-                                  A_csr->getColData("cuda"), 
-                                  CUDA_R_64F,
-                                  CUSPARSE_ACTION_NUMERIC,
-                                  CUSPARSE_INDEX_BASE_ZERO,
-                                  CUSPARSE_CSR2CSC_ALG1,
-                                  d_work);
-      error_sum += status;
-      return error_sum;
-      mem_.deleteOnDevice(d_work);
-    } else { 
-      out::error() << "Not implemented (yet)" << std::endl;
+      return cudaImpl_->csc2csr(A_csc, A_csr);
+    } else if (memspace == "cpu") { 
+      out::warning() << "Using untested csc2csr on CPU ..." << std::endl;
+      return cpuImpl_->csc2csr(A_csc, A_csr);
+    } else {
+      out::error() << "csc2csr not implemented for " << memspace << " device." << std::endl;
       return -1;
-    } 
-
-
+    }
   }
 
 } // namespace ReSolve
diff --git a/resolve/matrix/MatrixHandler.hpp b/resolve/matrix/MatrixHandler.hpp
index ea6abbcb..af444abd 100644
--- a/resolve/matrix/MatrixHandler.hpp
+++ b/resolve/matrix/MatrixHandler.hpp
@@ -7,7 +7,6 @@
 #include <resolve/Common.hpp>
 #include <resolve/MemoryUtils.hpp>
 
-#include <resolve/matrix/MatrixHandlerImpl.hpp>
 
 namespace ReSolve
 { 
@@ -23,6 +22,7 @@ namespace ReSolve
     class Csr;
   }
   class LinAlgWorkspace;
+  class MatrixHandlerImpl;
 }
 
 
@@ -71,7 +71,6 @@ namespace ReSolve {
                  std::string matrix_type,
                  std::string memspace);
       int Matrix1Norm(matrix::Sparse *A, real_type* norm);
-      bool getValuesChanged();
       void setValuesChanged(bool toWhat); 
     
     private: 
diff --git a/resolve/matrix/MatrixHandlerCpu.cpp b/resolve/matrix/MatrixHandlerCpu.cpp
index 5d61501d..d06a4ce5 100644
--- a/resolve/matrix/MatrixHandlerCpu.cpp
+++ b/resolve/matrix/MatrixHandlerCpu.cpp
@@ -1,4 +1,5 @@
 #include <algorithm>
+#include <cassert>
 
 #include <resolve/utilities/logger/Logger.hpp>
 #include <resolve/vector/Vector.hpp>
@@ -279,60 +280,83 @@ namespace ReSolve {
     return -1;
   }
 
-  // int MatrixHandlerCpu::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace)
-  // {
-  //   //it ONLY WORKS WITH CUDA
-  //   index_type error_sum = 0;
-  //   if (memspace == "cuda") { 
-  //     LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-
-  //     A_csr->allocateMatrixData("cuda");
-  //     index_type n = A_csc->getNumRows();
-  //     index_type m = A_csc->getNumRows();
-  //     index_type nnz = A_csc->getNnz();
-  //     size_t bufferSize;
-  //     void* d_work;
-  //     cusparseStatus_t status = cusparseCsr2cscEx2_bufferSize(workspaceCUDA->getCusparseHandle(),
-  //                                                             n, 
-  //                                                             m, 
-  //                                                             nnz, 
-  //                                                             A_csc->getValues("cuda"), 
-  //                                                             A_csc->getColData("cuda"), 
-  //                                                             A_csc->getRowData("cuda"), 
-  //                                                             A_csr->getValues("cuda"), 
-  //                                                             A_csr->getRowData("cuda"),
-  //                                                             A_csr->getColData("cuda"), 
-  //                                                             CUDA_R_64F, 
-  //                                                             CUSPARSE_ACTION_NUMERIC,
-  //                                                             CUSPARSE_INDEX_BASE_ZERO, 
-  //                                                             CUSPARSE_CSR2CSC_ALG1, 
-  //                                                             &bufferSize);
-  //     error_sum += status;
-  //     mem_.allocateBufferOnDevice(&d_work, bufferSize);
-  //     status = cusparseCsr2cscEx2(workspaceCUDA->getCusparseHandle(),
-  //                                 n, 
-  //                                 m, 
-  //                                 nnz, 
-  //                                 A_csc->getValues("cuda"), 
-  //                                 A_csc->getColData("cuda"), 
-  //                                 A_csc->getRowData("cuda"), 
-  //                                 A_csr->getValues("cuda"), 
-  //                                 A_csr->getRowData("cuda"),
-  //                                 A_csr->getColData("cuda"), 
-  //                                 CUDA_R_64F,
-  //                                 CUSPARSE_ACTION_NUMERIC,
-  //                                 CUSPARSE_INDEX_BASE_ZERO,
-  //                                 CUSPARSE_CSR2CSC_ALG1,
-  //                                 d_work);
-  //     error_sum += status;
-  //     return error_sum;
-  //     mem_.deleteOnDevice(d_work);
-  //   } else { 
-  //     out::error() << "Not implemented (yet)" << std::endl;
-  //     return -1;
-  //   } 
+  /**
+   * @authors Slaven Peles <peless@ornl.gov>, Daniel Reynolds (SMU), and
+   * David Gardner and Carol Woodward (LLNL)
+   */
+  int MatrixHandlerCpu::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr)
+  {
+    int error_sum = 0;
+    assert(A_csc->getNnz() == A_csr->getNnz());
+    assert(A_csc->getNumRows() == A_csr->getNumColumns());
+    assert(A_csr->getNumRows() == A_csc->getNumColumns());
 
+    index_type nnz = A_csc->getNnz();
+    index_type n   = A_csc->getNumColumns();
 
-  // }
+    index_type* rowIdxCsc = A_csc->getRowData("cpu");
+    index_type* colPtrCsc = A_csc->getColData("cpu");
+    real_type*  valuesCsc = A_csc->getValues("cpu");
+
+    index_type* rowPtrCsr = A_csr->getRowData("cpu");
+    index_type* colIdxCsr = A_csr->getColData("cpu");
+    real_type*  valuesCsr = A_csr->getValues("cpu");
+
+    // Set all CSR row pointers to zero
+    for (index_type i = 0; i <= n; ++i) {
+      rowPtrCsr[i] = 0;
+    }
+
+    // Set all CSR values and column indices to zero
+    for (index_type i = 0; i < nnz; ++i) {
+      colIdxCsr[i] = 0;
+      valuesCsr[i] = 0.0;
+    }
+
+    // Compute number of entries per row
+    for (index_type i = 0; i < nnz; ++i) {
+      rowPtrCsr[rowIdxCsc[i]]++;
+    }
+
+    // Compute cumualtive sum of nnz per row
+    for (index_type row = 0, rowsum = 0; row < n; ++row)
+    {
+      // Store value in row pointer to temp
+      index_type temp  = rowPtrCsr[row];
+
+      // Copy cumulative sum to the row pointer
+      rowPtrCsr[row] = rowsum;
+
+      // Update row sum
+      rowsum += temp;
+    }
+    rowPtrCsr[n] = nnz;
+
+    for (index_type col = 0; col < n; ++col)
+    {
+      // Compute positions of column indices and values in CSR matrix and store them there
+      // Overwrites CSR row pointers in the process
+      for (index_type jj = colPtrCsc[col]; jj < colPtrCsc[col+1]; jj++)
+      {
+          index_type row  = rowIdxCsc[jj];
+          index_type dest = rowPtrCsr[row];
+
+          colIdxCsr[dest] = col;
+          valuesCsr[dest] = valuesCsc[jj];
+
+          rowPtrCsr[row]++;
+      }
+    }
+
+    // Restore CSR row pointer values
+    for (index_type row = 0, last = 0; row <= n; row++)
+    {
+        index_type temp  = rowPtrCsr[row];
+        rowPtrCsr[row] = last;
+        last    = temp;
+    }
+
+    return 0;
+  }
 
 } // namespace ReSolve
diff --git a/resolve/matrix/MatrixHandlerCpu.hpp b/resolve/matrix/MatrixHandlerCpu.hpp
index e4de02bf..60079a89 100644
--- a/resolve/matrix/MatrixHandlerCpu.hpp
+++ b/resolve/matrix/MatrixHandlerCpu.hpp
@@ -40,7 +40,7 @@ namespace ReSolve {
       MatrixHandlerCpu(LinAlgWorkspace* workspace);
       virtual ~MatrixHandlerCpu();
 
-      // int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace); //memspace decides on what is returned (cpu or cuda pointer)
+      int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr); //memspace decides on what is returned (cpu or cuda pointer)
       // int coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace);
 
       /// Should compute vec_result := alpha*A*vec_x + beta*vec_result, but at least on cpu alpha and beta are flipped
@@ -51,7 +51,6 @@ namespace ReSolve {
                  const real_type* beta,
                  std::string matrix_type);
       virtual int Matrix1Norm(matrix::Sparse *A, real_type* norm);
-      // bool getValuesChanged();
       // void setValuesChanged(bool toWhat); 
     
     private: 
diff --git a/resolve/matrix/MatrixHandlerCuda.cpp b/resolve/matrix/MatrixHandlerCuda.cpp
index bd151cbf..31515398 100644
--- a/resolve/matrix/MatrixHandlerCuda.cpp
+++ b/resolve/matrix/MatrixHandlerCuda.cpp
@@ -241,7 +241,6 @@ namespace ReSolve {
     if (matrixFormat == "csr") {
       matrix::Csr* A = dynamic_cast<matrix::Csr*>(Ageneric);
       //result = alpha *A*x + beta * result
-      // if (memspace == "cuda" ) {
       cusparseStatus_t status;
       // std::cout << "Matvec on NVIDIA GPU ...\n";
       LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
@@ -260,16 +259,16 @@ namespace ReSolve {
       cusparseHandle_t handle_cusparse = workspaceCUDA->getCusparseHandle();
       if (values_changed_) { 
         status = cusparseCreateCsr(&matA, 
-                                    A->getNumRows(),
-                                    A->getNumColumns(),
-                                    A->getNnzExpanded(),
-                                    A->getRowData("cuda"),
-                                    A->getColData("cuda"),
-                                    A->getValues("cuda"), 
-                                    CUSPARSE_INDEX_32I, 
-                                    CUSPARSE_INDEX_32I,
-                                    CUSPARSE_INDEX_BASE_ZERO,
-                                    CUDA_R_64F);
+                                   A->getNumRows(),
+                                   A->getNumColumns(),
+                                   A->getNnzExpanded(),
+                                   A->getRowData("cuda"),
+                                   A->getColData("cuda"),
+                                   A->getValues("cuda"), 
+                                   CUSPARSE_INDEX_32I, 
+                                   CUSPARSE_INDEX_32I,
+                                   CUSPARSE_INDEX_BASE_ZERO,
+                                   CUDA_R_64F);
         error_sum += status;
         values_changed_ = false;
       }
@@ -328,60 +327,52 @@ namespace ReSolve {
     return -1;
   }
 
-  // int MatrixHandlerCuda::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace)
-  // {
-  //   //it ONLY WORKS WITH CUDA
-  //   index_type error_sum = 0;
-  //   if (memspace == "cuda") { 
-  //     LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-
-  //     A_csr->allocateMatrixData("cuda");
-  //     index_type n = A_csc->getNumRows();
-  //     index_type m = A_csc->getNumRows();
-  //     index_type nnz = A_csc->getNnz();
-  //     size_t bufferSize;
-  //     void* d_work;
-  //     cusparseStatus_t status = cusparseCsr2cscEx2_bufferSize(workspaceCUDA->getCusparseHandle(),
-  //                                                             n, 
-  //                                                             m, 
-  //                                                             nnz, 
-  //                                                             A_csc->getValues("cuda"), 
-  //                                                             A_csc->getColData("cuda"), 
-  //                                                             A_csc->getRowData("cuda"), 
-  //                                                             A_csr->getValues("cuda"), 
-  //                                                             A_csr->getRowData("cuda"),
-  //                                                             A_csr->getColData("cuda"), 
-  //                                                             CUDA_R_64F, 
-  //                                                             CUSPARSE_ACTION_NUMERIC,
-  //                                                             CUSPARSE_INDEX_BASE_ZERO, 
-  //                                                             CUSPARSE_CSR2CSC_ALG1, 
-  //                                                             &bufferSize);
-  //     error_sum += status;
-  //     mem_.allocateBufferOnDevice(&d_work, bufferSize);
-  //     status = cusparseCsr2cscEx2(workspaceCUDA->getCusparseHandle(),
-  //                                 n, 
-  //                                 m, 
-  //                                 nnz, 
-  //                                 A_csc->getValues("cuda"), 
-  //                                 A_csc->getColData("cuda"), 
-  //                                 A_csc->getRowData("cuda"), 
-  //                                 A_csr->getValues("cuda"), 
-  //                                 A_csr->getRowData("cuda"),
-  //                                 A_csr->getColData("cuda"), 
-  //                                 CUDA_R_64F,
-  //                                 CUSPARSE_ACTION_NUMERIC,
-  //                                 CUSPARSE_INDEX_BASE_ZERO,
-  //                                 CUSPARSE_CSR2CSC_ALG1,
-  //                                 d_work);
-  //     error_sum += status;
-  //     return error_sum;
-  //     mem_.deleteOnDevice(d_work);
-  //   } else { 
-  //     out::error() << "Not implemented (yet)" << std::endl;
-  //     return -1;
-  //   } 
-
-
-  // }
+  int MatrixHandlerCuda::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr)
+  {
+    index_type error_sum = 0;
+    LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
+
+    A_csr->allocateMatrixData("cuda");
+    index_type n = A_csc->getNumRows();
+    index_type m = A_csc->getNumRows();
+    index_type nnz = A_csc->getNnz();
+    size_t bufferSize;
+    void* d_work;
+    cusparseStatus_t status = cusparseCsr2cscEx2_bufferSize(workspaceCUDA->getCusparseHandle(),
+                                                            n, 
+                                                            m, 
+                                                            nnz, 
+                                                            A_csc->getValues("cuda"), 
+                                                            A_csc->getColData("cuda"), 
+                                                            A_csc->getRowData("cuda"), 
+                                                            A_csr->getValues("cuda"), 
+                                                            A_csr->getRowData("cuda"),
+                                                            A_csr->getColData("cuda"), 
+                                                            CUDA_R_64F, 
+                                                            CUSPARSE_ACTION_NUMERIC,
+                                                            CUSPARSE_INDEX_BASE_ZERO, 
+                                                            CUSPARSE_CSR2CSC_ALG1, 
+                                                            &bufferSize);
+    error_sum += status;
+    mem_.allocateBufferOnDevice(&d_work, bufferSize);
+    status = cusparseCsr2cscEx2(workspaceCUDA->getCusparseHandle(),
+                                n, 
+                                m, 
+                                nnz, 
+                                A_csc->getValues("cuda"), 
+                                A_csc->getColData("cuda"), 
+                                A_csc->getRowData("cuda"), 
+                                A_csr->getValues("cuda"), 
+                                A_csr->getRowData("cuda"),
+                                A_csr->getColData("cuda"), 
+                                CUDA_R_64F,
+                                CUSPARSE_ACTION_NUMERIC,
+                                CUSPARSE_INDEX_BASE_ZERO,
+                                CUSPARSE_CSR2CSC_ALG1,
+                                d_work);
+    error_sum += status;
+    return error_sum;
+    mem_.deleteOnDevice(d_work);
+  }
 
 } // namespace ReSolve
diff --git a/resolve/matrix/MatrixHandlerCuda.hpp b/resolve/matrix/MatrixHandlerCuda.hpp
index 1e411ad1..ae0c3709 100644
--- a/resolve/matrix/MatrixHandlerCuda.hpp
+++ b/resolve/matrix/MatrixHandlerCuda.hpp
@@ -40,7 +40,7 @@ namespace ReSolve {
       MatrixHandlerCuda(LinAlgWorkspace* workspace);
       virtual ~MatrixHandlerCuda();
 
-      // int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace); //memspace decides on what is returned (cpu or cuda pointer)
+      int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr); //memspace decides on what is returned (cpu or cuda pointer)
       // int coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace);
 
       /// Should compute vec_result := alpha*A*vec_x + beta*vec_result, but at least on cpu alpha and beta are flipped
@@ -51,7 +51,6 @@ namespace ReSolve {
                  const real_type* beta,
                  std::string matrix_type);
       virtual int Matrix1Norm(matrix::Sparse *A, real_type* norm);
-      // bool getValuesChanged();
       // void setValuesChanged(bool toWhat); 
     
     private: 
diff --git a/resolve/matrix/MatrixHandlerImpl.hpp b/resolve/matrix/MatrixHandlerImpl.hpp
index 5a1d6b6f..ba24d498 100644
--- a/resolve/matrix/MatrixHandlerImpl.hpp
+++ b/resolve/matrix/MatrixHandlerImpl.hpp
@@ -41,7 +41,7 @@ namespace ReSolve {
       virtual ~MatrixHandlerImpl()
       {}
 
-      // int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace); //memspace decides on what is returned (cpu or cuda pointer)
+      virtual int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr) = 0;
       // int coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace);
 
       /// Should compute vec_result := alpha*A*vec_x + beta*vec_result, but at least on cpu alpha and beta are flipped

From 23edbf7548b96e4e1c27cddb3cdb2f86e32745ff Mon Sep 17 00:00:00 2001
From: Slaven Peles <peless@ornl.gov>
Date: Mon, 16 Oct 2023 22:31:37 -0400
Subject: [PATCH 06/15] Method setValuesChanged in MatrixHandler now requires
 memory space as input.

---
 examples/r_KLU_GLU.cpp                        |  2 +-
 examples/r_KLU_GLU_matrix_values_update.cpp   |  2 +-
 examples/r_KLU_KLU.cpp                        |  2 +-
 examples/r_KLU_KLU_standalone.cpp             |  2 +-
 examples/r_KLU_rf.cpp                         |  2 +-
 examples/r_KLU_rf_FGMRES.cpp                  |  6 ++---
 .../r_KLU_rf_FGMRES_reuse_factorization.cpp   |  6 ++---
 resolve/LinSolverIterativeFGMRES.cpp          |  2 +-
 resolve/matrix/MatrixHandler.cpp              | 12 ++++++----
 resolve/matrix/MatrixHandler.hpp              |  5 ++---
 resolve/matrix/MatrixHandlerCpu.cpp           |  8 +++----
 resolve/matrix/MatrixHandlerCpu.hpp           |  4 ++--
 resolve/matrix/MatrixHandlerCuda.cpp          |  8 +++----
 resolve/matrix/MatrixHandlerCuda.hpp          |  4 ++--
 resolve/matrix/MatrixHandlerImpl.hpp          | 22 ++++++-------------
 tests/functionality/testKLU.cpp               |  4 ++--
 tests/functionality/testKLU_GLU.cpp           |  4 ++--
 tests/functionality/testKLU_Rf.cpp            |  6 ++---
 tests/functionality/testKLU_Rf_FGMRES.cpp     |  4 ++--
 tests/unit/matrix/MatrixHandlerTests.hpp      |  2 +-
 20 files changed, 51 insertions(+), 56 deletions(-)

diff --git a/examples/r_KLU_GLU.cpp b/examples/r_KLU_GLU.cpp
index 82c949db..c072eb34 100644
--- a/examples/r_KLU_GLU.cpp
+++ b/examples/r_KLU_GLU.cpp
@@ -145,7 +145,7 @@ int main(int argc, char *argv[])
     vec_r->update(rhs, "cpu", "cuda");
 
 
-    matrix_handler->setValuesChanged(true);
+    matrix_handler->setValuesChanged(true, "cuda");
     matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
 
     printf("\t 2-Norm of the residual: %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda")));
diff --git a/examples/r_KLU_GLU_matrix_values_update.cpp b/examples/r_KLU_GLU_matrix_values_update.cpp
index 19d7ec3c..af797883 100644
--- a/examples/r_KLU_GLU_matrix_values_update.cpp
+++ b/examples/r_KLU_GLU_matrix_values_update.cpp
@@ -155,7 +155,7 @@ int main(int argc, char *argv[])
       vec_r->update(rhs, "cpu", "cuda");
 
 
-      matrix_handler->setValuesChanged(true);
+      matrix_handler->setValuesChanged(true, "cuda");
       matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
 
       printf("\t 2-Norm of the residual: %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda")));
diff --git a/examples/r_KLU_KLU.cpp b/examples/r_KLU_KLU.cpp
index 85c84921..e777833a 100644
--- a/examples/r_KLU_KLU.cpp
+++ b/examples/r_KLU_KLU.cpp
@@ -134,7 +134,7 @@ int main(int argc, char *argv[])
     }
     vec_r->update(rhs, "cpu", "cpu");
 
-    matrix_handler->setValuesChanged(true);
+    matrix_handler->setValuesChanged(true, "cpu");
 
     matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cpu"); 
     real_type* test = vec_r->getData("cpu");
diff --git a/examples/r_KLU_KLU_standalone.cpp b/examples/r_KLU_KLU_standalone.cpp
index f0b7c3a4..daeae43a 100644
--- a/examples/r_KLU_KLU_standalone.cpp
+++ b/examples/r_KLU_KLU_standalone.cpp
@@ -96,7 +96,7 @@ int main(int argc, char *argv[])
   std::cout << "KLU solve status: " << status << std::endl;      
   vec_r->update(rhs, "cpu", "cpu");
 
-  matrix_handler->setValuesChanged(true);
+  matrix_handler->setValuesChanged(true, "cpu");
 
   matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cpu"); 
   real_type* test = vec_r->getData("cpu");
diff --git a/examples/r_KLU_rf.cpp b/examples/r_KLU_rf.cpp
index f2e96dfe..8954724d 100644
--- a/examples/r_KLU_rf.cpp
+++ b/examples/r_KLU_rf.cpp
@@ -158,7 +158,7 @@ int main(int argc, char *argv[] )
     }
     vec_r->update(rhs, "cpu", "cuda");
 
-    matrix_handler->setValuesChanged(true);
+    matrix_handler->setValuesChanged(true, "cuda");
 
     matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
 
diff --git a/examples/r_KLU_rf_FGMRES.cpp b/examples/r_KLU_rf_FGMRES.cpp
index d9543895..52f54a0d 100644
--- a/examples/r_KLU_rf_FGMRES.cpp
+++ b/examples/r_KLU_rf_FGMRES.cpp
@@ -125,7 +125,7 @@ int main(int argc, char *argv[])
     real_type norm_b;
     if (i < 2){
       KLU->setup(A);
-      matrix_handler->setValuesChanged(true);
+      matrix_handler->setValuesChanged(true, "cuda");
       status = KLU->analyze();
       std::cout<<"KLU analysis status: "<<status<<std::endl;
       status = KLU->factorize();
@@ -135,7 +135,7 @@ int main(int argc, char *argv[])
       vec_r->update(rhs, "cpu", "cuda");
       norm_b = vector_handler->dot(vec_r, vec_r, "cuda");
       norm_b = sqrt(norm_b);
-      matrix_handler->setValuesChanged(true);
+      matrix_handler->setValuesChanged(true, "cuda");
       matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
       printf("\t 2-Norm of the residual : %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b);
       if (i == 1) {
@@ -165,7 +165,7 @@ int main(int argc, char *argv[])
        norm_b = vector_handler->dot(vec_r, vec_r, "cuda");
       norm_b = sqrt(norm_b);
 
-      //matrix_handler->setValuesChanged(true);
+      //matrix_handler->setValuesChanged(true, "cuda");
       FGMRES->resetMatrix(A);
       FGMRES->setupPreconditioner("CuSolverRf", Rf);
       
diff --git a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
index d50bdb4a..2c484926 100644
--- a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
+++ b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
@@ -126,7 +126,7 @@ int main(int argc, char *argv[])
     real_type norm_b;
     if (i < 2){
       KLU->setup(A);
-      matrix_handler->setValuesChanged(true);
+      matrix_handler->setValuesChanged(true, "cuda");
       status = KLU->analyze();
       std::cout<<"KLU analysis status: "<<status<<std::endl;
       status = KLU->factorize();
@@ -136,7 +136,7 @@ int main(int argc, char *argv[])
       vec_r->update(rhs, "cpu", "cuda");
       norm_b = vector_handler->dot(vec_r, vec_r, "cuda");
       norm_b = sqrt(norm_b);
-      matrix_handler->setValuesChanged(true);
+      matrix_handler->setValuesChanged(true, "cuda");
       matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
       printf("\t 2-Norm of the residual : %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b);
       if (i == 1) {
@@ -178,7 +178,7 @@ int main(int argc, char *argv[])
        norm_b = vector_handler->dot(vec_r, vec_r, "cuda");
       norm_b = sqrt(norm_b);
 
-      matrix_handler->setValuesChanged(true);
+      matrix_handler->setValuesChanged(true, "cuda");
       FGMRES->resetMatrix(A);
       
       matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
diff --git a/resolve/LinSolverIterativeFGMRES.cpp b/resolve/LinSolverIterativeFGMRES.cpp
index 2a6da732..bd7dca42 100644
--- a/resolve/LinSolverIterativeFGMRES.cpp
+++ b/resolve/LinSolverIterativeFGMRES.cpp
@@ -308,7 +308,7 @@ namespace ReSolve
   int  LinSolverIterativeFGMRES::resetMatrix(matrix::Sparse* new_matrix)
   {
     A_ = new_matrix;
-    matrix_handler_->setValuesChanged(true);
+    matrix_handler_->setValuesChanged(true, "cuda");
     return 0;
   }
 
diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp
index 7005046d..6ebadfa7 100644
--- a/resolve/matrix/MatrixHandler.cpp
+++ b/resolve/matrix/MatrixHandler.cpp
@@ -50,7 +50,6 @@ namespace ReSolve {
   MatrixHandler::MatrixHandler()
   {
     this->new_matrix_ = true;
-    this->values_changed_ = true;
     cpuImpl_  = new MatrixHandlerCpu();
     cudaImpl_ = new MatrixHandlerCuda();
   }
@@ -66,10 +65,15 @@ namespace ReSolve {
     cudaImpl_ = new MatrixHandlerCuda(new_workspace);
   }
 
-  void MatrixHandler::setValuesChanged(bool toWhat)
+  void MatrixHandler::setValuesChanged(bool isValuesChanged, std::string memspace)
   {
-    this->values_changed_ = toWhat;
-    cpuImpl_->setValuesChanged(values_changed_);
+    if (memspace == "cpu") {
+      cpuImpl_->setValuesChanged(isValuesChanged);
+    } else if (memspace == "cuda") {
+      cudaImpl_->setValuesChanged(isValuesChanged);
+    } else {
+      out::error() << "Unsupported device " << memspace << "\n";
+    }
   }
 
   int MatrixHandler::coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace)
diff --git a/resolve/matrix/MatrixHandler.hpp b/resolve/matrix/MatrixHandler.hpp
index af444abd..59ace249 100644
--- a/resolve/matrix/MatrixHandler.hpp
+++ b/resolve/matrix/MatrixHandler.hpp
@@ -71,15 +71,14 @@ namespace ReSolve {
                  std::string matrix_type,
                  std::string memspace);
       int Matrix1Norm(matrix::Sparse *A, real_type* norm);
-      void setValuesChanged(bool toWhat); 
+      void setValuesChanged(bool toWhat, std::string memspace); 
     
     private: 
       LinAlgWorkspace* workspace_{nullptr};
       bool new_matrix_{true};     ///< if the structure changed, you need a new handler.
-      bool values_changed_{true}; ///< needed for matvec
 
       MemoryHandler mem_; ///< Device memory manager object
-      MatrixHandlerImpl* cpuImpl_{nullptr};
+      MatrixHandlerImpl*  cpuImpl_{nullptr};
       MatrixHandlerImpl* cudaImpl_{nullptr};
   };
 
diff --git a/resolve/matrix/MatrixHandlerCpu.cpp b/resolve/matrix/MatrixHandlerCpu.cpp
index d06a4ce5..522d2a1b 100644
--- a/resolve/matrix/MatrixHandlerCpu.cpp
+++ b/resolve/matrix/MatrixHandlerCpu.cpp
@@ -65,10 +65,10 @@ namespace ReSolve {
   //   return this->values_changed_;
   // }
 
-  // void MatrixHandlerCpu::setValuesChanged(bool toWhat)
-  // {
-  //   this->values_changed_ = toWhat;
-  // }
+  void MatrixHandlerCpu::setValuesChanged(bool values_changed)
+  {
+    values_changed_ = values_changed;
+  }
 
 //   int MatrixHandlerCpu::coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace)
 //   {
diff --git a/resolve/matrix/MatrixHandlerCpu.hpp b/resolve/matrix/MatrixHandlerCpu.hpp
index 60079a89..fb760750 100644
--- a/resolve/matrix/MatrixHandlerCpu.hpp
+++ b/resolve/matrix/MatrixHandlerCpu.hpp
@@ -51,12 +51,12 @@ namespace ReSolve {
                  const real_type* beta,
                  std::string matrix_type);
       virtual int Matrix1Norm(matrix::Sparse *A, real_type* norm);
-      // void setValuesChanged(bool toWhat); 
+      void setValuesChanged(bool isValuesChanged); 
     
     private: 
       LinAlgWorkspace* workspace_{nullptr};
       // bool new_matrix_{true};     ///< if the structure changed, you need a new handler.
-      // bool values_changed_{true}; ///< needed for matvec
+      bool values_changed_{true}; ///< needed for matvec
 
       MemoryHandler mem_; ///< Device memory manager object
   };
diff --git a/resolve/matrix/MatrixHandlerCuda.cpp b/resolve/matrix/MatrixHandlerCuda.cpp
index 31515398..f40de0d8 100644
--- a/resolve/matrix/MatrixHandlerCuda.cpp
+++ b/resolve/matrix/MatrixHandlerCuda.cpp
@@ -65,10 +65,10 @@ namespace ReSolve {
   //   return this->values_changed_;
   // }
 
-  // void MatrixHandlerCuda::setValuesChanged(bool toWhat)
-  // {
-  //   this->values_changed_ = toWhat;
-  // }
+  void MatrixHandlerCuda::setValuesChanged(bool values_changed)
+  {
+    values_changed_ = values_changed;
+  }
 
 //   int MatrixHandlerCuda::coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace)
 //   {
diff --git a/resolve/matrix/MatrixHandlerCuda.hpp b/resolve/matrix/MatrixHandlerCuda.hpp
index ae0c3709..f2250008 100644
--- a/resolve/matrix/MatrixHandlerCuda.hpp
+++ b/resolve/matrix/MatrixHandlerCuda.hpp
@@ -51,12 +51,12 @@ namespace ReSolve {
                  const real_type* beta,
                  std::string matrix_type);
       virtual int Matrix1Norm(matrix::Sparse *A, real_type* norm);
-      // void setValuesChanged(bool toWhat); 
+      void setValuesChanged(bool isValuesChanged); 
     
     private: 
       LinAlgWorkspace* workspace_{nullptr};
       // bool new_matrix_{true};     ///< if the structure changed, you need a new handler.
-      // bool values_changed_{true}; ///< needed for matvec
+      bool values_changed_{true}; ///< needed for matvec
 
       MemoryHandler mem_; ///< Device memory manager object
   };
diff --git a/resolve/matrix/MatrixHandlerImpl.hpp b/resolve/matrix/MatrixHandlerImpl.hpp
index ba24d498..015412d0 100644
--- a/resolve/matrix/MatrixHandlerImpl.hpp
+++ b/resolve/matrix/MatrixHandlerImpl.hpp
@@ -46,27 +46,19 @@ namespace ReSolve {
 
       /// Should compute vec_result := alpha*A*vec_x + beta*vec_result, but at least on cpu alpha and beta are flipped
       virtual int matvec(matrix::Sparse* A,
-                 vector_type* vec_x,
-                 vector_type* vec_result,
-                 const real_type* alpha,
-                 const real_type* beta,
-                 std::string matrix_type) = 0;
+                         vector_type* vec_x,
+                         vector_type* vec_result,
+                         const real_type* alpha,
+                         const real_type* beta,
+                         std::string matrix_type) = 0;
       virtual int Matrix1Norm(matrix::Sparse* A, real_type* norm) = 0;
 
-      bool isValuesChanged()
-      {
-        return values_changed_;
-      }
-
-      void setValuesChanged(bool isValuesChanged)
-      {
-        values_changed_ = isValuesChanged;
-      } 
+      virtual void setValuesChanged(bool isValuesChanged) = 0;
     
     protected: 
       // LinAlgWorkspace* workspace_{nullptr};
       // bool new_matrix_{true};     ///< if the structure changed, you need a new handler.
-      bool values_changed_{true}; ///< needed for matvec
+      // bool values_changed_{true}; ///< needed for matvec
 
       // MemoryHandler mem_; ///< Device memory manager object
   };
diff --git a/tests/functionality/testKLU.cpp b/tests/functionality/testKLU.cpp
index a60083cd..70ee83b0 100644
--- a/tests/functionality/testKLU.cpp
+++ b/tests/functionality/testKLU.cpp
@@ -104,7 +104,7 @@ int main(int argc, char *argv[])
   vec_diff->update(x_data, "cpu", "cpu");
 
   // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cpu"));
-  matrix_handler->setValuesChanged(true);
+  matrix_handler->setValuesChanged(true, "cpu");
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cpu"); 
   error_sum += status;
   
@@ -174,7 +174,7 @@ int main(int argc, char *argv[])
   error_sum += status;
 
   vec_r->update(rhs, "cpu", "cpu");
-  matrix_handler->setValuesChanged(true);
+  matrix_handler->setValuesChanged(true, "cpu");
 
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cpu"); 
   error_sum += status;
diff --git a/tests/functionality/testKLU_GLU.cpp b/tests/functionality/testKLU_GLU.cpp
index 60dcce50..1c66fda1 100644
--- a/tests/functionality/testKLU_GLU.cpp
+++ b/tests/functionality/testKLU_GLU.cpp
@@ -127,7 +127,7 @@ int main(int argc, char *argv[])
   vec_diff->update(x_data, "cpu", "cuda");
 
   // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cuda"));
-  matrix_handler->setValuesChanged(true);
+  matrix_handler->setValuesChanged(true, "cuda");
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cuda"); 
   error_sum += status;
   
@@ -198,7 +198,7 @@ int main(int argc, char *argv[])
   error_sum += status;
 
    vec_r->update(rhs, "cpu", "cuda");
-   matrix_handler->setValuesChanged(true);
+   matrix_handler->setValuesChanged(true, "cuda");
 
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); 
   error_sum += status;
diff --git a/tests/functionality/testKLU_Rf.cpp b/tests/functionality/testKLU_Rf.cpp
index 50add7cd..656aa724 100644
--- a/tests/functionality/testKLU_Rf.cpp
+++ b/tests/functionality/testKLU_Rf.cpp
@@ -111,7 +111,7 @@ int main(int argc, char *argv[])
   vec_diff->update(x_data, "cpu", "cuda");
 
   // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cuda"));
-  matrix_handler->setValuesChanged(true);
+  matrix_handler->setValuesChanged(true, "cuda");
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cuda"); 
   error_sum += status;
   
@@ -194,8 +194,8 @@ int main(int argc, char *argv[])
   status = Rf->solve(vec_rhs, vec_x);
   error_sum += status;
 
-   vec_r->update(rhs, "cpu", "cuda");
-  matrix_handler->setValuesChanged(true);
+  vec_r->update(rhs, "cpu", "cuda");
+  matrix_handler->setValuesChanged(true, "cuda");
 
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); 
   error_sum += status;
diff --git a/tests/functionality/testKLU_Rf_FGMRES.cpp b/tests/functionality/testKLU_Rf_FGMRES.cpp
index 2dae74b5..14bb0f33 100644
--- a/tests/functionality/testKLU_Rf_FGMRES.cpp
+++ b/tests/functionality/testKLU_Rf_FGMRES.cpp
@@ -117,7 +117,7 @@ int main(int argc, char *argv[])
   vec_diff->update(x_data, "cpu", "cuda");
 
   // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cuda"));
-  matrix_handler->setValuesChanged(true);
+  matrix_handler->setValuesChanged(true, "cuda");
   //evaluate the residual ||b-Ax||
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cuda"); 
   error_sum += status;
@@ -221,7 +221,7 @@ int main(int argc, char *argv[])
   error_sum += status;
 
   vec_r->update(rhs, "cpu", "cuda");
-  matrix_handler->setValuesChanged(true);
+  matrix_handler->setValuesChanged(true, "cuda");
 
   //evaluate final residual
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); 
diff --git a/tests/unit/matrix/MatrixHandlerTests.hpp b/tests/unit/matrix/MatrixHandlerTests.hpp
index 41df9f4b..16fd3215 100644
--- a/tests/unit/matrix/MatrixHandlerTests.hpp
+++ b/tests/unit/matrix/MatrixHandlerTests.hpp
@@ -57,7 +57,7 @@ class MatrixHandlerTests : TestBase
 
     real_type alpha = 2.0/30.0;
     real_type beta  = 2.0;
-    handler.setValuesChanged(true);
+    handler.setValuesChanged(true, memspace_);
     handler.matvec(A, &x, &y, &alpha, &beta, "csr", memspace_);
 
     status *= verifyAnswer(y, 4.0, memspace_);

From edc7e4c3ba3f5b795bfee73da5cb96718fb89b73 Mon Sep 17 00:00:00 2001
From: Slaven Peles <peless@ornl.gov>
Date: Tue, 17 Oct 2023 11:53:28 -0400
Subject: [PATCH 07/15] Take helper class for index-value pairs outside
 MatrixHandler sources.

---
 resolve/matrix/MatrixHandler.cpp          | 36 ++-----------------
 resolve/matrix/MatrixHandler.hpp          | 22 ------------
 resolve/matrix/MatrixHandlerCpu.cpp       | 35 +-----------------
 resolve/matrix/MatrixHandlerCuda.cpp      | 35 +-----------------
 resolve/utilities/misc/IndexValuePair.hpp | 43 +++++++++++++++++++++++
 5 files changed, 47 insertions(+), 124 deletions(-)
 create mode 100644 resolve/utilities/misc/IndexValuePair.hpp

diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp
index 6ebadfa7..c605ce5a 100644
--- a/resolve/matrix/MatrixHandler.cpp
+++ b/resolve/matrix/MatrixHandler.cpp
@@ -6,6 +6,7 @@
 #include <resolve/matrix/Csc.hpp>
 #include <resolve/matrix/Csr.hpp>
 #include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/utilities/misc/IndexValuePair.hpp>
 #include "MatrixHandler.hpp"
 #include "MatrixHandlerCpu.hpp"
 #include "MatrixHandlerCuda.hpp"
@@ -14,39 +15,6 @@ namespace ReSolve {
   // Create a shortcut name for Logger static class
   using out = io::Logger;
 
-  //helper class
-  indexPlusValue::indexPlusValue()
-  {
-    idx_ = 0;
-    value_ = 0.0;
-  }
-
-
-  indexPlusValue::~indexPlusValue()
-  {
-  }
-
-  void indexPlusValue::setIdx(index_type new_idx)
-  {
-    idx_ = new_idx;
-  }
-
-  void indexPlusValue::setValue(real_type new_value)
-  {
-    value_ = new_value;
-  }
-
-  index_type indexPlusValue::getIdx()
-  {
-    return idx_;
-  }
-
-  real_type indexPlusValue::getValue()
-  {
-    return value_;
-  }
-  //end of helper class
-
   MatrixHandler::MatrixHandler()
   {
     this->new_matrix_ = true;
@@ -131,7 +99,7 @@ namespace ReSolve {
     index_type* nnz_shifts = new index_type[n];
     std::fill_n(nnz_shifts, n , 0);
 
-    indexPlusValue* tmp = new indexPlusValue[nnz_unpacked]; 
+    IndexValuePair* tmp = new IndexValuePair[nnz_unpacked]; 
 
     csr_ia[0] = 0;
 
diff --git a/resolve/matrix/MatrixHandler.hpp b/resolve/matrix/MatrixHandler.hpp
index 59ace249..7ba01a08 100644
--- a/resolve/matrix/MatrixHandler.hpp
+++ b/resolve/matrix/MatrixHandler.hpp
@@ -28,28 +28,6 @@ namespace ReSolve
 
 namespace ReSolve {
 
-  //helper class
-  class indexPlusValue
-  {
-    public:
-      indexPlusValue();
-      ~indexPlusValue();
-      void setIdx (index_type new_idx);
-      void setValue (real_type new_value);
-
-      index_type getIdx();
-      real_type getValue();
-
-      bool operator < (const indexPlusValue& str) const
-      {
-        return (idx_ < str.idx_);
-      }  
-
-    private:
-      index_type idx_;
-      real_type value_;
-  };
-
   class MatrixHandler
   {
     using vector_type = vector::Vector;
diff --git a/resolve/matrix/MatrixHandlerCpu.cpp b/resolve/matrix/MatrixHandlerCpu.cpp
index 522d2a1b..cc211f5f 100644
--- a/resolve/matrix/MatrixHandlerCpu.cpp
+++ b/resolve/matrix/MatrixHandlerCpu.cpp
@@ -12,39 +12,6 @@ namespace ReSolve {
   // Create a shortcut name for Logger static class
   using out = io::Logger;
 
-  // //helper class
-  // indexPlusValue::indexPlusValue()
-  // {
-  //   idx_ = 0;
-  //   value_ = 0.0;
-  // }
-
-
-  // indexPlusValue::~indexPlusValue()
-  // {
-  // }
-
-  // void indexPlusValue::setIdx(index_type new_idx)
-  // {
-  //   idx_ = new_idx;
-  // }
-
-  // void indexPlusValue::setValue(real_type new_value)
-  // {
-  //   value_ = new_value;
-  // }
-
-  // index_type indexPlusValue::getIdx()
-  // {
-  //   return idx_;
-  // }
-
-  // real_type indexPlusValue::getValue()
-  // {
-  //   return value_;
-  // }
-  // //end of helper class
-
   MatrixHandlerCpu::MatrixHandlerCpu()
   {
     // new_matrix_ = true;
@@ -125,7 +92,7 @@ namespace ReSolve {
 //     index_type* nnz_shifts = new index_type[n];
 //     std::fill_n(nnz_shifts, n , 0);
 
-//     indexPlusValue* tmp = new indexPlusValue[nnz_unpacked]; 
+//     IndexValuePair* tmp = new IndexValuePair[nnz_unpacked]; 
 
 //     csr_ia[0] = 0;
 
diff --git a/resolve/matrix/MatrixHandlerCuda.cpp b/resolve/matrix/MatrixHandlerCuda.cpp
index f40de0d8..deb09180 100644
--- a/resolve/matrix/MatrixHandlerCuda.cpp
+++ b/resolve/matrix/MatrixHandlerCuda.cpp
@@ -12,39 +12,6 @@ namespace ReSolve {
   // Create a shortcut name for Logger static class
   using out = io::Logger;
 
-  // //helper class
-  // indexPlusValue::indexPlusValue()
-  // {
-  //   idx_ = 0;
-  //   value_ = 0.0;
-  // }
-
-
-  // indexPlusValue::~indexPlusValue()
-  // {
-  // }
-
-  // void indexPlusValue::setIdx(index_type new_idx)
-  // {
-  //   idx_ = new_idx;
-  // }
-
-  // void indexPlusValue::setValue(real_type new_value)
-  // {
-  //   value_ = new_value;
-  // }
-
-  // index_type indexPlusValue::getIdx()
-  // {
-  //   return idx_;
-  // }
-
-  // real_type indexPlusValue::getValue()
-  // {
-  //   return value_;
-  // }
-  // //end of helper class
-
   MatrixHandlerCuda::MatrixHandlerCuda()
   {
     // new_matrix_ = true;
@@ -125,7 +92,7 @@ namespace ReSolve {
 //     index_type* nnz_shifts = new index_type[n];
 //     std::fill_n(nnz_shifts, n , 0);
 
-//     indexPlusValue* tmp = new indexPlusValue[nnz_unpacked]; 
+//     IndexValuePair* tmp = new IndexValuePair[nnz_unpacked]; 
 
 //     csr_ia[0] = 0;
 
diff --git a/resolve/utilities/misc/IndexValuePair.hpp b/resolve/utilities/misc/IndexValuePair.hpp
new file mode 100644
index 00000000..b09ccf97
--- /dev/null
+++ b/resolve/utilities/misc/IndexValuePair.hpp
@@ -0,0 +1,43 @@
+#pragma once
+
+
+namespace ReSolve {
+
+  /// @brief Helper class for COO matrix sorting
+  class IndexValuePair
+  {
+    public:
+      IndexValuePair() : idx_(0), value_(0.0)
+      {}
+      ~IndexValuePair()
+      {}
+      void setIdx (index_type new_idx)
+      {
+        idx_ = new_idx;
+      }
+      void setValue (real_type new_value)
+      {
+        value_ = new_value;
+      }
+
+      index_type getIdx()
+      {
+        return idx_;
+      }
+      real_type getValue()
+      {
+        return value_;
+      }
+
+      bool operator < (const IndexValuePair& str) const
+      {
+        return (idx_ < str.idx_);
+      }  
+
+    private:
+      index_type idx_;
+      real_type value_;
+  };
+
+} // namespace ReSolve
+

From 678a3244d0e424152bb184f44b33e80971ffebf7 Mon Sep 17 00:00:00 2001
From: Slaven Peles <peless@ornl.gov>
Date: Wed, 18 Oct 2023 19:25:51 -0400
Subject: [PATCH 08/15] Complete PIMPL implementation for MatrixHandler.

---
 examples/r_KLU_KLU.cpp                    |   6 +-
 examples/r_KLU_KLU_standalone.cpp         |   6 +-
 resolve/LinSolverIterativeFGMRES.cpp      |   4 +-
 resolve/matrix/MatrixHandler.cpp          |  39 ++++-
 resolve/matrix/MatrixHandler.hpp          |  23 ++-
 resolve/matrix/MatrixHandlerCuda.cpp      | 186 ++--------------------
 resolve/matrix/MatrixHandlerCuda.hpp      |   3 +-
 tests/functionality/testKLU.cpp           |   9 +-
 tests/functionality/testKLU_GLU.cpp       |   2 +-
 tests/functionality/testKLU_Rf.cpp        |   2 +-
 tests/functionality/testKLU_Rf_FGMRES.cpp |  15 +-
 tests/unit/matrix/MatrixHandlerTests.hpp  |  24 ++-
 12 files changed, 117 insertions(+), 202 deletions(-)

diff --git a/examples/r_KLU_KLU.cpp b/examples/r_KLU_KLU.cpp
index e777833a..bee712bd 100644
--- a/examples/r_KLU_KLU.cpp
+++ b/examples/r_KLU_KLU.cpp
@@ -35,9 +35,9 @@ int main(int argc, char *argv[])
 
   ReSolve::matrix::Coo* A_coo;
   ReSolve::matrix::Csr* A;
-  ReSolve::LinAlgWorkspace* workspace = ReSolve::createLinAlgWorkspace("cpu");
-  ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace);
-  ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace);
+  ReSolve::LinAlgWorkspace* workspace    = new ReSolve::LinAlgWorkspace();
+  ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace);
+  ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace);
   real_type* rhs;
   real_type* x;
 
diff --git a/examples/r_KLU_KLU_standalone.cpp b/examples/r_KLU_KLU_standalone.cpp
index daeae43a..a1aa1fad 100644
--- a/examples/r_KLU_KLU_standalone.cpp
+++ b/examples/r_KLU_KLU_standalone.cpp
@@ -31,9 +31,9 @@ int main(int argc, char *argv[])
 
   ReSolve::matrix::Coo* A_coo;
   ReSolve::matrix::Csr* A;
-  ReSolve::LinAlgWorkspace* workspace = ReSolve::createLinAlgWorkspace("cpu");
-  ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace);
-  ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace);
+  ReSolve::LinAlgWorkspace* workspace    = new ReSolve::LinAlgWorkspace();
+  ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace);
+  ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace);
   real_type* rhs;
   real_type* x;
 
diff --git a/resolve/LinSolverIterativeFGMRES.cpp b/resolve/LinSolverIterativeFGMRES.cpp
index bd7dca42..fa63f2d5 100644
--- a/resolve/LinSolverIterativeFGMRES.cpp
+++ b/resolve/LinSolverIterativeFGMRES.cpp
@@ -66,12 +66,12 @@ namespace ReSolve
   {
     if (d_V_ != nullptr) {
       // cudaFree(d_V_);
-      delete [] d_V_;   
+      delete d_V_;   
     }
 
     if (d_Z_ != nullptr) {
       //      cudaFree(d_Z_);
-      delete [] d_Z_;   
+      delete d_Z_;   
     }
 
   }
diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp
index c605ce5a..50dd3082 100644
--- a/resolve/matrix/MatrixHandler.cpp
+++ b/resolve/matrix/MatrixHandler.cpp
@@ -15,6 +15,15 @@ namespace ReSolve {
   // Create a shortcut name for Logger static class
   using out = io::Logger;
 
+  /**
+   * @brief Default constructor
+   * 
+   * @post Instantiates CPU and CUDA matrix handlers, but does not 
+   * create a workspace.
+   * 
+   * @todo There is little utility for the default constructor. Rethink its purpose.
+   * Consider making it private method.
+   */
   MatrixHandler::MatrixHandler()
   {
     this->new_matrix_ = true;
@@ -22,15 +31,43 @@ namespace ReSolve {
     cudaImpl_ = new MatrixHandlerCuda();
   }
 
+  /**
+   * @brief Destructor
+   * 
+   */
   MatrixHandler::~MatrixHandler()
   {
+    if (isCpuEnabled_)  delete cpuImpl_;
+    if (isCudaEnabled_) delete cudaImpl_;
   }
 
+  /**
+   * @brief Constructor taking pointer to the workspace as its parameter.
+   * 
+   * @note The CPU implementation currently does not require a workspace.
+   * The workspace pointer parameter is provided for forward compatibility.
+   */
   MatrixHandler::MatrixHandler(LinAlgWorkspace* new_workspace)
   {
-    workspace_ = new_workspace;
     cpuImpl_  = new MatrixHandlerCpu(new_workspace);
+    isCpuEnabled_  = true;
+    isCudaEnabled_ = false;
+  }
+
+  /**
+   * @brief Constructor taking pointer to the CUDA workspace as its parameter.
+   * 
+   * @post A CPU implementation instance is created because it is cheap and
+   * it does not require a workspace.
+   * 
+   * @post A CUDA implementation instance is created with supplied workspace.
+   */
+  MatrixHandler::MatrixHandler(LinAlgWorkspaceCUDA* new_workspace)
+  {
+    cpuImpl_  = new MatrixHandlerCpu();
     cudaImpl_ = new MatrixHandlerCuda(new_workspace);
+    isCpuEnabled_  = true;
+    isCudaEnabled_ = true;
   }
 
   void MatrixHandler::setValuesChanged(bool isValuesChanged, std::string memspace)
diff --git a/resolve/matrix/MatrixHandler.hpp b/resolve/matrix/MatrixHandler.hpp
index 7ba01a08..fed6b593 100644
--- a/resolve/matrix/MatrixHandler.hpp
+++ b/resolve/matrix/MatrixHandler.hpp
@@ -1,8 +1,3 @@
-// this class encapsulates various matrix manipulation operations, commonly required by linear solvers:
-// this includes 
-// (1) Matrix format conversion: coo2csr, csr2csc
-// (2) Matrix vector product (SpMV)
-// (3) Matrix 1-norm
 #pragma once
 #include <resolve/Common.hpp>
 #include <resolve/MemoryUtils.hpp>
@@ -22,12 +17,25 @@ namespace ReSolve
     class Csr;
   }
   class LinAlgWorkspace;
+  class LinAlgWorkspaceCUDA;
   class MatrixHandlerImpl;
 }
 
 
 namespace ReSolve {
 
+  /**
+   * @brief this class encapsulates various matrix manipulation operations, 
+   * commonly required by linear solvers. 
+   * 
+   * This includes:
+   *  - Matrix format conversion: coo2csr, csr2csc
+   *  - Matrix vector product (SpMV)
+   *  - Matrix 1-norm
+   * 
+   * @author Kasia Swirydowicz <kasia.swirydowicz@pnnl.gov>
+   * @author Slaven Peles <peless@ornl.gov>
+   */
   class MatrixHandler
   {
     using vector_type = vector::Vector;
@@ -35,6 +43,7 @@ namespace ReSolve {
     public:
       MatrixHandler();
       MatrixHandler(LinAlgWorkspace* workspace);
+      MatrixHandler(LinAlgWorkspaceCUDA* workspace);
       ~MatrixHandler();
 
       int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace); //memspace decides on what is returned (cpu or cuda pointer)
@@ -52,12 +61,14 @@ namespace ReSolve {
       void setValuesChanged(bool toWhat, std::string memspace); 
     
     private: 
-      LinAlgWorkspace* workspace_{nullptr};
       bool new_matrix_{true};     ///< if the structure changed, you need a new handler.
 
       MemoryHandler mem_; ///< Device memory manager object
       MatrixHandlerImpl*  cpuImpl_{nullptr};
       MatrixHandlerImpl* cudaImpl_{nullptr};
+
+      bool isCpuEnabled_{false};
+      bool isCudaEnabled_{false};
   };
 
 } // namespace ReSolve
diff --git a/resolve/matrix/MatrixHandlerCuda.cpp b/resolve/matrix/MatrixHandlerCuda.cpp
index deb09180..15252d85 100644
--- a/resolve/matrix/MatrixHandlerCuda.cpp
+++ b/resolve/matrix/MatrixHandlerCuda.cpp
@@ -14,7 +14,6 @@ namespace ReSolve {
 
   MatrixHandlerCuda::MatrixHandlerCuda()
   {
-    // new_matrix_ = true;
     values_changed_ = true;
   }
 
@@ -22,186 +21,28 @@ namespace ReSolve {
   {
   }
 
-  MatrixHandlerCuda::MatrixHandlerCuda(LinAlgWorkspace* new_workspace)
+  MatrixHandlerCuda::MatrixHandlerCuda(LinAlgWorkspaceCUDA* new_workspace)
   {
     workspace_ = new_workspace;
   }
 
-  // bool MatrixHandlerCuda::getValuesChanged()
-  // {
-  //   return this->values_changed_;
-  // }
+  MatrixHandlerCuda::MatrixHandlerCuda(LinAlgWorkspace* new_workspace)
+  {
+    workspace_ = (LinAlgWorkspaceCUDA*) new_workspace;
+  }
 
   void MatrixHandlerCuda::setValuesChanged(bool values_changed)
   {
     values_changed_ = values_changed;
   }
 
-//   int MatrixHandlerCuda::coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace)
-//   {
-//     //this happens on the CPU not on the GPU
-//     //but will return whatever memspace requested.
-
-//     //count nnzs first
-
-//     index_type nnz_unpacked = 0;
-//     index_type nnz = A_coo->getNnz();
-//     index_type n = A_coo->getNumRows();
-//     bool symmetric = A_coo->symmetric();
-//     bool expanded = A_coo->expanded();
-
-//     index_type* nnz_counts =  new index_type[n];
-//     std::fill_n(nnz_counts, n, 0);
-//     index_type* coo_rows = A_coo->getRowData("cpu");
-//     index_type* coo_cols = A_coo->getColData("cpu");
-//     real_type* coo_vals = A_coo->getValues("cpu");
-
-//     index_type* diag_control = new index_type[n]; //for DEDUPLICATION of the diagonal
-//     std::fill_n(diag_control, n, 0);
-//     index_type nnz_unpacked_no_duplicates = 0;
-//     index_type nnz_no_duplicates = nnz;
-
-
-//     //maybe check if they exist?
-//     for (index_type i = 0; i < nnz; ++i)
-//     {
-//       nnz_counts[coo_rows[i]]++;
-//       nnz_unpacked++;
-//       nnz_unpacked_no_duplicates++;
-//       if ((coo_rows[i] != coo_cols[i])&& (symmetric) && (!expanded))
-//       {
-//         nnz_counts[coo_cols[i]]++;
-//         nnz_unpacked++;
-//         nnz_unpacked_no_duplicates++;
-//       }
-//       if (coo_rows[i] == coo_cols[i]){
-//         if (diag_control[coo_rows[i]] > 0) {
-//          //duplicate
-//           nnz_unpacked_no_duplicates--;
-//           nnz_no_duplicates--;
-//         }
-//         diag_control[coo_rows[i]]++;
-//       }
-//     }
-//     A_csr->setExpanded(true);
-//     A_csr->setNnzExpanded(nnz_unpacked_no_duplicates);
-//     index_type* csr_ia = new index_type[n+1];
-//     std::fill_n(csr_ia, n + 1, 0);
-//     index_type* csr_ja = new index_type[nnz_unpacked];
-//     real_type* csr_a = new real_type[nnz_unpacked];
-//     index_type* nnz_shifts = new index_type[n];
-//     std::fill_n(nnz_shifts, n , 0);
-
-//     IndexValuePair* tmp = new IndexValuePair[nnz_unpacked]; 
-
-//     csr_ia[0] = 0;
-
-//     for (index_type i = 1; i < n + 1; ++i){
-//       csr_ia[i] = csr_ia[i - 1] + nnz_counts[i - 1] - (diag_control[i-1] - 1);
-//     }
-
-//     int r, start;
-
-
-//     for (index_type i = 0; i < nnz; ++i){
-//       //which row
-//       r = coo_rows[i];
-//       start = csr_ia[r];
-
-//       if ((start + nnz_shifts[r]) > nnz_unpacked) {
-//         out::warning() << "index out of bounds (case 1) start: " << start << "nnz_shifts[" << r << "] = " << nnz_shifts[r] << std::endl;
-//       }
-//       if ((r == coo_cols[i]) && (diag_control[r] > 1)) {//diagonal, and there are duplicates
-//         bool already_there = false;  
-//         for (index_type j = start; j < start + nnz_shifts[r]; ++j)
-//         {
-//           index_type c = tmp[j].getIdx();
-//           if (c == r) {
-//             real_type val = tmp[j].getValue();
-//             val += coo_vals[i];
-//             tmp[j].setValue(val);
-//             already_there = true;
-//             out::warning() << " duplicate found, row " << c << " adding in place " << j << " current value: " << val << std::endl;
-//           }  
-//         }  
-//         if (!already_there){ // first time this duplicates appears
-
-//           tmp[start + nnz_shifts[r]].setIdx(coo_cols[i]);
-//           tmp[start + nnz_shifts[r]].setValue(coo_vals[i]);
-
-//           nnz_shifts[r]++;
-//         }
-//       } else {//not diagonal
-//         tmp[start + nnz_shifts[r]].setIdx(coo_cols[i]);
-//         tmp[start + nnz_shifts[r]].setValue(coo_vals[i]);
-//         nnz_shifts[r]++;
-
-//         if ((coo_rows[i] != coo_cols[i]) && (symmetric == 1))
-//         {
-//           r = coo_cols[i];
-//           start = csr_ia[r];
-
-//           if ((start + nnz_shifts[r]) > nnz_unpacked)
-//             out::warning() << "index out of bounds (case 2) start: " << start << "nnz_shifts[" << r << "] = " << nnz_shifts[r] << std::endl;
-//           tmp[start + nnz_shifts[r]].setIdx(coo_rows[i]);
-//           tmp[start + nnz_shifts[r]].setValue(coo_vals[i]);
-//           nnz_shifts[r]++;
-//         }
-//       }
-//     }
-//     //now sort whatever is inside rows
-
-//     for (int i = 0; i < n; ++i)
-//     {
-
-//       //now sorting (and adding 1)
-//       int colStart = csr_ia[i];
-//       int colEnd = csr_ia[i + 1];
-//       int length = colEnd - colStart;
-//       std::sort(&tmp[colStart],&tmp[colStart] + length);
-//     }
-
-//     for (index_type i = 0; i < nnz_unpacked; ++i)
-//     {
-//       csr_ja[i] = tmp[i].getIdx();
-//       csr_a[i] = tmp[i].getValue();
-//     }
-// #if 0
-//     for (int i = 0; i<n; ++i){
-//       printf("Row: %d \n", i);
-//       for (int j = csr_ia[i]; j<csr_ia[i+1]; ++j){
-//         printf("(%d %16.16f) ", csr_ja[j], csr_a[j]);
-//       }
-//       printf("\n");
-//     }
-// #endif
-//     A_csr->setNnz(nnz_no_duplicates);
-//     if (memspace == "cpu"){
-//       A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cpu");
-//     } else {
-//       if (memspace == "cuda"){      
-//         A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cuda");
-//       } else {
-//         //display error
-//       }
-//     }
-//     delete [] nnz_counts;
-//     delete [] tmp;
-//     delete [] nnz_shifts;
-//     delete [] csr_ia;
-//     delete [] csr_ja;
-//     delete [] csr_a;
-//     delete [] diag_control; 
-
-//     return 0;
-//   }
 
   int MatrixHandlerCuda::matvec(matrix::Sparse* Ageneric, 
-                               vector_type* vec_x, 
-                               vector_type* vec_result, 
-                               const real_type* alpha, 
-                               const real_type* beta,
-                               std::string matrixFormat) 
+                                vector_type* vec_x, 
+                                vector_type* vec_result, 
+                                const real_type* alpha, 
+                                const real_type* beta,
+                                std::string matrixFormat) 
   {
     using namespace constants;
     int error_sum = 0;
@@ -209,11 +50,8 @@ namespace ReSolve {
       matrix::Csr* A = dynamic_cast<matrix::Csr*>(Ageneric);
       //result = alpha *A*x + beta * result
       cusparseStatus_t status;
-      // std::cout << "Matvec on NVIDIA GPU ...\n";
-      LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
+      LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
       cusparseDnVecDescr_t vecx = workspaceCUDA->getVecX();
-      //printf("is vec_x NULL? %d\n", vec_x->getData("cuda") == nullptr);
-      //printf("is vec_result NULL? %d\n", vec_result->getData("cuda") == nullptr);
       cusparseCreateDnVec(&vecx, A->getNumRows(), vec_x->getData("cuda"), CUDA_R_64F);
 
 
@@ -224,7 +62,7 @@ namespace ReSolve {
 
       void* buffer_spmv = workspaceCUDA->getSpmvBuffer();
       cusparseHandle_t handle_cusparse = workspaceCUDA->getCusparseHandle();
-      if (values_changed_) { 
+      if (values_changed_) {
         status = cusparseCreateCsr(&matA, 
                                    A->getNumRows(),
                                    A->getNumColumns(),
diff --git a/resolve/matrix/MatrixHandlerCuda.hpp b/resolve/matrix/MatrixHandlerCuda.hpp
index f2250008..c77eb795 100644
--- a/resolve/matrix/MatrixHandlerCuda.hpp
+++ b/resolve/matrix/MatrixHandlerCuda.hpp
@@ -38,6 +38,7 @@ namespace ReSolve {
     public:
       MatrixHandlerCuda();
       MatrixHandlerCuda(LinAlgWorkspace* workspace);
+      MatrixHandlerCuda(LinAlgWorkspaceCUDA* workspace);
       virtual ~MatrixHandlerCuda();
 
       int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr); //memspace decides on what is returned (cpu or cuda pointer)
@@ -54,7 +55,7 @@ namespace ReSolve {
       void setValuesChanged(bool isValuesChanged); 
     
     private: 
-      LinAlgWorkspace* workspace_{nullptr};
+      LinAlgWorkspaceCUDA* workspace_{nullptr};
       // bool new_matrix_{true};     ///< if the structure changed, you need a new handler.
       bool values_changed_{true}; ///< needed for matvec
 
diff --git a/tests/functionality/testKLU.cpp b/tests/functionality/testKLU.cpp
index 70ee83b0..66790c83 100644
--- a/tests/functionality/testKLU.cpp
+++ b/tests/functionality/testKLU.cpp
@@ -1,6 +1,7 @@
 #include <string>
 #include <iostream>
 #include <iomanip>
+#include <cmath>
 
 #include <resolve/vector/Vector.hpp>
 #include <resolve/matrix/io.hpp>
@@ -9,7 +10,7 @@
 #include <resolve/matrix/MatrixHandler.hpp>
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
-#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 //author: KS
 //functionality test to check whether KLU works correctly.
 using namespace ReSolve::constants;
@@ -26,9 +27,9 @@ int main(int argc, char *argv[])
   int error_sum = 0;
   int status = 0;
 
-  ReSolve::LinAlgWorkspace* workspace = ReSolve::createLinAlgWorkspace("cpu");
-  ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace);
-  ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace);
+  ReSolve::LinAlgWorkspace* workspace =    new ReSolve::LinAlgWorkspace();
+  ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace);
+  ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace);
   ReSolve::LinSolverDirectKLU* KLU = new ReSolve::LinSolverDirectKLU;
   KLU->setupParameters(1, 0.1, false);
 
diff --git a/tests/functionality/testKLU_GLU.cpp b/tests/functionality/testKLU_GLU.cpp
index 1c66fda1..1384f405 100644
--- a/tests/functionality/testKLU_GLU.cpp
+++ b/tests/functionality/testKLU_GLU.cpp
@@ -30,7 +30,7 @@ int main(int argc, char *argv[])
   int error_sum = 0;
   int status = 0;
 
-  ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA;
+  ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA();
   workspace_CUDA->initializeHandles();
   ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_CUDA);
   ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_CUDA);
diff --git a/tests/functionality/testKLU_Rf.cpp b/tests/functionality/testKLU_Rf.cpp
index 656aa724..b7e725a7 100644
--- a/tests/functionality/testKLU_Rf.cpp
+++ b/tests/functionality/testKLU_Rf.cpp
@@ -30,7 +30,7 @@ int main(int argc, char *argv[])
   int error_sum = 0;
   int status = 0;
 
-  ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA;
+  ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA();
   workspace_CUDA->initializeHandles();
   ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_CUDA);
   ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_CUDA);
diff --git a/tests/functionality/testKLU_Rf_FGMRES.cpp b/tests/functionality/testKLU_Rf_FGMRES.cpp
index 14bb0f33..3e4b9d61 100644
--- a/tests/functionality/testKLU_Rf_FGMRES.cpp
+++ b/tests/functionality/testKLU_Rf_FGMRES.cpp
@@ -31,7 +31,7 @@ int main(int argc, char *argv[])
   int error_sum = 0;
   int status = 0;
 
-  ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA;
+  ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA();
   workspace_CUDA->initializeHandles();
   ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_CUDA);
   ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_CUDA);
@@ -258,5 +258,18 @@ int main(int argc, char *argv[])
   } else {
     std::cout<<"Test 4 (KLU with cuSolverRf refactorization + IR) FAILED, error sum: "<<error_sum<<std::endl<<std::endl;;
   }
+
+  delete A;
+  delete KLU;
+  delete GS;
+  delete FGMRES;
+  delete Rf;
+  delete x;
+  delete vec_r;
+  delete vec_x;
+  delete workspace_CUDA;
+  delete matrix_handler;
+  delete vector_handler;
+
   return error_sum;
 }
diff --git a/tests/unit/matrix/MatrixHandlerTests.hpp b/tests/unit/matrix/MatrixHandlerTests.hpp
index 16fd3215..3e0ec56d 100644
--- a/tests/unit/matrix/MatrixHandlerTests.hpp
+++ b/tests/unit/matrix/MatrixHandlerTests.hpp
@@ -43,8 +43,7 @@ class MatrixHandlerTests : TestBase
   {
     TestStatus status;
 
-    ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-    ReSolve::MatrixHandler handler(workspace);
+    ReSolve::MatrixHandler* handler = createMatrixHandler();
 
     matrix::Csr* A = createCsrMatrix(N, memspace_);
     vector::Vector x(N);
@@ -57,12 +56,12 @@ class MatrixHandlerTests : TestBase
 
     real_type alpha = 2.0/30.0;
     real_type beta  = 2.0;
-    handler.setValuesChanged(true, memspace_);
-    handler.matvec(A, &x, &y, &alpha, &beta, "csr", memspace_);
+    handler->setValuesChanged(true, memspace_);
+    handler->matvec(A, &x, &y, &alpha, &beta, "csr", memspace_);
 
     status *= verifyAnswer(y, 4.0, memspace_);
 
-    delete workspace;
+    delete handler;
     delete A;
 
     return status.report(__func__);
@@ -71,6 +70,21 @@ class MatrixHandlerTests : TestBase
 private:
   std::string memspace_{"cpu"};
 
+  ReSolve::MatrixHandler* createMatrixHandler()
+  {
+    if (memspace_ == "cpu") {
+      LinAlgWorkspace* workpsace = new LinAlgWorkspace();
+      return new MatrixHandler(workpsace);
+    } else if (memspace_ == "cuda") {
+      LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA();
+      workspace->initializeHandles();
+      return new MatrixHandler(workspace);
+    } else {
+      std::cout << "Invalid memory space " << memspace_ << "\n";
+    }
+    return nullptr;
+  }
+
   bool verifyAnswer(vector::Vector& x, real_type answer, std::string memspace)
   {
     bool status = true;

From cec1aa74393bbe5f563912a8093340a112fcbfaa Mon Sep 17 00:00:00 2001
From: Slaven Peles <peless@ornl.gov>
Date: Thu, 19 Oct 2023 00:24:56 -0400
Subject: [PATCH 09/15] Use PIMPL in VectorHandler class.

---
 resolve/vector/CMakeLists.txt            |   2 +
 resolve/vector/VectorHandler.cpp         | 165 ++++------------
 resolve/vector/VectorHandler.hpp         |  23 ++-
 resolve/vector/VectorHandlerCpu.cpp      | 159 +++++++++++++++
 resolve/vector/VectorHandlerCpu.hpp      |  57 ++++++
 resolve/vector/VectorHandlerCuda.cpp     | 236 +++++++++++++++++++++++
 resolve/vector/VectorHandlerCuda.hpp     |  57 ++++++
 resolve/vector/VectorHandlerImpl.hpp     |  57 ++++++
 tests/unit/vector/GramSchmidtTests.hpp   |  20 +-
 tests/unit/vector/VectorHandlerTests.hpp |  60 +++---
 10 files changed, 680 insertions(+), 156 deletions(-)
 create mode 100644 resolve/vector/VectorHandlerCpu.cpp
 create mode 100644 resolve/vector/VectorHandlerCpu.hpp
 create mode 100644 resolve/vector/VectorHandlerCuda.cpp
 create mode 100644 resolve/vector/VectorHandlerCuda.hpp
 create mode 100644 resolve/vector/VectorHandlerImpl.hpp

diff --git a/resolve/vector/CMakeLists.txt b/resolve/vector/CMakeLists.txt
index 8fa9cd59..f418bf55 100644
--- a/resolve/vector/CMakeLists.txt
+++ b/resolve/vector/CMakeLists.txt
@@ -10,6 +10,8 @@
 set(Vector_SRC 
     Vector.cpp
     VectorHandler.cpp
+    VectorHandlerCpu.cpp
+    VectorHandlerCuda.cpp
 )
 
 
diff --git a/resolve/vector/VectorHandler.cpp b/resolve/vector/VectorHandler.cpp
index 71bfce9c..9648058e 100644
--- a/resolve/vector/VectorHandler.cpp
+++ b/resolve/vector/VectorHandler.cpp
@@ -4,6 +4,9 @@
 #include <resolve/cuda/cudaKernels.h>
 #include <resolve/vector/Vector.hpp>
 #include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/vector/VectorHandlerImpl.hpp>
+#include <resolve/vector/VectorHandlerCpu.hpp>
+#include <resolve/vector/VectorHandlerCuda.hpp>
 #include "VectorHandler.hpp"
 
 namespace ReSolve {
@@ -14,6 +17,8 @@ namespace ReSolve {
    */
   VectorHandler::VectorHandler()
   {
+    cpuImpl_ = new VectorHandlerCpu();
+    isCpuEnabled_ = true;
   }
 
   /** 
@@ -21,9 +26,24 @@ namespace ReSolve {
    * 
    * @param new_workspace - workspace to be set     
    */
-  VectorHandler:: VectorHandler(LinAlgWorkspace* new_workspace)
+  VectorHandler::VectorHandler(LinAlgWorkspace* new_workspace)
   {
-    workspace_ = new_workspace;
+    cpuImpl_ = new VectorHandlerCpu(new_workspace);
+    isCpuEnabled_ = true;
+  }
+
+  /** 
+   * @brief constructor
+   * 
+   * @param new_workspace - workspace to be set     
+   */
+  VectorHandler::VectorHandler(LinAlgWorkspaceCUDA* new_workspace)
+  {
+    cudaImpl_ = new VectorHandlerCuda(new_workspace);
+    cpuImpl_  = new  VectorHandlerCpu();
+
+    isCudaEnabled_ = true;
+    isCpuEnabled_  = true;
   }
 
   /** 
@@ -46,28 +66,11 @@ namespace ReSolve {
 
   real_type VectorHandler::dot(vector::Vector* x, vector::Vector* y, std::string memspace)
   { 
-    if (memspace == "cuda" ){ 
-      LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-      cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
-      double nrm = 0.0;
-      cublasStatus_t st= cublasDdot (handle_cublas,  x->getSize(), x->getData("cuda"), 1, y->getData("cuda"), 1, &nrm);
-      if (st!=0) {printf("dot product crashed with code %d \n", st);}
-      return nrm;
+    if (memspace == "cuda" ) {
+      return cudaImpl_->dot(x, y);
     } else {
       if (memspace == "cpu") {
-        real_type* x_data = x->getData("cpu");
-        real_type* y_data = y->getData("cpu");
-        real_type sum = 0.0;
-        real_type c = 0.0;
-        real_type t, y;
-        for (int i = 0; i < x->getSize(); ++i){
-          y = (x_data[i] * y_data[i]) - c;
-          t = sum + y;
-          c = (t - sum) - y;
-          sum = t;        
-          //   sum += (x_data[i] * y_data[i]);
-        } 
-        return sum;
+        return cpuImpl_->dot(x, y);
       } else {
         out::error() << "Not implemented (yet)" << std::endl;
         return NAN;
@@ -85,20 +88,11 @@ namespace ReSolve {
    */
   void VectorHandler::scal(const real_type* alpha, vector::Vector* x, std::string memspace)
   {
-    if (memspace == "cuda" ) { 
-      LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-      cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
-      cublasStatus_t st = cublasDscal(handle_cublas, x->getSize(), alpha, x->getData("cuda"), 1);
-      if (st!=0) {
-        ReSolve::io::Logger::error() << "scal crashed with code " << st << "\n";
-      }
+    if (memspace == "cuda" ) {
+      cudaImpl_->scal(alpha, x);
     } else {
       if (memspace == "cpu") {
-        real_type* x_data = x->getData("cpu");
-
-        for (int i = 0; i < x->getSize(); ++i){
-          x_data[i] *= (*alpha);
-        }
+        cpuImpl_->scal(alpha, x);
       } else {      
         out::error() << "Not implemented (yet)" << std::endl;
       }  
@@ -114,26 +108,14 @@ namespace ReSolve {
    * @param[in]  memspace String containg memspace (cpu or cuda)
    * 
    */
-  void VectorHandler::axpy(const  real_type* alpha, vector::Vector* x, vector::Vector* y, std::string memspace )
+  void VectorHandler::axpy(const  real_type* alpha, vector::Vector* x, vector::Vector* y, std::string memspace)
   {
     //AXPY:  y = alpha * x + y
-    if (memspace == "cuda" ) { 
-      LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-      cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
-      cublasDaxpy(handle_cublas,
-                  x->getSize(),
-                  alpha,
-                  x->getData("cuda"),
-                  1,
-                  y->getData("cuda"),
-                  1);
+    if (memspace == "cuda" ) {
+      cudaImpl_->axpy(alpha, x, y);
     } else {
       if (memspace == "cpu") {
-        real_type* x_data = x->getData("cpu");
-        real_type* y_data = y->getData("cpu");
-        for (int i = 0; i < x->getSize(); ++i){
-          y_data[i] = (*alpha) * x_data[i] + y_data[i];
-        }
+        cpuImpl_->axpy(alpha, x, y);
       } else {
         out::error() <<"Not implemented (yet)" << std::endl;
       }
@@ -160,38 +142,9 @@ namespace ReSolve {
   void VectorHandler::gemv(std::string transpose, index_type n, index_type k, const real_type* alpha, const real_type* beta, vector::Vector* V, vector::Vector* y, vector::Vector* x, std::string memspace)
   {
     if (memspace == "cuda") {
-      LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-      cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
-      if (transpose == "T") {
-
-        cublasDgemv(handle_cublas,
-                    CUBLAS_OP_T,
-                    n,
-                    k,
-                    alpha,
-                    V->getData("cuda"),
-                    n,
-                    y->getData("cuda"),
-                    1,
-                    beta,
-                    x->getData("cuda"),
-                    1);
-
-      } else {
-        cublasDgemv(handle_cublas,
-                    CUBLAS_OP_N,
-                    n,
-                    k,
-                    alpha,
-                    V->getData("cuda"),
-                    n,
-                    y->getData("cuda"),
-                    1,
-                    beta,
-                    x->getData("cuda"),
-                    1);
-      }
-
+      cudaImpl_->gemv(transpose, n, k, alpha, beta, V, y, x);
+    } else if (memspace == "cpu") {
+      cpuImpl_->gemv(transpose, n, k, alpha, beta, V, y, x);
     } else {
       out::error() << "Not implemented (yet)" << std::endl;
     }
@@ -213,26 +166,9 @@ namespace ReSolve {
   {
     using namespace constants;
     if (memspace == "cuda") {
-      if (k < 200) {
-        mass_axpy(size, k, x->getData("cuda"), y->getData("cuda"),alpha->getData("cuda"));
-      } else {
-        LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-        cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
-        cublasDgemm(handle_cublas,
-                    CUBLAS_OP_N,
-                    CUBLAS_OP_N,
-                    size,       // m
-                    1,          // n
-                    k + 1,      // k
-                    &MINUSONE, // alpha
-                    x->getData("cuda"), // A
-                    size,       // lda
-                    alpha->getData("cuda"), // B
-                    k + 1,      // ldb
-                    &ONE,
-                    y->getData("cuda"),          // c
-                    size);      // ldc     
-      }
+      cudaImpl_->massAxpy(size, alpha, k, x, y);
+    } else if (memspace == "cpu") {
+      cpuImpl_->massAxpy(size, alpha, k, x, y);
     } else {
       out::error() << "Not implemented (yet)" << std::endl;
     }
@@ -254,29 +190,10 @@ namespace ReSolve {
    */
   void VectorHandler::massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res, std::string memspace)
   {
-    using namespace constants;
-
     if (memspace == "cuda") {
-      if (k < 200) {
-        mass_inner_product_two_vectors(size, k, x->getData("cuda") , x->getData(1, "cuda"), V->getData("cuda"), res->getData("cuda"));
-      } else {
-        LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-        cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
-        cublasDgemm(handle_cublas,
-                    CUBLAS_OP_T,
-                    CUBLAS_OP_N,
-                    k + 1,   //m
-                    2,       //n
-                    size,    //k
-                    &ONE,   //alpha
-                    V->getData("cuda"),       //A
-                    size,    //lda
-                    x->getData("cuda"),       //B
-                    size,    //ldb
-                    &ZERO,
-                    res->getData("cuda"),     //c
-                    k + 1);  //ldc 
-      }
+      cudaImpl_->massDot2Vec(size, V, k, x, res);
+    } else if (memspace == "cpu") {
+      cpuImpl_->massDot2Vec(size, V, k, x, res);
     } else {
       out::error() << "Not implemented (yet)" << std::endl;
     }
diff --git a/resolve/vector/VectorHandler.hpp b/resolve/vector/VectorHandler.hpp
index bf2d37d7..53ec0bca 100644
--- a/resolve/vector/VectorHandler.hpp
+++ b/resolve/vector/VectorHandler.hpp
@@ -7,7 +7,9 @@ namespace ReSolve
   {
     class Vector;
   }
+  class VectorHandlerImpl;
   class LinAlgWorkspace;
+  class LinAlgWorkspaceCUDA;
 }
 
 
@@ -16,13 +18,14 @@ namespace ReSolve { //namespace vector {
     public:
       VectorHandler();
       VectorHandler(LinAlgWorkspace* new_workspace);
+      VectorHandler(LinAlgWorkspaceCUDA* new_workspace);
       ~VectorHandler();
 
       //y = alpha x + y
-      void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y, std::string memspace );
+      void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y, std::string memspace);
 
       //dot: x \cdot y
-      real_type dot(vector::Vector* x, vector::Vector* y, std::string memspace );
+      real_type dot(vector::Vector* x, vector::Vector* y, std::string memspace);
 
       //scal = alpha * x
       void scal(const real_type* alpha, vector::Vector* x, std::string memspace);
@@ -40,9 +43,21 @@ namespace ReSolve { //namespace vector {
        * if `transpose = T` (yes), `x = beta*x + alpha*V^T*y`,
        * where `x` is `[k x 1]`, `V` is `[n x k]` and `y` is `[n x 1]`.
        */ 
-      void gemv(std::string transpose, index_type n, index_type k, const real_type* alpha, const real_type* beta, vector::Vector* V, vector::Vector* y, vector::Vector* x, std::string memspace);
+      void gemv(std::string transpose,
+                index_type n,
+                index_type k,
+                const real_type* alpha,
+                const real_type* beta,
+                vector::Vector* V,
+                vector::Vector* y,
+                vector::Vector* x,
+                std::string memspace);
     private:
-      LinAlgWorkspace* workspace_;
+      VectorHandlerImpl*  cpuImpl_{nullptr};
+      VectorHandlerImpl* cudaImpl_{nullptr};
+
+      bool isCpuEnabled_{false};
+      bool isCudaEnabled_{false};
   };
 
 } //} // namespace ReSolve::vector
diff --git a/resolve/vector/VectorHandlerCpu.cpp b/resolve/vector/VectorHandlerCpu.cpp
new file mode 100644
index 00000000..593ba5b0
--- /dev/null
+++ b/resolve/vector/VectorHandlerCpu.cpp
@@ -0,0 +1,159 @@
+#include <iostream>
+
+#include <resolve/utilities/logger/Logger.hpp>
+#include <resolve/cuda/cudaKernels.h>
+#include <resolve/vector/Vector.hpp>
+#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/vector/VectorHandlerImpl.hpp>
+#include "VectorHandlerCpu.hpp"
+
+namespace ReSolve {
+  using out = io::Logger;
+
+  /** 
+   * @brief empty constructor that does absolutely nothing        
+   */
+  VectorHandlerCpu::VectorHandlerCpu()
+  {
+  }
+
+  /** 
+   * @brief constructor
+   * 
+   * @param new_workspace - workspace to be set     
+   */
+  VectorHandlerCpu:: VectorHandlerCpu(LinAlgWorkspace* new_workspace)
+  {
+    workspace_ = new_workspace;
+  }
+
+  /** 
+   * @brief destructor     
+   */
+  VectorHandlerCpu::~VectorHandlerCpu()
+  {
+    //delete the workspace TODO
+  }
+
+  /** 
+   * @brief dot product of two vectors i.e, a = x^Ty
+   * 
+   * @param[in] x The first vector
+   * @param[in] y The second vector
+   * @param[in] memspace String containg memspace (cpu or cuda)
+   * 
+   * @return dot product (real number) of _x_ and _y_
+   */
+
+  real_type VectorHandlerCpu::dot(vector::Vector* x, vector::Vector* y)
+  { 
+    real_type* x_data = x->getData("cpu");
+    real_type* y_data = y->getData("cpu");
+    real_type sum = 0.0;
+    real_type c = 0.0;
+    // real_type t, y;
+    for (int i = 0; i < x->getSize(); ++i) {
+      real_type y = (x_data[i] * y_data[i]) - c;
+      real_type t = sum + y;
+      c = (t - sum) - y;
+      sum = t;        
+      //   sum += (x_data[i] * y_data[i]);
+    } 
+    return sum;
+  }
+
+  /** 
+   * @brief scale a vector by a constant i.e, x = alpha*x where alpha is a constant
+   * 
+   * @param[in] alpha The constant
+   * @param[in,out] x The vector
+   * @param memspace string containg memspace (cpu or cuda)
+   * 
+   */
+  void VectorHandlerCpu::scal(const real_type* alpha, vector::Vector* x)
+  {
+    real_type* x_data = x->getData("cpu");
+
+    for (int i = 0; i < x->getSize(); ++i){
+      x_data[i] *= (*alpha);
+    }
+  }
+
+  /** 
+   * @brief axpy i.e, y = alpha*x+y where alpha is a constant
+   * 
+   * @param[in] alpha The constant
+   * @param[in] x The first vector
+   * @param[in,out] y The second vector (result is return in y)
+   * @param[in]  memspace String containg memspace (cpu or cuda)
+   * 
+   */
+  void VectorHandlerCpu::axpy(const  real_type* alpha, vector::Vector* x, vector::Vector* y)
+  {
+    //AXPY:  y = alpha * x + y
+    real_type* x_data = x->getData("cpu");
+    real_type* y_data = y->getData("cpu");
+    for (int i = 0; i < x->getSize(); ++i) {
+      y_data[i] = (*alpha) * x_data[i] + y_data[i];
+    }
+  }
+
+  /** 
+   * @brief gemv computes matrix-vector product where both matrix and vectors are dense.
+   *        i.e., x = beta*x +  alpha*V*y
+   *
+   * @param[in] Transpose - yes (T) or no (N)
+   * @param[in] n Number of rows in (non-transposed) matrix
+   * @param[in] k Number of columns in (non-transposed)   
+   * @param[in] alpha Constant real number
+   * @param[in] beta Constant real number
+   * @param[in] V Multivector containing the matrix, organized columnwise
+   * @param[in] y Vector, k x 1 if N and n x 1 if T
+   * @param[in,out] x Vector, n x 1 if N and k x 1 if T
+   * @param[in] memspace  cpu or cuda (for now)
+   *
+   * @pre   V is stored colum-wise, _n_ > 0, _k_ > 0
+   * 
+   */  
+  void VectorHandlerCpu::gemv(std::string transpose, index_type n, index_type k, const real_type* alpha, const real_type* beta, vector::Vector* V, vector::Vector* y, vector::Vector* x)
+  {
+    out::error() << "Not implemented (yet)" << std::endl;
+  }
+
+  /** 
+   * @brief mass (bulk) axpy i.e, y = y - x*alpha where  alpha is a vector
+   * 
+   * @param[in] size number of elements in y
+   * @param[in] alpha vector size k x 1
+   * @param[in] x (multi)vector size size x k
+   * @param[in,out] y vector size size x 1 (this is where the result is stored)
+   * @param[in] memspace string containg memspace (cpu or cuda)
+   *
+   * @pre   _k_ > 0, _size_ > 0, _size_ = x->getSize()
+   *
+   */
+  void VectorHandlerCpu::massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y)
+  {
+    out::error() << "Not implemented (yet)" << std::endl;
+  }
+
+  /** 
+   * @brief mass (bulk) dot product i.e,  V^T x, where V is n x k dense multivector (a dense multivector consisting of k vectors size n)  
+   *        and x is k x 2 dense multivector (a multivector consisiting of two vectors size n each)
+   * 
+   * @param[in] size Number of elements in a single vector in V
+   * @param[in] V Multivector; k vectors size n x 1 each
+   * @param[in] k Number of vectors in V
+   * @param[in] x Multivector; 2 vectors size n x 1 each
+   * @param[out] res Multivector; 2 vectors size k x 1 each (result is returned in res)
+   * @param[in] memspace String containg memspace (cpu or cuda)
+   *
+   * @pre   _size_ > 0, _k_ > 0, size = x->getSize(), _res_ needs to be allocated
+   *
+   */
+  void VectorHandlerCpu::massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res)
+  {
+    out::error() << "Not implemented (yet)" << std::endl;
+  }
+
+} // namespace ReSolve
diff --git a/resolve/vector/VectorHandlerCpu.hpp b/resolve/vector/VectorHandlerCpu.hpp
new file mode 100644
index 00000000..a5f09162
--- /dev/null
+++ b/resolve/vector/VectorHandlerCpu.hpp
@@ -0,0 +1,57 @@
+#pragma once
+#include <string>
+
+namespace ReSolve
+{ 
+  namespace vector
+  {
+    class Vector;
+  }
+  class LinAlgWorkspace;
+  class VectorHandlerImpl;
+}
+
+
+namespace ReSolve { //namespace vector {
+  class VectorHandlerCpu : public VectorHandlerImpl
+  { 
+    public:
+      VectorHandlerCpu();
+      VectorHandlerCpu(LinAlgWorkspace* workspace);
+      virtual ~VectorHandlerCpu();
+
+      //y = alpha x + y
+      virtual void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y);
+
+      //dot: x \cdot y
+      virtual real_type dot(vector::Vector* x, vector::Vector* y);
+
+      //scal = alpha * x
+      virtual void scal(const real_type* alpha, vector::Vector* x);
+
+      //mass axpy: x*alpha + y where x is [n x k] and alpha is [k x 1]; x is stored columnwise
+      virtual void massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y);
+
+      //mass dot: V^T x, where V is [n x k] and x is [k x 2], everything is stored and returned columnwise
+      //Size = n
+      virtual void massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res);
+
+      /** gemv:
+       * if `transpose = N` (no), `x = beta*x +  alpha*V*y`,
+       * where `x` is `[n x 1]`, `V` is `[n x k]` and `y` is `[k x 1]`.
+       * if `transpose = T` (yes), `x = beta*x + alpha*V^T*y`,
+       * where `x` is `[k x 1]`, `V` is `[n x k]` and `y` is `[n x 1]`.
+       */ 
+      virtual void gemv(std::string transpose,
+                        index_type n,
+                        index_type k,
+                        const real_type* alpha,
+                        const real_type* beta,
+                        vector::Vector* V,
+                        vector::Vector* y,
+                        vector::Vector* x);
+    private:
+      LinAlgWorkspace* workspace_;
+  };
+
+} //} // namespace ReSolve::vector
diff --git a/resolve/vector/VectorHandlerCuda.cpp b/resolve/vector/VectorHandlerCuda.cpp
new file mode 100644
index 00000000..69044f37
--- /dev/null
+++ b/resolve/vector/VectorHandlerCuda.cpp
@@ -0,0 +1,236 @@
+#include <iostream>
+
+#include <resolve/utilities/logger/Logger.hpp>
+#include <resolve/cuda/cudaKernels.h>
+#include <resolve/vector/Vector.hpp>
+#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/vector/VectorHandlerImpl.hpp>
+#include "VectorHandlerCuda.hpp"
+
+namespace ReSolve {
+  using out = io::Logger;
+
+  /** 
+   * @brief empty constructor that does absolutely nothing        
+   */
+  VectorHandlerCuda::VectorHandlerCuda()
+  {
+  }
+
+  /** 
+   * @brief constructor
+   * 
+   * @param new_workspace - workspace to be set     
+   */
+  VectorHandlerCuda:: VectorHandlerCuda(LinAlgWorkspaceCUDA* new_workspace)
+  {
+    workspace_ = new_workspace;
+  }
+
+  /** 
+   * @brief destructor     
+   */
+  VectorHandlerCuda::~VectorHandlerCuda()
+  {
+    //delete the workspace TODO
+  }
+
+  /** 
+   * @brief dot product of two vectors i.e, a = x^Ty
+   * 
+   * @param[in] x The first vector
+   * @param[in] y The second vector
+   * @param[in] memspace String containg memspace (cpu or cuda)
+   * 
+   * @return dot product (real number) of _x_ and _y_
+   */
+
+  real_type VectorHandlerCuda::dot(vector::Vector* x, vector::Vector* y)
+  { 
+    LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
+    cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
+    double nrm = 0.0;
+    cublasStatus_t st= cublasDdot (handle_cublas,  x->getSize(), x->getData("cuda"), 1, y->getData("cuda"), 1, &nrm);
+    if (st!=0) {printf("dot product crashed with code %d \n", st);}
+    return nrm;
+  }
+
+  /** 
+   * @brief scale a vector by a constant i.e, x = alpha*x where alpha is a constant
+   * 
+   * @param[in] alpha The constant
+   * @param[in,out] x The vector
+   * @param memspace string containg memspace (cpu or cuda)
+   * 
+   */
+  void VectorHandlerCuda::scal(const real_type* alpha, vector::Vector* x)
+  {
+    LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
+    cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
+    cublasStatus_t st = cublasDscal(handle_cublas, x->getSize(), alpha, x->getData("cuda"), 1);
+    if (st!=0) {
+      ReSolve::io::Logger::error() << "scal crashed with code " << st << "\n";
+    }
+  }
+
+  /** 
+   * @brief axpy i.e, y = alpha*x+y where alpha is a constant
+   * 
+   * @param[in] alpha The constant
+   * @param[in] x The first vector
+   * @param[in,out] y The second vector (result is return in y)
+   * @param[in]  memspace String containg memspace (cpu or cuda)
+   * 
+   */
+  void VectorHandlerCuda::axpy(const  real_type* alpha, vector::Vector* x, vector::Vector* y)
+  {
+    //AXPY:  y = alpha * x + y
+    LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
+    cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
+    cublasDaxpy(handle_cublas,
+                x->getSize(),
+                alpha,
+                x->getData("cuda"),
+                1,
+                y->getData("cuda"),
+                1);
+  }
+
+  /** 
+   * @brief gemv computes matrix-vector product where both matrix and vectors are dense.
+   *        i.e., x = beta*x +  alpha*V*y
+   *
+   * @param[in] Transpose - yes (T) or no (N)
+   * @param[in] n Number of rows in (non-transposed) matrix
+   * @param[in] k Number of columns in (non-transposed)   
+   * @param[in] alpha Constant real number
+   * @param[in] beta Constant real number
+   * @param[in] V Multivector containing the matrix, organized columnwise
+   * @param[in] y Vector, k x 1 if N and n x 1 if T
+   * @param[in,out] x Vector, n x 1 if N and k x 1 if T
+   * @param[in] memspace  cpu or cuda (for now)
+   *
+   * @pre   V is stored colum-wise, _n_ > 0, _k_ > 0
+   * 
+   */  
+  void VectorHandlerCuda::gemv(std::string transpose,
+                               index_type n,
+                               index_type k,
+                               const real_type* alpha,
+                               const real_type* beta,
+                               vector::Vector* V,
+                               vector::Vector* y,
+                               vector::Vector* x)
+  {
+    LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
+    cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
+    if (transpose == "T") {
+
+      cublasDgemv(handle_cublas,
+                  CUBLAS_OP_T,
+                  n,
+                  k,
+                  alpha,
+                  V->getData("cuda"),
+                  n,
+                  y->getData("cuda"),
+                  1,
+                  beta,
+                  x->getData("cuda"),
+                  1);
+
+    } else {
+      cublasDgemv(handle_cublas,
+                  CUBLAS_OP_N,
+                  n,
+                  k,
+                  alpha,
+                  V->getData("cuda"),
+                  n,
+                  y->getData("cuda"),
+                  1,
+                  beta,
+                  x->getData("cuda"),
+                  1);
+    }
+  }
+
+  /** 
+   * @brief mass (bulk) axpy i.e, y = y - x*alpha where  alpha is a vector
+   * 
+   * @param[in] size number of elements in y
+   * @param[in] alpha vector size k x 1
+   * @param[in] x (multi)vector size size x k
+   * @param[in,out] y vector size size x 1 (this is where the result is stored)
+   * @param[in] memspace string containg memspace (cpu or cuda)
+   *
+   * @pre   _k_ > 0, _size_ > 0, _size_ = x->getSize()
+   *
+   */
+  void VectorHandlerCuda::massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y)
+  {
+    using namespace constants;
+    if (k < 200) {
+      mass_axpy(size, k, x->getData("cuda"), y->getData("cuda"),alpha->getData("cuda"));
+    } else {
+      LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
+      cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
+      cublasDgemm(handle_cublas,
+                  CUBLAS_OP_N,
+                  CUBLAS_OP_N,
+                  size,       // m
+                  1,          // n
+                  k + 1,      // k
+                  &MINUSONE, // alpha
+                  x->getData("cuda"), // A
+                  size,       // lda
+                  alpha->getData("cuda"), // B
+                  k + 1,      // ldb
+                  &ONE,
+                  y->getData("cuda"),          // c
+                  size);      // ldc     
+    }
+  }
+
+  /** 
+   * @brief mass (bulk) dot product i.e,  V^T x, where V is n x k dense multivector
+   * (a dense multivector consisting of k vectors size n) and x is k x 2 dense
+   * multivector (a multivector consisiting of two vectors size n each)
+   * 
+   * @param[in] size Number of elements in a single vector in V
+   * @param[in] V Multivector; k vectors size n x 1 each
+   * @param[in] k Number of vectors in V
+   * @param[in] x Multivector; 2 vectors size n x 1 each
+   * @param[out] res Multivector; 2 vectors size k x 1 each (result is returned in res)
+   * @param[in] memspace String containg memspace (cpu or cuda)
+   *
+   * @pre   _size_ > 0, _k_ > 0, size = x->getSize(), _res_ needs to be allocated
+   *
+   */
+  void VectorHandlerCuda::massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res)
+  {
+    using namespace constants;
+
+    if (k < 200) {
+      mass_inner_product_two_vectors(size, k, x->getData("cuda") , x->getData(1, "cuda"), V->getData("cuda"), res->getData("cuda"));
+    } else {
+      LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
+      cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
+      cublasDgemm(handle_cublas,
+                  CUBLAS_OP_T,
+                  CUBLAS_OP_N,
+                  k + 1,   //m
+                  2,       //n
+                  size,    //k
+                  &ONE,   //alpha
+                  V->getData("cuda"),       //A
+                  size,    //lda
+                  x->getData("cuda"),       //B
+                  size,    //ldb
+                  &ZERO,
+                  res->getData("cuda"),     //c
+                  k + 1);  //ldc 
+    }
+  }
+
+} // namespace ReSolve
diff --git a/resolve/vector/VectorHandlerCuda.hpp b/resolve/vector/VectorHandlerCuda.hpp
new file mode 100644
index 00000000..0ee2752d
--- /dev/null
+++ b/resolve/vector/VectorHandlerCuda.hpp
@@ -0,0 +1,57 @@
+#pragma once
+#include <string>
+
+namespace ReSolve
+{ 
+  namespace vector
+  {
+    class Vector;
+  }
+  class LinAlgWorkspaceCUDA;
+  class VectorHandlerImpl;
+}
+
+
+namespace ReSolve { //namespace vector {
+  class VectorHandlerCuda : public VectorHandlerImpl
+  { 
+    public:
+      VectorHandlerCuda();
+      VectorHandlerCuda(LinAlgWorkspaceCUDA* workspace);
+      virtual ~VectorHandlerCuda();
+
+      //y = alpha x + y
+      virtual void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y);
+
+      //dot: x \cdot y
+      virtual real_type dot(vector::Vector* x, vector::Vector* y);
+
+      //scal = alpha * x
+      virtual void scal(const real_type* alpha, vector::Vector* x);
+
+      //mass axpy: x*alpha + y where x is [n x k] and alpha is [k x 1]; x is stored columnwise
+      virtual void massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y);
+
+      //mass dot: V^T x, where V is [n x k] and x is [k x 2], everything is stored and returned columnwise
+      //Size = n
+      virtual void massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res);
+
+      /** gemv:
+       * if `transpose = N` (no), `x = beta*x +  alpha*V*y`,
+       * where `x` is `[n x 1]`, `V` is `[n x k]` and `y` is `[k x 1]`.
+       * if `transpose = T` (yes), `x = beta*x + alpha*V^T*y`,
+       * where `x` is `[k x 1]`, `V` is `[n x k]` and `y` is `[n x 1]`.
+       */ 
+      virtual void gemv(std::string transpose,
+                        index_type n,
+                        index_type k,
+                        const real_type* alpha,
+                        const real_type* beta,
+                        vector::Vector* V,
+                        vector::Vector* y,
+                        vector::Vector* x);
+    private:
+      LinAlgWorkspaceCUDA* workspace_;
+  };
+
+} //} // namespace ReSolve::vector
diff --git a/resolve/vector/VectorHandlerImpl.hpp b/resolve/vector/VectorHandlerImpl.hpp
new file mode 100644
index 00000000..229a7461
--- /dev/null
+++ b/resolve/vector/VectorHandlerImpl.hpp
@@ -0,0 +1,57 @@
+#pragma once
+#include <string>
+
+namespace ReSolve
+{ 
+  namespace vector
+  {
+    class Vector;
+  }
+  class VectorHandlerCpu;
+  class VectorHandlerCuda;
+}
+
+
+namespace ReSolve
+{
+  class VectorHandlerImpl
+  { 
+    public:
+      VectorHandlerImpl()
+      {}
+      virtual ~VectorHandlerImpl()
+      {}
+
+      //y = alpha x + y
+      virtual void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y ) = 0;
+
+      //dot: x \cdot y
+      virtual real_type dot(vector::Vector* x, vector::Vector* y ) = 0;
+
+      //scal = alpha * x
+      virtual void scal(const real_type* alpha, vector::Vector* x) = 0;
+
+      //mass axpy: x*alpha + y where x is [n x k] and alpha is [k x 1]; x is stored columnwise
+      virtual void massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y) = 0;
+
+      //mass dot: V^T x, where V is [n x k] and x is [k x 2], everything is stored and returned columnwise
+      //Size = n
+      virtual void massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res) = 0;
+
+      /** gemv:
+       * if `transpose = N` (no), `x = beta*x +  alpha*V*y`,
+       * where `x` is `[n x 1]`, `V` is `[n x k]` and `y` is `[k x 1]`.
+       * if `transpose = T` (yes), `x = beta*x + alpha*V^T*y`,
+       * where `x` is `[k x 1]`, `V` is `[n x k]` and `y` is `[n x 1]`.
+       */ 
+      virtual void gemv(std::string transpose,
+                        index_type n,
+                        index_type k,
+                        const real_type* alpha,
+                        const real_type* beta,
+                        vector::Vector* V,
+                        vector::Vector* y,
+                        vector::Vector* x) = 0;
+  };
+
+} //} // namespace ReSolve::vector
diff --git a/tests/unit/vector/GramSchmidtTests.hpp b/tests/unit/vector/GramSchmidtTests.hpp
index 2d801c49..eddd800e 100644
--- a/tests/unit/vector/GramSchmidtTests.hpp
+++ b/tests/unit/vector/GramSchmidtTests.hpp
@@ -66,8 +66,7 @@ namespace ReSolve {
               break;
           }
 
-          ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-          ReSolve::VectorHandler* handler = new ReSolve::VectorHandler(workspace);
+          ReSolve::VectorHandler* handler = createVectorHandler();
 
           vector::Vector* V = new vector::Vector(N, 3); // we will be using a space of 3 vectors
           real_type* H = new real_type[6]; //in this case, Hessenberg matrix is 3 x 2
@@ -114,7 +113,7 @@ namespace ReSolve {
 
           status *= verifyAnswer(V, 3, handler, memspace_);
           
-          delete workspace;
+          delete handler;
           delete [] H;
           delete V; 
           delete GS;
@@ -125,6 +124,21 @@ namespace ReSolve {
       private:
         std::string memspace_{"cuda"};
 
+        ReSolve::VectorHandler* createVectorHandler()
+        {
+          if (memspace_ == "cpu") { // TODO: Fix memory leak here
+            LinAlgWorkspace* workpsace = new LinAlgWorkspace();
+            return new VectorHandler(workpsace);
+          } else if (memspace_ == "cuda") {
+            LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA();
+            workspace->initializeHandles();
+            return new VectorHandler(workspace);
+          } else {
+            std::cout << "Invalid memory space " << memspace_ << "\n";
+          }
+          return nullptr;
+        }
+
         // x is a multivector containing K vectors 
         bool verifyAnswer(vector::Vector* x, index_type K,  ReSolve::VectorHandler* handler, std::string memspace)
         {
diff --git a/tests/unit/vector/VectorHandlerTests.hpp b/tests/unit/vector/VectorHandlerTests.hpp
index ad0d3cfb..cc84f25b 100644
--- a/tests/unit/vector/VectorHandlerTests.hpp
+++ b/tests/unit/vector/VectorHandlerTests.hpp
@@ -38,8 +38,7 @@ namespace ReSolve {
         {
           TestStatus status;
 
-          ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-          ReSolve::VectorHandler handler(workspace);
+          ReSolve::VectorHandler* handler = createVectorHandler();
 
           vector::Vector* x = new vector::Vector(N);
           vector::Vector* y = new vector::Vector(N);
@@ -52,10 +51,10 @@ namespace ReSolve {
 
           real_type alpha = 0.5;
           //the result is a vector with y[i] = 2.5;          
-          handler.axpy(&alpha, x, y, memspace_);
+          handler->axpy(&alpha, x, y, memspace_);
           status *= verifyAnswer(y, 2.5, memspace_);
 
-          delete workspace;
+          delete handler;
           delete x;
           delete y; 
 
@@ -66,8 +65,7 @@ namespace ReSolve {
         {
           TestStatus status;
 
-          ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-          ReSolve::VectorHandler handler(workspace);
+          ReSolve::VectorHandler* handler = createVectorHandler();
 
           vector::Vector* x = new vector::Vector(N);
           vector::Vector* y = new vector::Vector(N);
@@ -79,7 +77,7 @@ namespace ReSolve {
           y->setToConst(4.0, memspace_);
           real_type ans;
           //the result is N
-          ans = handler.dot(x, y, memspace_);
+          ans = handler->dot(x, y, memspace_);
 
           bool st = true;;
           if (ans != (real_type) N) {
@@ -88,7 +86,7 @@ namespace ReSolve {
           } 
           status *= st;
 
-          delete workspace;
+          delete handler;
           delete x;
           delete y; 
 
@@ -99,8 +97,7 @@ namespace ReSolve {
         {
           TestStatus status;
 
-          ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-          ReSolve::VectorHandler handler(workspace);
+          ReSolve::VectorHandler* handler = createVectorHandler();
 
           vector::Vector* x =  new vector::Vector(N);
 
@@ -111,10 +108,10 @@ namespace ReSolve {
           real_type alpha = 3.5;
 
           //the answer is x[i] = 4.375;         
-          handler.scal(&alpha, x, memspace_);
+          handler->scal(&alpha, x, memspace_);
           status *= verifyAnswer(x, 4.375, memspace_);
 
-          delete workspace;
+          delete handler;
           delete x;
 
           return status.report(__func__);
@@ -124,8 +121,7 @@ namespace ReSolve {
         {
           TestStatus status;
 
-          ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-          ReSolve::VectorHandler handler(workspace);
+          ReSolve::VectorHandler* handler = createVectorHandler();
           
           vector::Vector* x =  new vector::Vector(N, K);
           vector::Vector* y =  new vector::Vector(N);
@@ -148,11 +144,11 @@ namespace ReSolve {
           index_type r = K % 2;
           real_type res = (real_type) ((floor((real_type) K / 2.0) + r) * 1.0 + floor((real_type) K / 2.0) * (-0.5));
 
-          handler.massAxpy(N, alpha, K, x, y, memspace_);
+          handler->massAxpy(N, alpha, K, x, y, memspace_);
           status *= verifyAnswer(y, 2.0 - res, memspace_);
 
 
-          delete workspace;
+          delete handler;
           delete x;
           delete y;
           delete alpha;
@@ -164,8 +160,7 @@ namespace ReSolve {
         {
           TestStatus status;
 
-          ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-          ReSolve::VectorHandler handler(workspace);
+          ReSolve::VectorHandler* handler = createVectorHandler();
           
           vector::Vector* x =  new vector::Vector(N, K);
           vector::Vector* y =  new vector::Vector(N, 2);
@@ -176,11 +171,11 @@ namespace ReSolve {
           
           x->setToConst(1.0, memspace_);
           y->setToConst(-1.0, memspace_);
-          handler.massDot2Vec(N, x, K, y, res, memspace_);
+          handler->massDot2Vec(N, x, K, y, res, memspace_);
           
           status *= verifyAnswer(res, (-1.0) * (real_type) N, memspace_);
 
-          delete workspace;
+          delete handler;
           delete x;
           delete y;
           delete res;
@@ -190,8 +185,8 @@ namespace ReSolve {
         TestOutcome gemv(index_type N,  index_type K)
         {
           TestStatus status;
-          ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-          ReSolve::VectorHandler handler(workspace);
+          // ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
+          ReSolve::VectorHandler* handler = createVectorHandler();
           vector::Vector* V = new vector::Vector(N, K);
           // for the test with NO TRANSPOSE
           vector::Vector* yN = new vector::Vector(K); 
@@ -214,9 +209,9 @@ namespace ReSolve {
           
           real_type alpha = -1.0;
           real_type beta = 1.0;
-          handler.gemv("N", N, K, &alpha, &beta, V, yN, xN, memspace_);
+          handler->gemv("N", N, K, &alpha, &beta, V, yN, xN, memspace_);
           status *= verifyAnswer(xN, (real_type) (K) + 0.5, memspace_);
-          handler.gemv("T", N, K, &alpha, &beta, V, yT, xT, memspace_);
+          handler->gemv("T", N, K, &alpha, &beta, V, yT, xT, memspace_);
           status *= verifyAnswer(xT, (real_type) (N) + 0.5, memspace_);
 
           return status.report(__func__);
@@ -225,7 +220,22 @@ namespace ReSolve {
       private:
         std::string memspace_{"cpu"};
 
-        // we can verify through norm but that would defeat the purpose of testing vector handler...
+        ReSolve::VectorHandler* createVectorHandler()
+        {
+          if (memspace_ == "cpu") {
+            LinAlgWorkspace* workpsace = new LinAlgWorkspace();
+            return new VectorHandler(workpsace);
+          } else if (memspace_ == "cuda") {
+            LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA();
+            workspace->initializeHandles();
+            return new VectorHandler(workspace);
+          } else {
+            std::cout << "Invalid memory space " << memspace_ << "\n";
+          }
+          return nullptr;
+        }
+
+        // we can verify through norm but that would defeat the purpose of testing vector handler ...
         bool verifyAnswer(vector::Vector* x, real_type answer, std::string memspace)
         {
           bool status = true;

From 1e8f84a048c1303969b4a70782ed4e5a5fbff1ed Mon Sep 17 00:00:00 2001
From: Slaven Peles <peless@ornl.gov>
Date: Thu, 19 Oct 2023 12:05:12 -0400
Subject: [PATCH 10/15] Remove workspace factory.

---
 examples/r_KLU_GLU.cpp                        |   2 +-
 examples/r_KLU_GLU_matrix_values_update.cpp   |   2 +-
 examples/r_KLU_KLU.cpp                        |   4 +-
 examples/r_KLU_KLU_standalone.cpp             |   4 +-
 examples/r_KLU_rf.cpp                         |   2 +-
 examples/r_KLU_rf_FGMRES.cpp                  |   2 +-
 .../r_KLU_rf_FGMRES_reuse_factorization.cpp   |   2 +-
 resolve/LinSolverDirectCuSolverGLU.cpp        |   6 +-
 resolve/LinSolverDirectCuSolverGLU.hpp        |   4 +-
 resolve/matrix/MatrixHandler.cpp              |   5 +-
 resolve/matrix/MatrixHandler.hpp              |   4 +-
 resolve/matrix/MatrixHandlerCpu.cpp           | 162 +-----------------
 resolve/matrix/MatrixHandlerCpu.hpp           |   6 +-
 resolve/matrix/MatrixHandlerCuda.cpp          |   5 -
 resolve/matrix/MatrixHandlerCuda.hpp          |   1 -
 resolve/matrix/MatrixHandlerImpl.hpp          |   6 -
 resolve/vector/VectorHandler.cpp              |   4 +-
 resolve/vector/VectorHandler.hpp              |   4 +-
 resolve/vector/VectorHandlerCpu.cpp           |  17 +-
 resolve/vector/VectorHandlerCpu.hpp           |   6 +-
 resolve/vector/VectorHandlerCuda.cpp          |   2 +-
 resolve/workspace/CMakeLists.txt              |   5 +-
 resolve/workspace/LinAlgWorkspace.cpp         |  16 --
 resolve/workspace/LinAlgWorkspace.hpp         |  18 +-
 resolve/workspace/LinAlgWorkspaceCUDA.hpp     |   6 +-
 resolve/workspace/LinAlgWorkspaceCpu.cpp      |  16 ++
 resolve/workspace/LinAlgWorkspaceCpu.hpp      |  18 ++
 resolve/workspace/LinAlgWorkspaceFactory.cpp  |  23 ---
 resolve/workspace/LinAlgWorkspaceFactory.hpp  |  18 --
 tests/functionality/testKLU.cpp               |   2 +-
 tests/functionality/testKLU_GLU.cpp           |   2 +-
 tests/functionality/testKLU_Rf.cpp            |   2 +-
 tests/functionality/testKLU_Rf_FGMRES.cpp     |   2 +-
 tests/unit/matrix/MatrixHandlerTests.hpp      |   4 +-
 tests/unit/vector/GramSchmidtTests.hpp        |   4 +-
 tests/unit/vector/VectorHandlerTests.hpp      |   5 +-
 36 files changed, 97 insertions(+), 294 deletions(-)
 delete mode 100644 resolve/workspace/LinAlgWorkspace.cpp
 create mode 100644 resolve/workspace/LinAlgWorkspaceCpu.cpp
 create mode 100644 resolve/workspace/LinAlgWorkspaceCpu.hpp
 delete mode 100644 resolve/workspace/LinAlgWorkspaceFactory.cpp
 delete mode 100644 resolve/workspace/LinAlgWorkspaceFactory.hpp

diff --git a/examples/r_KLU_GLU.cpp b/examples/r_KLU_GLU.cpp
index c072eb34..e0831976 100644
--- a/examples/r_KLU_GLU.cpp
+++ b/examples/r_KLU_GLU.cpp
@@ -9,7 +9,7 @@
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverGLU.hpp>
-#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 
 using namespace ReSolve::constants;
 
diff --git a/examples/r_KLU_GLU_matrix_values_update.cpp b/examples/r_KLU_GLU_matrix_values_update.cpp
index af797883..e6e09673 100644
--- a/examples/r_KLU_GLU_matrix_values_update.cpp
+++ b/examples/r_KLU_GLU_matrix_values_update.cpp
@@ -9,7 +9,7 @@
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverGLU.hpp>
-#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 
 // this updates the matrix values to simulate what CFD/optimization software does. 
 
diff --git a/examples/r_KLU_KLU.cpp b/examples/r_KLU_KLU.cpp
index bee712bd..550144ae 100644
--- a/examples/r_KLU_KLU.cpp
+++ b/examples/r_KLU_KLU.cpp
@@ -9,7 +9,7 @@
 #include <resolve/matrix/MatrixHandler.hpp>
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
-#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 
 using namespace ReSolve::constants;
 
@@ -35,7 +35,7 @@ int main(int argc, char *argv[])
 
   ReSolve::matrix::Coo* A_coo;
   ReSolve::matrix::Csr* A;
-  ReSolve::LinAlgWorkspace* workspace    = new ReSolve::LinAlgWorkspace();
+  ReSolve::LinAlgWorkspaceCpu* workspace = new ReSolve::LinAlgWorkspaceCpu();
   ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace);
   ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace);
   real_type* rhs;
diff --git a/examples/r_KLU_KLU_standalone.cpp b/examples/r_KLU_KLU_standalone.cpp
index a1aa1fad..628eeb06 100644
--- a/examples/r_KLU_KLU_standalone.cpp
+++ b/examples/r_KLU_KLU_standalone.cpp
@@ -9,7 +9,7 @@
 #include <resolve/matrix/MatrixHandler.hpp>
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
-#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 
 using namespace ReSolve::constants;
 
@@ -31,7 +31,7 @@ int main(int argc, char *argv[])
 
   ReSolve::matrix::Coo* A_coo;
   ReSolve::matrix::Csr* A;
-  ReSolve::LinAlgWorkspace* workspace    = new ReSolve::LinAlgWorkspace();
+  ReSolve::LinAlgWorkspaceCpu* workspace = new ReSolve::LinAlgWorkspaceCpu();
   ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace);
   ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace);
   real_type* rhs;
diff --git a/examples/r_KLU_rf.cpp b/examples/r_KLU_rf.cpp
index 8954724d..5dce2dfe 100644
--- a/examples/r_KLU_rf.cpp
+++ b/examples/r_KLU_rf.cpp
@@ -10,7 +10,7 @@
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverRf.hpp>
-#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 
 using namespace ReSolve::constants;
 
diff --git a/examples/r_KLU_rf_FGMRES.cpp b/examples/r_KLU_rf_FGMRES.cpp
index 52f54a0d..2e43f9c3 100644
--- a/examples/r_KLU_rf_FGMRES.cpp
+++ b/examples/r_KLU_rf_FGMRES.cpp
@@ -11,7 +11,7 @@
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverRf.hpp>
 #include <resolve/LinSolverIterativeFGMRES.hpp>
-#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 
 using namespace ReSolve::constants;
 
diff --git a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
index 2c484926..2317680c 100644
--- a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
+++ b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
@@ -11,7 +11,7 @@
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverRf.hpp>
 #include <resolve/LinSolverIterativeFGMRES.hpp>
-#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 
 using namespace ReSolve::constants;
 
diff --git a/resolve/LinSolverDirectCuSolverGLU.cpp b/resolve/LinSolverDirectCuSolverGLU.cpp
index feaf11b8..75039ff4 100644
--- a/resolve/LinSolverDirectCuSolverGLU.cpp
+++ b/resolve/LinSolverDirectCuSolverGLU.cpp
@@ -3,12 +3,12 @@
 
 #include <resolve/vector/Vector.hpp>
 #include <resolve/matrix/Csr.hpp>
-#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 #include "LinSolverDirectCuSolverGLU.hpp"
 
 namespace ReSolve
 {
-  LinSolverDirectCuSolverGLU::LinSolverDirectCuSolverGLU(LinAlgWorkspace* workspace)
+  LinSolverDirectCuSolverGLU::LinSolverDirectCuSolverGLU(LinAlgWorkspaceCUDA* workspace)
   {
     this->workspace_ = workspace;
   }
@@ -26,7 +26,7 @@ namespace ReSolve
   {
     int error_sum = 0;
 
-    LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
+    LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
     //get the handle
     handle_cusolversp_ = workspaceCUDA->getCusolverSpHandle();
     A_ = (matrix::Csr*) A;
diff --git a/resolve/LinSolverDirectCuSolverGLU.hpp b/resolve/LinSolverDirectCuSolverGLU.hpp
index 0aa27df9..a48c8cba 100644
--- a/resolve/LinSolverDirectCuSolverGLU.hpp
+++ b/resolve/LinSolverDirectCuSolverGLU.hpp
@@ -26,7 +26,7 @@ namespace ReSolve
     using vector_type = vector::Vector;
     
     public:
-      LinSolverDirectCuSolverGLU(LinAlgWorkspace* workspace);
+      LinSolverDirectCuSolverGLU(LinAlgWorkspaceCUDA* workspace);
       ~LinSolverDirectCuSolverGLU();
 
       int refactorize();
@@ -40,7 +40,7 @@ namespace ReSolve
       //note: we need cuSolver handle, we can copy it from the workspace to avoid double allocation
       cusparseMatDescr_t descr_M_; //this is NOT sparse matrix descriptor
       cusparseMatDescr_t descr_A_; //this is NOT sparse matrix descriptor
-      LinAlgWorkspace* workspace_;// so we can copy cusparse handle
+      LinAlgWorkspaceCUDA* workspace_;// so we can copy cusparse handle
       cusolverSpHandle_t handle_cusolversp_; 
       cusolverStatus_t status_cusolver_;
       cusparseStatus_t status_cusparse_;
diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp
index 50dd3082..c4849c18 100644
--- a/resolve/matrix/MatrixHandler.cpp
+++ b/resolve/matrix/MatrixHandler.cpp
@@ -5,7 +5,7 @@
 #include <resolve/matrix/Coo.hpp>
 #include <resolve/matrix/Csc.hpp>
 #include <resolve/matrix/Csr.hpp>
-#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 #include <resolve/utilities/misc/IndexValuePair.hpp>
 #include "MatrixHandler.hpp"
 #include "MatrixHandlerCpu.hpp"
@@ -47,7 +47,7 @@ namespace ReSolve {
    * @note The CPU implementation currently does not require a workspace.
    * The workspace pointer parameter is provided for forward compatibility.
    */
-  MatrixHandler::MatrixHandler(LinAlgWorkspace* new_workspace)
+  MatrixHandler::MatrixHandler(LinAlgWorkspaceCpu* new_workspace)
   {
     cpuImpl_  = new MatrixHandlerCpu(new_workspace);
     isCpuEnabled_  = true;
@@ -272,7 +272,6 @@ namespace ReSolve {
 
   int MatrixHandler::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace)
   {
-    index_type error_sum = 0;
     if (memspace == "cuda") { 
       return cudaImpl_->csc2csr(A_csc, A_csr);
     } else if (memspace == "cpu") { 
diff --git a/resolve/matrix/MatrixHandler.hpp b/resolve/matrix/MatrixHandler.hpp
index fed6b593..e05e6c8f 100644
--- a/resolve/matrix/MatrixHandler.hpp
+++ b/resolve/matrix/MatrixHandler.hpp
@@ -16,7 +16,7 @@ namespace ReSolve
     class Csc;
     class Csr;
   }
-  class LinAlgWorkspace;
+  class LinAlgWorkspaceCpu;
   class LinAlgWorkspaceCUDA;
   class MatrixHandlerImpl;
 }
@@ -42,7 +42,7 @@ namespace ReSolve {
     
     public:
       MatrixHandler();
-      MatrixHandler(LinAlgWorkspace* workspace);
+      MatrixHandler(LinAlgWorkspaceCpu* workspace);
       MatrixHandler(LinAlgWorkspaceCUDA* workspace);
       ~MatrixHandler();
 
diff --git a/resolve/matrix/MatrixHandlerCpu.cpp b/resolve/matrix/MatrixHandlerCpu.cpp
index cc211f5f..95637402 100644
--- a/resolve/matrix/MatrixHandlerCpu.cpp
+++ b/resolve/matrix/MatrixHandlerCpu.cpp
@@ -22,7 +22,7 @@ namespace ReSolve {
   {
   }
 
-  MatrixHandlerCpu::MatrixHandlerCpu(LinAlgWorkspace* new_workspace)
+  MatrixHandlerCpu::MatrixHandlerCpu(LinAlgWorkspaceCpu* new_workspace)
   {
     workspace_ = new_workspace;
   }
@@ -37,164 +37,6 @@ namespace ReSolve {
     values_changed_ = values_changed;
   }
 
-//   int MatrixHandlerCpu::coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace)
-//   {
-//     //this happens on the CPU not on the GPU
-//     //but will return whatever memspace requested.
-
-//     //count nnzs first
-
-//     index_type nnz_unpacked = 0;
-//     index_type nnz = A_coo->getNnz();
-//     index_type n = A_coo->getNumRows();
-//     bool symmetric = A_coo->symmetric();
-//     bool expanded = A_coo->expanded();
-
-//     index_type* nnz_counts =  new index_type[n];
-//     std::fill_n(nnz_counts, n, 0);
-//     index_type* coo_rows = A_coo->getRowData("cpu");
-//     index_type* coo_cols = A_coo->getColData("cpu");
-//     real_type* coo_vals = A_coo->getValues("cpu");
-
-//     index_type* diag_control = new index_type[n]; //for DEDUPLICATION of the diagonal
-//     std::fill_n(diag_control, n, 0);
-//     index_type nnz_unpacked_no_duplicates = 0;
-//     index_type nnz_no_duplicates = nnz;
-
-
-//     //maybe check if they exist?
-//     for (index_type i = 0; i < nnz; ++i)
-//     {
-//       nnz_counts[coo_rows[i]]++;
-//       nnz_unpacked++;
-//       nnz_unpacked_no_duplicates++;
-//       if ((coo_rows[i] != coo_cols[i])&& (symmetric) && (!expanded))
-//       {
-//         nnz_counts[coo_cols[i]]++;
-//         nnz_unpacked++;
-//         nnz_unpacked_no_duplicates++;
-//       }
-//       if (coo_rows[i] == coo_cols[i]){
-//         if (diag_control[coo_rows[i]] > 0) {
-//          //duplicate
-//           nnz_unpacked_no_duplicates--;
-//           nnz_no_duplicates--;
-//         }
-//         diag_control[coo_rows[i]]++;
-//       }
-//     }
-//     A_csr->setExpanded(true);
-//     A_csr->setNnzExpanded(nnz_unpacked_no_duplicates);
-//     index_type* csr_ia = new index_type[n+1];
-//     std::fill_n(csr_ia, n + 1, 0);
-//     index_type* csr_ja = new index_type[nnz_unpacked];
-//     real_type* csr_a = new real_type[nnz_unpacked];
-//     index_type* nnz_shifts = new index_type[n];
-//     std::fill_n(nnz_shifts, n , 0);
-
-//     IndexValuePair* tmp = new IndexValuePair[nnz_unpacked]; 
-
-//     csr_ia[0] = 0;
-
-//     for (index_type i = 1; i < n + 1; ++i){
-//       csr_ia[i] = csr_ia[i - 1] + nnz_counts[i - 1] - (diag_control[i-1] - 1);
-//     }
-
-//     int r, start;
-
-
-//     for (index_type i = 0; i < nnz; ++i){
-//       //which row
-//       r = coo_rows[i];
-//       start = csr_ia[r];
-
-//       if ((start + nnz_shifts[r]) > nnz_unpacked) {
-//         out::warning() << "index out of bounds (case 1) start: " << start << "nnz_shifts[" << r << "] = " << nnz_shifts[r] << std::endl;
-//       }
-//       if ((r == coo_cols[i]) && (diag_control[r] > 1)) {//diagonal, and there are duplicates
-//         bool already_there = false;  
-//         for (index_type j = start; j < start + nnz_shifts[r]; ++j)
-//         {
-//           index_type c = tmp[j].getIdx();
-//           if (c == r) {
-//             real_type val = tmp[j].getValue();
-//             val += coo_vals[i];
-//             tmp[j].setValue(val);
-//             already_there = true;
-//             out::warning() << " duplicate found, row " << c << " adding in place " << j << " current value: " << val << std::endl;
-//           }  
-//         }  
-//         if (!already_there){ // first time this duplicates appears
-
-//           tmp[start + nnz_shifts[r]].setIdx(coo_cols[i]);
-//           tmp[start + nnz_shifts[r]].setValue(coo_vals[i]);
-
-//           nnz_shifts[r]++;
-//         }
-//       } else {//not diagonal
-//         tmp[start + nnz_shifts[r]].setIdx(coo_cols[i]);
-//         tmp[start + nnz_shifts[r]].setValue(coo_vals[i]);
-//         nnz_shifts[r]++;
-
-//         if ((coo_rows[i] != coo_cols[i]) && (symmetric == 1))
-//         {
-//           r = coo_cols[i];
-//           start = csr_ia[r];
-
-//           if ((start + nnz_shifts[r]) > nnz_unpacked)
-//             out::warning() << "index out of bounds (case 2) start: " << start << "nnz_shifts[" << r << "] = " << nnz_shifts[r] << std::endl;
-//           tmp[start + nnz_shifts[r]].setIdx(coo_rows[i]);
-//           tmp[start + nnz_shifts[r]].setValue(coo_vals[i]);
-//           nnz_shifts[r]++;
-//         }
-//       }
-//     }
-//     //now sort whatever is inside rows
-
-//     for (int i = 0; i < n; ++i)
-//     {
-
-//       //now sorting (and adding 1)
-//       int colStart = csr_ia[i];
-//       int colEnd = csr_ia[i + 1];
-//       int length = colEnd - colStart;
-//       std::sort(&tmp[colStart],&tmp[colStart] + length);
-//     }
-
-//     for (index_type i = 0; i < nnz_unpacked; ++i)
-//     {
-//       csr_ja[i] = tmp[i].getIdx();
-//       csr_a[i] = tmp[i].getValue();
-//     }
-// #if 0
-//     for (int i = 0; i<n; ++i){
-//       printf("Row: %d \n", i);
-//       for (int j = csr_ia[i]; j<csr_ia[i+1]; ++j){
-//         printf("(%d %16.16f) ", csr_ja[j], csr_a[j]);
-//       }
-//       printf("\n");
-//     }
-// #endif
-//     A_csr->setNnz(nnz_no_duplicates);
-//     if (memspace == "cpu"){
-//       A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cpu");
-//     } else {
-//       if (memspace == "cuda"){      
-//         A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cuda");
-//       } else {
-//         //display error
-//       }
-//     }
-//     delete [] nnz_counts;
-//     delete [] tmp;
-//     delete [] nnz_shifts;
-//     delete [] csr_ia;
-//     delete [] csr_ja;
-//     delete [] csr_a;
-//     delete [] diag_control; 
-
-//     return 0;
-//   }
 
   int MatrixHandlerCpu::matvec(matrix::Sparse* Ageneric, 
                                vector_type* vec_x, 
@@ -253,7 +95,7 @@ namespace ReSolve {
    */
   int MatrixHandlerCpu::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr)
   {
-    int error_sum = 0;
+    // int error_sum = 0; TODO: Collect error output!
     assert(A_csc->getNnz() == A_csr->getNnz());
     assert(A_csc->getNumRows() == A_csr->getNumColumns());
     assert(A_csr->getNumRows() == A_csc->getNumColumns());
diff --git a/resolve/matrix/MatrixHandlerCpu.hpp b/resolve/matrix/MatrixHandlerCpu.hpp
index fb760750..3b6c91c0 100644
--- a/resolve/matrix/MatrixHandlerCpu.hpp
+++ b/resolve/matrix/MatrixHandlerCpu.hpp
@@ -21,7 +21,7 @@ namespace ReSolve
     class Csc;
     class Csr;
   }
-  class LinAlgWorkspace;
+  class LinAlgWorkspaceCpu;
 }
 
 
@@ -37,7 +37,7 @@ namespace ReSolve {
     
     public:
       MatrixHandlerCpu();
-      MatrixHandlerCpu(LinAlgWorkspace* workspace);
+      MatrixHandlerCpu(LinAlgWorkspaceCpu* workspace);
       virtual ~MatrixHandlerCpu();
 
       int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr); //memspace decides on what is returned (cpu or cuda pointer)
@@ -54,7 +54,7 @@ namespace ReSolve {
       void setValuesChanged(bool isValuesChanged); 
     
     private: 
-      LinAlgWorkspace* workspace_{nullptr};
+      LinAlgWorkspaceCpu* workspace_{nullptr};
       // bool new_matrix_{true};     ///< if the structure changed, you need a new handler.
       bool values_changed_{true}; ///< needed for matvec
 
diff --git a/resolve/matrix/MatrixHandlerCuda.cpp b/resolve/matrix/MatrixHandlerCuda.cpp
index 15252d85..b27a63c0 100644
--- a/resolve/matrix/MatrixHandlerCuda.cpp
+++ b/resolve/matrix/MatrixHandlerCuda.cpp
@@ -26,11 +26,6 @@ namespace ReSolve {
     workspace_ = new_workspace;
   }
 
-  MatrixHandlerCuda::MatrixHandlerCuda(LinAlgWorkspace* new_workspace)
-  {
-    workspace_ = (LinAlgWorkspaceCUDA*) new_workspace;
-  }
-
   void MatrixHandlerCuda::setValuesChanged(bool values_changed)
   {
     values_changed_ = values_changed;
diff --git a/resolve/matrix/MatrixHandlerCuda.hpp b/resolve/matrix/MatrixHandlerCuda.hpp
index c77eb795..a8fec606 100644
--- a/resolve/matrix/MatrixHandlerCuda.hpp
+++ b/resolve/matrix/MatrixHandlerCuda.hpp
@@ -37,7 +37,6 @@ namespace ReSolve {
     
     public:
       MatrixHandlerCuda();
-      MatrixHandlerCuda(LinAlgWorkspace* workspace);
       MatrixHandlerCuda(LinAlgWorkspaceCUDA* workspace);
       virtual ~MatrixHandlerCuda();
 
diff --git a/resolve/matrix/MatrixHandlerImpl.hpp b/resolve/matrix/MatrixHandlerImpl.hpp
index 015412d0..b901e2b5 100644
--- a/resolve/matrix/MatrixHandlerImpl.hpp
+++ b/resolve/matrix/MatrixHandlerImpl.hpp
@@ -55,12 +55,6 @@ namespace ReSolve {
 
       virtual void setValuesChanged(bool isValuesChanged) = 0;
     
-    protected: 
-      // LinAlgWorkspace* workspace_{nullptr};
-      // bool new_matrix_{true};     ///< if the structure changed, you need a new handler.
-      // bool values_changed_{true}; ///< needed for matvec
-
-      // MemoryHandler mem_; ///< Device memory manager object
   };
 
 } // namespace ReSolve
diff --git a/resolve/vector/VectorHandler.cpp b/resolve/vector/VectorHandler.cpp
index 9648058e..0feb96a8 100644
--- a/resolve/vector/VectorHandler.cpp
+++ b/resolve/vector/VectorHandler.cpp
@@ -3,7 +3,7 @@
 #include <resolve/utilities/logger/Logger.hpp>
 #include <resolve/cuda/cudaKernels.h>
 #include <resolve/vector/Vector.hpp>
-#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 #include <resolve/vector/VectorHandlerImpl.hpp>
 #include <resolve/vector/VectorHandlerCpu.hpp>
 #include <resolve/vector/VectorHandlerCuda.hpp>
@@ -26,7 +26,7 @@ namespace ReSolve {
    * 
    * @param new_workspace - workspace to be set     
    */
-  VectorHandler::VectorHandler(LinAlgWorkspace* new_workspace)
+  VectorHandler::VectorHandler(LinAlgWorkspaceCpu* new_workspace)
   {
     cpuImpl_ = new VectorHandlerCpu(new_workspace);
     isCpuEnabled_ = true;
diff --git a/resolve/vector/VectorHandler.hpp b/resolve/vector/VectorHandler.hpp
index 53ec0bca..c17d4688 100644
--- a/resolve/vector/VectorHandler.hpp
+++ b/resolve/vector/VectorHandler.hpp
@@ -8,7 +8,7 @@ namespace ReSolve
     class Vector;
   }
   class VectorHandlerImpl;
-  class LinAlgWorkspace;
+  class LinAlgWorkspaceCpu;
   class LinAlgWorkspaceCUDA;
 }
 
@@ -17,7 +17,7 @@ namespace ReSolve { //namespace vector {
   class VectorHandler { 
     public:
       VectorHandler();
-      VectorHandler(LinAlgWorkspace* new_workspace);
+      VectorHandler(LinAlgWorkspaceCpu* new_workspace);
       VectorHandler(LinAlgWorkspaceCUDA* new_workspace);
       ~VectorHandler();
 
diff --git a/resolve/vector/VectorHandlerCpu.cpp b/resolve/vector/VectorHandlerCpu.cpp
index 593ba5b0..f5cc463d 100644
--- a/resolve/vector/VectorHandlerCpu.cpp
+++ b/resolve/vector/VectorHandlerCpu.cpp
@@ -3,7 +3,7 @@
 #include <resolve/utilities/logger/Logger.hpp>
 #include <resolve/cuda/cudaKernels.h>
 #include <resolve/vector/Vector.hpp>
-#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 #include <resolve/vector/VectorHandlerImpl.hpp>
 #include "VectorHandlerCpu.hpp"
 
@@ -22,7 +22,7 @@ namespace ReSolve {
    * 
    * @param new_workspace - workspace to be set     
    */
-  VectorHandlerCpu:: VectorHandlerCpu(LinAlgWorkspace* new_workspace)
+  VectorHandlerCpu:: VectorHandlerCpu(LinAlgWorkspaceCpu* new_workspace)
   {
     workspace_ = new_workspace;
   }
@@ -115,7 +115,14 @@ namespace ReSolve {
    * @pre   V is stored colum-wise, _n_ > 0, _k_ > 0
    * 
    */  
-  void VectorHandlerCpu::gemv(std::string transpose, index_type n, index_type k, const real_type* alpha, const real_type* beta, vector::Vector* V, vector::Vector* y, vector::Vector* x)
+  void VectorHandlerCpu::gemv(std::string /* transpose */,
+                              index_type /* n */,
+                              index_type /* k */,
+                              const real_type* /* alpha */,
+                              const real_type* /* beta */,
+                              vector::Vector* /* V */,
+                              vector::Vector* /* y */,
+                              vector::Vector* /* x */)
   {
     out::error() << "Not implemented (yet)" << std::endl;
   }
@@ -132,7 +139,7 @@ namespace ReSolve {
    * @pre   _k_ > 0, _size_ > 0, _size_ = x->getSize()
    *
    */
-  void VectorHandlerCpu::massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y)
+  void VectorHandlerCpu::massAxpy(index_type /* size */, vector::Vector* /* alpha */, index_type /* k */, vector::Vector* /* x */, vector::Vector* /* y */)
   {
     out::error() << "Not implemented (yet)" << std::endl;
   }
@@ -151,7 +158,7 @@ namespace ReSolve {
    * @pre   _size_ > 0, _k_ > 0, size = x->getSize(), _res_ needs to be allocated
    *
    */
-  void VectorHandlerCpu::massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res)
+  void VectorHandlerCpu::massDot2Vec(index_type /* size */, vector::Vector* /* V */, index_type /* k */, vector::Vector* /* x */, vector::Vector* /* res */)
   {
     out::error() << "Not implemented (yet)" << std::endl;
   }
diff --git a/resolve/vector/VectorHandlerCpu.hpp b/resolve/vector/VectorHandlerCpu.hpp
index a5f09162..3f2f6133 100644
--- a/resolve/vector/VectorHandlerCpu.hpp
+++ b/resolve/vector/VectorHandlerCpu.hpp
@@ -7,7 +7,7 @@ namespace ReSolve
   {
     class Vector;
   }
-  class LinAlgWorkspace;
+  class LinAlgWorkspaceCpu;
   class VectorHandlerImpl;
 }
 
@@ -17,7 +17,7 @@ namespace ReSolve { //namespace vector {
   { 
     public:
       VectorHandlerCpu();
-      VectorHandlerCpu(LinAlgWorkspace* workspace);
+      VectorHandlerCpu(LinAlgWorkspaceCpu* workspace);
       virtual ~VectorHandlerCpu();
 
       //y = alpha x + y
@@ -51,7 +51,7 @@ namespace ReSolve { //namespace vector {
                         vector::Vector* y,
                         vector::Vector* x);
     private:
-      LinAlgWorkspace* workspace_;
+      LinAlgWorkspaceCpu* workspace_;
   };
 
 } //} // namespace ReSolve::vector
diff --git a/resolve/vector/VectorHandlerCuda.cpp b/resolve/vector/VectorHandlerCuda.cpp
index 69044f37..3c887e85 100644
--- a/resolve/vector/VectorHandlerCuda.cpp
+++ b/resolve/vector/VectorHandlerCuda.cpp
@@ -3,7 +3,7 @@
 #include <resolve/utilities/logger/Logger.hpp>
 #include <resolve/cuda/cudaKernels.h>
 #include <resolve/vector/Vector.hpp>
-#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 #include <resolve/vector/VectorHandlerImpl.hpp>
 #include "VectorHandlerCuda.hpp"
 
diff --git a/resolve/workspace/CMakeLists.txt b/resolve/workspace/CMakeLists.txt
index a50982c8..c6225487 100644
--- a/resolve/workspace/CMakeLists.txt
+++ b/resolve/workspace/CMakeLists.txt
@@ -7,8 +7,7 @@
 ]]
 
 set(ReSolve_Workspace_SRC
-  LinAlgWorkspace.cpp
-  LinAlgWorkspaceFactory.cpp
+  LinAlgWorkspaceCpu.cpp
 )
 
 set(ReSolve_Workspace_CUDA_SRC # consider adding a separate dir for cuda-sdk dependent code
@@ -17,7 +16,7 @@ set(ReSolve_Workspace_CUDA_SRC # consider adding a separate dir for cuda-sdk dep
 
 set(ReSolve_Workspace_HEADER_INSTALL
   LinAlgWorkspace.hpp
-  LinAlgWorkspaceFactory.hpp
+  LinAlgWorkspaceCpu.hpp
   LinAlgWorkspaceCUDA.hpp
 )
 
diff --git a/resolve/workspace/LinAlgWorkspace.cpp b/resolve/workspace/LinAlgWorkspace.cpp
deleted file mode 100644
index 5e4b55d8..00000000
--- a/resolve/workspace/LinAlgWorkspace.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-#include "LinAlgWorkspace.hpp"
-
-namespace ReSolve
-{
-  LinAlgWorkspace::LinAlgWorkspace()
-  {
-  }
-  
-  LinAlgWorkspace::~LinAlgWorkspace()
-  {
-  }
-
-  void LinAlgWorkspace::initializeHandles()
-  {
-  }
-}
diff --git a/resolve/workspace/LinAlgWorkspace.hpp b/resolve/workspace/LinAlgWorkspace.hpp
index 68074eb8..6da58fda 100644
--- a/resolve/workspace/LinAlgWorkspace.hpp
+++ b/resolve/workspace/LinAlgWorkspace.hpp
@@ -1,18 +1,8 @@
 #pragma once
 
+#include <resolve/workspace/LinAlgWorkspaceCpu.hpp>
 
-#include <resolve/MemoryUtils.hpp>
+#ifdef RESOLVE_USE_CUDA
+#include <resolve/workspace/LinAlgWorkspaceCUDA.hpp>
+#endif
 
-namespace ReSolve
-{
-  class LinAlgWorkspace
-  {
-    public:
-      LinAlgWorkspace();
-      ~LinAlgWorkspace();
-      void initializeHandles();
-    protected:
-      MemoryHandler mem_;
-  };
-
-}
diff --git a/resolve/workspace/LinAlgWorkspaceCUDA.hpp b/resolve/workspace/LinAlgWorkspaceCUDA.hpp
index c0d4554c..5076563a 100644
--- a/resolve/workspace/LinAlgWorkspaceCUDA.hpp
+++ b/resolve/workspace/LinAlgWorkspaceCUDA.hpp
@@ -4,11 +4,11 @@
 #include "cusparse.h"
 #include "cusolverSp.h"
 
-#include <resolve/workspace/LinAlgWorkspace.hpp>
+#include <resolve/MemoryUtils.hpp>
 
 namespace ReSolve
 {
-  class LinAlgWorkspaceCUDA : public LinAlgWorkspace
+  class LinAlgWorkspaceCUDA
   {
     public:
       LinAlgWorkspaceCUDA();
@@ -55,6 +55,8 @@ namespace ReSolve
       void* buffer_1norm_;
 
       bool matvec_setup_done_; //check if setup is done for matvec i.e. if buffer is allocated, csr structure is set etc.
+
+      MemoryHandler mem_;
   };
 
 } // namespace ReSolve
diff --git a/resolve/workspace/LinAlgWorkspaceCpu.cpp b/resolve/workspace/LinAlgWorkspaceCpu.cpp
new file mode 100644
index 00000000..3ed9aa43
--- /dev/null
+++ b/resolve/workspace/LinAlgWorkspaceCpu.cpp
@@ -0,0 +1,16 @@
+#include "LinAlgWorkspaceCpu.hpp"
+
+namespace ReSolve
+{
+  LinAlgWorkspaceCpu::LinAlgWorkspaceCpu()
+  {
+  }
+  
+  LinAlgWorkspaceCpu::~LinAlgWorkspaceCpu()
+  {
+  }
+
+  void LinAlgWorkspaceCpu::initializeHandles()
+  {
+  }
+}
diff --git a/resolve/workspace/LinAlgWorkspaceCpu.hpp b/resolve/workspace/LinAlgWorkspaceCpu.hpp
new file mode 100644
index 00000000..00e5f38e
--- /dev/null
+++ b/resolve/workspace/LinAlgWorkspaceCpu.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+
+#include <resolve/MemoryUtils.hpp>
+
+namespace ReSolve
+{
+  class LinAlgWorkspaceCpu
+  {
+    public:
+      LinAlgWorkspaceCpu();
+      ~LinAlgWorkspaceCpu();
+      void initializeHandles();
+    private:
+      MemoryHandler mem_;
+  };
+
+}
diff --git a/resolve/workspace/LinAlgWorkspaceFactory.cpp b/resolve/workspace/LinAlgWorkspaceFactory.cpp
deleted file mode 100644
index 43a10870..00000000
--- a/resolve/workspace/LinAlgWorkspaceFactory.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-#include "LinAlgWorkspaceFactory.hpp"
-
-namespace ReSolve
-{
-  /// @brief  Workspace factory
-  /// @param[in] memspace memory space ID 
-  /// @return pointer to the linear algebra workspace
-  LinAlgWorkspace* createLinAlgWorkspace(std::string memspace)
-  {
-    if (memspace == "cuda") {
-#ifdef RESOLVE_USE_CUDA
-      LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA();
-      workspace->initializeHandles();
-      return workspace;
-#else
-      return nullptr;
-#endif
-    } 
-    // If not CUDA, return default
-    return (new LinAlgWorkspace());
-  }
-
-}
diff --git a/resolve/workspace/LinAlgWorkspaceFactory.hpp b/resolve/workspace/LinAlgWorkspaceFactory.hpp
deleted file mode 100644
index 0f5860c1..00000000
--- a/resolve/workspace/LinAlgWorkspaceFactory.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-#pragma once
-
-#include <string>
-
-#include <resolve/workspace/LinAlgWorkspace.hpp>
-
-#ifdef RESOLVE_USE_CUDA
-#include <resolve/workspace/LinAlgWorkspaceCUDA.hpp>
-#endif
-
-// #ifndef RESOLVE_USE_CUDA
-// using ReSolve::LinAlgWorkspace = ReSolve::LinAlgWorkspaceCUDA;
-// #endif
-
-namespace ReSolve
-{
-  LinAlgWorkspace* createLinAlgWorkspace(std::string memspace);
-}
diff --git a/tests/functionality/testKLU.cpp b/tests/functionality/testKLU.cpp
index 66790c83..f3c1da57 100644
--- a/tests/functionality/testKLU.cpp
+++ b/tests/functionality/testKLU.cpp
@@ -27,7 +27,7 @@ int main(int argc, char *argv[])
   int error_sum = 0;
   int status = 0;
 
-  ReSolve::LinAlgWorkspace* workspace =    new ReSolve::LinAlgWorkspace();
+  ReSolve::LinAlgWorkspaceCpu* workspace = new ReSolve::LinAlgWorkspaceCpu();
   ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace);
   ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace);
   ReSolve::LinSolverDirectKLU* KLU = new ReSolve::LinSolverDirectKLU;
diff --git a/tests/functionality/testKLU_GLU.cpp b/tests/functionality/testKLU_GLU.cpp
index 1384f405..0e9bb4bd 100644
--- a/tests/functionality/testKLU_GLU.cpp
+++ b/tests/functionality/testKLU_GLU.cpp
@@ -10,7 +10,7 @@
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverGLU.hpp>
-#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 //author: KS
 //functionality test to check whether cuSolverGLU works correctly.
 
diff --git a/tests/functionality/testKLU_Rf.cpp b/tests/functionality/testKLU_Rf.cpp
index b7e725a7..729968f5 100644
--- a/tests/functionality/testKLU_Rf.cpp
+++ b/tests/functionality/testKLU_Rf.cpp
@@ -11,7 +11,7 @@
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverRf.hpp>
-#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 //author: KS
 //functionality test to check whether cuSolverRf works correctly.
 
diff --git a/tests/functionality/testKLU_Rf_FGMRES.cpp b/tests/functionality/testKLU_Rf_FGMRES.cpp
index 3e4b9d61..a474e406 100644
--- a/tests/functionality/testKLU_Rf_FGMRES.cpp
+++ b/tests/functionality/testKLU_Rf_FGMRES.cpp
@@ -12,7 +12,7 @@
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverRf.hpp>
 #include <resolve/LinSolverIterativeFGMRES.hpp>
-#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 //author: KS
 //functionality test to check whether cuSolverRf/FGMRES works correctly.
 
diff --git a/tests/unit/matrix/MatrixHandlerTests.hpp b/tests/unit/matrix/MatrixHandlerTests.hpp
index 3e0ec56d..c94524e3 100644
--- a/tests/unit/matrix/MatrixHandlerTests.hpp
+++ b/tests/unit/matrix/MatrixHandlerTests.hpp
@@ -5,7 +5,7 @@
 #include <iterator>
 #include <algorithm>
 #include <resolve/matrix/Csr.hpp>
-#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 #include <resolve/matrix/MatrixHandler.hpp>
 #include <resolve/vector/Vector.hpp>
 #include <tests/unit/TestBase.hpp>
@@ -73,7 +73,7 @@ class MatrixHandlerTests : TestBase
   ReSolve::MatrixHandler* createMatrixHandler()
   {
     if (memspace_ == "cpu") {
-      LinAlgWorkspace* workpsace = new LinAlgWorkspace();
+      LinAlgWorkspaceCpu* workpsace = new LinAlgWorkspaceCpu();
       return new MatrixHandler(workpsace);
     } else if (memspace_ == "cuda") {
       LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA();
diff --git a/tests/unit/vector/GramSchmidtTests.hpp b/tests/unit/vector/GramSchmidtTests.hpp
index eddd800e..6f562739 100644
--- a/tests/unit/vector/GramSchmidtTests.hpp
+++ b/tests/unit/vector/GramSchmidtTests.hpp
@@ -6,7 +6,7 @@
 #include <resolve/matrix/MatrixHandler.hpp>
 #include <resolve/vector/Vector.hpp>
 #include <tests/unit/TestBase.hpp>
-#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 
 namespace ReSolve { 
   namespace tests {
@@ -127,7 +127,7 @@ namespace ReSolve {
         ReSolve::VectorHandler* createVectorHandler()
         {
           if (memspace_ == "cpu") { // TODO: Fix memory leak here
-            LinAlgWorkspace* workpsace = new LinAlgWorkspace();
+            LinAlgWorkspaceCpu* workpsace = new LinAlgWorkspaceCpu();
             return new VectorHandler(workpsace);
           } else if (memspace_ == "cuda") {
             LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA();
diff --git a/tests/unit/vector/VectorHandlerTests.hpp b/tests/unit/vector/VectorHandlerTests.hpp
index cc84f25b..dc6dabb8 100644
--- a/tests/unit/vector/VectorHandlerTests.hpp
+++ b/tests/unit/vector/VectorHandlerTests.hpp
@@ -7,7 +7,7 @@
 #include <resolve/vector/Vector.hpp>
 #include <resolve/vector/VectorHandler.hpp>
 #include <tests/unit/TestBase.hpp>
-#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 
 namespace ReSolve { 
   namespace tests {
@@ -185,7 +185,6 @@ namespace ReSolve {
         TestOutcome gemv(index_type N,  index_type K)
         {
           TestStatus status;
-          // ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
           ReSolve::VectorHandler* handler = createVectorHandler();
           vector::Vector* V = new vector::Vector(N, K);
           // for the test with NO TRANSPOSE
@@ -223,7 +222,7 @@ namespace ReSolve {
         ReSolve::VectorHandler* createVectorHandler()
         {
           if (memspace_ == "cpu") {
-            LinAlgWorkspace* workpsace = new LinAlgWorkspace();
+            LinAlgWorkspaceCpu* workpsace = new LinAlgWorkspaceCpu();
             return new VectorHandler(workpsace);
           } else if (memspace_ == "cuda") {
             LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA();

From 7a920bed286c1a3d0cdfbc90c59ab61378811d4a Mon Sep 17 00:00:00 2001
From: Slaven Peles <peless@ornl.gov>
Date: Thu, 19 Oct 2023 18:25:04 -0400
Subject: [PATCH 11/15] Put preprocessor guards around CUDA code in unit tests.

---
 tests/unit/matrix/MatrixHandlerTests.hpp    | 4 +++-
 tests/unit/matrix/runMatrixHandlerTests.cpp | 2 ++
 tests/unit/vector/GramSchmidtTests.hpp      | 4 +++-
 tests/unit/vector/VectorHandlerTests.hpp    | 4 +++-
 tests/unit/vector/runGramSchmidtTests.cpp   | 2 ++
 tests/unit/vector/runVectorHandlerTests.cpp | 2 ++
 6 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/tests/unit/matrix/MatrixHandlerTests.hpp b/tests/unit/matrix/MatrixHandlerTests.hpp
index c94524e3..0aa476f8 100644
--- a/tests/unit/matrix/MatrixHandlerTests.hpp
+++ b/tests/unit/matrix/MatrixHandlerTests.hpp
@@ -75,12 +75,14 @@ class MatrixHandlerTests : TestBase
     if (memspace_ == "cpu") {
       LinAlgWorkspaceCpu* workpsace = new LinAlgWorkspaceCpu();
       return new MatrixHandler(workpsace);
+#ifdef RESOLVE_USE_CUDA
     } else if (memspace_ == "cuda") {
       LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA();
       workspace->initializeHandles();
       return new MatrixHandler(workspace);
+#endif
     } else {
-      std::cout << "Invalid memory space " << memspace_ << "\n";
+      std::cout << "ReSolve not built with support for memory space " << memspace_ << "\n";
     }
     return nullptr;
   }
diff --git a/tests/unit/matrix/runMatrixHandlerTests.cpp b/tests/unit/matrix/runMatrixHandlerTests.cpp
index d18f0965..6eee90d5 100644
--- a/tests/unit/matrix/runMatrixHandlerTests.cpp
+++ b/tests/unit/matrix/runMatrixHandlerTests.cpp
@@ -20,6 +20,7 @@ int main(int, char**)
     std::cout << "\n";
   }
 
+#ifdef RESOLVE_USE_CUDA
   {
     std::cout << "Running tests with CUDA backend:\n";
     ReSolve::tests::MatrixHandlerTests test("cuda");
@@ -30,6 +31,7 @@ int main(int, char**)
 
     std::cout << "\n";
   }
+#endif
 
   return result.summary();
 }
diff --git a/tests/unit/vector/GramSchmidtTests.hpp b/tests/unit/vector/GramSchmidtTests.hpp
index 6f562739..9981ea48 100644
--- a/tests/unit/vector/GramSchmidtTests.hpp
+++ b/tests/unit/vector/GramSchmidtTests.hpp
@@ -129,12 +129,14 @@ namespace ReSolve {
           if (memspace_ == "cpu") { // TODO: Fix memory leak here
             LinAlgWorkspaceCpu* workpsace = new LinAlgWorkspaceCpu();
             return new VectorHandler(workpsace);
+#ifdef RESOLVE_USE_CUDA
           } else if (memspace_ == "cuda") {
             LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA();
             workspace->initializeHandles();
             return new VectorHandler(workspace);
+#endif
           } else {
-            std::cout << "Invalid memory space " << memspace_ << "\n";
+            std::cout << "ReSolve not built with support for memory space " << memspace_ << "\n";
           }
           return nullptr;
         }
diff --git a/tests/unit/vector/VectorHandlerTests.hpp b/tests/unit/vector/VectorHandlerTests.hpp
index dc6dabb8..d2f8c73c 100644
--- a/tests/unit/vector/VectorHandlerTests.hpp
+++ b/tests/unit/vector/VectorHandlerTests.hpp
@@ -224,12 +224,14 @@ namespace ReSolve {
           if (memspace_ == "cpu") {
             LinAlgWorkspaceCpu* workpsace = new LinAlgWorkspaceCpu();
             return new VectorHandler(workpsace);
+#ifdef RESOLVE_USE_CUDA
           } else if (memspace_ == "cuda") {
             LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA();
             workspace->initializeHandles();
             return new VectorHandler(workspace);
+#endif
           } else {
-            std::cout << "Invalid memory space " << memspace_ << "\n";
+            std::cout << "ReSolve not built with support for memory space " << memspace_ << "\n";
           }
           return nullptr;
         }
diff --git a/tests/unit/vector/runGramSchmidtTests.cpp b/tests/unit/vector/runGramSchmidtTests.cpp
index 2b5a3a17..e118eb6d 100644
--- a/tests/unit/vector/runGramSchmidtTests.cpp
+++ b/tests/unit/vector/runGramSchmidtTests.cpp
@@ -7,6 +7,7 @@ int main(int, char**)
 {
   ReSolve::tests::TestingResults result; 
 
+#ifdef RESOLVE_USE_CUDA
   {
     std::cout << "Running tests with CUDA backend:\n";
     ReSolve::tests::GramSchmidtTests test("cuda");
@@ -18,6 +19,7 @@ int main(int, char**)
     result += test.orthogonalize(5000, ReSolve::GramSchmidt::mgs_pm);
     std::cout << "\n";
   }
+#endif
 
   return result.summary();
 }
diff --git a/tests/unit/vector/runVectorHandlerTests.cpp b/tests/unit/vector/runVectorHandlerTests.cpp
index ae40f5d7..77e99471 100644
--- a/tests/unit/vector/runVectorHandlerTests.cpp
+++ b/tests/unit/vector/runVectorHandlerTests.cpp
@@ -19,6 +19,7 @@ int main(int, char**)
     std::cout << "\n";
   }
 
+#ifdef RESOLVE_USE_CUDA
   {
     std::cout << "Running tests with CUDA backend:\n";
     ReSolve::tests::VectorHandlerTests test("cuda");
@@ -34,6 +35,7 @@ int main(int, char**)
 
     std::cout << "\n";
   }
+#endif
 
   return result.summary();
 }

From 5c5bdd85279e0e89efb02e2daaef7b489920a7b0 Mon Sep 17 00:00:00 2001
From: Slaven Peles <peless@ornl.gov>
Date: Thu, 19 Oct 2023 19:46:13 -0400
Subject: [PATCH 12/15] Make CUDA optional dependency (lots of cleanup needed.

---
 examples/CMakeLists.txt           |  1 +
 examples/r_KLU_KLU.cpp            |  1 +
 examples/r_KLU_KLU_standalone.cpp |  1 +
 resolve/CMakeLists.txt            | 15 +++++++++++++--
 resolve/cpu/CMakeLists.txt        |  1 +
 resolve/cpu/cpuVectorKernels.cpp  | 15 +++++++++++++++
 resolve/matrix/CMakeLists.txt     | 24 +++++++++++++++++++-----
 resolve/matrix/MatrixHandler.cpp  |  7 +++++++
 resolve/vector/CMakeLists.txt     | 24 +++++++++++++++++++-----
 resolve/vector/VectorHandler.cpp  |  9 +++++++--
 resolve/workspace/CMakeLists.txt  | 17 ++++++++++++-----
 11 files changed, 96 insertions(+), 19 deletions(-)
 create mode 100644 resolve/cpu/cpuVectorKernels.cpp

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 6250400b..f2d323f4 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -14,6 +14,7 @@ target_link_libraries(klu_klu.exe PRIVATE ReSolve)
 add_executable(klu_klu_standalone.exe r_KLU_KLU_standalone.cpp)
 target_link_libraries(klu_klu_standalone.exe PRIVATE ReSolve)
 
+# Create CUDA examples
 if(RESOLVE_USE_CUDA)
         
   # Build example with KLU factorization and GLU refactorization
diff --git a/examples/r_KLU_KLU.cpp b/examples/r_KLU_KLU.cpp
index 550144ae..19c9fc33 100644
--- a/examples/r_KLU_KLU.cpp
+++ b/examples/r_KLU_KLU.cpp
@@ -1,5 +1,6 @@
 #include <string>
 #include <iostream>
+#include <cmath>
 
 #include <resolve/matrix/Coo.hpp>
 #include <resolve/matrix/Csr.hpp>
diff --git a/examples/r_KLU_KLU_standalone.cpp b/examples/r_KLU_KLU_standalone.cpp
index 628eeb06..9d658b6e 100644
--- a/examples/r_KLU_KLU_standalone.cpp
+++ b/examples/r_KLU_KLU_standalone.cpp
@@ -1,5 +1,6 @@
 #include <string>
 #include <iostream>
+#include <cmath>
 
 #include <resolve/matrix/Coo.hpp>
 #include <resolve/matrix/Csr.hpp>
diff --git a/resolve/CMakeLists.txt b/resolve/CMakeLists.txt
index 2efac125..8dbcc467 100644
--- a/resolve/CMakeLists.txt
+++ b/resolve/CMakeLists.txt
@@ -8,16 +8,21 @@
 
 add_subdirectory(utilities)
 
+# C++ files
 set(ReSolve_SRC
     LinSolver.cpp
     LinSolverDirectKLU.cpp
+)
+
+# C++ code that links to CUDA SDK libraries
+set(ReSolve_CUDASDK_SRC
     LinSolverIterativeFGMRES.cpp
     GramSchmidt.cpp
     LinSolverDirectCuSolverGLU.cpp
     LinSolverDirectCuSolverRf.cpp
 )
 
-
+# Header files to be installed
 set(ReSolve_HEADER_INSTALL
     Common.hpp
     cusolver_defs.hpp
@@ -71,17 +76,21 @@ set(ReSolve_Targets_List
     resolve_workspace
 )
 
+# If CUDA support is enabled add CUDA SDK specific code and dependencies
 if(RESOLVE_USE_CUDA)
+  set(ReSolve_SRC ${ReSolve_SRC} ${ReSolve_CUDASDK_SRC})
   set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cuda)
 endif()
 
+# If no GPU support is enabled, link to dummy device backend
 if(NOT RESOLVE_USE_GPU)
     set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cpu)
 endif()
 
-
+# Set installable targets
 install(TARGETS ${ReSolve_Targets_List} EXPORT ReSolveTargets)
 
+# Create ReSolve library
 add_library(ReSolve SHARED ${ReSolve_SRC})
 
 target_include_directories(ReSolve INTERFACE 
@@ -92,10 +101,12 @@ target_include_directories(ReSolve INTERFACE
 # TODO: Make this PRIVATE dependency (requires refactoring ReSolve code)
 target_link_libraries(ReSolve PUBLIC ${ReSolve_Targets_List})
 
+# Install targets
 install(TARGETS ReSolve
         EXPORT ReSolveTargets
         ARCHIVE DESTINATION lib
         LIBRARY DESTINATION lib)
+
 # install include headers
 install(FILES ${ReSolve_HEADER_INSTALL} DESTINATION include/resolve)
 
diff --git a/resolve/cpu/CMakeLists.txt b/resolve/cpu/CMakeLists.txt
index cb84dc55..4e58e94a 100644
--- a/resolve/cpu/CMakeLists.txt
+++ b/resolve/cpu/CMakeLists.txt
@@ -8,6 +8,7 @@
 
 set(ReSolve_CPU_SRC
     MemoryUtils.cpp
+    cpuVectorKernels.cpp
 )
 
 set(ReSolve_CPU_HEADER_INSTALL
diff --git a/resolve/cpu/cpuVectorKernels.cpp b/resolve/cpu/cpuVectorKernels.cpp
new file mode 100644
index 00000000..26a20413
--- /dev/null
+++ b/resolve/cpu/cpuVectorKernels.cpp
@@ -0,0 +1,15 @@
+#include <resolve/Common.hpp>
+#include <resolve/vector/VectorKernels.hpp>
+
+
+namespace ReSolve { namespace vector {
+
+
+void set_array_const(index_type n, real_type val, real_type* arr)
+{
+  for(index_type i = 0; i < n; ++i) {
+    arr[i] = val;
+  }
+}
+
+}} // namespace ReSolve::vector
\ No newline at end of file
diff --git a/resolve/matrix/CMakeLists.txt b/resolve/matrix/CMakeLists.txt
index 5d05df1b..78b1689c 100644
--- a/resolve/matrix/CMakeLists.txt
+++ b/resolve/matrix/CMakeLists.txt
@@ -6,7 +6,7 @@
 
 ]]
 
-
+# C++ code
 set(Matrix_SRC 
     io.cpp
     Sparse.cpp
@@ -15,10 +15,14 @@ set(Matrix_SRC
     Coo.cpp
     MatrixHandler.cpp
     MatrixHandlerCpu.cpp
-    MatrixHandlerCuda.cpp
 )
 
+# C++ code that depends on CUDA SDK libraries
+set(Matrix_CUDASDK_SRC 
+    MatrixHandlerCuda.cpp
+)
 
+# Header files to be installed
 set(Matrix_HEADER_INSTALL
     io.hpp
     Sparse.hpp
@@ -28,13 +32,23 @@ set(Matrix_HEADER_INSTALL
     MatrixHandler.hpp
 )
 
+# Add CUDA matrix handler if CUDA support is enabled
+if(RESOLVE_USE_CUDA)
+  set(Matrix_SRC ${Matrix_SRC} ${Matrix_CUDASDK_SRC})
+endif()
+
+
 # Build shared library ReSolve::matrix
 add_library(resolve_matrix SHARED ${Matrix_SRC})
 
+# Link to CUDA ReSolve backend if CUDA is support enabled
 if (RESOLVE_USE_CUDA)
-    target_link_libraries(resolve_matrix PUBLIC resolve_backend_cuda)
-else()
-    target_link_libraries(resolve_matrix PUBLIC resolve_backend_cpu)
+  target_link_libraries(resolve_matrix PUBLIC resolve_backend_cuda)
+endif()
+
+# Link to dummy device backend if GPU support is not enabled
+if (NOT RESOLVE_USE_GPU)
+  target_link_libraries(resolve_matrix PUBLIC resolve_backend_cpu)
 endif()
 
 
diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp
index c4849c18..b0ae516e 100644
--- a/resolve/matrix/MatrixHandler.cpp
+++ b/resolve/matrix/MatrixHandler.cpp
@@ -9,7 +9,10 @@
 #include <resolve/utilities/misc/IndexValuePair.hpp>
 #include "MatrixHandler.hpp"
 #include "MatrixHandlerCpu.hpp"
+
+#ifdef RESOLVE_USE_CUDA
 #include "MatrixHandlerCuda.hpp"
+#endif
 
 namespace ReSolve {
   // Create a shortcut name for Logger static class
@@ -28,7 +31,9 @@ namespace ReSolve {
   {
     this->new_matrix_ = true;
     cpuImpl_  = new MatrixHandlerCpu();
+#ifdef RESOLVE_USE_CUDA
     cudaImpl_ = new MatrixHandlerCuda();
+#endif
   }
 
   /**
@@ -62,6 +67,7 @@ namespace ReSolve {
    * 
    * @post A CUDA implementation instance is created with supplied workspace.
    */
+#ifdef RESOLVE_USE_CUDA
   MatrixHandler::MatrixHandler(LinAlgWorkspaceCUDA* new_workspace)
   {
     cpuImpl_  = new MatrixHandlerCpu();
@@ -69,6 +75,7 @@ namespace ReSolve {
     isCpuEnabled_  = true;
     isCudaEnabled_ = true;
   }
+#endif
 
   void MatrixHandler::setValuesChanged(bool isValuesChanged, std::string memspace)
   {
diff --git a/resolve/vector/CMakeLists.txt b/resolve/vector/CMakeLists.txt
index f418bf55..ca056acb 100644
--- a/resolve/vector/CMakeLists.txt
+++ b/resolve/vector/CMakeLists.txt
@@ -6,29 +6,43 @@
 
 ]]
 
-
+# C++ code
 set(Vector_SRC 
     Vector.cpp
     VectorHandler.cpp
     VectorHandlerCpu.cpp
-    VectorHandlerCuda.cpp
 )
 
+# C++ code that depends on CUDA SDK libraries
+set(Vector_CUDASDK_SRC 
+    VectorHandlerCuda.cpp
+)
 
+# Header files to be installed
 set(Vector_HEADER_INSTALL
     Vector.hpp
     VectorHandler.hpp
     VectorKernels.hpp
 )
 
+# Add CUDA vector handler if CUDA support is enabled
+if(RESOLVE_USE_CUDA)
+  set(Vector_SRC ${Vector_SRC} ${Vector_CUDASDK_SRC})
+endif()
+
 add_library(resolve_vector SHARED ${Vector_SRC})
 
+# Link to ReSolve CUDA backend if CUDA is enabled
 if (RESOLVE_USE_CUDA)
-    target_link_libraries(resolve_vector PUBLIC resolve_backend_cuda)
-else()
-    target_link_libraries(resolve_vector PUBLIC resolve_backend_cpu)
+  target_link_libraries(resolve_vector PUBLIC resolve_backend_cuda)
 endif()
 
+# If no GPU is enabled link to dummy device backend
+if(NOT RESOLVE_USE_GPU)
+  target_link_libraries(resolve_vector PUBLIC resolve_backend_cpu)
+endif(NOT RESOLVE_USE_GPU)
+
+
 target_include_directories(resolve_vector INTERFACE
     $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
     $<INSTALL_INTERFACE:include>
diff --git a/resolve/vector/VectorHandler.cpp b/resolve/vector/VectorHandler.cpp
index 0feb96a8..8c89cb2f 100644
--- a/resolve/vector/VectorHandler.cpp
+++ b/resolve/vector/VectorHandler.cpp
@@ -1,14 +1,17 @@
 #include <iostream>
+#include <cmath>
 
 #include <resolve/utilities/logger/Logger.hpp>
-#include <resolve/cuda/cudaKernels.h>
 #include <resolve/vector/Vector.hpp>
 #include <resolve/workspace/LinAlgWorkspace.hpp>
 #include <resolve/vector/VectorHandlerImpl.hpp>
 #include <resolve/vector/VectorHandlerCpu.hpp>
-#include <resolve/vector/VectorHandlerCuda.hpp>
 #include "VectorHandler.hpp"
 
+#ifdef RESOLVE_USE_CUDA
+#include <resolve/vector/VectorHandlerCuda.hpp>
+#endif
+
 namespace ReSolve {
   using out = io::Logger;
 
@@ -32,6 +35,7 @@ namespace ReSolve {
     isCpuEnabled_ = true;
   }
 
+#ifdef RESOLVE_USE_CUDA
   /** 
    * @brief constructor
    * 
@@ -45,6 +49,7 @@ namespace ReSolve {
     isCudaEnabled_ = true;
     isCpuEnabled_  = true;
   }
+#endif
 
   /** 
    * @brief destructor     
diff --git a/resolve/workspace/CMakeLists.txt b/resolve/workspace/CMakeLists.txt
index c6225487..673fac4b 100644
--- a/resolve/workspace/CMakeLists.txt
+++ b/resolve/workspace/CMakeLists.txt
@@ -6,11 +6,13 @@
 
 ]]
 
+# C++ code
 set(ReSolve_Workspace_SRC
   LinAlgWorkspaceCpu.cpp
 )
 
-set(ReSolve_Workspace_CUDA_SRC # consider adding a separate dir for cuda-sdk dependent code
+# C++ code that depends on CUDA SDK libraries
+set(ReSolve_Workspace_CUDASDK_SRC
     LinAlgWorkspaceCUDA.cpp
 )
 
@@ -20,12 +22,17 @@ set(ReSolve_Workspace_HEADER_INSTALL
   LinAlgWorkspaceCUDA.hpp
 )
 
+# If cuda is enabled, add CUDA SDK workspace files
+if(RESOLVE_USE_CUDA)
+  set(ReSolve_Workspace_SRC ${ReSolve_Workspace_SRC} ${ReSolve_Workspace_CUDASDK_SRC})
+endif()
+
+add_library(resolve_workspace SHARED ${ReSolve_Workspace_SRC})
+
+# If CUDA is enabled, link to ReSolve CUDA backend
 if(RESOLVE_USE_CUDA)
-  add_library(resolve_workspace SHARED ${ReSolve_Workspace_SRC} ${ReSolve_Workspace_CUDA_SRC})
   target_link_libraries(resolve_workspace PUBLIC resolve_backend_cuda)
-else(RESOLVE_USE_CUDA)  
-  add_library(resolve_workspace SHARED ${ReSolve_Workspace_SRC})
-endif(RESOLVE_USE_CUDA)
+endif(RESOLVE_USE_CUDA)  
 
 target_include_directories(resolve_workspace INTERFACE
     $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>

From 977e1dab2b9a8eef2b633bc343e7988ad772f022 Mon Sep 17 00:00:00 2001
From: Slaven Peles <peless@ornl.gov>
Date: Thu, 19 Oct 2023 22:48:39 -0400
Subject: [PATCH 13/15] Fixes to CMake

---
 resolve/cpu/CMakeLists.txt    | 2 ++
 resolve/matrix/CMakeLists.txt | 1 +
 resolve/vector/CMakeLists.txt | 1 +
 tests/unit/CMakeLists.txt     | 8 +++-----
 4 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/resolve/cpu/CMakeLists.txt b/resolve/cpu/CMakeLists.txt
index 4e58e94a..7105655c 100644
--- a/resolve/cpu/CMakeLists.txt
+++ b/resolve/cpu/CMakeLists.txt
@@ -17,6 +17,8 @@ set(ReSolve_CPU_HEADER_INSTALL
 
 # First create dummy backend
 add_library(resolve_backend_cpu SHARED ${ReSolve_CPU_SRC})
+target_link_libraries(resolve_backend_cpu PRIVATE resolve_logger)
+
 target_include_directories(resolve_backend_cpu INTERFACE
     $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
     $<INSTALL_INTERFACE:include>
diff --git a/resolve/matrix/CMakeLists.txt b/resolve/matrix/CMakeLists.txt
index 78b1689c..554c0ba7 100644
--- a/resolve/matrix/CMakeLists.txt
+++ b/resolve/matrix/CMakeLists.txt
@@ -40,6 +40,7 @@ endif()
 
 # Build shared library ReSolve::matrix
 add_library(resolve_matrix SHARED ${Matrix_SRC})
+target_link_libraries(resolve_matrix PRIVATE resolve_logger resolve_vector)
 
 # Link to CUDA ReSolve backend if CUDA is support enabled
 if (RESOLVE_USE_CUDA)
diff --git a/resolve/vector/CMakeLists.txt b/resolve/vector/CMakeLists.txt
index ca056acb..16d53010 100644
--- a/resolve/vector/CMakeLists.txt
+++ b/resolve/vector/CMakeLists.txt
@@ -31,6 +31,7 @@ if(RESOLVE_USE_CUDA)
 endif()
 
 add_library(resolve_vector SHARED ${Vector_SRC})
+target_link_libraries(resolve_vector PRIVATE resolve_logger)
 
 # Link to ReSolve CUDA backend if CUDA is enabled
 if (RESOLVE_USE_CUDA)
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index d61250fe..f91c2ff7 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -6,8 +6,6 @@
 
 ]]
 
-if(RESOLVE_USE_CUDA)
-  add_subdirectory(matrix)
-  add_subdirectory(vector)
-  add_subdirectory(utilities)
-endif(RESOLVE_USE_CUDA)
+add_subdirectory(matrix)
+add_subdirectory(vector)
+add_subdirectory(utilities)

From 3037513c3e1b59877cc047e56d96f054dffa4886 Mon Sep 17 00:00:00 2001
From: Slaven Peles <peless@ornl.gov>
Date: Thu, 19 Oct 2023 23:57:37 -0400
Subject: [PATCH 14/15] Code cleanup.

---
 resolve/cuda/CMakeLists.txt              |  5 +++-
 resolve/matrix/MatrixHandler.cpp         | 35 ++++++++++++------------
 resolve/matrix/MatrixHandler.hpp         | 18 +++++++-----
 resolve/matrix/MatrixHandlerCpu.cpp      | 13 ++++-----
 resolve/matrix/MatrixHandlerCpu.hpp      | 10 +------
 resolve/matrix/MatrixHandlerCuda.cpp     |  5 ----
 resolve/matrix/MatrixHandlerCuda.hpp     | 14 ++--------
 resolve/matrix/MatrixHandlerImpl.hpp     | 12 +-------
 tests/unit/matrix/MatrixHandlerTests.hpp |  4 +--
 9 files changed, 43 insertions(+), 73 deletions(-)

diff --git a/resolve/cuda/CMakeLists.txt b/resolve/cuda/CMakeLists.txt
index 04a9c2ff..f97267bc 100644
--- a/resolve/cuda/CMakeLists.txt
+++ b/resolve/cuda/CMakeLists.txt
@@ -21,8 +21,11 @@ set(ReSolve_CUDA_HEADER_INSTALL
 
 set_source_files_properties(${ReSolve_CUDA_SRC} PROPERTIES LANGUAGE CUDA)
 
-# First create CUDA backend (this should really be CUDA _API_ backend, separate backend will be needed for CUDA SDK)
+# First create CUDA backend 
+# (this should really be CUDA _API_ backend, 
+# separate backend will be needed for CUDA SDK)
 add_library(resolve_backend_cuda SHARED ${ReSolve_CUDA_SRC})
+target_link_libraries(resolve_backend_cuda PRIVATE resolve_logger)
 target_link_libraries(resolve_backend_cuda PUBLIC resolve_cuda)
 target_include_directories(resolve_backend_cuda INTERFACE
     $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp
index b0ae516e..8bf4302c 100644
--- a/resolve/matrix/MatrixHandler.cpp
+++ b/resolve/matrix/MatrixHandler.cpp
@@ -29,11 +29,8 @@ namespace ReSolve {
    */
   MatrixHandler::MatrixHandler()
   {
-    this->new_matrix_ = true;
-    cpuImpl_  = new MatrixHandlerCpu();
-#ifdef RESOLVE_USE_CUDA
-    cudaImpl_ = new MatrixHandlerCuda();
-#endif
+    new_matrix_ = true;
+    cpuImpl_    = new MatrixHandlerCpu();
   }
 
   /**
@@ -59,6 +56,7 @@ namespace ReSolve {
     isCudaEnabled_ = false;
   }
 
+#ifdef RESOLVE_USE_CUDA
   /**
    * @brief Constructor taking pointer to the CUDA workspace as its parameter.
    * 
@@ -67,7 +65,6 @@ namespace ReSolve {
    * 
    * @post A CUDA implementation instance is created with supplied workspace.
    */
-#ifdef RESOLVE_USE_CUDA
   MatrixHandler::MatrixHandler(LinAlgWorkspaceCUDA* new_workspace)
   {
     cpuImpl_  = new MatrixHandlerCpu();
@@ -88,13 +85,14 @@ namespace ReSolve {
     }
   }
 
+  /**
+   * @brief Converts COO to CSR matrix format.
+   * 
+   * Conversion takes place on CPU, and then CSR matrix is copied to `memspace`.
+   */
   int MatrixHandler::coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace)
   {
-    //this happens on the CPU not on the GPU
-    //but will return whatever memspace requested.
-
     //count nnzs first
-
     index_type nnz_unpacked = 0;
     index_type nnz = A_coo->getNnz();
     index_type n = A_coo->getNumRows();
@@ -248,14 +246,15 @@ namespace ReSolve {
   }
 
   /**
-   * @brief Matrix vector product method  result = alpha *A*x + beta * result
-   * @param A 
-   * @param vec_x 
-   * @param vec_result 
-   * @param[in] alpha 
-   * @param[in] beta 
-   * @param[in] matrixFormat 
-   * @param[in] memspace 
+   * @brief Matrix vector product: result = alpha * A * x + beta * result
+   * 
+   * @param[in]  A - Sparse matrix
+   * @param[in]  vec_x - Vector multiplied by the matrix
+   * @param[out] vec_result - Vector where the result is stored
+   * @param[in]  alpha - scalar parameter
+   * @param[in]  beta  - scalar parameter
+   * @param[in]  matrixFormat - Only CSR format is supported at this time
+   * @param[in]  memspace     - Device where the product is computed
    * @return result := alpha * A * x + beta * result
    */
   int MatrixHandler::matvec(matrix::Sparse* A, 
diff --git a/resolve/matrix/MatrixHandler.hpp b/resolve/matrix/MatrixHandler.hpp
index e05e6c8f..398a8039 100644
--- a/resolve/matrix/MatrixHandler.hpp
+++ b/resolve/matrix/MatrixHandler.hpp
@@ -33,6 +33,10 @@ namespace ReSolve {
    *  - Matrix vector product (SpMV)
    *  - Matrix 1-norm
    * 
+   * The class uses pointer to implementation (PIMPL) idiom to create
+   * multiple matrix operation implementations running on CUDA and HIP devices
+   * as well as on CPU.
+   * 
    * @author Kasia Swirydowicz <kasia.swirydowicz@pnnl.gov>
    * @author Slaven Peles <peless@ornl.gov>
    */
@@ -46,7 +50,7 @@ namespace ReSolve {
       MatrixHandler(LinAlgWorkspaceCUDA* workspace);
       ~MatrixHandler();
 
-      int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace); //memspace decides on what is returned (cpu or cuda pointer)
+      int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace);
       int coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace);
 
       /// Should compute vec_result := alpha*A*vec_x + beta*vec_result, but at least on cpu alpha and beta are flipped
@@ -61,14 +65,14 @@ namespace ReSolve {
       void setValuesChanged(bool toWhat, std::string memspace); 
     
     private: 
-      bool new_matrix_{true};     ///< if the structure changed, you need a new handler.
+      bool new_matrix_{true};  ///< if the structure changed, you need a new handler.
 
-      MemoryHandler mem_; ///< Device memory manager object
-      MatrixHandlerImpl*  cpuImpl_{nullptr};
-      MatrixHandlerImpl* cudaImpl_{nullptr};
+      MemoryHandler mem_;      ///< Device memory manager object
+      MatrixHandlerImpl*  cpuImpl_{nullptr}; ///< Pointer to CPU implementation
+      MatrixHandlerImpl* cudaImpl_{nullptr}; ///< Pointer to CUDA implementation
 
-      bool isCpuEnabled_{false};
-      bool isCudaEnabled_{false};
+      bool isCpuEnabled_{false};  ///< true if CPU  implementation is instantiated
+      bool isCudaEnabled_{false}; ///< true if CUDA implementation is instantiated
   };
 
 } // namespace ReSolve
diff --git a/resolve/matrix/MatrixHandlerCpu.cpp b/resolve/matrix/MatrixHandlerCpu.cpp
index 95637402..2c434dcb 100644
--- a/resolve/matrix/MatrixHandlerCpu.cpp
+++ b/resolve/matrix/MatrixHandlerCpu.cpp
@@ -14,8 +14,6 @@ namespace ReSolve {
 
   MatrixHandlerCpu::MatrixHandlerCpu()
   {
-    // new_matrix_ = true;
-    // values_changed_ = true;
   }
 
   MatrixHandlerCpu::~MatrixHandlerCpu()
@@ -27,17 +25,15 @@ namespace ReSolve {
     workspace_ = new_workspace;
   }
 
-  // bool MatrixHandlerCpu::getValuesChanged()
-  // {
-  //   return this->values_changed_;
-  // }
-
   void MatrixHandlerCpu::setValuesChanged(bool values_changed)
   {
     values_changed_ = values_changed;
   }
 
 
+  /**
+   * @brief result := alpha * A * x + beta * result
+   */
   int MatrixHandlerCpu::matvec(matrix::Sparse* Ageneric, 
                                vector_type* vec_x, 
                                vector_type* vec_result, 
@@ -49,7 +45,6 @@ namespace ReSolve {
     // int error_sum = 0;
     if (matrixFormat == "csr") {
       matrix::Csr* A = (matrix::Csr*) Ageneric;
-      //result = alpha *A*x + beta * result
       index_type* ia = A->getRowData("cpu");
       index_type* ja = A->getColData("cpu");
       real_type*   a = A->getValues("cpu");
@@ -90,6 +85,8 @@ namespace ReSolve {
   }
 
   /**
+   * @brief Convert CSC to CSR matrix on the host
+   * 
    * @authors Slaven Peles <peless@ornl.gov>, Daniel Reynolds (SMU), and
    * David Gardner and Carol Woodward (LLNL)
    */
diff --git a/resolve/matrix/MatrixHandlerCpu.hpp b/resolve/matrix/MatrixHandlerCpu.hpp
index 3b6c91c0..0b0afbd3 100644
--- a/resolve/matrix/MatrixHandlerCpu.hpp
+++ b/resolve/matrix/MatrixHandlerCpu.hpp
@@ -1,8 +1,3 @@
-// this class encapsulates various matrix manipulation operations, commonly required by linear solvers:
-// this includes 
-// (1) Matrix format conversion: coo2csr, csr2csc
-// (2) Matrix vector product (SpMV)
-// (3) Matrix 1-norm
 #pragma once
 #include <resolve/Common.hpp>
 #include <resolve/MemoryUtils.hpp>
@@ -40,10 +35,8 @@ namespace ReSolve {
       MatrixHandlerCpu(LinAlgWorkspaceCpu* workspace);
       virtual ~MatrixHandlerCpu();
 
-      int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr); //memspace decides on what is returned (cpu or cuda pointer)
-      // int coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace);
+      int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr);
 
-      /// Should compute vec_result := alpha*A*vec_x + beta*vec_result, but at least on cpu alpha and beta are flipped
       virtual int matvec(matrix::Sparse* A,
                  vector_type* vec_x,
                  vector_type* vec_result,
@@ -55,7 +48,6 @@ namespace ReSolve {
     
     private: 
       LinAlgWorkspaceCpu* workspace_{nullptr};
-      // bool new_matrix_{true};     ///< if the structure changed, you need a new handler.
       bool values_changed_{true}; ///< needed for matvec
 
       MemoryHandler mem_; ///< Device memory manager object
diff --git a/resolve/matrix/MatrixHandlerCuda.cpp b/resolve/matrix/MatrixHandlerCuda.cpp
index b27a63c0..3405ba8d 100644
--- a/resolve/matrix/MatrixHandlerCuda.cpp
+++ b/resolve/matrix/MatrixHandlerCuda.cpp
@@ -12,11 +12,6 @@ namespace ReSolve {
   // Create a shortcut name for Logger static class
   using out = io::Logger;
 
-  MatrixHandlerCuda::MatrixHandlerCuda()
-  {
-    values_changed_ = true;
-  }
-
   MatrixHandlerCuda::~MatrixHandlerCuda()
   {
   }
diff --git a/resolve/matrix/MatrixHandlerCuda.hpp b/resolve/matrix/MatrixHandlerCuda.hpp
index a8fec606..efd4c356 100644
--- a/resolve/matrix/MatrixHandlerCuda.hpp
+++ b/resolve/matrix/MatrixHandlerCuda.hpp
@@ -1,8 +1,3 @@
-// this class encapsulates various matrix manipulation operations, commonly required by linear solvers:
-// this includes 
-// (1) Matrix format conversion: coo2csr, csr2csc
-// (2) Matrix vector product (SpMV)
-// (3) Matrix 1-norm
 #pragma once
 #include <resolve/Common.hpp>
 #include <resolve/MemoryUtils.hpp>
@@ -29,21 +24,17 @@ namespace ReSolve {
   /**
    * @class MatrixHandlerCuda
    * 
-   * @brief CPU implementation of the matrix handler.
+   * @brief CUDA implementation of the matrix handler.
    */
   class MatrixHandlerCuda : public MatrixHandlerImpl
   {
     using vector_type = vector::Vector;
     
     public:
-      MatrixHandlerCuda();
       MatrixHandlerCuda(LinAlgWorkspaceCUDA* workspace);
       virtual ~MatrixHandlerCuda();
 
-      int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr); //memspace decides on what is returned (cpu or cuda pointer)
-      // int coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace);
-
-      /// Should compute vec_result := alpha*A*vec_x + beta*vec_result, but at least on cpu alpha and beta are flipped
+      int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr);
       virtual int matvec(matrix::Sparse* A,
                  vector_type* vec_x,
                  vector_type* vec_result,
@@ -55,7 +46,6 @@ namespace ReSolve {
     
     private: 
       LinAlgWorkspaceCUDA* workspace_{nullptr};
-      // bool new_matrix_{true};     ///< if the structure changed, you need a new handler.
       bool values_changed_{true}; ///< needed for matvec
 
       MemoryHandler mem_; ///< Device memory manager object
diff --git a/resolve/matrix/MatrixHandlerImpl.hpp b/resolve/matrix/MatrixHandlerImpl.hpp
index b901e2b5..2bef6b3d 100644
--- a/resolve/matrix/MatrixHandlerImpl.hpp
+++ b/resolve/matrix/MatrixHandlerImpl.hpp
@@ -1,8 +1,3 @@
-// this class encapsulates various matrix manipulation operations, commonly required by linear solvers:
-// this includes 
-// (1) Matrix format conversion: coo2csr, csr2csc
-// (2) Matrix vector product (SpMV)
-// (3) Matrix 1-norm
 #pragma once
 #include <resolve/Common.hpp>
 #include <resolve/MemoryUtils.hpp>
@@ -20,7 +15,6 @@ namespace ReSolve
     class Csc;
     class Csr;
   }
-  // class LinAlgWorkspace;
 }
 
 
@@ -37,14 +31,11 @@ namespace ReSolve {
     public:
       MatrixHandlerImpl()
       {}
-      // MatrixHandlerImpl(LinAlgWorkspace* workspace);
       virtual ~MatrixHandlerImpl()
       {}
 
       virtual int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr) = 0;
-      // int coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace);
 
-      /// Should compute vec_result := alpha*A*vec_x + beta*vec_result, but at least on cpu alpha and beta are flipped
       virtual int matvec(matrix::Sparse* A,
                          vector_type* vec_x,
                          vector_type* vec_result,
@@ -53,8 +44,7 @@ namespace ReSolve {
                          std::string matrix_type) = 0;
       virtual int Matrix1Norm(matrix::Sparse* A, real_type* norm) = 0;
 
-      virtual void setValuesChanged(bool isValuesChanged) = 0;
-    
+      virtual void setValuesChanged(bool isValuesChanged) = 0;    
   };
 
 } // namespace ReSolve
diff --git a/tests/unit/matrix/MatrixHandlerTests.hpp b/tests/unit/matrix/MatrixHandlerTests.hpp
index 0aa476f8..e203017a 100644
--- a/tests/unit/matrix/MatrixHandlerTests.hpp
+++ b/tests/unit/matrix/MatrixHandlerTests.hpp
@@ -73,8 +73,8 @@ class MatrixHandlerTests : TestBase
   ReSolve::MatrixHandler* createMatrixHandler()
   {
     if (memspace_ == "cpu") {
-      LinAlgWorkspaceCpu* workpsace = new LinAlgWorkspaceCpu();
-      return new MatrixHandler(workpsace);
+      LinAlgWorkspaceCpu* workspace = new LinAlgWorkspaceCpu();
+      return new MatrixHandler(workspace);
 #ifdef RESOLVE_USE_CUDA
     } else if (memspace_ == "cuda") {
       LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA();

From 126b16aa0f1fe5a9ac11a4c62b232eb78da0c74b Mon Sep 17 00:00:00 2001
From: Slaven Peles <peless@ornl.gov>
Date: Sat, 21 Oct 2023 16:16:06 -0400
Subject: [PATCH 15/15] Fix I/O in examples.

---
 examples/r_KLU_GLU.cpp                        |  5 ++-
 examples/r_KLU_GLU_matrix_values_update.cpp   |  5 ++-
 examples/r_KLU_KLU.cpp                        |  9 ++---
 examples/r_KLU_KLU_standalone.cpp             |  7 ++--
 examples/r_KLU_rf.cpp                         |  5 ++-
 examples/r_KLU_rf_FGMRES.cpp                  | 12 +++++-
 .../r_KLU_rf_FGMRES_reuse_factorization.cpp   | 38 ++++++++++++++-----
 runResolve                                    |  6 +--
 8 files changed, 62 insertions(+), 25 deletions(-)

diff --git a/examples/r_KLU_GLU.cpp b/examples/r_KLU_GLU.cpp
index e0831976..e2cbfde4 100644
--- a/examples/r_KLU_GLU.cpp
+++ b/examples/r_KLU_GLU.cpp
@@ -1,5 +1,6 @@
 #include <string>
 #include <iostream>
+#include <iomanip>
 
 #include <resolve/matrix/Coo.hpp>
 #include <resolve/matrix/Csr.hpp>
@@ -148,7 +149,9 @@ int main(int argc, char *argv[])
     matrix_handler->setValuesChanged(true, "cuda");
     matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
 
-    printf("\t 2-Norm of the residual: %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda")));
+    std::cout << "\t 2-Norm of the residual: " 
+              << std::scientific << std::setprecision(16) 
+              << sqrt(vector_handler->dot(vec_r, vec_r, "cuda")) << "\n";
 
   } // for (int i = 0; i < numSystems; ++i)
 
diff --git a/examples/r_KLU_GLU_matrix_values_update.cpp b/examples/r_KLU_GLU_matrix_values_update.cpp
index e6e09673..7d1bb141 100644
--- a/examples/r_KLU_GLU_matrix_values_update.cpp
+++ b/examples/r_KLU_GLU_matrix_values_update.cpp
@@ -1,5 +1,6 @@
 #include <string>
 #include <iostream>
+#include <iomanip>
 
 #include <resolve/vector/Vector.hpp>
 #include <resolve/matrix/io.hpp>
@@ -158,7 +159,9 @@ int main(int argc, char *argv[])
       matrix_handler->setValuesChanged(true, "cuda");
       matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
 
-      printf("\t 2-Norm of the residual: %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda")));
+      std::cout << "\t 2-Norm of the residual: " 
+                << std::scientific << std::setprecision(16) 
+                << sqrt(vector_handler->dot(vec_r, vec_r, "cuda")) << "\n";
 
 
     }
diff --git a/examples/r_KLU_KLU.cpp b/examples/r_KLU_KLU.cpp
index 19c9fc33..8b0ea59a 100644
--- a/examples/r_KLU_KLU.cpp
+++ b/examples/r_KLU_KLU.cpp
@@ -1,5 +1,6 @@
 #include <string>
 #include <iostream>
+#include <iomanip>
 #include <cmath>
 
 #include <resolve/matrix/Coo.hpp>
@@ -138,12 +139,10 @@ int main(int argc, char *argv[])
     matrix_handler->setValuesChanged(true, "cpu");
 
     matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cpu"); 
-    real_type* test = vec_r->getData("cpu");
-    (void) test; // TODO: Do we need `test` variable in this example?
-
-    printf("\t 2-Norm of the residual: %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cpu")));
-
 
+    std::cout << "\t 2-Norm of the residual: " 
+              << std::scientific << std::setprecision(16) 
+              << sqrt(vector_handler->dot(vec_r, vec_r, "cpu")) << "\n";
   }
 
   //now DELETE
diff --git a/examples/r_KLU_KLU_standalone.cpp b/examples/r_KLU_KLU_standalone.cpp
index 9d658b6e..77e5b97a 100644
--- a/examples/r_KLU_KLU_standalone.cpp
+++ b/examples/r_KLU_KLU_standalone.cpp
@@ -1,5 +1,6 @@
 #include <string>
 #include <iostream>
+#include <iomanip>
 #include <cmath>
 
 #include <resolve/matrix/Coo.hpp>
@@ -100,10 +101,10 @@ int main(int argc, char *argv[])
   matrix_handler->setValuesChanged(true, "cpu");
 
   matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cpu"); 
-  real_type* test = vec_r->getData("cpu");
-  (void) test; // TODO: Do we need `test` variable in this example?
 
-  printf("\t 2-Norm of the residual: %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cpu")));
+  std::cout << "\t 2-Norm of the residual: " 
+            << std::scientific << std::setprecision(16) 
+            << sqrt(vector_handler->dot(vec_r, vec_r, "cpu")) << "\n";
 
 
 
diff --git a/examples/r_KLU_rf.cpp b/examples/r_KLU_rf.cpp
index 5dce2dfe..01fa0f3c 100644
--- a/examples/r_KLU_rf.cpp
+++ b/examples/r_KLU_rf.cpp
@@ -1,5 +1,6 @@
 #include <string>
 #include <iostream>
+#include <iomanip>
 
 #include <resolve/matrix/Coo.hpp>
 #include <resolve/matrix/Csr.hpp>
@@ -162,7 +163,9 @@ int main(int argc, char *argv[] )
 
     matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
 
-    printf("\t 2-Norm of the residual: %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda")));
+    std::cout << "\t 2-Norm of the residual: " 
+              << std::scientific << std::setprecision(16) 
+              << sqrt(vector_handler->dot(vec_r, vec_r, "cuda")) << "\n";
 
   } // for (int i = 0; i < numSystems; ++i)
 
diff --git a/examples/r_KLU_rf_FGMRES.cpp b/examples/r_KLU_rf_FGMRES.cpp
index 2e43f9c3..ee674869 100644
--- a/examples/r_KLU_rf_FGMRES.cpp
+++ b/examples/r_KLU_rf_FGMRES.cpp
@@ -1,5 +1,6 @@
 #include <string>
 #include <iostream>
+#include <iomanip>
 
 #include <resolve/matrix/Coo.hpp>
 #include <resolve/matrix/Csr.hpp>
@@ -171,12 +172,19 @@ int main(int argc, char *argv[])
       
       matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
 
-      printf("\t 2-Norm of the residual (before IR): %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b);
+      std::cout << "\t 2-Norm of the residual (before IR): " 
+                << std::scientific << std::setprecision(16) 
+                << sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b << "\n";
 
       vec_rhs->update(rhs, "cpu", "cuda");
       FGMRES->solve(vec_rhs, vec_x);
 
-      printf("FGMRES: init nrm: %16.16e final nrm: %16.16e iter: %d \n", FGMRES->getInitResidualNorm()/norm_b, FGMRES->getFinalResidualNorm()/norm_b, FGMRES->getNumIter());
+      std::cout << "FGMRES: init nrm: " 
+                << std::scientific << std::setprecision(16) 
+                << FGMRES->getInitResidualNorm()/norm_b
+                << " final nrm: "
+                << FGMRES->getFinalResidualNorm()/norm_b
+                << " iter: " << FGMRES->getNumIter() << "\n";
     }
 
   } // for (int i = 0; i < numSystems; ++i)
diff --git a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
index 2317680c..6a520a7a 100644
--- a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
+++ b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
@@ -1,5 +1,7 @@
 #include <string>
 #include <iostream>
+#include <iomanip>
+
 #include <resolve/GramSchmidt.hpp>
 #include <resolve/matrix/Coo.hpp>
 #include <resolve/matrix/Csr.hpp>
@@ -138,7 +140,9 @@ int main(int argc, char *argv[])
       norm_b = sqrt(norm_b);
       matrix_handler->setValuesChanged(true, "cuda");
       matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
-      printf("\t 2-Norm of the residual : %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b);
+      std::cout << "\t 2-Norm of the residual : " 
+                << std::scientific << std::setprecision(16) 
+                << sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b << "\n";
       if (i == 1) {
         ReSolve::matrix::Csc* L_csc = (ReSolve::matrix::Csc*) KLU->getLFactor();
         ReSolve::matrix::Csc* U_csc = (ReSolve::matrix::Csc*) KLU->getUFactor();
@@ -147,7 +151,9 @@ int main(int argc, char *argv[])
 
         matrix_handler->csc2csr(L_csc,L, "cuda");
         matrix_handler->csc2csr(U_csc,U, "cuda");
-        if (L == nullptr) {printf("ERROR");}
+        if (L == nullptr) {
+          std::cout << "ERROR\n";
+        }
         index_type* P = KLU->getPOrdering();
         index_type* Q = KLU->getQOrdering();
         Rf->setup(A, L, U, P, Q);
@@ -163,14 +169,17 @@ int main(int argc, char *argv[])
       if ((i % 2 == 0))
       {
         status = Rf->refactorize();
-        std::cout<<"CUSOLVER RF, using REAL refactorization, refactorization status: "<<status<<std::endl;      
+        std::cout << "CUSOLVER RF, using REAL refactorization, refactorization status: "
+                  << status << std::endl;    
         vec_rhs->update(rhs, "cpu", "cuda");
         status = Rf->solve(vec_rhs, vec_x);
         FGMRES->setupPreconditioner("CuSolverRf", Rf);
       }
-     //if (i%2!=0)  vec_x->setToZero("cuda");
+      //if (i%2!=0)  vec_x->setToZero("cuda");
       real_type norm_x =  vector_handler->dot(vec_x, vec_x, "cuda");
-      printf("Norm of x(before solve): %16.16e \n", sqrt(norm_x));
+      std::cout << "Norm of x (before solve): " 
+                << std::scientific << std::setprecision(16) 
+                << sqrt(norm_x) << "\n";
       std::cout<<"CUSOLVER RF solve status: "<<status<<std::endl;      
       
       vec_rhs->update(rhs, "cpu", "cuda");
@@ -183,15 +192,26 @@ int main(int argc, char *argv[])
       
       matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
 
-      printf("\t 2-Norm of the residual (before IR): %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b);
-      printf("\t 2-Norm of the RIGHT HAND SIDE: %16.16e\n", norm_b);
+      std::cout << "\t 2-Norm of the residual (before IR): " 
+                << std::scientific << std::setprecision(16) 
+                << sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b << "\n";
+      std::cout << "\t 2-Norm of the RIGHT HAND SIDE: " 
+                << std::scientific << std::setprecision(16) 
+                << norm_b << "\n";
 
       vec_rhs->update(rhs, "cpu", "cuda");
       FGMRES->solve(vec_rhs, vec_x);
 
-      printf("FGMRES: init nrm: %16.16e final nrm: %16.16e iter: %d \n", FGMRES->getInitResidualNorm()/norm_b, FGMRES->getFinalResidualNorm()/norm_b, FGMRES->getNumIter());
+      std::cout << "FGMRES: init nrm: " 
+                << std::scientific << std::setprecision(16) 
+                << FGMRES->getInitResidualNorm()/norm_b
+                << " final nrm: "
+                << FGMRES->getFinalResidualNorm()/norm_b
+                << " iter: " << FGMRES->getNumIter() << "\n";
       norm_x = vector_handler->dot(vec_x, vec_x, "cuda");
-      printf("Norm of x (after IR): %16.16e \n", sqrt(norm_x));
+      std::cout << "Norm of x (after IR): " 
+                << std::scientific << std::setprecision(16) 
+                << sqrt(norm_x) << "\n";
     }
 
 
diff --git a/runResolve b/runResolve
index 0a8e5dbf..2d5c67ca 100755
--- a/runResolve
+++ b/runResolve
@@ -13,9 +13,9 @@ foo=" ${foo} ${i} ${i}"
 done
 
 
-matfile="/gpfs/wolf/csc359/scratch/peles/testcases/ACTIVSg10k_AC/matrix_ACTIVSg10k_AC_"
+matfile="$HOME/testcases/ACTIVSg10k_AC/matrix_ACTIVSg10k_AC_"
 
-rhsfile="/gpfs/wolf/csc359/scratch/peles/testcases/ACTIVSg10k_AC/rhs_ACTIVSg10k_AC_"
+rhsfile="$HOME/testcases/ACTIVSg10k_AC/rhs_ACTIVSg10k_AC_"
 
 
-./resolve/klu_rf_fgmres.exe $matfile $rhsfile 10 $foo
+./$1 $matfile $rhsfile 10 $foo