diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5715e6e2f..d3b566eec 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,6 +27,13 @@ option(RESOLVE_USE_GPU  "Use GPU device for computations" ON)
 option(RESOLVE_USE_CUDA "Use CUDA language and SDK" ON)
 set(RESOLVE_CTEST_OUTPUT_DIR ${PROJECT_BINARY_DIR} CACHE PATH "Directory where CTest outputs are saved")
 
+if(RESOLVE_USE_CUDA)
+  set(RESOLVE_USE_GPU On CACHE BOOL "Using GPU!" FORCE)
+else()
+  set(RESOLVE_USE_GPU Off CACHE BOOL "Using GPU!" FORCE)
+endif()
+
+
 set(CMAKE_MACOSX_RPATH 1)
 # set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
 # https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling#always-full-rpath
@@ -130,10 +137,8 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/ReSolveConfig.cmake"
         "${CMAKE_CURRENT_BINARY_DIR}/ReSolveConfigVersion.cmake"
         DESTINATION share/resolve/cmake)
     
-# Add examples (for now only CUDA examples are available)
-if(RESOLVE_USE_CUDA)
-  add_subdirectory(examples)
-endif(RESOLVE_USE_CUDA)
+# Add usage examples
+add_subdirectory(examples)
 
 # Add tests
 add_subdirectory(tests)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 73dce68ed..f2d323f4c 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -14,30 +14,40 @@ target_link_libraries(klu_klu.exe PRIVATE ReSolve)
 add_executable(klu_klu_standalone.exe r_KLU_KLU_standalone.cpp)
 target_link_libraries(klu_klu_standalone.exe PRIVATE ReSolve)
 
-# Build example with KLU factorization and GLU refactorization
-add_executable(klu_glu.exe r_KLU_GLU.cpp)
-target_link_libraries(klu_glu.exe PRIVATE ReSolve)
+# Create CUDA examples
+if(RESOLVE_USE_CUDA)
+        
+  # Build example with KLU factorization and GLU refactorization
+  add_executable(klu_glu.exe r_KLU_GLU.cpp)
+  target_link_libraries(klu_glu.exe PRIVATE ReSolve)
 
-# Build example with KLU factorization and Rf refactorization
-add_executable(klu_rf.exe r_KLU_rf.cpp)
-target_link_libraries(klu_rf.exe PRIVATE ReSolve)
+  # Build example with KLU factorization and Rf refactorization
+  add_executable(klu_rf.exe r_KLU_rf.cpp)
+  target_link_libraries(klu_rf.exe PRIVATE ReSolve)
 
-# Build example with KLU factorization, Rf refactorization, and FGMRES iterative refinement
-add_executable(klu_rf_fgmres.exe r_KLU_rf_FGMRES.cpp)
-target_link_libraries(klu_rf_fgmres.exe PRIVATE ReSolve)
+  # Build example with KLU factorization, Rf refactorization, and FGMRES iterative refinement
+  add_executable(klu_rf_fgmres.exe r_KLU_rf_FGMRES.cpp)
+  target_link_libraries(klu_rf_fgmres.exe PRIVATE ReSolve)
 
-# Build example where matrix is factorized once, refactorized once and then the preconditioner is REUSED
-add_executable(klu_rf_fgmres_reuse_refactorization.exe r_KLU_rf_FGMRES_reuse_factorization.cpp)
-target_link_libraries(klu_rf_fgmres_reuse_refactorization.exe PRIVATE ReSolve)
+  # Build example where matrix is factorized once, refactorized once and then the preconditioner is REUSED
+  add_executable(klu_rf_fgmres_reuse_refactorization.exe r_KLU_rf_FGMRES_reuse_factorization.cpp)
+  target_link_libraries(klu_rf_fgmres_reuse_refactorization.exe PRIVATE ReSolve)
 
-# Build example where matrix data is updated 
-add_executable(klu_glu_values_update.exe r_KLU_GLU_matrix_values_update.cpp)
-target_link_libraries(klu_glu_values_update.exe PRIVATE ReSolve)
+  # Build example where matrix data is updated 
+  add_executable(klu_glu_values_update.exe r_KLU_GLU_matrix_values_update.cpp)
+  target_link_libraries(klu_glu_values_update.exe PRIVATE ReSolve)
+
+endif(RESOLVE_USE_CUDA)
 
 
 
 # Install all examples in bin directory
-set(installable_executables klu_klu.exe klu_klu_standalone.exe klu_glu.exe klu_rf.exe klu_rf_fgmres.exe klu_glu_values_update.exe)
+set(installable_executables klu_klu.exe klu_klu_standalone.exe)
+
+if(RESOLVE_USE_CUDA)
+  set(installable_executables ${installable_executables} klu_glu.exe klu_rf.exe klu_rf_fgmres.exe klu_glu_values_update.exe)        
+endif(RESOLVE_USE_CUDA)
+
 install(TARGETS ${installable_executables} 
         RUNTIME DESTINATION bin)
 
diff --git a/examples/r_KLU_GLU.cpp b/examples/r_KLU_GLU.cpp
index a4c059a9b..e2cbfde49 100644
--- a/examples/r_KLU_GLU.cpp
+++ b/examples/r_KLU_GLU.cpp
@@ -1,5 +1,6 @@
 #include <string>
 #include <iostream>
+#include <iomanip>
 
 #include <resolve/matrix/Coo.hpp>
 #include <resolve/matrix/Csr.hpp>
@@ -9,7 +10,7 @@
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverGLU.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 
 using namespace ReSolve::constants;
 
@@ -145,10 +146,12 @@ int main(int argc, char *argv[])
     vec_r->update(rhs, "cpu", "cuda");
 
 
-    matrix_handler->setValuesChanged(true);
+    matrix_handler->setValuesChanged(true, "cuda");
     matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
 
-    printf("\t 2-Norm of the residual: %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda")));
+    std::cout << "\t 2-Norm of the residual: " 
+              << std::scientific << std::setprecision(16) 
+              << sqrt(vector_handler->dot(vec_r, vec_r, "cuda")) << "\n";
 
   } // for (int i = 0; i < numSystems; ++i)
 
diff --git a/examples/r_KLU_GLU_matrix_values_update.cpp b/examples/r_KLU_GLU_matrix_values_update.cpp
index 96c77591d..7d1bb141c 100644
--- a/examples/r_KLU_GLU_matrix_values_update.cpp
+++ b/examples/r_KLU_GLU_matrix_values_update.cpp
@@ -1,5 +1,6 @@
 #include <string>
 #include <iostream>
+#include <iomanip>
 
 #include <resolve/vector/Vector.hpp>
 #include <resolve/matrix/io.hpp>
@@ -9,7 +10,7 @@
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverGLU.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 
 // this updates the matrix values to simulate what CFD/optimization software does. 
 
@@ -155,10 +156,12 @@ int main(int argc, char *argv[])
       vec_r->update(rhs, "cpu", "cuda");
 
 
-      matrix_handler->setValuesChanged(true);
+      matrix_handler->setValuesChanged(true, "cuda");
       matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
 
-      printf("\t 2-Norm of the residual: %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda")));
+      std::cout << "\t 2-Norm of the residual: " 
+                << std::scientific << std::setprecision(16) 
+                << sqrt(vector_handler->dot(vec_r, vec_r, "cuda")) << "\n";
 
 
     }
diff --git a/examples/r_KLU_KLU.cpp b/examples/r_KLU_KLU.cpp
index 63aad4b57..8b0ea59a9 100644
--- a/examples/r_KLU_KLU.cpp
+++ b/examples/r_KLU_KLU.cpp
@@ -1,5 +1,7 @@
 #include <string>
 #include <iostream>
+#include <iomanip>
+#include <cmath>
 
 #include <resolve/matrix/Coo.hpp>
 #include <resolve/matrix/Csr.hpp>
@@ -9,7 +11,7 @@
 #include <resolve/matrix/MatrixHandler.hpp>
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 
 using namespace ReSolve::constants;
 
@@ -35,10 +37,9 @@ int main(int argc, char *argv[])
 
   ReSolve::matrix::Coo* A_coo;
   ReSolve::matrix::Csr* A;
-  ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA;
-  workspace_CUDA->initializeHandles();
-  ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_CUDA);
-  ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_CUDA);
+  ReSolve::LinAlgWorkspaceCpu* workspace = new ReSolve::LinAlgWorkspaceCpu();
+  ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace);
+  ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace);
   real_type* rhs;
   real_type* x;
 
@@ -96,7 +97,11 @@ int main(int argc, char *argv[])
       ReSolve::io::readAndUpdateMatrix(mat_file, A_coo);
       ReSolve::io::readAndUpdateRhs(rhs_file, &rhs);
     }
-    std::cout<<"Finished reading the matrix and rhs, size: "<<A->getNumRows()<<" x "<<A->getNumColumns()<< ", nnz: "<< A->getNnz()<< ", symmetric? "<<A->symmetric()<< ", Expanded? "<<A->expanded()<<std::endl;
+    std::cout << "Finished reading the matrix and rhs, size: " << A->getNumRows()
+              << " x "           << A->getNumColumns()
+              << ", nnz: "       << A->getNnz()
+              << ", symmetric? " << A->symmetric()
+              << ", Expanded? "  << A->expanded() << std::endl;
     mat_file.close();
     rhs_file.close();
 
@@ -106,8 +111,8 @@ int main(int argc, char *argv[])
       vec_rhs->update(rhs, "cpu", "cpu");
       vec_rhs->setDataUpdated("cpu");
     } else { 
-      matrix_handler->coo2csr(A_coo, A, "cuda");
-      vec_rhs->update(rhs, "cpu", "cuda");
+      matrix_handler->coo2csr(A_coo, A, "cpu");
+      vec_rhs->update(rhs, "cpu", "cpu");
     }
     std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<<std::endl;
     //Now call direct solver
@@ -129,17 +134,15 @@ int main(int argc, char *argv[])
       status = KLU->solve(vec_rhs, vec_x);
       std::cout<<"KLU solve status: "<<status<<std::endl;      
     }
-    vec_r->update(rhs, "cpu", "cuda");
-
-    matrix_handler->setValuesChanged(true);
-
-    matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); 
-    real_type* test = vec_r->getData("cpu");
-    (void) test; // TODO: Do we need `test` variable in this example?
+    vec_r->update(rhs, "cpu", "cpu");
 
-    printf("\t 2-Norm of the residual: %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda")));
+    matrix_handler->setValuesChanged(true, "cpu");
 
+    matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cpu"); 
 
+    std::cout << "\t 2-Norm of the residual: " 
+              << std::scientific << std::setprecision(16) 
+              << sqrt(vector_handler->dot(vec_r, vec_r, "cpu")) << "\n";
   }
 
   //now DELETE
diff --git a/examples/r_KLU_KLU_standalone.cpp b/examples/r_KLU_KLU_standalone.cpp
index fd84efe07..77e5b97aa 100644
--- a/examples/r_KLU_KLU_standalone.cpp
+++ b/examples/r_KLU_KLU_standalone.cpp
@@ -1,5 +1,7 @@
 #include <string>
 #include <iostream>
+#include <iomanip>
+#include <cmath>
 
 #include <resolve/matrix/Coo.hpp>
 #include <resolve/matrix/Csr.hpp>
@@ -9,7 +11,7 @@
 #include <resolve/matrix/MatrixHandler.hpp>
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 
 using namespace ReSolve::constants;
 
@@ -31,10 +33,9 @@ int main(int argc, char *argv[])
 
   ReSolve::matrix::Coo* A_coo;
   ReSolve::matrix::Csr* A;
-  ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA;
-  workspace_CUDA->initializeHandles();
-  ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_CUDA);
-  ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_CUDA);
+  ReSolve::LinAlgWorkspaceCpu* workspace = new ReSolve::LinAlgWorkspaceCpu();
+  ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace);
+  ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace);
   real_type* rhs;
   real_type* x;
 
@@ -95,15 +96,15 @@ int main(int argc, char *argv[])
   std::cout << "KLU factorization status: " << status << std::endl;
   status = KLU->solve(vec_rhs, vec_x);
   std::cout << "KLU solve status: " << status << std::endl;      
-  vec_r->update(rhs, "cpu", "cuda");
+  vec_r->update(rhs, "cpu", "cpu");
 
-  matrix_handler->setValuesChanged(true);
+  matrix_handler->setValuesChanged(true, "cpu");
 
-  matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); 
-  real_type* test = vec_r->getData("cpu");
-  (void) test; // TODO: Do we need `test` variable in this example?
+  matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cpu"); 
 
-  printf("\t 2-Norm of the residual: %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda")));
+  std::cout << "\t 2-Norm of the residual: " 
+            << std::scientific << std::setprecision(16) 
+            << sqrt(vector_handler->dot(vec_r, vec_r, "cpu")) << "\n";
 
 
 
diff --git a/examples/r_KLU_rf.cpp b/examples/r_KLU_rf.cpp
index 9e166d34f..01fa0f3cc 100644
--- a/examples/r_KLU_rf.cpp
+++ b/examples/r_KLU_rf.cpp
@@ -1,5 +1,6 @@
 #include <string>
 #include <iostream>
+#include <iomanip>
 
 #include <resolve/matrix/Coo.hpp>
 #include <resolve/matrix/Csr.hpp>
@@ -10,7 +11,7 @@
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverRf.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 
 using namespace ReSolve::constants;
 
@@ -158,11 +159,13 @@ int main(int argc, char *argv[] )
     }
     vec_r->update(rhs, "cpu", "cuda");
 
-    matrix_handler->setValuesChanged(true);
+    matrix_handler->setValuesChanged(true, "cuda");
 
     matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
 
-    printf("\t 2-Norm of the residual: %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda")));
+    std::cout << "\t 2-Norm of the residual: " 
+              << std::scientific << std::setprecision(16) 
+              << sqrt(vector_handler->dot(vec_r, vec_r, "cuda")) << "\n";
 
   } // for (int i = 0; i < numSystems; ++i)
 
diff --git a/examples/r_KLU_rf_FGMRES.cpp b/examples/r_KLU_rf_FGMRES.cpp
index db816206d..ee6748690 100644
--- a/examples/r_KLU_rf_FGMRES.cpp
+++ b/examples/r_KLU_rf_FGMRES.cpp
@@ -1,5 +1,6 @@
 #include <string>
 #include <iostream>
+#include <iomanip>
 
 #include <resolve/matrix/Coo.hpp>
 #include <resolve/matrix/Csr.hpp>
@@ -11,7 +12,7 @@
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverRf.hpp>
 #include <resolve/LinSolverIterativeFGMRES.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 
 using namespace ReSolve::constants;
 
@@ -125,7 +126,7 @@ int main(int argc, char *argv[])
     real_type norm_b;
     if (i < 2){
       KLU->setup(A);
-      matrix_handler->setValuesChanged(true);
+      matrix_handler->setValuesChanged(true, "cuda");
       status = KLU->analyze();
       std::cout<<"KLU analysis status: "<<status<<std::endl;
       status = KLU->factorize();
@@ -135,7 +136,7 @@ int main(int argc, char *argv[])
       vec_r->update(rhs, "cpu", "cuda");
       norm_b = vector_handler->dot(vec_r, vec_r, "cuda");
       norm_b = sqrt(norm_b);
-      matrix_handler->setValuesChanged(true);
+      matrix_handler->setValuesChanged(true, "cuda");
       matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
       printf("\t 2-Norm of the residual : %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b);
       if (i == 1) {
@@ -165,18 +166,25 @@ int main(int argc, char *argv[])
        norm_b = vector_handler->dot(vec_r, vec_r, "cuda");
       norm_b = sqrt(norm_b);
 
-      //matrix_handler->setValuesChanged(true);
+      //matrix_handler->setValuesChanged(true, "cuda");
       FGMRES->resetMatrix(A);
       FGMRES->setupPreconditioner("CuSolverRf", Rf);
       
       matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
 
-      printf("\t 2-Norm of the residual (before IR): %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b);
+      std::cout << "\t 2-Norm of the residual (before IR): " 
+                << std::scientific << std::setprecision(16) 
+                << sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b << "\n";
 
       vec_rhs->update(rhs, "cpu", "cuda");
       FGMRES->solve(vec_rhs, vec_x);
 
-      printf("FGMRES: init nrm: %16.16e final nrm: %16.16e iter: %d \n", FGMRES->getInitResidualNorm()/norm_b, FGMRES->getFinalResidualNorm()/norm_b, FGMRES->getNumIter());
+      std::cout << "FGMRES: init nrm: " 
+                << std::scientific << std::setprecision(16) 
+                << FGMRES->getInitResidualNorm()/norm_b
+                << " final nrm: "
+                << FGMRES->getFinalResidualNorm()/norm_b
+                << " iter: " << FGMRES->getNumIter() << "\n";
     }
 
   } // for (int i = 0; i < numSystems; ++i)
diff --git a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
index d0ca3a56c..6a520a7a3 100644
--- a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
+++ b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
@@ -1,5 +1,7 @@
 #include <string>
 #include <iostream>
+#include <iomanip>
+
 #include <resolve/GramSchmidt.hpp>
 #include <resolve/matrix/Coo.hpp>
 #include <resolve/matrix/Csr.hpp>
@@ -11,7 +13,7 @@
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverRf.hpp>
 #include <resolve/LinSolverIterativeFGMRES.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 
 using namespace ReSolve::constants;
 
@@ -126,7 +128,7 @@ int main(int argc, char *argv[])
     real_type norm_b;
     if (i < 2){
       KLU->setup(A);
-      matrix_handler->setValuesChanged(true);
+      matrix_handler->setValuesChanged(true, "cuda");
       status = KLU->analyze();
       std::cout<<"KLU analysis status: "<<status<<std::endl;
       status = KLU->factorize();
@@ -136,9 +138,11 @@ int main(int argc, char *argv[])
       vec_r->update(rhs, "cpu", "cuda");
       norm_b = vector_handler->dot(vec_r, vec_r, "cuda");
       norm_b = sqrt(norm_b);
-      matrix_handler->setValuesChanged(true);
+      matrix_handler->setValuesChanged(true, "cuda");
       matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
-      printf("\t 2-Norm of the residual : %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b);
+      std::cout << "\t 2-Norm of the residual : " 
+                << std::scientific << std::setprecision(16) 
+                << sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b << "\n";
       if (i == 1) {
         ReSolve::matrix::Csc* L_csc = (ReSolve::matrix::Csc*) KLU->getLFactor();
         ReSolve::matrix::Csc* U_csc = (ReSolve::matrix::Csc*) KLU->getUFactor();
@@ -147,7 +151,9 @@ int main(int argc, char *argv[])
 
         matrix_handler->csc2csr(L_csc,L, "cuda");
         matrix_handler->csc2csr(U_csc,U, "cuda");
-        if (L == nullptr) {printf("ERROR");}
+        if (L == nullptr) {
+          std::cout << "ERROR\n";
+        }
         index_type* P = KLU->getPOrdering();
         index_type* Q = KLU->getQOrdering();
         Rf->setup(A, L, U, P, Q);
@@ -163,14 +169,17 @@ int main(int argc, char *argv[])
       if ((i % 2 == 0))
       {
         status = Rf->refactorize();
-        std::cout<<"CUSOLVER RF, using REAL refactorization, refactorization status: "<<status<<std::endl;      
+        std::cout << "CUSOLVER RF, using REAL refactorization, refactorization status: "
+                  << status << std::endl;    
         vec_rhs->update(rhs, "cpu", "cuda");
         status = Rf->solve(vec_rhs, vec_x);
         FGMRES->setupPreconditioner("CuSolverRf", Rf);
       }
-     //if (i%2!=0)  vec_x->setToZero("cuda");
+      //if (i%2!=0)  vec_x->setToZero("cuda");
       real_type norm_x =  vector_handler->dot(vec_x, vec_x, "cuda");
-      printf("Norm of x(before solve): %16.16e \n", sqrt(norm_x));
+      std::cout << "Norm of x (before solve): " 
+                << std::scientific << std::setprecision(16) 
+                << sqrt(norm_x) << "\n";
       std::cout<<"CUSOLVER RF solve status: "<<status<<std::endl;      
       
       vec_rhs->update(rhs, "cpu", "cuda");
@@ -178,20 +187,31 @@ int main(int argc, char *argv[])
        norm_b = vector_handler->dot(vec_r, vec_r, "cuda");
       norm_b = sqrt(norm_b);
 
-      matrix_handler->setValuesChanged(true);
+      matrix_handler->setValuesChanged(true, "cuda");
       FGMRES->resetMatrix(A);
       
       matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
 
-      printf("\t 2-Norm of the residual (before IR): %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b);
-      printf("\t 2-Norm of the RIGHT HAND SIDE: %16.16e\n", norm_b);
+      std::cout << "\t 2-Norm of the residual (before IR): " 
+                << std::scientific << std::setprecision(16) 
+                << sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b << "\n";
+      std::cout << "\t 2-Norm of the RIGHT HAND SIDE: " 
+                << std::scientific << std::setprecision(16) 
+                << norm_b << "\n";
 
       vec_rhs->update(rhs, "cpu", "cuda");
       FGMRES->solve(vec_rhs, vec_x);
 
-      printf("FGMRES: init nrm: %16.16e final nrm: %16.16e iter: %d \n", FGMRES->getInitResidualNorm()/norm_b, FGMRES->getFinalResidualNorm()/norm_b, FGMRES->getNumIter());
+      std::cout << "FGMRES: init nrm: " 
+                << std::scientific << std::setprecision(16) 
+                << FGMRES->getInitResidualNorm()/norm_b
+                << " final nrm: "
+                << FGMRES->getFinalResidualNorm()/norm_b
+                << " iter: " << FGMRES->getNumIter() << "\n";
       norm_x = vector_handler->dot(vec_x, vec_x, "cuda");
-      printf("Norm of x (after IR): %16.16e \n", sqrt(norm_x));
+      std::cout << "Norm of x (after IR): " 
+                << std::scientific << std::setprecision(16) 
+                << sqrt(norm_x) << "\n";
     }
 
 
diff --git a/resolve/CMakeLists.txt b/resolve/CMakeLists.txt
index 675fc11f3..8dbcc4677 100644
--- a/resolve/CMakeLists.txt
+++ b/resolve/CMakeLists.txt
@@ -8,21 +8,24 @@
 
 add_subdirectory(utilities)
 
+# C++ files
 set(ReSolve_SRC
     LinSolver.cpp
     LinSolverDirectKLU.cpp
-    LinAlgWorkspace.cpp
+)
+
+# C++ code that links to CUDA SDK libraries
+set(ReSolve_CUDASDK_SRC
     LinSolverIterativeFGMRES.cpp
     GramSchmidt.cpp
     LinSolverDirectCuSolverGLU.cpp
     LinSolverDirectCuSolverRf.cpp
 )
 
-
+# Header files to be installed
 set(ReSolve_HEADER_INSTALL
     Common.hpp
     cusolver_defs.hpp
-    LinAlgWorkspace.hpp
     LinSolver.hpp
     LinSolverDirectCuSolverGLU.hpp
     LinSolverDirectCuSolverRf.hpp
@@ -39,11 +42,15 @@ if(NOT RESOLVE_USE_GPU)
     add_subdirectory(cpu)
 endif()
 
-# First create CUDA backend (this should really be CUDA _API_ backend, separate backend will be needed for CUDA SDK)
+# If CUDA support is enabled, create CUDA backend
+# (this should really be CUDA _API_ backend, separate backend will be needed for CUDA SDK)
 if(RESOLVE_USE_CUDA)
     add_subdirectory(cuda)
 endif()
 
+# Now, build workspaces
+add_subdirectory(workspace)
+
 # Next build vector and matrix objects that may use this backend.
 add_subdirectory(vector)
 add_subdirectory(matrix)
@@ -66,19 +73,24 @@ set(ReSolve_Targets_List
     resolve_vector
     resolve_logger
     resolve_tpl
+    resolve_workspace
 )
 
-if(RESOLVE_USE_GPU)
-    if(RESOLVE_USE_CUDA)
-        set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cuda)
-    endif()
-else(RESOLVE_USE_GPU)
-    set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cpu)
-endif(RESOLVE_USE_GPU)
+# If CUDA support is enabled add CUDA SDK specific code and dependencies
+if(RESOLVE_USE_CUDA)
+  set(ReSolve_SRC ${ReSolve_SRC} ${ReSolve_CUDASDK_SRC})
+  set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cuda)
+endif()
 
+# If no GPU support is enabled, link to dummy device backend
+if(NOT RESOLVE_USE_GPU)
+    set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cpu)
+endif()
 
+# Set installable targets
 install(TARGETS ${ReSolve_Targets_List} EXPORT ReSolveTargets)
 
+# Create ReSolve library
 add_library(ReSolve SHARED ${ReSolve_SRC})
 
 target_include_directories(ReSolve INTERFACE 
@@ -89,10 +101,12 @@ target_include_directories(ReSolve INTERFACE
 # TODO: Make this PRIVATE dependency (requires refactoring ReSolve code)
 target_link_libraries(ReSolve PUBLIC ${ReSolve_Targets_List})
 
+# Install targets
 install(TARGETS ReSolve
         EXPORT ReSolveTargets
         ARCHIVE DESTINATION lib
         LIBRARY DESTINATION lib)
+
 # install include headers
 install(FILES ${ReSolve_HEADER_INSTALL} DESTINATION include/resolve)
 
diff --git a/resolve/LinSolverDirectCuSolverGLU.cpp b/resolve/LinSolverDirectCuSolverGLU.cpp
index 63413c5b1..75039ff47 100644
--- a/resolve/LinSolverDirectCuSolverGLU.cpp
+++ b/resolve/LinSolverDirectCuSolverGLU.cpp
@@ -3,11 +3,12 @@
 
 #include <resolve/vector/Vector.hpp>
 #include <resolve/matrix/Csr.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 #include "LinSolverDirectCuSolverGLU.hpp"
 
 namespace ReSolve
 {
-  LinSolverDirectCuSolverGLU::LinSolverDirectCuSolverGLU(LinAlgWorkspace* workspace)
+  LinSolverDirectCuSolverGLU::LinSolverDirectCuSolverGLU(LinAlgWorkspaceCUDA* workspace)
   {
     this->workspace_ = workspace;
   }
@@ -25,7 +26,7 @@ namespace ReSolve
   {
     int error_sum = 0;
 
-    LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
+    LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
     //get the handle
     handle_cusolversp_ = workspaceCUDA->getCusolverSpHandle();
     A_ = (matrix::Csr*) A;
diff --git a/resolve/LinSolverDirectCuSolverGLU.hpp b/resolve/LinSolverDirectCuSolverGLU.hpp
index d76e1921d..a48c8cba4 100644
--- a/resolve/LinSolverDirectCuSolverGLU.hpp
+++ b/resolve/LinSolverDirectCuSolverGLU.hpp
@@ -1,6 +1,5 @@
 #pragma once
 #include "Common.hpp"
-#include <resolve/LinAlgWorkspace.hpp>
 #include "LinSolver.hpp"
 #include "cusolver_defs.hpp"
 #include <resolve/MemoryUtils.hpp>
@@ -19,12 +18,15 @@ namespace ReSolve
     class Sparse;
   }
 
+  // Forward declaration of ReSolve handlers workspace
+  class LinAlgWorkspace;
+
   class LinSolverDirectCuSolverGLU : public LinSolverDirect 
   {
     using vector_type = vector::Vector;
     
     public:
-      LinSolverDirectCuSolverGLU(LinAlgWorkspace* workspace);
+      LinSolverDirectCuSolverGLU(LinAlgWorkspaceCUDA* workspace);
       ~LinSolverDirectCuSolverGLU();
 
       int refactorize();
@@ -38,7 +40,7 @@ namespace ReSolve
       //note: we need cuSolver handle, we can copy it from the workspace to avoid double allocation
       cusparseMatDescr_t descr_M_; //this is NOT sparse matrix descriptor
       cusparseMatDescr_t descr_A_; //this is NOT sparse matrix descriptor
-      LinAlgWorkspace *workspace_;// so we can copy cusparse handle
+      LinAlgWorkspaceCUDA* workspace_;// so we can copy cusparse handle
       cusolverSpHandle_t handle_cusolversp_; 
       cusolverStatus_t status_cusolver_;
       cusparseStatus_t status_cusparse_;
diff --git a/resolve/LinSolverIterativeFGMRES.cpp b/resolve/LinSolverIterativeFGMRES.cpp
index 2a6da7320..fa63f2d57 100644
--- a/resolve/LinSolverIterativeFGMRES.cpp
+++ b/resolve/LinSolverIterativeFGMRES.cpp
@@ -66,12 +66,12 @@ namespace ReSolve
   {
     if (d_V_ != nullptr) {
       // cudaFree(d_V_);
-      delete [] d_V_;   
+      delete d_V_;   
     }
 
     if (d_Z_ != nullptr) {
       //      cudaFree(d_Z_);
-      delete [] d_Z_;   
+      delete d_Z_;   
     }
 
   }
@@ -308,7 +308,7 @@ namespace ReSolve
   int  LinSolverIterativeFGMRES::resetMatrix(matrix::Sparse* new_matrix)
   {
     A_ = new_matrix;
-    matrix_handler_->setValuesChanged(true);
+    matrix_handler_->setValuesChanged(true, "cuda");
     return 0;
   }
 
diff --git a/resolve/MemoryUtils.hpp b/resolve/MemoryUtils.hpp
index 77c2a7af1..00f3d6533 100644
--- a/resolve/MemoryUtils.hpp
+++ b/resolve/MemoryUtils.hpp
@@ -48,14 +48,16 @@ namespace ReSolve
 
 } // namespace ReSolve
 
-// Check if GPU support is enabled in Re::Solve and set appropriate device memory manager.
 #ifdef RESOLVE_USE_GPU
 
+// Check if GPU support is enabled in Re::Solve and set appropriate device memory manager.
 #if defined RESOLVE_USE_CUDA
 #include <resolve/cuda/CudaMemory.hpp>
 using MemoryHandler = ReSolve::MemoryUtils<ReSolve::memory::Cuda>;
 #elif defined RESOLVE_USE_HIP
 #error HIP support requested, but not available! Probably a bug in CMake configuration.
+#else
+#error Unrecognized device, probably bug in CMake configuration
 #endif
 
 #else
diff --git a/resolve/cpu/CMakeLists.txt b/resolve/cpu/CMakeLists.txt
index cb84dc555..7105655c2 100644
--- a/resolve/cpu/CMakeLists.txt
+++ b/resolve/cpu/CMakeLists.txt
@@ -8,6 +8,7 @@
 
 set(ReSolve_CPU_SRC
     MemoryUtils.cpp
+    cpuVectorKernels.cpp
 )
 
 set(ReSolve_CPU_HEADER_INSTALL
@@ -16,6 +17,8 @@ set(ReSolve_CPU_HEADER_INSTALL
 
 # First create dummy backend
 add_library(resolve_backend_cpu SHARED ${ReSolve_CPU_SRC})
+target_link_libraries(resolve_backend_cpu PRIVATE resolve_logger)
+
 target_include_directories(resolve_backend_cpu INTERFACE
     $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
     $<INSTALL_INTERFACE:include>
diff --git a/resolve/cpu/cpuVectorKernels.cpp b/resolve/cpu/cpuVectorKernels.cpp
new file mode 100644
index 000000000..26a204135
--- /dev/null
+++ b/resolve/cpu/cpuVectorKernels.cpp
@@ -0,0 +1,15 @@
+#include <resolve/Common.hpp>
+#include <resolve/vector/VectorKernels.hpp>
+
+
+namespace ReSolve { namespace vector {
+
+
+void set_array_const(index_type n, real_type val, real_type* arr)
+{
+  for(index_type i = 0; i < n; ++i) {
+    arr[i] = val;
+  }
+}
+
+}} // namespace ReSolve::vector
\ No newline at end of file
diff --git a/resolve/cuda/CMakeLists.txt b/resolve/cuda/CMakeLists.txt
index d3ead3138..f97267bcf 100644
--- a/resolve/cuda/CMakeLists.txt
+++ b/resolve/cuda/CMakeLists.txt
@@ -8,19 +8,24 @@
 
 set(ReSolve_CUDA_SRC
     cudaKernels.cu
+    cudaVectorKernels.cu
     MemoryUtils.cu
 )
 
 set(ReSolve_CUDA_HEADER_INSTALL
     cudaKernels.h
+    cudaVectorKernels.h
     CudaMemory.hpp
     cuda_check_errors.hpp
 )
 
 set_source_files_properties(${ReSolve_CUDA_SRC} PROPERTIES LANGUAGE CUDA)
 
-# First create CUDA backend (this should really be CUDA _API_ backend, separate backend will be needed for CUDA SDK)
+# First create CUDA backend 
+# (this should really be CUDA _API_ backend, 
+# separate backend will be needed for CUDA SDK)
 add_library(resolve_backend_cuda SHARED ${ReSolve_CUDA_SRC})
+target_link_libraries(resolve_backend_cuda PRIVATE resolve_logger)
 target_link_libraries(resolve_backend_cuda PUBLIC resolve_cuda)
 target_include_directories(resolve_backend_cuda INTERFACE
     $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
diff --git a/resolve/vector/cudaVectorKernels.cu b/resolve/cuda/cudaVectorKernels.cu
similarity index 92%
rename from resolve/vector/cudaVectorKernels.cu
rename to resolve/cuda/cudaVectorKernels.cu
index ea54a880d..a1c53198c 100644
--- a/resolve/vector/cudaVectorKernels.cu
+++ b/resolve/cuda/cudaVectorKernels.cu
@@ -1,5 +1,5 @@
 #include "cudaVectorKernels.h"
-#include "VectorKernels.hpp"
+#include <resolve/vector/VectorKernels.hpp>
 
 
 namespace ReSolve { namespace vector {
diff --git a/resolve/vector/cudaVectorKernels.h b/resolve/cuda/cudaVectorKernels.h
similarity index 100%
rename from resolve/vector/cudaVectorKernels.h
rename to resolve/cuda/cudaVectorKernels.h
diff --git a/resolve/matrix/CMakeLists.txt b/resolve/matrix/CMakeLists.txt
index 89ac925d9..554c0ba78 100644
--- a/resolve/matrix/CMakeLists.txt
+++ b/resolve/matrix/CMakeLists.txt
@@ -1,4 +1,12 @@
+#[[
 
+@brief Build ReSolve matrix module
+
+@author Slaven Peles <peless@ornl.gov>
+
+]]
+
+# C++ code
 set(Matrix_SRC 
     io.cpp
     Sparse.cpp
@@ -6,9 +14,15 @@ set(Matrix_SRC
     Csc.cpp
     Coo.cpp
     MatrixHandler.cpp
+    MatrixHandlerCpu.cpp
 )
 
+# C++ code that depends on CUDA SDK libraries
+set(Matrix_CUDASDK_SRC 
+    MatrixHandlerCuda.cpp
+)
 
+# Header files to be installed
 set(Matrix_HEADER_INSTALL
     io.hpp
     Sparse.hpp
@@ -18,13 +32,24 @@ set(Matrix_HEADER_INSTALL
     MatrixHandler.hpp
 )
 
+# Add CUDA matrix handler if CUDA support is enabled
+if(RESOLVE_USE_CUDA)
+  set(Matrix_SRC ${Matrix_SRC} ${Matrix_CUDASDK_SRC})
+endif()
+
+
 # Build shared library ReSolve::matrix
 add_library(resolve_matrix SHARED ${Matrix_SRC})
+target_link_libraries(resolve_matrix PRIVATE resolve_logger resolve_vector)
 
+# Link to CUDA ReSolve backend if CUDA is support enabled
 if (RESOLVE_USE_CUDA)
-    target_link_libraries(resolve_matrix PUBLIC resolve_backend_cuda)
-else()
-    target_link_libraries(resolve_matrix PUBLIC resolve_backend_cpu)
+  target_link_libraries(resolve_matrix PUBLIC resolve_backend_cuda)
+endif()
+
+# Link to dummy device backend if GPU support is not enabled
+if (NOT RESOLVE_USE_GPU)
+  target_link_libraries(resolve_matrix PUBLIC resolve_backend_cpu)
 endif()
 
 
diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp
index 191197313..8bf4302c2 100644
--- a/resolve/matrix/MatrixHandler.cpp
+++ b/resolve/matrix/MatrixHandler.cpp
@@ -1,82 +1,98 @@
 #include <algorithm>
 
 #include <resolve/utilities/logger/Logger.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
 #include <resolve/vector/Vector.hpp>
 #include <resolve/matrix/Coo.hpp>
 #include <resolve/matrix/Csc.hpp>
 #include <resolve/matrix/Csr.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
+#include <resolve/utilities/misc/IndexValuePair.hpp>
 #include "MatrixHandler.hpp"
+#include "MatrixHandlerCpu.hpp"
+
+#ifdef RESOLVE_USE_CUDA
+#include "MatrixHandlerCuda.hpp"
+#endif
 
 namespace ReSolve {
   // Create a shortcut name for Logger static class
   using out = io::Logger;
 
-  //helper class
-  indexPlusValue::indexPlusValue()
-  {
-    idx_ = 0;
-    value_ = 0.0;
-  }
-
-
-  indexPlusValue::~indexPlusValue()
-  {
-  }
-
-  void indexPlusValue::setIdx(index_type new_idx)
-  {
-    idx_ = new_idx;
-  }
-
-  void indexPlusValue::setValue(real_type new_value)
-  {
-    value_ = new_value;
-  }
-
-  index_type indexPlusValue::getIdx()
-  {
-    return idx_;
-  }
-
-  real_type indexPlusValue::getValue()
-  {
-    return value_;
-  }
-  //end of helper class
-
+  /**
+   * @brief Default constructor
+   * 
+   * @post Instantiates CPU and CUDA matrix handlers, but does not 
+   * create a workspace.
+   * 
+   * @todo There is little utility for the default constructor. Rethink its purpose.
+   * Consider making it private method.
+   */
   MatrixHandler::MatrixHandler()
   {
-    this->new_matrix_ = true;
-    this->values_changed_ = true;
+    new_matrix_ = true;
+    cpuImpl_    = new MatrixHandlerCpu();
   }
 
+  /**
+   * @brief Destructor
+   * 
+   */
   MatrixHandler::~MatrixHandler()
   {
+    if (isCpuEnabled_)  delete cpuImpl_;
+    if (isCudaEnabled_) delete cudaImpl_;
   }
 
-  MatrixHandler::MatrixHandler(LinAlgWorkspace* new_workspace)
+  /**
+   * @brief Constructor taking pointer to the workspace as its parameter.
+   * 
+   * @note The CPU implementation currently does not require a workspace.
+   * The workspace pointer parameter is provided for forward compatibility.
+   */
+  MatrixHandler::MatrixHandler(LinAlgWorkspaceCpu* new_workspace)
   {
-    workspace_ = new_workspace;
+    cpuImpl_  = new MatrixHandlerCpu(new_workspace);
+    isCpuEnabled_  = true;
+    isCudaEnabled_ = false;
   }
 
-  bool MatrixHandler::getValuesChanged()
+#ifdef RESOLVE_USE_CUDA
+  /**
+   * @brief Constructor taking pointer to the CUDA workspace as its parameter.
+   * 
+   * @post A CPU implementation instance is created because it is cheap and
+   * it does not require a workspace.
+   * 
+   * @post A CUDA implementation instance is created with supplied workspace.
+   */
+  MatrixHandler::MatrixHandler(LinAlgWorkspaceCUDA* new_workspace)
   {
-    return this->values_changed_;
+    cpuImpl_  = new MatrixHandlerCpu();
+    cudaImpl_ = new MatrixHandlerCuda(new_workspace);
+    isCpuEnabled_  = true;
+    isCudaEnabled_ = true;
   }
+#endif
 
-  void MatrixHandler::setValuesChanged(bool toWhat)
+  void MatrixHandler::setValuesChanged(bool isValuesChanged, std::string memspace)
   {
-    this->values_changed_ = toWhat;
+    if (memspace == "cpu") {
+      cpuImpl_->setValuesChanged(isValuesChanged);
+    } else if (memspace == "cuda") {
+      cudaImpl_->setValuesChanged(isValuesChanged);
+    } else {
+      out::error() << "Unsupported device " << memspace << "\n";
+    }
   }
 
+  /**
+   * @brief Converts COO to CSR matrix format.
+   * 
+   * Conversion takes place on CPU, and then CSR matrix is copied to `memspace`.
+   */
   int MatrixHandler::coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace)
   {
-    //this happens on the CPU not on the GPU
-    //but will return whatever memspace requested.
-
     //count nnzs first
-
     index_type nnz_unpacked = 0;
     index_type nnz = A_coo->getNnz();
     index_type n = A_coo->getNumRows();
@@ -125,7 +141,7 @@ namespace ReSolve {
     index_type* nnz_shifts = new index_type[n];
     std::fill_n(nnz_shifts, n , 0);
 
-    indexPlusValue* tmp = new indexPlusValue[nnz_unpacked]; 
+    IndexValuePair* tmp = new IndexValuePair[nnz_unpacked]; 
 
     csr_ia[0] = 0;
 
@@ -229,189 +245,48 @@ namespace ReSolve {
     return 0;
   }
 
-  int MatrixHandler::matvec(matrix::Sparse* Ageneric, 
+  /**
+   * @brief Matrix vector product: result = alpha * A * x + beta * result
+   * 
+   * @param[in]  A - Sparse matrix
+   * @param[in]  vec_x - Vector multiplied by the matrix
+   * @param[out] vec_result - Vector where the result is stored
+   * @param[in]  alpha - scalar parameter
+   * @param[in]  beta  - scalar parameter
+   * @param[in]  matrixFormat - Only CSR format is supported at this time
+   * @param[in]  memspace     - Device where the product is computed
+   * @return result := alpha * A * x + beta * result
+   */
+  int MatrixHandler::matvec(matrix::Sparse* A, 
                             vector_type* vec_x, 
                             vector_type* vec_result, 
                             const real_type* alpha, 
                             const real_type* beta,
                             std::string matrixFormat, 
-                            std::string memspace) 
+                            std::string memspace)
   {
-    using namespace constants;
-    int error_sum = 0;
-    if (matrixFormat == "csr"){
-      matrix::Csr* A = (matrix::Csr*) Ageneric;
-      cusparseStatus_t status;
-      //result = alpha *A*x + beta * result
-      if (memspace == "cuda" ){
-        // std::cout << "Matvec on NVIDIA GPU ...\n";
-        LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-        cusparseDnVecDescr_t vecx = workspaceCUDA->getVecX();
-        //printf("is vec_x NULL? %d\n", vec_x->getData("cuda") == nullptr);
-        //printf("is vec_result NULL? %d\n", vec_result->getData("cuda") == nullptr);
-        cusparseCreateDnVec(&vecx, A->getNumRows(), vec_x->getData("cuda"), CUDA_R_64F);
-
-
-        cusparseDnVecDescr_t vecAx = workspaceCUDA->getVecY();
-        cusparseCreateDnVec(&vecAx, A->getNumRows(), vec_result->getData("cuda"), CUDA_R_64F);
-
-        cusparseSpMatDescr_t matA = workspaceCUDA->getSpmvMatrixDescriptor();
-
-        void* buffer_spmv = workspaceCUDA->getSpmvBuffer();
-        cusparseHandle_t handle_cusparse = workspaceCUDA->getCusparseHandle();
-        if (values_changed_){ 
-          status = cusparseCreateCsr(&matA, 
-                                     A->getNumRows(),
-                                     A->getNumColumns(),
-                                     A->getNnzExpanded(),
-                                     A->getRowData("cuda"),
-                                     A->getColData("cuda"),
-                                     A->getValues("cuda"), 
-                                     CUSPARSE_INDEX_32I, 
-                                     CUSPARSE_INDEX_32I,
-                                     CUSPARSE_INDEX_BASE_ZERO,
-                                     CUDA_R_64F);
-          error_sum += status;
-          values_changed_ = false;
-        }
-        if (!workspaceCUDA->matvecSetup()){
-          //setup first, allocate, etc.
-          size_t bufferSize = 0;
-
-          status = cusparseSpMV_bufferSize(handle_cusparse, 
-                                           CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                           &MINUSONE,
-                                           matA,
-                                           vecx,
-                                           &ONE,
-                                           vecAx,
-                                           CUDA_R_64F,
-                                           CUSPARSE_SPMV_CSR_ALG2, 
-                                           &bufferSize);
-          error_sum += status;
-          mem_.deviceSynchronize();
-          mem_.allocateBufferOnDevice(&buffer_spmv, bufferSize);
-          workspaceCUDA->setSpmvMatrixDescriptor(matA);
-          workspaceCUDA->setSpmvBuffer(buffer_spmv);
-
-          workspaceCUDA->matvecSetupDone();
-        } 
-
-        status = cusparseSpMV(handle_cusparse,
-                              CUSPARSE_OPERATION_NON_TRANSPOSE,       
-                              alpha, 
-                              matA, 
-                              vecx, 
-                              beta, 
-                              vecAx, 
-                              CUDA_R_64F,
-                              CUSPARSE_SPMV_CSR_ALG2, 
-                              buffer_spmv);
-        error_sum += status;
-        mem_.deviceSynchronize();
-        if (status)
-          out::error() << "Matvec status: " << status 
-                       << "Last error code: " << mem_.getLastDeviceError() << std::endl;
-        vec_result->setDataUpdated("cuda");
-
-        cusparseDestroyDnVec(vecx);
-        cusparseDestroyDnVec(vecAx);
-        return error_sum;
-      } else {
-        if (memspace == "cpu"){
-          //cpu csr matvec
-          index_type* ia = A->getRowData("cpu");
-          index_type* ja = A->getColData("cpu");
-          real_type* a = A->getValues("cpu");
-
-          real_type* x_data = vec_x->getData("cpu");
-          real_type* result_data = vec_result->getData("cpu");
-          real_type sum;
-          real_type y, t, c;
-          //Kahan algorithm for stability; Kahan-Babushka version didnt make a difference   
-          for (int i = 0; i < A->getNumRows(); ++i) {
-            sum = 0.0;
-            c = 0.0;
-            for (int j = ia[i]; j < ia[i+1]; ++j) { 
-              y =  ( a[j] * x_data[ja[j]]) - c;
-              t = sum + y;
-              c = (t - sum) - y;
-              sum = t;
-              //  sum += ( a[j] * x_data[ja[j]]);
-            }
-            sum *= (*alpha);
-            result_data[i] = result_data[i]*(*beta) + sum;
-          } 
-          vec_result->setDataUpdated("cpu");
-          return 0;
-        } else {
-          out::error() << "Not implemented (yet)" << std::endl;
-
-          return 1;
-        }
-      }
+    if (memspace == "cuda" ) {
+      return cudaImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat);
+    } else if (memspace == "cpu") {
+        return cpuImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat);
     } else {
-
-      out::error() << "Not implemented (yet), only csr matvec is implemented" << std::endl;
-      return 1;
+        out::error() << "Support for device " << memspace << " not implemented (yet)" << std::endl;
+        return 1;
     }
   }
 
 
   int MatrixHandler::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace)
   {
-    //it ONLY WORKS WITH CUDA
-    index_type error_sum = 0;
     if (memspace == "cuda") { 
-      LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-
-      A_csr->allocateMatrixData("cuda");
-      index_type n = A_csc->getNumRows();
-      index_type m = A_csc->getNumRows();
-      index_type nnz = A_csc->getNnz();
-      size_t bufferSize;
-      void* d_work;
-      cusparseStatus_t status = cusparseCsr2cscEx2_bufferSize(workspaceCUDA->getCusparseHandle(),
-                                                              n, 
-                                                              m, 
-                                                              nnz, 
-                                                              A_csc->getValues("cuda"), 
-                                                              A_csc->getColData("cuda"), 
-                                                              A_csc->getRowData("cuda"), 
-                                                              A_csr->getValues("cuda"), 
-                                                              A_csr->getRowData("cuda"),
-                                                              A_csr->getColData("cuda"), 
-                                                              CUDA_R_64F, 
-                                                              CUSPARSE_ACTION_NUMERIC,
-                                                              CUSPARSE_INDEX_BASE_ZERO, 
-                                                              CUSPARSE_CSR2CSC_ALG1, 
-                                                              &bufferSize);
-      error_sum += status;
-      mem_.allocateBufferOnDevice(&d_work, bufferSize);
-      status = cusparseCsr2cscEx2(workspaceCUDA->getCusparseHandle(),
-                                  n, 
-                                  m, 
-                                  nnz, 
-                                  A_csc->getValues("cuda"), 
-                                  A_csc->getColData("cuda"), 
-                                  A_csc->getRowData("cuda"), 
-                                  A_csr->getValues("cuda"), 
-                                  A_csr->getRowData("cuda"),
-                                  A_csr->getColData("cuda"), 
-                                  CUDA_R_64F,
-                                  CUSPARSE_ACTION_NUMERIC,
-                                  CUSPARSE_INDEX_BASE_ZERO,
-                                  CUSPARSE_CSR2CSC_ALG1,
-                                  d_work);
-      error_sum += status;
-      return error_sum;
-      mem_.deleteOnDevice(d_work);
-    } else { 
-      out::error() << "Not implemented (yet)" << std::endl;
+      return cudaImpl_->csc2csr(A_csc, A_csr);
+    } else if (memspace == "cpu") { 
+      out::warning() << "Using untested csc2csr on CPU ..." << std::endl;
+      return cpuImpl_->csc2csr(A_csc, A_csr);
+    } else {
+      out::error() << "csc2csr not implemented for " << memspace << " device." << std::endl;
       return -1;
-    } 
-
-
+    }
   }
 
 } // namespace ReSolve
diff --git a/resolve/matrix/MatrixHandler.hpp b/resolve/matrix/MatrixHandler.hpp
index 366b54224..398a80391 100644
--- a/resolve/matrix/MatrixHandler.hpp
+++ b/resolve/matrix/MatrixHandler.hpp
@@ -1,12 +1,8 @@
-// this class encapsulates various matrix manipulation operations, commonly required by linear solvers:
-// this includes 
-// (1) Matrix format conversion: coo2csr, csr2csc
-// (2) Matrix vector product (SpMV)
-// (3) Matrix 1-norm
 #pragma once
 #include <resolve/Common.hpp>
 #include <resolve/MemoryUtils.hpp>
 
+
 namespace ReSolve
 { 
   namespace vector
@@ -20,44 +16,41 @@ namespace ReSolve
     class Csc;
     class Csr;
   }
-  class LinAlgWorkspace;
+  class LinAlgWorkspaceCpu;
+  class LinAlgWorkspaceCUDA;
+  class MatrixHandlerImpl;
 }
 
 
 namespace ReSolve {
 
-  //helper class
-  class indexPlusValue
-  {
-    public:
-      indexPlusValue();
-      ~indexPlusValue();
-      void setIdx (index_type new_idx);
-      void setValue (real_type new_value);
-
-      index_type getIdx();
-      real_type getValue();
-
-      bool operator < (const indexPlusValue& str) const
-      {
-        return (idx_ < str.idx_);
-      }  
-
-    private:
-      index_type idx_;
-      real_type value_;
-  };
-
+  /**
+   * @brief this class encapsulates various matrix manipulation operations, 
+   * commonly required by linear solvers. 
+   * 
+   * This includes:
+   *  - Matrix format conversion: coo2csr, csr2csc
+   *  - Matrix vector product (SpMV)
+   *  - Matrix 1-norm
+   * 
+   * The class uses pointer to implementation (PIMPL) idiom to create
+   * multiple matrix operation implementations running on CUDA and HIP devices
+   * as well as on CPU.
+   * 
+   * @author Kasia Swirydowicz <kasia.swirydowicz@pnnl.gov>
+   * @author Slaven Peles <peless@ornl.gov>
+   */
   class MatrixHandler
   {
     using vector_type = vector::Vector;
     
     public:
       MatrixHandler();
-      MatrixHandler(LinAlgWorkspace* workspace);
+      MatrixHandler(LinAlgWorkspaceCpu* workspace);
+      MatrixHandler(LinAlgWorkspaceCUDA* workspace);
       ~MatrixHandler();
 
-      int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace); //memspace decides on what is returned (cpu or cuda pointer)
+      int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace);
       int coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace);
 
       /// Should compute vec_result := alpha*A*vec_x + beta*vec_result, but at least on cpu alpha and beta are flipped
@@ -68,16 +61,18 @@ namespace ReSolve {
                  const real_type* beta,
                  std::string matrix_type,
                  std::string memspace);
-      void Matrix1Norm(matrix::Sparse *A, real_type* norm);
-      bool getValuesChanged();
-      void setValuesChanged(bool toWhat); 
+      int Matrix1Norm(matrix::Sparse *A, real_type* norm);
+      void setValuesChanged(bool toWhat, std::string memspace); 
     
     private: 
-      LinAlgWorkspace* workspace_{nullptr};
-      bool new_matrix_{true};     ///< if the structure changed, you need a new handler.
-      bool values_changed_{true}; ///< needed for matvec
+      bool new_matrix_{true};  ///< if the structure changed, you need a new handler.
+
+      MemoryHandler mem_;      ///< Device memory manager object
+      MatrixHandlerImpl*  cpuImpl_{nullptr}; ///< Pointer to CPU implementation
+      MatrixHandlerImpl* cudaImpl_{nullptr}; ///< Pointer to CUDA implementation
 
-      MemoryHandler mem_; ///< Device memory manager object
+      bool isCpuEnabled_{false};  ///< true if CPU  implementation is instantiated
+      bool isCudaEnabled_{false}; ///< true if CUDA implementation is instantiated
   };
 
 } // namespace ReSolve
diff --git a/resolve/matrix/MatrixHandlerCpu.cpp b/resolve/matrix/MatrixHandlerCpu.cpp
new file mode 100644
index 000000000..2c434dcb4
--- /dev/null
+++ b/resolve/matrix/MatrixHandlerCpu.cpp
@@ -0,0 +1,168 @@
+#include <algorithm>
+#include <cassert>
+
+#include <resolve/utilities/logger/Logger.hpp>
+#include <resolve/vector/Vector.hpp>
+#include <resolve/matrix/Coo.hpp>
+#include <resolve/matrix/Csc.hpp>
+#include <resolve/matrix/Csr.hpp>
+#include "MatrixHandlerCpu.hpp"
+
+namespace ReSolve {
+  // Create a shortcut name for Logger static class
+  using out = io::Logger;
+
+  MatrixHandlerCpu::MatrixHandlerCpu()
+  {
+  }
+
+  MatrixHandlerCpu::~MatrixHandlerCpu()
+  {
+  }
+
+  MatrixHandlerCpu::MatrixHandlerCpu(LinAlgWorkspaceCpu* new_workspace)
+  {
+    workspace_ = new_workspace;
+  }
+
+  void MatrixHandlerCpu::setValuesChanged(bool values_changed)
+  {
+    values_changed_ = values_changed;
+  }
+
+
+  /**
+   * @brief result := alpha * A * x + beta * result
+   */
+  int MatrixHandlerCpu::matvec(matrix::Sparse* Ageneric, 
+                               vector_type* vec_x, 
+                               vector_type* vec_result, 
+                               const real_type* alpha, 
+                               const real_type* beta,
+                               std::string matrixFormat) 
+  {
+    using namespace constants;
+    // int error_sum = 0;
+    if (matrixFormat == "csr") {
+      matrix::Csr* A = (matrix::Csr*) Ageneric;
+      index_type* ia = A->getRowData("cpu");
+      index_type* ja = A->getColData("cpu");
+      real_type*   a = A->getValues("cpu");
+
+      real_type* x_data      = vec_x->getData("cpu");
+      real_type* result_data = vec_result->getData("cpu");
+      real_type sum;
+      real_type y;
+      real_type t;
+      real_type c;
+
+      //Kahan algorithm for stability; Kahan-Babushka version didnt make a difference   
+      for (int i = 0; i < A->getNumRows(); ++i) {
+        sum = 0.0;
+        c = 0.0;
+        for (int j = ia[i]; j < ia[i+1]; ++j) { 
+          y =  ( a[j] * x_data[ja[j]]) - c;
+          t = sum + y;
+          c = (t - sum) - y;
+          sum = t;
+          //  sum += ( a[j] * x_data[ja[j]]);
+        }
+        sum *= (*alpha);
+        result_data[i] = result_data[i]*(*beta) + sum;
+      } 
+      vec_result->setDataUpdated("cpu");
+      return 0;
+    } else {
+      out::error() << "MatVec not implemented (yet) for " 
+                   << matrixFormat << " matrix format." << std::endl;
+      return 1;
+    }
+  }
+
+  int MatrixHandlerCpu::Matrix1Norm(matrix::Sparse* /* A */, real_type* /* norm */)
+  {
+    return -1;
+  }
+
+  /**
+   * @brief Convert CSC to CSR matrix on the host
+   * 
+   * @authors Slaven Peles <peless@ornl.gov>, Daniel Reynolds (SMU), and
+   * David Gardner and Carol Woodward (LLNL)
+   */
+  int MatrixHandlerCpu::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr)
+  {
+    // int error_sum = 0; TODO: Collect error output!
+    assert(A_csc->getNnz() == A_csr->getNnz());
+    assert(A_csc->getNumRows() == A_csr->getNumColumns());
+    assert(A_csr->getNumRows() == A_csc->getNumColumns());
+
+    index_type nnz = A_csc->getNnz();
+    index_type n   = A_csc->getNumColumns();
+
+    index_type* rowIdxCsc = A_csc->getRowData("cpu");
+    index_type* colPtrCsc = A_csc->getColData("cpu");
+    real_type*  valuesCsc = A_csc->getValues("cpu");
+
+    index_type* rowPtrCsr = A_csr->getRowData("cpu");
+    index_type* colIdxCsr = A_csr->getColData("cpu");
+    real_type*  valuesCsr = A_csr->getValues("cpu");
+
+    // Set all CSR row pointers to zero
+    for (index_type i = 0; i <= n; ++i) {
+      rowPtrCsr[i] = 0;
+    }
+
+    // Set all CSR values and column indices to zero
+    for (index_type i = 0; i < nnz; ++i) {
+      colIdxCsr[i] = 0;
+      valuesCsr[i] = 0.0;
+    }
+
+    // Compute number of entries per row
+    for (index_type i = 0; i < nnz; ++i) {
+      rowPtrCsr[rowIdxCsc[i]]++;
+    }
+
+    // Compute cumualtive sum of nnz per row
+    for (index_type row = 0, rowsum = 0; row < n; ++row)
+    {
+      // Store value in row pointer to temp
+      index_type temp  = rowPtrCsr[row];
+
+      // Copy cumulative sum to the row pointer
+      rowPtrCsr[row] = rowsum;
+
+      // Update row sum
+      rowsum += temp;
+    }
+    rowPtrCsr[n] = nnz;
+
+    for (index_type col = 0; col < n; ++col)
+    {
+      // Compute positions of column indices and values in CSR matrix and store them there
+      // Overwrites CSR row pointers in the process
+      for (index_type jj = colPtrCsc[col]; jj < colPtrCsc[col+1]; jj++)
+      {
+          index_type row  = rowIdxCsc[jj];
+          index_type dest = rowPtrCsr[row];
+
+          colIdxCsr[dest] = col;
+          valuesCsr[dest] = valuesCsc[jj];
+
+          rowPtrCsr[row]++;
+      }
+    }
+
+    // Restore CSR row pointer values
+    for (index_type row = 0, last = 0; row <= n; row++)
+    {
+        index_type temp  = rowPtrCsr[row];
+        rowPtrCsr[row] = last;
+        last    = temp;
+    }
+
+    return 0;
+  }
+
+} // namespace ReSolve
diff --git a/resolve/matrix/MatrixHandlerCpu.hpp b/resolve/matrix/MatrixHandlerCpu.hpp
new file mode 100644
index 000000000..0b0afbd31
--- /dev/null
+++ b/resolve/matrix/MatrixHandlerCpu.hpp
@@ -0,0 +1,57 @@
+#pragma once
+#include <resolve/Common.hpp>
+#include <resolve/MemoryUtils.hpp>
+#include <resolve/matrix/MatrixHandlerImpl.hpp>
+
+namespace ReSolve
+{ 
+  namespace vector
+  {
+    class Vector;
+  }
+  namespace matrix
+  {
+    class Sparse;
+    class Coo;
+    class Csc;
+    class Csr;
+  }
+  class LinAlgWorkspaceCpu;
+}
+
+
+namespace ReSolve {
+  /**
+   * @class MatrixHandlerCpu
+   * 
+   * @brief CPU implementation of the matrix handler.
+   */
+  class MatrixHandlerCpu : public MatrixHandlerImpl
+  {
+    using vector_type = vector::Vector;
+    
+    public:
+      MatrixHandlerCpu();
+      MatrixHandlerCpu(LinAlgWorkspaceCpu* workspace);
+      virtual ~MatrixHandlerCpu();
+
+      int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr);
+
+      virtual int matvec(matrix::Sparse* A,
+                 vector_type* vec_x,
+                 vector_type* vec_result,
+                 const real_type* alpha,
+                 const real_type* beta,
+                 std::string matrix_type);
+      virtual int Matrix1Norm(matrix::Sparse *A, real_type* norm);
+      void setValuesChanged(bool isValuesChanged); 
+    
+    private: 
+      LinAlgWorkspaceCpu* workspace_{nullptr};
+      bool values_changed_{true}; ///< needed for matvec
+
+      MemoryHandler mem_; ///< Device memory manager object
+  };
+
+} // namespace ReSolve
+
diff --git a/resolve/matrix/MatrixHandlerCuda.cpp b/resolve/matrix/MatrixHandlerCuda.cpp
new file mode 100644
index 000000000..3405ba8d2
--- /dev/null
+++ b/resolve/matrix/MatrixHandlerCuda.cpp
@@ -0,0 +1,173 @@
+#include <algorithm>
+
+#include <resolve/utilities/logger/Logger.hpp>
+#include <resolve/vector/Vector.hpp>
+#include <resolve/matrix/Coo.hpp>
+#include <resolve/matrix/Csc.hpp>
+#include <resolve/matrix/Csr.hpp>
+#include <resolve/workspace/LinAlgWorkspaceCUDA.hpp>
+#include "MatrixHandlerCuda.hpp"
+
+namespace ReSolve {
+  // Create a shortcut name for Logger static class
+  using out = io::Logger;
+
+  MatrixHandlerCuda::~MatrixHandlerCuda()
+  {
+  }
+
+  MatrixHandlerCuda::MatrixHandlerCuda(LinAlgWorkspaceCUDA* new_workspace)
+  {
+    workspace_ = new_workspace;
+  }
+
+  void MatrixHandlerCuda::setValuesChanged(bool values_changed)
+  {
+    values_changed_ = values_changed;
+  }
+
+
+  int MatrixHandlerCuda::matvec(matrix::Sparse* Ageneric, 
+                                vector_type* vec_x, 
+                                vector_type* vec_result, 
+                                const real_type* alpha, 
+                                const real_type* beta,
+                                std::string matrixFormat) 
+  {
+    using namespace constants;
+    int error_sum = 0;
+    if (matrixFormat == "csr") {
+      matrix::Csr* A = dynamic_cast<matrix::Csr*>(Ageneric);
+      //result = alpha *A*x + beta * result
+      cusparseStatus_t status;
+      LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
+      cusparseDnVecDescr_t vecx = workspaceCUDA->getVecX();
+      cusparseCreateDnVec(&vecx, A->getNumRows(), vec_x->getData("cuda"), CUDA_R_64F);
+
+
+      cusparseDnVecDescr_t vecAx = workspaceCUDA->getVecY();
+      cusparseCreateDnVec(&vecAx, A->getNumRows(), vec_result->getData("cuda"), CUDA_R_64F);
+
+      cusparseSpMatDescr_t matA = workspaceCUDA->getSpmvMatrixDescriptor();
+
+      void* buffer_spmv = workspaceCUDA->getSpmvBuffer();
+      cusparseHandle_t handle_cusparse = workspaceCUDA->getCusparseHandle();
+      if (values_changed_) {
+        status = cusparseCreateCsr(&matA, 
+                                   A->getNumRows(),
+                                   A->getNumColumns(),
+                                   A->getNnzExpanded(),
+                                   A->getRowData("cuda"),
+                                   A->getColData("cuda"),
+                                   A->getValues("cuda"), 
+                                   CUSPARSE_INDEX_32I, 
+                                   CUSPARSE_INDEX_32I,
+                                   CUSPARSE_INDEX_BASE_ZERO,
+                                   CUDA_R_64F);
+        error_sum += status;
+        values_changed_ = false;
+      }
+      if (!workspaceCUDA->matvecSetup()) {
+        //setup first, allocate, etc.
+        size_t bufferSize = 0;
+
+        status = cusparseSpMV_bufferSize(handle_cusparse, 
+                                         CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                         &MINUSONE,
+                                         matA,
+                                         vecx,
+                                         &ONE,
+                                         vecAx,
+                                         CUDA_R_64F,
+                                         CUSPARSE_SPMV_CSR_ALG2, 
+                                         &bufferSize);
+        error_sum += status;
+        mem_.deviceSynchronize();
+        mem_.allocateBufferOnDevice(&buffer_spmv, bufferSize);
+        workspaceCUDA->setSpmvMatrixDescriptor(matA);
+        workspaceCUDA->setSpmvBuffer(buffer_spmv);
+
+        workspaceCUDA->matvecSetupDone();
+      } 
+
+      status = cusparseSpMV(handle_cusparse,
+                            CUSPARSE_OPERATION_NON_TRANSPOSE,       
+                            alpha, 
+                            matA, 
+                            vecx, 
+                            beta, 
+                            vecAx, 
+                            CUDA_R_64F,
+                            CUSPARSE_SPMV_CSR_ALG2, 
+                            buffer_spmv);
+      error_sum += status;
+      mem_.deviceSynchronize();
+      if (status)
+        out::error() << "Matvec status: " << status 
+                      << "Last error code: " << mem_.getLastDeviceError() << std::endl;
+      vec_result->setDataUpdated("cuda");
+
+      cusparseDestroyDnVec(vecx);
+      cusparseDestroyDnVec(vecAx);
+      return error_sum;
+    } else {
+      out::error() << "MatVec not implemented (yet) for " 
+                   << matrixFormat << " matrix format." << std::endl;
+      return 1;
+    }
+  }
+
+  int MatrixHandlerCuda::Matrix1Norm(matrix::Sparse* /* A */, real_type* /* norm */)
+  {
+    return -1;
+  }
+
+  int MatrixHandlerCuda::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr)
+  {
+    index_type error_sum = 0;
+    LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
+
+    A_csr->allocateMatrixData("cuda");
+    index_type n = A_csc->getNumRows();
+    index_type m = A_csc->getNumRows();
+    index_type nnz = A_csc->getNnz();
+    size_t bufferSize;
+    void* d_work;
+    cusparseStatus_t status = cusparseCsr2cscEx2_bufferSize(workspaceCUDA->getCusparseHandle(),
+                                                            n, 
+                                                            m, 
+                                                            nnz, 
+                                                            A_csc->getValues("cuda"), 
+                                                            A_csc->getColData("cuda"), 
+                                                            A_csc->getRowData("cuda"), 
+                                                            A_csr->getValues("cuda"), 
+                                                            A_csr->getRowData("cuda"),
+                                                            A_csr->getColData("cuda"), 
+                                                            CUDA_R_64F, 
+                                                            CUSPARSE_ACTION_NUMERIC,
+                                                            CUSPARSE_INDEX_BASE_ZERO, 
+                                                            CUSPARSE_CSR2CSC_ALG1, 
+                                                            &bufferSize);
+    error_sum += status;
+    mem_.allocateBufferOnDevice(&d_work, bufferSize);
+    status = cusparseCsr2cscEx2(workspaceCUDA->getCusparseHandle(),
+                                n, 
+                                m, 
+                                nnz, 
+                                A_csc->getValues("cuda"), 
+                                A_csc->getColData("cuda"), 
+                                A_csc->getRowData("cuda"), 
+                                A_csr->getValues("cuda"), 
+                                A_csr->getRowData("cuda"),
+                                A_csr->getColData("cuda"), 
+                                CUDA_R_64F,
+                                CUSPARSE_ACTION_NUMERIC,
+                                CUSPARSE_INDEX_BASE_ZERO,
+                                CUSPARSE_CSR2CSC_ALG1,
+                                d_work);
+    error_sum += status;
+    return error_sum;
+    mem_.deleteOnDevice(d_work);
+  }
+
+} // namespace ReSolve
diff --git a/resolve/matrix/MatrixHandlerCuda.hpp b/resolve/matrix/MatrixHandlerCuda.hpp
new file mode 100644
index 000000000..efd4c3566
--- /dev/null
+++ b/resolve/matrix/MatrixHandlerCuda.hpp
@@ -0,0 +1,55 @@
+#pragma once
+#include <resolve/Common.hpp>
+#include <resolve/MemoryUtils.hpp>
+#include <resolve/matrix/MatrixHandlerImpl.hpp>
+
+namespace ReSolve
+{ 
+  namespace vector
+  {
+    class Vector;
+  }
+  namespace matrix
+  {
+    class Sparse;
+    class Coo;
+    class Csc;
+    class Csr;
+  }
+  class LinAlgWorkspaceCUDA;
+}
+
+
+namespace ReSolve {
+  /**
+   * @class MatrixHandlerCuda
+   * 
+   * @brief CUDA implementation of the matrix handler.
+   */
+  class MatrixHandlerCuda : public MatrixHandlerImpl
+  {
+    using vector_type = vector::Vector;
+    
+    public:
+      MatrixHandlerCuda(LinAlgWorkspaceCUDA* workspace);
+      virtual ~MatrixHandlerCuda();
+
+      int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr);
+      virtual int matvec(matrix::Sparse* A,
+                 vector_type* vec_x,
+                 vector_type* vec_result,
+                 const real_type* alpha,
+                 const real_type* beta,
+                 std::string matrix_type);
+      virtual int Matrix1Norm(matrix::Sparse *A, real_type* norm);
+      void setValuesChanged(bool isValuesChanged); 
+    
+    private: 
+      LinAlgWorkspaceCUDA* workspace_{nullptr};
+      bool values_changed_{true}; ///< needed for matvec
+
+      MemoryHandler mem_; ///< Device memory manager object
+  };
+
+} // namespace ReSolve
+
diff --git a/resolve/matrix/MatrixHandlerImpl.hpp b/resolve/matrix/MatrixHandlerImpl.hpp
new file mode 100644
index 000000000..2bef6b3d7
--- /dev/null
+++ b/resolve/matrix/MatrixHandlerImpl.hpp
@@ -0,0 +1,51 @@
+#pragma once
+#include <resolve/Common.hpp>
+#include <resolve/MemoryUtils.hpp>
+
+namespace ReSolve
+{ 
+  namespace vector
+  {
+    class Vector;
+  }
+  namespace matrix
+  {
+    class Sparse;
+    class Coo;
+    class Csc;
+    class Csr;
+  }
+}
+
+
+namespace ReSolve {
+  /**
+   * @class MatrixHandlerImpl
+   * 
+   * @brief Base class for different matrix handler implementations.
+   */
+  class MatrixHandlerImpl
+  {
+    using vector_type = vector::Vector;
+    
+    public:
+      MatrixHandlerImpl()
+      {}
+      virtual ~MatrixHandlerImpl()
+      {}
+
+      virtual int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr) = 0;
+
+      virtual int matvec(matrix::Sparse* A,
+                         vector_type* vec_x,
+                         vector_type* vec_result,
+                         const real_type* alpha,
+                         const real_type* beta,
+                         std::string matrix_type) = 0;
+      virtual int Matrix1Norm(matrix::Sparse* A, real_type* norm) = 0;
+
+      virtual void setValuesChanged(bool isValuesChanged) = 0;    
+  };
+
+} // namespace ReSolve
+
diff --git a/resolve/utilities/misc/IndexValuePair.hpp b/resolve/utilities/misc/IndexValuePair.hpp
new file mode 100644
index 000000000..b09ccf972
--- /dev/null
+++ b/resolve/utilities/misc/IndexValuePair.hpp
@@ -0,0 +1,43 @@
+#pragma once
+
+
+namespace ReSolve {
+
+  /// @brief Helper class for COO matrix sorting
+  class IndexValuePair
+  {
+    public:
+      IndexValuePair() : idx_(0), value_(0.0)
+      {}
+      ~IndexValuePair()
+      {}
+      void setIdx (index_type new_idx)
+      {
+        idx_ = new_idx;
+      }
+      void setValue (real_type new_value)
+      {
+        value_ = new_value;
+      }
+
+      index_type getIdx()
+      {
+        return idx_;
+      }
+      real_type getValue()
+      {
+        return value_;
+      }
+
+      bool operator < (const IndexValuePair& str) const
+      {
+        return (idx_ < str.idx_);
+      }  
+
+    private:
+      index_type idx_;
+      real_type value_;
+  };
+
+} // namespace ReSolve
+
diff --git a/resolve/vector/CMakeLists.txt b/resolve/vector/CMakeLists.txt
index 57feb5df3..16d53010d 100644
--- a/resolve/vector/CMakeLists.txt
+++ b/resolve/vector/CMakeLists.txt
@@ -1,32 +1,49 @@
+#[[
 
+@brief Build ReSolve (multi)vector module
+
+@author Slaven Peles <peless@ornl.gov>
+
+]]
+
+# C++ code
 set(Vector_SRC 
     Vector.cpp
     VectorHandler.cpp
+    VectorHandlerCpu.cpp
 )
 
-
-
-set(Vector_SRC_CUDA
-    cudaVectorKernels.cu
+# C++ code that depends on CUDA SDK libraries
+set(Vector_CUDASDK_SRC 
+    VectorHandlerCuda.cpp
 )
 
+# Header files to be installed
 set(Vector_HEADER_INSTALL
     Vector.hpp
     VectorHandler.hpp
+    VectorKernels.hpp
 )
 
+# Add CUDA vector handler if CUDA support is enabled
+if(RESOLVE_USE_CUDA)
+  set(Vector_SRC ${Vector_SRC} ${Vector_CUDASDK_SRC})
+endif()
+
+add_library(resolve_vector SHARED ${Vector_SRC})
+target_link_libraries(resolve_vector PRIVATE resolve_logger)
+
+# Link to ReSolve CUDA backend if CUDA is enabled
 if (RESOLVE_USE_CUDA)
-    set_source_files_properties(${Vector_SRC_CUDA} PROPERTIES LANGUAGE CUDA)
-
-    # Build shared library ReSolve::vector
-    add_library(resolve_vector SHARED ${Vector_SRC} ${Vector_SRC_CUDA})
-    target_link_libraries(resolve_vector PUBLIC resolve_backend_cuda)
-else()
-    # Build shared library ReSolve::vector
-    add_library(resolve_vector SHARED ${Vector_SRC})
-    target_link_libraries(resolve_vector PUBLIC resolve_backend_cpu)
+  target_link_libraries(resolve_vector PUBLIC resolve_backend_cuda)
 endif()
 
+# If no GPU is enabled link to dummy device backend
+if(NOT RESOLVE_USE_GPU)
+  target_link_libraries(resolve_vector PUBLIC resolve_backend_cpu)
+endif(NOT RESOLVE_USE_GPU)
+
+
 target_include_directories(resolve_vector INTERFACE
     $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
     $<INSTALL_INTERFACE:include>
diff --git a/resolve/vector/VectorHandler.cpp b/resolve/vector/VectorHandler.cpp
index 032cc9b1e..8c89cb2f2 100644
--- a/resolve/vector/VectorHandler.cpp
+++ b/resolve/vector/VectorHandler.cpp
@@ -1,11 +1,17 @@
 #include <iostream>
+#include <cmath>
 
 #include <resolve/utilities/logger/Logger.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
-#include <resolve/cuda/cudaKernels.h>
 #include <resolve/vector/Vector.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
+#include <resolve/vector/VectorHandlerImpl.hpp>
+#include <resolve/vector/VectorHandlerCpu.hpp>
 #include "VectorHandler.hpp"
 
+#ifdef RESOLVE_USE_CUDA
+#include <resolve/vector/VectorHandlerCuda.hpp>
+#endif
+
 namespace ReSolve {
   using out = io::Logger;
 
@@ -14,17 +20,36 @@ namespace ReSolve {
    */
   VectorHandler::VectorHandler()
   {
+    cpuImpl_ = new VectorHandlerCpu();
+    isCpuEnabled_ = true;
+  }
+
+  /** 
+   * @brief constructor
+   * 
+   * @param new_workspace - workspace to be set     
+   */
+  VectorHandler::VectorHandler(LinAlgWorkspaceCpu* new_workspace)
+  {
+    cpuImpl_ = new VectorHandlerCpu(new_workspace);
+    isCpuEnabled_ = true;
   }
 
+#ifdef RESOLVE_USE_CUDA
   /** 
    * @brief constructor
    * 
    * @param new_workspace - workspace to be set     
    */
-  VectorHandler:: VectorHandler(LinAlgWorkspace* new_workspace)
+  VectorHandler::VectorHandler(LinAlgWorkspaceCUDA* new_workspace)
   {
-    workspace_ = new_workspace;
+    cudaImpl_ = new VectorHandlerCuda(new_workspace);
+    cpuImpl_  = new  VectorHandlerCpu();
+
+    isCudaEnabled_ = true;
+    isCpuEnabled_  = true;
   }
+#endif
 
   /** 
    * @brief destructor     
@@ -46,28 +71,11 @@ namespace ReSolve {
 
   real_type VectorHandler::dot(vector::Vector* x, vector::Vector* y, std::string memspace)
   { 
-    if (memspace == "cuda" ){ 
-      LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-      cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
-      double nrm = 0.0;
-      cublasStatus_t st= cublasDdot (handle_cublas,  x->getSize(), x->getData("cuda"), 1, y->getData("cuda"), 1, &nrm);
-      if (st!=0) {printf("dot product crashed with code %d \n", st);}
-      return nrm;
+    if (memspace == "cuda" ) {
+      return cudaImpl_->dot(x, y);
     } else {
       if (memspace == "cpu") {
-        real_type* x_data = x->getData("cpu");
-        real_type* y_data = y->getData("cpu");
-        real_type sum = 0.0;
-        real_type c = 0.0;
-        real_type t, y;
-        for (int i = 0; i < x->getSize(); ++i){
-          y = (x_data[i] * y_data[i]) - c;
-          t = sum + y;
-          c = (t - sum) - y;
-          sum = t;        
-          //   sum += (x_data[i] * y_data[i]);
-        } 
-        return sum;
+        return cpuImpl_->dot(x, y);
       } else {
         out::error() << "Not implemented (yet)" << std::endl;
         return NAN;
@@ -85,20 +93,11 @@ namespace ReSolve {
    */
   void VectorHandler::scal(const real_type* alpha, vector::Vector* x, std::string memspace)
   {
-    if (memspace == "cuda" ) { 
-      LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-      cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
-      cublasStatus_t st = cublasDscal(handle_cublas, x->getSize(), alpha, x->getData("cuda"), 1);
-      if (st!=0) {
-        ReSolve::io::Logger::error() << "scal crashed with code " << st << "\n";
-      }
+    if (memspace == "cuda" ) {
+      cudaImpl_->scal(alpha, x);
     } else {
       if (memspace == "cpu") {
-        real_type* x_data = x->getData("cpu");
-
-        for (int i = 0; i < x->getSize(); ++i){
-          x_data[i] *= (*alpha);
-        }
+        cpuImpl_->scal(alpha, x);
       } else {      
         out::error() << "Not implemented (yet)" << std::endl;
       }  
@@ -114,26 +113,14 @@ namespace ReSolve {
    * @param[in]  memspace String containg memspace (cpu or cuda)
    * 
    */
-  void VectorHandler::axpy(const  real_type* alpha, vector::Vector* x, vector::Vector* y, std::string memspace )
+  void VectorHandler::axpy(const  real_type* alpha, vector::Vector* x, vector::Vector* y, std::string memspace)
   {
     //AXPY:  y = alpha * x + y
-    if (memspace == "cuda" ) { 
-      LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-      cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
-      cublasDaxpy(handle_cublas,
-                  x->getSize(),
-                  alpha,
-                  x->getData("cuda"),
-                  1,
-                  y->getData("cuda"),
-                  1);
+    if (memspace == "cuda" ) {
+      cudaImpl_->axpy(alpha, x, y);
     } else {
       if (memspace == "cpu") {
-        real_type* x_data = x->getData("cpu");
-        real_type* y_data = y->getData("cpu");
-        for (int i = 0; i < x->getSize(); ++i){
-          y_data[i] = (*alpha) * x_data[i] + y_data[i];
-        }
+        cpuImpl_->axpy(alpha, x, y);
       } else {
         out::error() <<"Not implemented (yet)" << std::endl;
       }
@@ -160,38 +147,9 @@ namespace ReSolve {
   void VectorHandler::gemv(std::string transpose, index_type n, index_type k, const real_type* alpha, const real_type* beta, vector::Vector* V, vector::Vector* y, vector::Vector* x, std::string memspace)
   {
     if (memspace == "cuda") {
-      LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-      cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
-      if (transpose == "T") {
-
-        cublasDgemv(handle_cublas,
-                    CUBLAS_OP_T,
-                    n,
-                    k,
-                    alpha,
-                    V->getData("cuda"),
-                    n,
-                    y->getData("cuda"),
-                    1,
-                    beta,
-                    x->getData("cuda"),
-                    1);
-
-      } else {
-        cublasDgemv(handle_cublas,
-                    CUBLAS_OP_N,
-                    n,
-                    k,
-                    alpha,
-                    V->getData("cuda"),
-                    n,
-                    y->getData("cuda"),
-                    1,
-                    beta,
-                    x->getData("cuda"),
-                    1);
-      }
-
+      cudaImpl_->gemv(transpose, n, k, alpha, beta, V, y, x);
+    } else if (memspace == "cpu") {
+      cpuImpl_->gemv(transpose, n, k, alpha, beta, V, y, x);
     } else {
       out::error() << "Not implemented (yet)" << std::endl;
     }
@@ -213,26 +171,9 @@ namespace ReSolve {
   {
     using namespace constants;
     if (memspace == "cuda") {
-      if (k < 200) {
-        mass_axpy(size, k, x->getData("cuda"), y->getData("cuda"),alpha->getData("cuda"));
-      } else {
-        LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-        cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
-        cublasDgemm(handle_cublas,
-                    CUBLAS_OP_N,
-                    CUBLAS_OP_N,
-                    size,       // m
-                    1,          // n
-                    k + 1,      // k
-                    &MINUSONE, // alpha
-                    x->getData("cuda"), // A
-                    size,       // lda
-                    alpha->getData("cuda"), // B
-                    k + 1,      // ldb
-                    &ONE,
-                    y->getData("cuda"),          // c
-                    size);      // ldc     
-      }
+      cudaImpl_->massAxpy(size, alpha, k, x, y);
+    } else if (memspace == "cpu") {
+      cpuImpl_->massAxpy(size, alpha, k, x, y);
     } else {
       out::error() << "Not implemented (yet)" << std::endl;
     }
@@ -254,29 +195,10 @@ namespace ReSolve {
    */
   void VectorHandler::massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res, std::string memspace)
   {
-    using namespace constants;
-
     if (memspace == "cuda") {
-      if (k < 200) {
-        mass_inner_product_two_vectors(size, k, x->getData("cuda") , x->getData(1, "cuda"), V->getData("cuda"), res->getData("cuda"));
-      } else {
-        LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-        cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
-        cublasDgemm(handle_cublas,
-                    CUBLAS_OP_T,
-                    CUBLAS_OP_N,
-                    k + 1,   //m
-                    2,       //n
-                    size,    //k
-                    &ONE,   //alpha
-                    V->getData("cuda"),       //A
-                    size,    //lda
-                    x->getData("cuda"),       //B
-                    size,    //ldb
-                    &ZERO,
-                    res->getData("cuda"),     //c
-                    k + 1);  //ldc 
-      }
+      cudaImpl_->massDot2Vec(size, V, k, x, res);
+    } else if (memspace == "cpu") {
+      cpuImpl_->massDot2Vec(size, V, k, x, res);
     } else {
       out::error() << "Not implemented (yet)" << std::endl;
     }
diff --git a/resolve/vector/VectorHandler.hpp b/resolve/vector/VectorHandler.hpp
index bf2d37d7b..c17d46883 100644
--- a/resolve/vector/VectorHandler.hpp
+++ b/resolve/vector/VectorHandler.hpp
@@ -7,7 +7,9 @@ namespace ReSolve
   {
     class Vector;
   }
-  class LinAlgWorkspace;
+  class VectorHandlerImpl;
+  class LinAlgWorkspaceCpu;
+  class LinAlgWorkspaceCUDA;
 }
 
 
@@ -15,14 +17,15 @@ namespace ReSolve { //namespace vector {
   class VectorHandler { 
     public:
       VectorHandler();
-      VectorHandler(LinAlgWorkspace* new_workspace);
+      VectorHandler(LinAlgWorkspaceCpu* new_workspace);
+      VectorHandler(LinAlgWorkspaceCUDA* new_workspace);
       ~VectorHandler();
 
       //y = alpha x + y
-      void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y, std::string memspace );
+      void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y, std::string memspace);
 
       //dot: x \cdot y
-      real_type dot(vector::Vector* x, vector::Vector* y, std::string memspace );
+      real_type dot(vector::Vector* x, vector::Vector* y, std::string memspace);
 
       //scal = alpha * x
       void scal(const real_type* alpha, vector::Vector* x, std::string memspace);
@@ -40,9 +43,21 @@ namespace ReSolve { //namespace vector {
        * if `transpose = T` (yes), `x = beta*x + alpha*V^T*y`,
        * where `x` is `[k x 1]`, `V` is `[n x k]` and `y` is `[n x 1]`.
        */ 
-      void gemv(std::string transpose, index_type n, index_type k, const real_type* alpha, const real_type* beta, vector::Vector* V, vector::Vector* y, vector::Vector* x, std::string memspace);
+      void gemv(std::string transpose,
+                index_type n,
+                index_type k,
+                const real_type* alpha,
+                const real_type* beta,
+                vector::Vector* V,
+                vector::Vector* y,
+                vector::Vector* x,
+                std::string memspace);
     private:
-      LinAlgWorkspace* workspace_;
+      VectorHandlerImpl*  cpuImpl_{nullptr};
+      VectorHandlerImpl* cudaImpl_{nullptr};
+
+      bool isCpuEnabled_{false};
+      bool isCudaEnabled_{false};
   };
 
 } //} // namespace ReSolve::vector
diff --git a/resolve/vector/VectorHandlerCpu.cpp b/resolve/vector/VectorHandlerCpu.cpp
new file mode 100644
index 000000000..f5cc463d3
--- /dev/null
+++ b/resolve/vector/VectorHandlerCpu.cpp
@@ -0,0 +1,166 @@
+#include <iostream>
+
+#include <resolve/utilities/logger/Logger.hpp>
+#include <resolve/cuda/cudaKernels.h>
+#include <resolve/vector/Vector.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
+#include <resolve/vector/VectorHandlerImpl.hpp>
+#include "VectorHandlerCpu.hpp"
+
+namespace ReSolve {
+  using out = io::Logger;
+
+  /** 
+   * @brief empty constructor that does absolutely nothing        
+   */
+  VectorHandlerCpu::VectorHandlerCpu()
+  {
+  }
+
+  /** 
+   * @brief constructor
+   * 
+   * @param new_workspace - workspace to be set     
+   */
+  VectorHandlerCpu:: VectorHandlerCpu(LinAlgWorkspaceCpu* new_workspace)
+  {
+    workspace_ = new_workspace;
+  }
+
+  /** 
+   * @brief destructor     
+   */
+  VectorHandlerCpu::~VectorHandlerCpu()
+  {
+    //delete the workspace TODO
+  }
+
+  /** 
+   * @brief dot product of two vectors i.e, a = x^Ty
+   * 
+   * @param[in] x The first vector
+   * @param[in] y The second vector
+   * @param[in] memspace String containg memspace (cpu or cuda)
+   * 
+   * @return dot product (real number) of _x_ and _y_
+   */
+
+  real_type VectorHandlerCpu::dot(vector::Vector* x, vector::Vector* y)
+  { 
+    real_type* x_data = x->getData("cpu");
+    real_type* y_data = y->getData("cpu");
+    real_type sum = 0.0;
+    real_type c = 0.0;
+    // real_type t, y;
+    for (int i = 0; i < x->getSize(); ++i) {
+      real_type y = (x_data[i] * y_data[i]) - c;
+      real_type t = sum + y;
+      c = (t - sum) - y;
+      sum = t;        
+      //   sum += (x_data[i] * y_data[i]);
+    } 
+    return sum;
+  }
+
+  /** 
+   * @brief scale a vector by a constant i.e, x = alpha*x where alpha is a constant
+   * 
+   * @param[in] alpha The constant
+   * @param[in,out] x The vector
+   * @param memspace string containg memspace (cpu or cuda)
+   * 
+   */
+  void VectorHandlerCpu::scal(const real_type* alpha, vector::Vector* x)
+  {
+    real_type* x_data = x->getData("cpu");
+
+    for (int i = 0; i < x->getSize(); ++i){
+      x_data[i] *= (*alpha);
+    }
+  }
+
+  /** 
+   * @brief axpy i.e, y = alpha*x+y where alpha is a constant
+   * 
+   * @param[in] alpha The constant
+   * @param[in] x The first vector
+   * @param[in,out] y The second vector (result is return in y)
+   * @param[in]  memspace String containg memspace (cpu or cuda)
+   * 
+   */
+  void VectorHandlerCpu::axpy(const  real_type* alpha, vector::Vector* x, vector::Vector* y)
+  {
+    //AXPY:  y = alpha * x + y
+    real_type* x_data = x->getData("cpu");
+    real_type* y_data = y->getData("cpu");
+    for (int i = 0; i < x->getSize(); ++i) {
+      y_data[i] = (*alpha) * x_data[i] + y_data[i];
+    }
+  }
+
+  /** 
+   * @brief gemv computes matrix-vector product where both matrix and vectors are dense.
+   *        i.e., x = beta*x +  alpha*V*y
+   *
+   * @param[in] Transpose - yes (T) or no (N)
+   * @param[in] n Number of rows in (non-transposed) matrix
+   * @param[in] k Number of columns in (non-transposed)   
+   * @param[in] alpha Constant real number
+   * @param[in] beta Constant real number
+   * @param[in] V Multivector containing the matrix, organized columnwise
+   * @param[in] y Vector, k x 1 if N and n x 1 if T
+   * @param[in,out] x Vector, n x 1 if N and k x 1 if T
+   * @param[in] memspace  cpu or cuda (for now)
+   *
+   * @pre   V is stored colum-wise, _n_ > 0, _k_ > 0
+   * 
+   */  
+  void VectorHandlerCpu::gemv(std::string /* transpose */,
+                              index_type /* n */,
+                              index_type /* k */,
+                              const real_type* /* alpha */,
+                              const real_type* /* beta */,
+                              vector::Vector* /* V */,
+                              vector::Vector* /* y */,
+                              vector::Vector* /* x */)
+  {
+    out::error() << "Not implemented (yet)" << std::endl;
+  }
+
+  /** 
+   * @brief mass (bulk) axpy i.e, y = y - x*alpha where  alpha is a vector
+   * 
+   * @param[in] size number of elements in y
+   * @param[in] alpha vector size k x 1
+   * @param[in] x (multi)vector size size x k
+   * @param[in,out] y vector size size x 1 (this is where the result is stored)
+   * @param[in] memspace string containg memspace (cpu or cuda)
+   *
+   * @pre   _k_ > 0, _size_ > 0, _size_ = x->getSize()
+   *
+   */
+  void VectorHandlerCpu::massAxpy(index_type /* size */, vector::Vector* /* alpha */, index_type /* k */, vector::Vector* /* x */, vector::Vector* /* y */)
+  {
+    out::error() << "Not implemented (yet)" << std::endl;
+  }
+
+  /** 
+   * @brief mass (bulk) dot product i.e,  V^T x, where V is n x k dense multivector (a dense multivector consisting of k vectors size n)  
+   *        and x is k x 2 dense multivector (a multivector consisiting of two vectors size n each)
+   * 
+   * @param[in] size Number of elements in a single vector in V
+   * @param[in] V Multivector; k vectors size n x 1 each
+   * @param[in] k Number of vectors in V
+   * @param[in] x Multivector; 2 vectors size n x 1 each
+   * @param[out] res Multivector; 2 vectors size k x 1 each (result is returned in res)
+   * @param[in] memspace String containg memspace (cpu or cuda)
+   *
+   * @pre   _size_ > 0, _k_ > 0, size = x->getSize(), _res_ needs to be allocated
+   *
+   */
+  void VectorHandlerCpu::massDot2Vec(index_type /* size */, vector::Vector* /* V */, index_type /* k */, vector::Vector* /* x */, vector::Vector* /* res */)
+  {
+    out::error() << "Not implemented (yet)" << std::endl;
+  }
+
+} // namespace ReSolve
diff --git a/resolve/vector/VectorHandlerCpu.hpp b/resolve/vector/VectorHandlerCpu.hpp
new file mode 100644
index 000000000..3f2f6133d
--- /dev/null
+++ b/resolve/vector/VectorHandlerCpu.hpp
@@ -0,0 +1,57 @@
+#pragma once
+#include <string>
+
+namespace ReSolve
+{ 
+  namespace vector
+  {
+    class Vector;
+  }
+  class LinAlgWorkspaceCpu;
+  class VectorHandlerImpl;
+}
+
+
+namespace ReSolve { //namespace vector {
+  class VectorHandlerCpu : public VectorHandlerImpl
+  { 
+    public:
+      VectorHandlerCpu();
+      VectorHandlerCpu(LinAlgWorkspaceCpu* workspace);
+      virtual ~VectorHandlerCpu();
+
+      //y = alpha x + y
+      virtual void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y);
+
+      //dot: x \cdot y
+      virtual real_type dot(vector::Vector* x, vector::Vector* y);
+
+      //scal = alpha * x
+      virtual void scal(const real_type* alpha, vector::Vector* x);
+
+      //mass axpy: x*alpha + y where x is [n x k] and alpha is [k x 1]; x is stored columnwise
+      virtual void massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y);
+
+      //mass dot: V^T x, where V is [n x k] and x is [k x 2], everything is stored and returned columnwise
+      //Size = n
+      virtual void massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res);
+
+      /** gemv:
+       * if `transpose = N` (no), `x = beta*x +  alpha*V*y`,
+       * where `x` is `[n x 1]`, `V` is `[n x k]` and `y` is `[k x 1]`.
+       * if `transpose = T` (yes), `x = beta*x + alpha*V^T*y`,
+       * where `x` is `[k x 1]`, `V` is `[n x k]` and `y` is `[n x 1]`.
+       */ 
+      virtual void gemv(std::string transpose,
+                        index_type n,
+                        index_type k,
+                        const real_type* alpha,
+                        const real_type* beta,
+                        vector::Vector* V,
+                        vector::Vector* y,
+                        vector::Vector* x);
+    private:
+      LinAlgWorkspaceCpu* workspace_;
+  };
+
+} //} // namespace ReSolve::vector
diff --git a/resolve/vector/VectorHandlerCuda.cpp b/resolve/vector/VectorHandlerCuda.cpp
new file mode 100644
index 000000000..3c887e855
--- /dev/null
+++ b/resolve/vector/VectorHandlerCuda.cpp
@@ -0,0 +1,236 @@
+#include <iostream>
+
+#include <resolve/utilities/logger/Logger.hpp>
+#include <resolve/cuda/cudaKernels.h>
+#include <resolve/vector/Vector.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
+#include <resolve/vector/VectorHandlerImpl.hpp>
+#include "VectorHandlerCuda.hpp"
+
+namespace ReSolve {
+  using out = io::Logger;
+
+  /** 
+   * @brief empty constructor that does absolutely nothing        
+   */
+  VectorHandlerCuda::VectorHandlerCuda()
+  {
+  }
+
+  /** 
+   * @brief constructor
+   * 
+   * @param new_workspace - workspace to be set     
+   */
+  VectorHandlerCuda:: VectorHandlerCuda(LinAlgWorkspaceCUDA* new_workspace)
+  {
+    workspace_ = new_workspace;
+  }
+
+  /** 
+   * @brief destructor     
+   */
+  VectorHandlerCuda::~VectorHandlerCuda()
+  {
+    //delete the workspace TODO
+  }
+
+  /** 
+   * @brief dot product of two vectors i.e, a = x^Ty
+   * 
+   * @param[in] x The first vector
+   * @param[in] y The second vector
+   * @param[in] memspace String containg memspace (cpu or cuda)
+   * 
+   * @return dot product (real number) of _x_ and _y_
+   */
+
+  real_type VectorHandlerCuda::dot(vector::Vector* x, vector::Vector* y)
+  { 
+    LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
+    cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
+    double nrm = 0.0;
+    cublasStatus_t st= cublasDdot (handle_cublas,  x->getSize(), x->getData("cuda"), 1, y->getData("cuda"), 1, &nrm);
+    if (st!=0) {printf("dot product crashed with code %d \n", st);}
+    return nrm;
+  }
+
+  /** 
+   * @brief scale a vector by a constant i.e, x = alpha*x where alpha is a constant
+   * 
+   * @param[in] alpha The constant
+   * @param[in,out] x The vector
+   * @param memspace string containg memspace (cpu or cuda)
+   * 
+   */
+  void VectorHandlerCuda::scal(const real_type* alpha, vector::Vector* x)
+  {
+    LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
+    cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
+    cublasStatus_t st = cublasDscal(handle_cublas, x->getSize(), alpha, x->getData("cuda"), 1);
+    if (st!=0) {
+      ReSolve::io::Logger::error() << "scal crashed with code " << st << "\n";
+    }
+  }
+
+  /** 
+   * @brief axpy i.e, y = alpha*x+y where alpha is a constant
+   * 
+   * @param[in] alpha The constant
+   * @param[in] x The first vector
+   * @param[in,out] y The second vector (result is return in y)
+   * @param[in]  memspace String containg memspace (cpu or cuda)
+   * 
+   */
+  void VectorHandlerCuda::axpy(const  real_type* alpha, vector::Vector* x, vector::Vector* y)
+  {
+    //AXPY:  y = alpha * x + y
+    LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
+    cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
+    cublasDaxpy(handle_cublas,
+                x->getSize(),
+                alpha,
+                x->getData("cuda"),
+                1,
+                y->getData("cuda"),
+                1);
+  }
+
+  /** 
+   * @brief gemv computes matrix-vector product where both matrix and vectors are dense.
+   *        i.e., x = beta*x +  alpha*V*y
+   *
+   * @param[in] Transpose - yes (T) or no (N)
+   * @param[in] n Number of rows in (non-transposed) matrix
+   * @param[in] k Number of columns in (non-transposed)   
+   * @param[in] alpha Constant real number
+   * @param[in] beta Constant real number
+   * @param[in] V Multivector containing the matrix, organized columnwise
+   * @param[in] y Vector, k x 1 if N and n x 1 if T
+   * @param[in,out] x Vector, n x 1 if N and k x 1 if T
+   * @param[in] memspace  cpu or cuda (for now)
+   *
+   * @pre   V is stored colum-wise, _n_ > 0, _k_ > 0
+   * 
+   */  
+  void VectorHandlerCuda::gemv(std::string transpose,
+                               index_type n,
+                               index_type k,
+                               const real_type* alpha,
+                               const real_type* beta,
+                               vector::Vector* V,
+                               vector::Vector* y,
+                               vector::Vector* x)
+  {
+    LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
+    cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
+    if (transpose == "T") {
+
+      cublasDgemv(handle_cublas,
+                  CUBLAS_OP_T,
+                  n,
+                  k,
+                  alpha,
+                  V->getData("cuda"),
+                  n,
+                  y->getData("cuda"),
+                  1,
+                  beta,
+                  x->getData("cuda"),
+                  1);
+
+    } else {
+      cublasDgemv(handle_cublas,
+                  CUBLAS_OP_N,
+                  n,
+                  k,
+                  alpha,
+                  V->getData("cuda"),
+                  n,
+                  y->getData("cuda"),
+                  1,
+                  beta,
+                  x->getData("cuda"),
+                  1);
+    }
+  }
+
+  /** 
+   * @brief mass (bulk) axpy i.e, y = y - x*alpha where  alpha is a vector
+   * 
+   * @param[in] size number of elements in y
+   * @param[in] alpha vector size k x 1
+   * @param[in] x (multi)vector size size x k
+   * @param[in,out] y vector size size x 1 (this is where the result is stored)
+   * @param[in] memspace string containg memspace (cpu or cuda)
+   *
+   * @pre   _k_ > 0, _size_ > 0, _size_ = x->getSize()
+   *
+   */
+  void VectorHandlerCuda::massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y)
+  {
+    using namespace constants;
+    if (k < 200) {
+      mass_axpy(size, k, x->getData("cuda"), y->getData("cuda"),alpha->getData("cuda"));
+    } else {
+      LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
+      cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
+      cublasDgemm(handle_cublas,
+                  CUBLAS_OP_N,
+                  CUBLAS_OP_N,
+                  size,       // m
+                  1,          // n
+                  k + 1,      // k
+                  &MINUSONE, // alpha
+                  x->getData("cuda"), // A
+                  size,       // lda
+                  alpha->getData("cuda"), // B
+                  k + 1,      // ldb
+                  &ONE,
+                  y->getData("cuda"),          // c
+                  size);      // ldc     
+    }
+  }
+
+  /** 
+   * @brief mass (bulk) dot product i.e,  V^T x, where V is n x k dense multivector
+   * (a dense multivector consisting of k vectors size n) and x is k x 2 dense
+   * multivector (a multivector consisiting of two vectors size n each)
+   * 
+   * @param[in] size Number of elements in a single vector in V
+   * @param[in] V Multivector; k vectors size n x 1 each
+   * @param[in] k Number of vectors in V
+   * @param[in] x Multivector; 2 vectors size n x 1 each
+   * @param[out] res Multivector; 2 vectors size k x 1 each (result is returned in res)
+   * @param[in] memspace String containg memspace (cpu or cuda)
+   *
+   * @pre   _size_ > 0, _k_ > 0, size = x->getSize(), _res_ needs to be allocated
+   *
+   */
+  void VectorHandlerCuda::massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res)
+  {
+    using namespace constants;
+
+    if (k < 200) {
+      mass_inner_product_two_vectors(size, k, x->getData("cuda") , x->getData(1, "cuda"), V->getData("cuda"), res->getData("cuda"));
+    } else {
+      LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
+      cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
+      cublasDgemm(handle_cublas,
+                  CUBLAS_OP_T,
+                  CUBLAS_OP_N,
+                  k + 1,   //m
+                  2,       //n
+                  size,    //k
+                  &ONE,   //alpha
+                  V->getData("cuda"),       //A
+                  size,    //lda
+                  x->getData("cuda"),       //B
+                  size,    //ldb
+                  &ZERO,
+                  res->getData("cuda"),     //c
+                  k + 1);  //ldc 
+    }
+  }
+
+} // namespace ReSolve
diff --git a/resolve/vector/VectorHandlerCuda.hpp b/resolve/vector/VectorHandlerCuda.hpp
new file mode 100644
index 000000000..0ee2752d3
--- /dev/null
+++ b/resolve/vector/VectorHandlerCuda.hpp
@@ -0,0 +1,57 @@
+#pragma once
+#include <string>
+
+namespace ReSolve
+{ 
+  namespace vector
+  {
+    class Vector;
+  }
+  class LinAlgWorkspaceCUDA;
+  class VectorHandlerImpl;
+}
+
+
+namespace ReSolve { //namespace vector {
+  class VectorHandlerCuda : public VectorHandlerImpl
+  { 
+    public:
+      VectorHandlerCuda();
+      VectorHandlerCuda(LinAlgWorkspaceCUDA* workspace);
+      virtual ~VectorHandlerCuda();
+
+      //y = alpha x + y
+      virtual void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y);
+
+      //dot: x \cdot y
+      virtual real_type dot(vector::Vector* x, vector::Vector* y);
+
+      //scal = alpha * x
+      virtual void scal(const real_type* alpha, vector::Vector* x);
+
+      //mass axpy: x*alpha + y where x is [n x k] and alpha is [k x 1]; x is stored columnwise
+      virtual void massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y);
+
+      //mass dot: V^T x, where V is [n x k] and x is [k x 2], everything is stored and returned columnwise
+      //Size = n
+      virtual void massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res);
+
+      /** gemv:
+       * if `transpose = N` (no), `x = beta*x +  alpha*V*y`,
+       * where `x` is `[n x 1]`, `V` is `[n x k]` and `y` is `[k x 1]`.
+       * if `transpose = T` (yes), `x = beta*x + alpha*V^T*y`,
+       * where `x` is `[k x 1]`, `V` is `[n x k]` and `y` is `[n x 1]`.
+       */ 
+      virtual void gemv(std::string transpose,
+                        index_type n,
+                        index_type k,
+                        const real_type* alpha,
+                        const real_type* beta,
+                        vector::Vector* V,
+                        vector::Vector* y,
+                        vector::Vector* x);
+    private:
+      LinAlgWorkspaceCUDA* workspace_;
+  };
+
+} //} // namespace ReSolve::vector
diff --git a/resolve/vector/VectorHandlerImpl.hpp b/resolve/vector/VectorHandlerImpl.hpp
new file mode 100644
index 000000000..229a74618
--- /dev/null
+++ b/resolve/vector/VectorHandlerImpl.hpp
@@ -0,0 +1,57 @@
+#pragma once
+#include <string>
+
+namespace ReSolve
+{ 
+  namespace vector
+  {
+    class Vector;
+  }
+  class VectorHandlerCpu;
+  class VectorHandlerCuda;
+}
+
+
+namespace ReSolve
+{
+  class VectorHandlerImpl
+  { 
+    public:
+      VectorHandlerImpl()
+      {}
+      virtual ~VectorHandlerImpl()
+      {}
+
+      //y = alpha x + y
+      virtual void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y ) = 0;
+
+      //dot: x \cdot y
+      virtual real_type dot(vector::Vector* x, vector::Vector* y ) = 0;
+
+      //scal = alpha * x
+      virtual void scal(const real_type* alpha, vector::Vector* x) = 0;
+
+      //mass axpy: x*alpha + y where x is [n x k] and alpha is [k x 1]; x is stored columnwise
+      virtual void massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y) = 0;
+
+      //mass dot: V^T x, where V is [n x k] and x is [k x 2], everything is stored and returned columnwise
+      //Size = n
+      virtual void massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res) = 0;
+
+      /** gemv:
+       * if `transpose = N` (no), `x = beta*x +  alpha*V*y`,
+       * where `x` is `[n x 1]`, `V` is `[n x k]` and `y` is `[k x 1]`.
+       * if `transpose = T` (yes), `x = beta*x + alpha*V^T*y`,
+       * where `x` is `[k x 1]`, `V` is `[n x k]` and `y` is `[n x 1]`.
+       */ 
+      virtual void gemv(std::string transpose,
+                        index_type n,
+                        index_type k,
+                        const real_type* alpha,
+                        const real_type* beta,
+                        vector::Vector* V,
+                        vector::Vector* y,
+                        vector::Vector* x) = 0;
+  };
+
+} //} // namespace ReSolve::vector
diff --git a/resolve/workspace/CMakeLists.txt b/resolve/workspace/CMakeLists.txt
new file mode 100644
index 000000000..673fac4bf
--- /dev/null
+++ b/resolve/workspace/CMakeLists.txt
@@ -0,0 +1,43 @@
+#[[
+
+@brief Build ReSolve workspace module
+
+@author Slaven Peles <peless@ornl.gov>
+
+]]
+
+# C++ code
+set(ReSolve_Workspace_SRC
+  LinAlgWorkspaceCpu.cpp
+)
+
+# C++ code that depends on CUDA SDK libraries
+set(ReSolve_Workspace_CUDASDK_SRC
+    LinAlgWorkspaceCUDA.cpp
+)
+
+set(ReSolve_Workspace_HEADER_INSTALL
+  LinAlgWorkspace.hpp
+  LinAlgWorkspaceCpu.hpp
+  LinAlgWorkspaceCUDA.hpp
+)
+
+# If cuda is enabled, add CUDA SDK workspace files
+if(RESOLVE_USE_CUDA)
+  set(ReSolve_Workspace_SRC ${ReSolve_Workspace_SRC} ${ReSolve_Workspace_CUDASDK_SRC})
+endif()
+
+add_library(resolve_workspace SHARED ${ReSolve_Workspace_SRC})
+
+# If CUDA is enabled, link to ReSolve CUDA backend
+if(RESOLVE_USE_CUDA)
+  target_link_libraries(resolve_workspace PUBLIC resolve_backend_cuda)
+endif(RESOLVE_USE_CUDA)  
+
+target_include_directories(resolve_workspace INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:include>
+)
+
+# install include headers
+install(FILES ${ReSolve_Workspace_HEADER_INSTALL} DESTINATION include/resolve/workspace)
diff --git a/resolve/workspace/LinAlgWorkspace.hpp b/resolve/workspace/LinAlgWorkspace.hpp
new file mode 100644
index 000000000..6da58fdaa
--- /dev/null
+++ b/resolve/workspace/LinAlgWorkspace.hpp
@@ -0,0 +1,8 @@
+#pragma once
+
+#include <resolve/workspace/LinAlgWorkspaceCpu.hpp>
+
+#ifdef RESOLVE_USE_CUDA
+#include <resolve/workspace/LinAlgWorkspaceCUDA.hpp>
+#endif
+
diff --git a/resolve/LinAlgWorkspace.cpp b/resolve/workspace/LinAlgWorkspaceCUDA.cpp
similarity index 84%
rename from resolve/LinAlgWorkspace.cpp
rename to resolve/workspace/LinAlgWorkspaceCUDA.cpp
index 9dee78c54..200ac4370 100644
--- a/resolve/LinAlgWorkspace.cpp
+++ b/resolve/workspace/LinAlgWorkspaceCUDA.cpp
@@ -1,15 +1,7 @@
-#include "LinAlgWorkspace.hpp"
+#include <resolve/workspace/LinAlgWorkspaceCUDA.hpp>
 
 namespace ReSolve
 {
-  LinAlgWorkspace::LinAlgWorkspace()
-  {
-  }
-  
-  LinAlgWorkspace::~LinAlgWorkspace()
-  {
-  }
-
   LinAlgWorkspaceCUDA::LinAlgWorkspaceCUDA()
   {
     handle_cusolversp_ = nullptr;
@@ -29,8 +21,6 @@ namespace ReSolve
     cusolverSpDestroy(handle_cusolversp_);
     cublasDestroy(handle_cublas_);
     cusparseDestroySpMat(mat_A_);
-    // cusparseDestroyDnVec(vec_x_);
-    // cusparseDestroyDnVec(vec_y_);
   }
 
   void* LinAlgWorkspaceCUDA::getSpmvBuffer()
@@ -93,21 +83,23 @@ namespace ReSolve
     mat_A_ = mat;
   }
 
-  cusparseDnVecDescr_t   LinAlgWorkspaceCUDA::getVecX()
+  cusparseDnVecDescr_t LinAlgWorkspaceCUDA::getVecX()
   {
     return vec_x_;
   }
 
-  cusparseDnVecDescr_t   LinAlgWorkspaceCUDA::getVecY()
+  cusparseDnVecDescr_t LinAlgWorkspaceCUDA::getVecY()
   {
     return vec_y_;
   }
 
-  bool LinAlgWorkspaceCUDA::matvecSetup(){
+  bool LinAlgWorkspaceCUDA::matvecSetup()
+  {
     return matvec_setup_done_;
   }
 
-  void LinAlgWorkspaceCUDA::matvecSetupDone() {
+  void LinAlgWorkspaceCUDA::matvecSetupDone()
+  {
     matvec_setup_done_ = true;
   }
 
@@ -117,4 +109,4 @@ namespace ReSolve
     cublasCreate(&handle_cublas_);
     cusolverSpCreate(&handle_cusolversp_);
   }
-}
+} // namespace ReSolve
diff --git a/resolve/LinAlgWorkspace.hpp b/resolve/workspace/LinAlgWorkspaceCUDA.hpp
similarity index 71%
rename from resolve/LinAlgWorkspace.hpp
rename to resolve/workspace/LinAlgWorkspaceCUDA.hpp
index e5c795802..5076563ac 100644
--- a/resolve/LinAlgWorkspace.hpp
+++ b/resolve/workspace/LinAlgWorkspaceCUDA.hpp
@@ -8,17 +8,7 @@
 
 namespace ReSolve
 {
-  class LinAlgWorkspace
-  {
-    public:
-      LinAlgWorkspace();
-      ~LinAlgWorkspace();
-    protected:
-      MemoryHandler mem_;
-  };
-
-
-  class LinAlgWorkspaceCUDA : public LinAlgWorkspace
+  class LinAlgWorkspaceCUDA
   {
     public:
       LinAlgWorkspaceCUDA();
@@ -65,20 +55,8 @@ namespace ReSolve
       void* buffer_1norm_;
 
       bool matvec_setup_done_; //check if setup is done for matvec i.e. if buffer is allocated, csr structure is set etc.
+
+      MemoryHandler mem_;
   };
 
-  /// @brief  Workspace factory
-  /// @param[in] memspace memory space ID 
-  /// @return pointer to the linear algebra workspace
-  inline LinAlgWorkspace* createLinAlgWorkspace(std::string memspace)
-  {
-    if (memspace == "cuda") {
-      LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA();
-      workspace->initializeHandles();
-      return workspace;
-    } 
-    // If not CUDA, return default
-    return (new LinAlgWorkspace());
-  }
-
-}
+} // namespace ReSolve
diff --git a/resolve/workspace/LinAlgWorkspaceCpu.cpp b/resolve/workspace/LinAlgWorkspaceCpu.cpp
new file mode 100644
index 000000000..3ed9aa432
--- /dev/null
+++ b/resolve/workspace/LinAlgWorkspaceCpu.cpp
@@ -0,0 +1,16 @@
+#include "LinAlgWorkspaceCpu.hpp"
+
+namespace ReSolve
+{
+  LinAlgWorkspaceCpu::LinAlgWorkspaceCpu()
+  {
+  }
+  
+  LinAlgWorkspaceCpu::~LinAlgWorkspaceCpu()
+  {
+  }
+
+  void LinAlgWorkspaceCpu::initializeHandles()
+  {
+  }
+}
diff --git a/resolve/workspace/LinAlgWorkspaceCpu.hpp b/resolve/workspace/LinAlgWorkspaceCpu.hpp
new file mode 100644
index 000000000..00e5f38ed
--- /dev/null
+++ b/resolve/workspace/LinAlgWorkspaceCpu.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+
+#include <resolve/MemoryUtils.hpp>
+
+namespace ReSolve
+{
+  class LinAlgWorkspaceCpu
+  {
+    public:
+      LinAlgWorkspaceCpu();
+      ~LinAlgWorkspaceCpu();
+      void initializeHandles();
+    private:
+      MemoryHandler mem_;
+  };
+
+}
diff --git a/runResolve b/runResolve
index 0a8e5dbf7..2d5c67ca3 100755
--- a/runResolve
+++ b/runResolve
@@ -13,9 +13,9 @@ foo=" ${foo} ${i} ${i}"
 done
 
 
-matfile="/gpfs/wolf/csc359/scratch/peles/testcases/ACTIVSg10k_AC/matrix_ACTIVSg10k_AC_"
+matfile="$HOME/testcases/ACTIVSg10k_AC/matrix_ACTIVSg10k_AC_"
 
-rhsfile="/gpfs/wolf/csc359/scratch/peles/testcases/ACTIVSg10k_AC/rhs_ACTIVSg10k_AC_"
+rhsfile="$HOME/testcases/ACTIVSg10k_AC/rhs_ACTIVSg10k_AC_"
 
 
-./resolve/klu_rf_fgmres.exe $matfile $rhsfile 10 $foo
+./$1 $matfile $rhsfile 10 $foo
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index ae24f9db1..19b799e4f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -6,9 +6,6 @@
 
 ]]
 
-if(RESOLVE_USE_CUDA)
-  add_subdirectory(functionality) 
-endif(RESOLVE_USE_CUDA)
-
+add_subdirectory(functionality) 
 add_subdirectory(unit)
 
diff --git a/tests/functionality/CMakeLists.txt b/tests/functionality/CMakeLists.txt
index e2a6b4b62..a6652c263 100644
--- a/tests/functionality/CMakeLists.txt
+++ b/tests/functionality/CMakeLists.txt
@@ -10,26 +10,31 @@
 add_executable(klu_klu_test.exe testKLU.cpp)
 target_link_libraries(klu_klu_test.exe PRIVATE ReSolve)
 
-# Build KLU+Rf test
-add_executable(klu_rf_test.exe testKLU_Rf.cpp)
-target_link_libraries(klu_rf_test.exe PRIVATE ReSolve)
+if(RESOLVE_USE_CUDA)
+        
+  # Build KLU+Rf test
+  add_executable(klu_rf_test.exe testKLU_Rf.cpp)
+  target_link_libraries(klu_rf_test.exe PRIVATE ReSolve)
 
-# Build KLU+Rf+fgmres test
-add_executable(klu_rf_fgmres_test.exe testKLU_Rf_FGMRES.cpp)
-target_link_libraries(klu_rf_fgmres_test.exe PRIVATE ReSolve)
+  # Build KLU+Rf+fgmres test
+  add_executable(klu_rf_fgmres_test.exe testKLU_Rf_FGMRES.cpp)
+  target_link_libraries(klu_rf_fgmres_test.exe PRIVATE ReSolve)
 
+  # Build KLU+GLU test
+  add_executable(klu_glu_test.exe testKLU_GLU.cpp)
+  target_link_libraries(klu_glu_test.exe PRIVATE ReSolve)
 
-
-# Build KLU+GLU test
-add_executable(klu_glu_test.exe testKLU_GLU.cpp)
-target_link_libraries(klu_glu_test.exe PRIVATE ReSolve)
+endif(RESOLVE_USE_CUDA)
 
 # Install tests
-set(installable_tests 
-        klu_klu_test.exe
-        klu_rf_test.exe
-        klu_rf_fgmres_test.exe
-        klu_glu_test.exe)
+set(installable_tests klu_klu_test.exe)
+
+if(RESOLVE_USE_CUDA)
+  set(installable_tests ${installable_tests}
+                        klu_rf_test.exe
+                        klu_rf_fgmres_test.exe
+                        klu_glu_test.exe)
+endif(RESOLVE_USE_CUDA)
 
 install(TARGETS ${installable_tests} 
         RUNTIME DESTINATION bin/resolve/tests/functionality)
@@ -40,6 +45,8 @@ install(DIRECTORY data DESTINATION bin/resolve/tests/functionality)
 set(test_data_dir ${CMAKE_SOURCE_DIR}/tests/functionality/)
 
 add_test(NAME klu_klu_test COMMAND $<TARGET_FILE:klu_klu_test.exe> "${test_data_dir}")
-add_test(NAME klu_rf_test  COMMAND $<TARGET_FILE:klu_rf_test.exe>  "${test_data_dir}")
-add_test(NAME klu_rf_fgmres_test COMMAND $<TARGET_FILE:klu_rf_fgmres_test.exe> "${test_data_dir}")
-add_test(NAME klu_glu_test COMMAND $<TARGET_FILE:klu_glu_test.exe> "${test_data_dir}")
+if(RESOLVE_USE_CUDA)
+  add_test(NAME klu_rf_test  COMMAND $<TARGET_FILE:klu_rf_test.exe>  "${test_data_dir}")
+  add_test(NAME klu_rf_fgmres_test COMMAND $<TARGET_FILE:klu_rf_fgmres_test.exe> "${test_data_dir}")
+  add_test(NAME klu_glu_test COMMAND $<TARGET_FILE:klu_glu_test.exe> "${test_data_dir}")
+endif(RESOLVE_USE_CUDA)
diff --git a/tests/functionality/testKLU.cpp b/tests/functionality/testKLU.cpp
index d5ff9d6d8..f3c1da577 100644
--- a/tests/functionality/testKLU.cpp
+++ b/tests/functionality/testKLU.cpp
@@ -1,6 +1,7 @@
 #include <string>
 #include <iostream>
 #include <iomanip>
+#include <cmath>
 
 #include <resolve/vector/Vector.hpp>
 #include <resolve/matrix/io.hpp>
@@ -9,7 +10,7 @@
 #include <resolve/matrix/MatrixHandler.hpp>
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 //author: KS
 //functionality test to check whether KLU works correctly.
 using namespace ReSolve::constants;
@@ -26,10 +27,9 @@ int main(int argc, char *argv[])
   int error_sum = 0;
   int status = 0;
 
-  ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA;
-  workspace_CUDA->initializeHandles();
-  ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_CUDA);
-  ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_CUDA);
+  ReSolve::LinAlgWorkspaceCpu* workspace = new ReSolve::LinAlgWorkspaceCpu();
+  ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace);
+  ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace);
   ReSolve::LinSolverDirectKLU* KLU = new ReSolve::LinSolverDirectKLU;
   KLU->setupParameters(1, 0.1, false);
 
@@ -101,32 +101,32 @@ int main(int argc, char *argv[])
   }
 
   vec_test->setData(x_data, "cpu");
-  vec_r->update(rhs, "cpu", "cuda");
-  vec_diff->update(x_data, "cpu", "cuda");
+  vec_r->update(rhs, "cpu", "cpu");
+  vec_diff->update(x_data, "cpu", "cpu");
 
-  // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cuda"));
-  matrix_handler->setValuesChanged(true);
-  status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cuda"); 
+  // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cpu"));
+  matrix_handler->setValuesChanged(true, "cpu");
+  status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cpu"); 
   error_sum += status;
   
-  real_type normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda"));
+  real_type normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cpu"));
 
 
   //for testing only - control
   
-  real_type normXtrue = sqrt(vector_handler->dot(vec_x, vec_x, "cuda"));
-  real_type normB1 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cuda"));
+  real_type normXtrue = sqrt(vector_handler->dot(vec_x, vec_x, "cpu"));
+  real_type normB1 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cpu"));
   
   //compute x-x_true
-  vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cuda");
+  vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cpu");
   //evaluate its norm
-  real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda"));
+  real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cpu"));
  
   //compute the residual using exact solution
-  vec_r->update(rhs, "cpu", "cuda");
-  status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
+  vec_r->update(rhs, "cpu", "cpu");
+  status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "cpu"); 
   error_sum += status;
-  real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda"));
+  real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cpu"));
   //evaluate the residual ON THE CPU using COMPUTED solution
  
   vec_r->update(rhs, "cpu", "cpu");
@@ -134,7 +134,7 @@ int main(int argc, char *argv[])
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cpu");
   error_sum += status;
  
-  real_type normRmatrix1CPU = sqrt(vector_handler->dot(vec_r, vec_r, "cuda"));
+  real_type normRmatrix1CPU = sqrt(vector_handler->dot(vec_r, vec_r, "cpu"));
  
   std::cout<<"Results (first matrix): "<<std::endl<<std::endl;
   std::cout<<"\t ||b-A*x||_2                 : " << std::setprecision(16) << normRmatrix1    << " (residual norm)" << std::endl;
@@ -164,8 +164,8 @@ int main(int argc, char *argv[])
   ReSolve::io::readAndUpdateRhs(rhs2_file, &rhs);
   rhs2_file.close();
 
-  matrix_handler->coo2csr(A_coo, A, "cuda");
-  vec_rhs->update(rhs, "cpu", "cuda");
+  matrix_handler->coo2csr(A_coo, A, "cpu");
+  vec_rhs->update(rhs, "cpu", "cpu");
 
   // and solve it too
   status =  KLU->refactorize();
@@ -174,27 +174,27 @@ int main(int argc, char *argv[])
   status = KLU->solve(vec_rhs, vec_x);
   error_sum += status;
 
-  vec_r->update(rhs, "cpu", "cuda");
-  matrix_handler->setValuesChanged(true);
+  vec_r->update(rhs, "cpu", "cpu");
+  matrix_handler->setValuesChanged(true, "cpu");
 
-  status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); 
+  status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cpu"); 
   error_sum += status;
 
-  real_type normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda"));
+  real_type normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cpu"));
   
   //for testing only - control
-  real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cuda"));
+  real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cpu"));
   //compute x-x_true
-  vec_diff->update(x_data, "cpu", "cuda");
-  vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cuda");
+  vec_diff->update(x_data, "cpu", "cpu");
+  vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cpu");
   //evaluate its norm
-  real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda"));
+  real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cpu"));
  
   //compute the residual using exact solution
-  vec_r->update(rhs, "cpu", "cuda");
-  status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "cuda"); 
+  vec_r->update(rhs, "cpu", "cpu");
+  status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "cpu"); 
   error_sum += status;
-  real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda"));
+  real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cpu"));
   
   std::cout<<"Results (second matrix): "<<std::endl<<std::endl;
   std::cout<<"\t ||b-A*x||_2                 : "<<normRmatrix2<<" (residual norm)"<<std::endl;
diff --git a/tests/functionality/testKLU_GLU.cpp b/tests/functionality/testKLU_GLU.cpp
index 37f888e4d..0e9bb4bd5 100644
--- a/tests/functionality/testKLU_GLU.cpp
+++ b/tests/functionality/testKLU_GLU.cpp
@@ -10,7 +10,7 @@
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverGLU.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 //author: KS
 //functionality test to check whether cuSolverGLU works correctly.
 
@@ -30,7 +30,7 @@ int main(int argc, char *argv[])
   int error_sum = 0;
   int status = 0;
 
-  ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA;
+  ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA();
   workspace_CUDA->initializeHandles();
   ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_CUDA);
   ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_CUDA);
@@ -127,7 +127,7 @@ int main(int argc, char *argv[])
   vec_diff->update(x_data, "cpu", "cuda");
 
   // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cuda"));
-  matrix_handler->setValuesChanged(true);
+  matrix_handler->setValuesChanged(true, "cuda");
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cuda"); 
   error_sum += status;
   
@@ -198,7 +198,7 @@ int main(int argc, char *argv[])
   error_sum += status;
 
    vec_r->update(rhs, "cpu", "cuda");
-   matrix_handler->setValuesChanged(true);
+   matrix_handler->setValuesChanged(true, "cuda");
 
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); 
   error_sum += status;
diff --git a/tests/functionality/testKLU_Rf.cpp b/tests/functionality/testKLU_Rf.cpp
index 21aba8b0d..729968f57 100644
--- a/tests/functionality/testKLU_Rf.cpp
+++ b/tests/functionality/testKLU_Rf.cpp
@@ -11,7 +11,7 @@
 #include <resolve/vector/VectorHandler.hpp>
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverRf.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 //author: KS
 //functionality test to check whether cuSolverRf works correctly.
 
@@ -30,7 +30,7 @@ int main(int argc, char *argv[])
   int error_sum = 0;
   int status = 0;
 
-  ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA;
+  ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA();
   workspace_CUDA->initializeHandles();
   ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_CUDA);
   ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_CUDA);
@@ -111,7 +111,7 @@ int main(int argc, char *argv[])
   vec_diff->update(x_data, "cpu", "cuda");
 
   // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cuda"));
-  matrix_handler->setValuesChanged(true);
+  matrix_handler->setValuesChanged(true, "cuda");
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cuda"); 
   error_sum += status;
   
@@ -194,8 +194,8 @@ int main(int argc, char *argv[])
   status = Rf->solve(vec_rhs, vec_x);
   error_sum += status;
 
-   vec_r->update(rhs, "cpu", "cuda");
-  matrix_handler->setValuesChanged(true);
+  vec_r->update(rhs, "cpu", "cuda");
+  matrix_handler->setValuesChanged(true, "cuda");
 
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); 
   error_sum += status;
diff --git a/tests/functionality/testKLU_Rf_FGMRES.cpp b/tests/functionality/testKLU_Rf_FGMRES.cpp
index c63a3f999..a474e4069 100644
--- a/tests/functionality/testKLU_Rf_FGMRES.cpp
+++ b/tests/functionality/testKLU_Rf_FGMRES.cpp
@@ -12,7 +12,7 @@
 #include <resolve/LinSolverDirectKLU.hpp>
 #include <resolve/LinSolverDirectCuSolverRf.hpp>
 #include <resolve/LinSolverIterativeFGMRES.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 //author: KS
 //functionality test to check whether cuSolverRf/FGMRES works correctly.
 
@@ -31,7 +31,7 @@ int main(int argc, char *argv[])
   int error_sum = 0;
   int status = 0;
 
-  ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA;
+  ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA();
   workspace_CUDA->initializeHandles();
   ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_CUDA);
   ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_CUDA);
@@ -117,7 +117,7 @@ int main(int argc, char *argv[])
   vec_diff->update(x_data, "cpu", "cuda");
 
   // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cuda"));
-  matrix_handler->setValuesChanged(true);
+  matrix_handler->setValuesChanged(true, "cuda");
   //evaluate the residual ||b-Ax||
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cuda"); 
   error_sum += status;
@@ -221,7 +221,7 @@ int main(int argc, char *argv[])
   error_sum += status;
 
   vec_r->update(rhs, "cpu", "cuda");
-  matrix_handler->setValuesChanged(true);
+  matrix_handler->setValuesChanged(true, "cuda");
 
   //evaluate final residual
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); 
@@ -258,5 +258,18 @@ int main(int argc, char *argv[])
   } else {
     std::cout<<"Test 4 (KLU with cuSolverRf refactorization + IR) FAILED, error sum: "<<error_sum<<std::endl<<std::endl;;
   }
+
+  delete A;
+  delete KLU;
+  delete GS;
+  delete FGMRES;
+  delete Rf;
+  delete x;
+  delete vec_r;
+  delete vec_x;
+  delete workspace_CUDA;
+  delete matrix_handler;
+  delete vector_handler;
+
   return error_sum;
 }
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index d61250fe7..f91c2ff78 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -6,8 +6,6 @@
 
 ]]
 
-if(RESOLVE_USE_CUDA)
-  add_subdirectory(matrix)
-  add_subdirectory(vector)
-  add_subdirectory(utilities)
-endif(RESOLVE_USE_CUDA)
+add_subdirectory(matrix)
+add_subdirectory(vector)
+add_subdirectory(utilities)
diff --git a/tests/unit/matrix/MatrixHandlerTests.hpp b/tests/unit/matrix/MatrixHandlerTests.hpp
index 18fbe412d..e203017af 100644
--- a/tests/unit/matrix/MatrixHandlerTests.hpp
+++ b/tests/unit/matrix/MatrixHandlerTests.hpp
@@ -5,7 +5,7 @@
 #include <iterator>
 #include <algorithm>
 #include <resolve/matrix/Csr.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 #include <resolve/matrix/MatrixHandler.hpp>
 #include <resolve/vector/Vector.hpp>
 #include <tests/unit/TestBase.hpp>
@@ -43,8 +43,7 @@ class MatrixHandlerTests : TestBase
   {
     TestStatus status;
 
-    ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-    ReSolve::MatrixHandler handler(workspace);
+    ReSolve::MatrixHandler* handler = createMatrixHandler();
 
     matrix::Csr* A = createCsrMatrix(N, memspace_);
     vector::Vector x(N);
@@ -57,12 +56,12 @@ class MatrixHandlerTests : TestBase
 
     real_type alpha = 2.0/30.0;
     real_type beta  = 2.0;
-    handler.setValuesChanged(true);
-    handler.matvec(A, &x, &y, &alpha, &beta, "csr", memspace_);
+    handler->setValuesChanged(true, memspace_);
+    handler->matvec(A, &x, &y, &alpha, &beta, "csr", memspace_);
 
     status *= verifyAnswer(y, 4.0, memspace_);
 
-    delete workspace;
+    delete handler;
     delete A;
 
     return status.report(__func__);
@@ -71,6 +70,23 @@ class MatrixHandlerTests : TestBase
 private:
   std::string memspace_{"cpu"};
 
+  ReSolve::MatrixHandler* createMatrixHandler()
+  {
+    if (memspace_ == "cpu") {
+      LinAlgWorkspaceCpu* workspace = new LinAlgWorkspaceCpu();
+      return new MatrixHandler(workspace);
+#ifdef RESOLVE_USE_CUDA
+    } else if (memspace_ == "cuda") {
+      LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA();
+      workspace->initializeHandles();
+      return new MatrixHandler(workspace);
+#endif
+    } else {
+      std::cout << "ReSolve not built with support for memory space " << memspace_ << "\n";
+    }
+    return nullptr;
+  }
+
   bool verifyAnswer(vector::Vector& x, real_type answer, std::string memspace)
   {
     bool status = true;
diff --git a/tests/unit/matrix/runMatrixHandlerTests.cpp b/tests/unit/matrix/runMatrixHandlerTests.cpp
index d18f09658..6eee90d55 100644
--- a/tests/unit/matrix/runMatrixHandlerTests.cpp
+++ b/tests/unit/matrix/runMatrixHandlerTests.cpp
@@ -20,6 +20,7 @@ int main(int, char**)
     std::cout << "\n";
   }
 
+#ifdef RESOLVE_USE_CUDA
   {
     std::cout << "Running tests with CUDA backend:\n";
     ReSolve::tests::MatrixHandlerTests test("cuda");
@@ -30,6 +31,7 @@ int main(int, char**)
 
     std::cout << "\n";
   }
+#endif
 
   return result.summary();
 }
diff --git a/tests/unit/vector/GramSchmidtTests.hpp b/tests/unit/vector/GramSchmidtTests.hpp
index 6f0d305b3..9981ea487 100644
--- a/tests/unit/vector/GramSchmidtTests.hpp
+++ b/tests/unit/vector/GramSchmidtTests.hpp
@@ -6,7 +6,7 @@
 #include <resolve/matrix/MatrixHandler.hpp>
 #include <resolve/vector/Vector.hpp>
 #include <tests/unit/TestBase.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 
 namespace ReSolve { 
   namespace tests {
@@ -66,8 +66,7 @@ namespace ReSolve {
               break;
           }
 
-          ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-          ReSolve::VectorHandler* handler = new ReSolve::VectorHandler(workspace);
+          ReSolve::VectorHandler* handler = createVectorHandler();
 
           vector::Vector* V = new vector::Vector(N, 3); // we will be using a space of 3 vectors
           real_type* H = new real_type[6]; //in this case, Hessenberg matrix is 3 x 2
@@ -114,7 +113,7 @@ namespace ReSolve {
 
           status *= verifyAnswer(V, 3, handler, memspace_);
           
-          delete workspace;
+          delete handler;
           delete [] H;
           delete V; 
           delete GS;
@@ -125,6 +124,23 @@ namespace ReSolve {
       private:
         std::string memspace_{"cuda"};
 
+        ReSolve::VectorHandler* createVectorHandler()
+        {
+          if (memspace_ == "cpu") { // TODO: Fix memory leak here
+            LinAlgWorkspaceCpu* workpsace = new LinAlgWorkspaceCpu();
+            return new VectorHandler(workpsace);
+#ifdef RESOLVE_USE_CUDA
+          } else if (memspace_ == "cuda") {
+            LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA();
+            workspace->initializeHandles();
+            return new VectorHandler(workspace);
+#endif
+          } else {
+            std::cout << "ReSolve not built with support for memory space " << memspace_ << "\n";
+          }
+          return nullptr;
+        }
+
         // x is a multivector containing K vectors 
         bool verifyAnswer(vector::Vector* x, index_type K,  ReSolve::VectorHandler* handler, std::string memspace)
         {
diff --git a/tests/unit/vector/VectorHandlerTests.hpp b/tests/unit/vector/VectorHandlerTests.hpp
index 0737385d0..d2f8c73c6 100644
--- a/tests/unit/vector/VectorHandlerTests.hpp
+++ b/tests/unit/vector/VectorHandlerTests.hpp
@@ -7,7 +7,7 @@
 #include <resolve/vector/Vector.hpp>
 #include <resolve/vector/VectorHandler.hpp>
 #include <tests/unit/TestBase.hpp>
-#include <resolve/LinAlgWorkspace.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
 
 namespace ReSolve { 
   namespace tests {
@@ -38,8 +38,7 @@ namespace ReSolve {
         {
           TestStatus status;
 
-          ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-          ReSolve::VectorHandler handler(workspace);
+          ReSolve::VectorHandler* handler = createVectorHandler();
 
           vector::Vector* x = new vector::Vector(N);
           vector::Vector* y = new vector::Vector(N);
@@ -52,10 +51,10 @@ namespace ReSolve {
 
           real_type alpha = 0.5;
           //the result is a vector with y[i] = 2.5;          
-          handler.axpy(&alpha, x, y, memspace_);
+          handler->axpy(&alpha, x, y, memspace_);
           status *= verifyAnswer(y, 2.5, memspace_);
 
-          delete workspace;
+          delete handler;
           delete x;
           delete y; 
 
@@ -66,8 +65,7 @@ namespace ReSolve {
         {
           TestStatus status;
 
-          ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-          ReSolve::VectorHandler handler(workspace);
+          ReSolve::VectorHandler* handler = createVectorHandler();
 
           vector::Vector* x = new vector::Vector(N);
           vector::Vector* y = new vector::Vector(N);
@@ -79,7 +77,7 @@ namespace ReSolve {
           y->setToConst(4.0, memspace_);
           real_type ans;
           //the result is N
-          ans = handler.dot(x, y, memspace_);
+          ans = handler->dot(x, y, memspace_);
 
           bool st = true;;
           if (ans != (real_type) N) {
@@ -88,7 +86,7 @@ namespace ReSolve {
           } 
           status *= st;
 
-          delete workspace;
+          delete handler;
           delete x;
           delete y; 
 
@@ -99,8 +97,7 @@ namespace ReSolve {
         {
           TestStatus status;
 
-          ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-          ReSolve::VectorHandler handler(workspace);
+          ReSolve::VectorHandler* handler = createVectorHandler();
 
           vector::Vector* x =  new vector::Vector(N);
 
@@ -111,10 +108,10 @@ namespace ReSolve {
           real_type alpha = 3.5;
 
           //the answer is x[i] = 4.375;         
-          handler.scal(&alpha, x, memspace_);
+          handler->scal(&alpha, x, memspace_);
           status *= verifyAnswer(x, 4.375, memspace_);
 
-          delete workspace;
+          delete handler;
           delete x;
 
           return status.report(__func__);
@@ -124,8 +121,7 @@ namespace ReSolve {
         {
           TestStatus status;
 
-          ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-          ReSolve::VectorHandler handler(workspace);
+          ReSolve::VectorHandler* handler = createVectorHandler();
           
           vector::Vector* x =  new vector::Vector(N, K);
           vector::Vector* y =  new vector::Vector(N);
@@ -148,11 +144,11 @@ namespace ReSolve {
           index_type r = K % 2;
           real_type res = (real_type) ((floor((real_type) K / 2.0) + r) * 1.0 + floor((real_type) K / 2.0) * (-0.5));
 
-          handler.massAxpy(N, alpha, K, x, y, memspace_);
+          handler->massAxpy(N, alpha, K, x, y, memspace_);
           status *= verifyAnswer(y, 2.0 - res, memspace_);
 
 
-          delete workspace;
+          delete handler;
           delete x;
           delete y;
           delete alpha;
@@ -164,8 +160,7 @@ namespace ReSolve {
         {
           TestStatus status;
 
-          ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-          ReSolve::VectorHandler handler(workspace);
+          ReSolve::VectorHandler* handler = createVectorHandler();
           
           vector::Vector* x =  new vector::Vector(N, K);
           vector::Vector* y =  new vector::Vector(N, 2);
@@ -176,11 +171,11 @@ namespace ReSolve {
           
           x->setToConst(1.0, memspace_);
           y->setToConst(-1.0, memspace_);
-          handler.massDot2Vec(N, x, K, y, res, memspace_);
+          handler->massDot2Vec(N, x, K, y, res, memspace_);
           
           status *= verifyAnswer(res, (-1.0) * (real_type) N, memspace_);
 
-          delete workspace;
+          delete handler;
           delete x;
           delete y;
           delete res;
@@ -190,8 +185,7 @@ namespace ReSolve {
         TestOutcome gemv(index_type N,  index_type K)
         {
           TestStatus status;
-          ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-          ReSolve::VectorHandler handler(workspace);
+          ReSolve::VectorHandler* handler = createVectorHandler();
           vector::Vector* V = new vector::Vector(N, K);
           // for the test with NO TRANSPOSE
           vector::Vector* yN = new vector::Vector(K); 
@@ -214,9 +208,9 @@ namespace ReSolve {
           
           real_type alpha = -1.0;
           real_type beta = 1.0;
-          handler.gemv("N", N, K, &alpha, &beta, V, yN, xN, memspace_);
+          handler->gemv("N", N, K, &alpha, &beta, V, yN, xN, memspace_);
           status *= verifyAnswer(xN, (real_type) (K) + 0.5, memspace_);
-          handler.gemv("T", N, K, &alpha, &beta, V, yT, xT, memspace_);
+          handler->gemv("T", N, K, &alpha, &beta, V, yT, xT, memspace_);
           status *= verifyAnswer(xT, (real_type) (N) + 0.5, memspace_);
 
           return status.report(__func__);
@@ -225,7 +219,24 @@ namespace ReSolve {
       private:
         std::string memspace_{"cpu"};
 
-        // we can verify through norm but that would defeat the purpose of testing vector handler...
+        ReSolve::VectorHandler* createVectorHandler()
+        {
+          if (memspace_ == "cpu") {
+            LinAlgWorkspaceCpu* workpsace = new LinAlgWorkspaceCpu();
+            return new VectorHandler(workpsace);
+#ifdef RESOLVE_USE_CUDA
+          } else if (memspace_ == "cuda") {
+            LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA();
+            workspace->initializeHandles();
+            return new VectorHandler(workspace);
+#endif
+          } else {
+            std::cout << "ReSolve not built with support for memory space " << memspace_ << "\n";
+          }
+          return nullptr;
+        }
+
+        // we can verify through norm but that would defeat the purpose of testing vector handler ...
         bool verifyAnswer(vector::Vector* x, real_type answer, std::string memspace)
         {
           bool status = true;
diff --git a/tests/unit/vector/runGramSchmidtTests.cpp b/tests/unit/vector/runGramSchmidtTests.cpp
index 2b5a3a17e..e118eb6d3 100644
--- a/tests/unit/vector/runGramSchmidtTests.cpp
+++ b/tests/unit/vector/runGramSchmidtTests.cpp
@@ -7,6 +7,7 @@ int main(int, char**)
 {
   ReSolve::tests::TestingResults result; 
 
+#ifdef RESOLVE_USE_CUDA
   {
     std::cout << "Running tests with CUDA backend:\n";
     ReSolve::tests::GramSchmidtTests test("cuda");
@@ -18,6 +19,7 @@ int main(int, char**)
     result += test.orthogonalize(5000, ReSolve::GramSchmidt::mgs_pm);
     std::cout << "\n";
   }
+#endif
 
   return result.summary();
 }
diff --git a/tests/unit/vector/runVectorHandlerTests.cpp b/tests/unit/vector/runVectorHandlerTests.cpp
index ae40f5d78..77e994712 100644
--- a/tests/unit/vector/runVectorHandlerTests.cpp
+++ b/tests/unit/vector/runVectorHandlerTests.cpp
@@ -19,6 +19,7 @@ int main(int, char**)
     std::cout << "\n";
   }
 
+#ifdef RESOLVE_USE_CUDA
   {
     std::cout << "Running tests with CUDA backend:\n";
     ReSolve::tests::VectorHandlerTests test("cuda");
@@ -34,6 +35,7 @@ int main(int, char**)
 
     std::cout << "\n";
   }
+#endif
 
   return result.summary();
 }