diff --git a/.github/workflows/run_tests/action.yml b/.github/workflows/run_tests/action.yml
index 17a5cf769..928991ce3 100644
--- a/.github/workflows/run_tests/action.yml
+++ b/.github/workflows/run_tests/action.yml
@@ -23,6 +23,8 @@ runs:
           mpirun -n 3 --oversubscribe tests/test_StaticSVD
           ./tests/test_IncrementalSVDBrand
           mpirun -n 3 --oversubscribe tests/test_IncrementalSVDBrand
+          ./tests/test_NNLS
+          mpirun -n 3 --oversubscribe tests/test_NNLS
       shell: bash
     - name: Basis dataset update test
       run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8356f3e92..f3f1eaf15 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -275,6 +275,7 @@ if(GTEST_FOUND)
     IncrementalSVD
     IncrementalSVDBrand
     GreedyCustomSampler
+    NNLS
     basis_conversion)
   foreach(stem IN LISTS unit_test_stems)
     add_executable(test_${stem} unit_tests/test_${stem}.cpp)
diff --git a/lib/linalg/NNLS.cpp b/lib/linalg/NNLS.cpp
index 5c794f1a2..cc73e4456 100644
--- a/lib/linalg/NNLS.cpp
+++ b/lib/linalg/NNLS.cpp
@@ -58,15 +58,18 @@ extern "C" {
 NNLSSolver::NNLSSolver(double const_tol, int min_nnz, int max_nnz,
                        int verbosity,
                        double res_change_termination_tol,
-                       double zero_tol, int n_outer, int n_inner)
+                       double zero_tol, int n_outer, int n_inner,
+                       const NNLS_termination criterion)
     : const_tol_(const_tol), min_nnz_(min_nnz), max_nnz_(max_nnz),
       verbosity_(verbosity),
       res_change_termination_tol_(res_change_termination_tol),
       zero_tol_(zero_tol), n_outer_(n_outer), n_inner_(n_inner),
-      n_proc_max_for_partial_matrix_(15),
       NNLS_qrres_on_(false),
-      qr_residual_mode_(QRresidualMode::hybrid)
+      qr_residual_mode_(QRresidualMode::hybrid),
+      d_criterion(criterion)
 {
+    CAROM_VERIFY((d_criterion == NNLS_termination::L2)
+                 || (d_criterion == NNLS_termination::LINF));
     MPI_Comm_rank(MPI_COMM_WORLD, &d_rank);
     MPI_Comm_size(MPI_COMM_WORLD, &d_num_procs);
     std::cout << "NNLSSolver init on rank " << d_rank << " out of "
@@ -147,6 +150,13 @@ void NNLSSolver::solve_parallel_with_scalapack(const Matrix& matTrans,
     rhs_halfgap -= rhs_lb;
     rhs_halfgap *= 0.5;
 
+    double l2norm_threshold;
+    if (d_criterion == NNLS_termination::L2) {
+        l2norm_threshold = rhs_halfgap.norm();
+        if (d_rank == 0 && verbosity_ > 1)
+            printf("L2 norm threshold: %.5e\n", l2norm_threshold);
+    }
+
     Vector rhs_avg_glob(rhs_avg);
     Vector rhs_halfgap_glob(rhs_halfgap);
 
@@ -159,7 +169,7 @@ void NNLSSolver::solve_parallel_with_scalapack(const Matrix& matTrans,
     char notrans = 'N';
     char layout = 'R';
 
-    int n_proc = std::min(n_proc_max_for_partial_matrix_,d_num_procs);
+    int n_proc = d_num_procs;
     int nb = 3; // block column size
 
     // initialize blacs process grid
@@ -168,8 +178,8 @@ void NNLSSolver::solve_parallel_with_scalapack(const Matrix& matTrans,
 
     int n_dist_loc_max = 0; // maximum number of columns in distributed matrix
     if (d_rank < n_proc) {
-        // m is the maximum number of columns in the global matrix
-        n_dist_loc_max = ((m/nb + 1)/n_proc + 1)*nb;
+        // n_tot is the maximum number of columns in the global (not transposed) matrix
+        n_dist_loc_max = ((n_tot/nb + 1)/n_proc + 1)*nb;
     }
 
     std::vector<double> mu_max_array(d_num_procs);
@@ -254,6 +264,7 @@ void NNLSSolver::solve_parallel_with_scalapack(const Matrix& matTrans,
 
     double mumax_glob;
     double rmax;
+    bool tolerance_met;
 
     for (unsigned int oiter = 0; oiter < n_outer_; ++oiter) {
         stalledFlag = 0;
@@ -265,12 +276,17 @@ void NNLSSolver::solve_parallel_with_scalapack(const Matrix& matTrans,
         // This norm of a non-distributed vector is identical on all ranks.
         l2_res_hist(oiter) = res_glob.norm();
 
+        if (d_criterion == NNLS_termination::LINF)
+            tolerance_met = (rmax <= const_tol_);
+        else if (d_criterion == NNLS_termination::L2)
+            tolerance_met = (l2_res_hist(oiter) <= l2norm_threshold);
+
         if (verbosity_ > 1 && d_rank == 0) {
             printf("%d %d %d %d %d %.15e %.15e\n", oiter, n_total_inner_iter,
                    m, n_tot, n_glob, rmax, l2_res_hist(oiter));
             fflush(stdout);
         }
-        if (rmax <= const_tol_ && n_glob >= min_nnz_cap) {
+        if (tolerance_met && n_glob >= min_nnz_cap) {
             if (d_rank == 0 && verbosity_ > 1) {
                 printf("target tolerance met\n");
                 fflush(stdout);
diff --git a/lib/linalg/NNLS.h b/lib/linalg/NNLS.h
index 7ccf3a9b9..459e43379 100644
--- a/lib/linalg/NNLS.h
+++ b/lib/linalg/NNLS.h
@@ -19,6 +19,16 @@
 
 namespace CAROM {
 
+/**
+ * @brief Termination criterion for NNLS solver:
+ *        LINF: L-infinity norm (maximum value) of the residual
+ *        L2: L2 norm of the residual
+ */
+enum class NNLS_termination {
+    LINF,
+    L2
+};
+
 /**
  * \class NNLSSolver
  * Class for solving non-negative least-squares problems, cf. T. Chapman et al,
@@ -34,7 +44,8 @@ class NNLSSolver {
                int verbosity=0,
                double res_change_termination_tol=1.0e-4,
                double zero_tol=1.0e-14, int n_outer=100000,
-               int n_inner=100000);
+               int n_inner=100000,
+               const NNLS_termination criterion=NNLS_termination::LINF);
 
     /**
      * Destructor*/
@@ -115,12 +126,6 @@ class NNLSSolver {
      */
     double res_change_termination_tol_;
 
-    /**
-     * @brief Maximum number of processors used in the partial matrix containing
-     * only the nonzero quadrature points.
-     */
-    int n_proc_max_for_partial_matrix_;
-
     bool normalize_const_;
     bool QR_reduce_const_;
     bool NNLS_qrres_on_;
@@ -128,6 +133,8 @@ class NNLSSolver {
 
     int d_num_procs;
     int d_rank;
+
+    NNLS_termination d_criterion;
 };
 
 }
diff --git a/unit_tests/test_NNLS.cpp b/unit_tests/test_NNLS.cpp
new file mode 100644
index 000000000..cc501127e
--- /dev/null
+++ b/unit_tests/test_NNLS.cpp
@@ -0,0 +1,244 @@
+/******************************************************************************
+ *
+ * Copyright (c) 2013-2024, Lawrence Livermore National Security, LLC
+ * and other libROM project developers. See the top-level COPYRIGHT
+ * file for details.
+ *
+ * SPDX-License-Identifier: (Apache-2.0 OR MIT)
+ *
+ *****************************************************************************/
+
+#ifdef CAROM_HAS_GTEST
+
+#include<gtest/gtest.h>
+#include "linalg/NNLS.h"
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstring> // for memcpy
+#include <random>
+#include "mpi.h"
+#include "utils/mpi_utils.h"
+
+/**
+ * Simple smoke test to make sure Google Test is properly linked
+ */
+TEST(GoogleTestFramework, GoogleTestFrameworkFound) {
+    SUCCEED();
+}
+
+TEST(NNLS, solve_with_LINF)
+{
+    int nproc;
+    int rank;
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    const int nrow = 50;
+    const int ncol = 200;
+    const int ncol_local = CAROM::split_dimension(ncol);
+    std::vector<int> row_offset(nproc + 1);
+    const int total_cols = CAROM::get_global_offsets(ncol_local, row_offset,
+                           MPI_COMM_WORLD);
+    const double rel_tol = 0.05;
+    const double nnls_tol = 1.0e-11;
+    const int true_nnz = 44;
+
+    std::default_random_engine generator;
+    generator.seed(
+        1234); // fix the seed to keep the same result for different nproc.
+    std::uniform_real_distribution<> uniform_distribution(0.0, 1.0);
+    std::normal_distribution<double> normal_distribution(0.0, 1.0);
+
+    // distribute from a global matrix to keep the same system for different nproc.
+    CAROM::Matrix Gt(ncol, nrow, false);
+    for (int i = 0; i < ncol; i++)
+        for (int j = 0; j < nrow; j++)
+            Gt(i, j) = normal_distribution(generator);
+    Gt.distribute(ncol_local);
+
+    CAROM::Vector fom_sol(ncol_local, true);
+    CAROM::Vector rom_sol(ncol_local, true);
+    CAROM::Vector rhs(nrow, false);
+
+    // distribute from a global matrix to keep the same system for different nproc.
+    CAROM::Vector fom_sol_serial(ncol, false);
+    for (int c = 0; c < ncol; c++)
+        fom_sol_serial(c) = uniform_distribution(generator);
+    for (int c = 0; c < ncol_local; c++)
+        fom_sol(c) = fom_sol_serial(row_offset[rank] + c);
+
+    Gt.transposeMult(fom_sol, rhs);
+    rom_sol = 0.0;
+
+    CAROM::Vector rhs_lb(rhs);
+    CAROM::Vector rhs_ub(rhs);
+
+    for (int i = 0; i < rhs.dim(); ++i)
+    {
+        double delta = rel_tol * abs(rhs(i));
+        rhs_lb(i) -= delta;
+        rhs_ub(i) += delta;
+    }
+
+    CAROM::NNLSSolver nnls(nnls_tol, 0, 0, 2);
+    nnls.solve_parallel_with_scalapack(Gt, rhs_lb, rhs_ub, rom_sol);
+
+    int nnz = 0;
+    for (int i = 0; i < rom_sol.dim(); ++i)
+    {
+        if (rom_sol(i) != 0.0)
+        {
+            nnz++;
+        }
+    }
+
+    std::cout << rank << ": Number of nonzeros in NNLS solution: " << nnz
+              << ", out of " << rom_sol.dim() << std::endl;
+
+    MPI_Allreduce(MPI_IN_PLACE, &nnz, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+
+    if (rank == 0)
+        std::cout << "Global number of nonzeros in NNLS solution: " << nnz << std::endl;
+    EXPECT_EQ(nnz, true_nnz);
+
+    // Check residual of NNLS solution
+    CAROM::Vector res(Gt.numColumns(), false);
+    Gt.transposeMult(rom_sol, res);
+
+    const double normGsol = res.norm();
+    const double normRHS = rhs.norm();
+
+    res -= rhs;
+    const double relNorm = res.norm() / std::max(normGsol, normRHS);
+    std::cout << rank << ": relative residual norm for NNLS solution of Gs = Gw: "
+              << relNorm << std::endl;
+
+    double max_error = 0.0;
+    for (int k = 0; k < res.dim(); k++)
+    {
+        max_error = std::max(max_error, abs(res(k) / rhs(k)));
+        // printf("rank %d, error(%d): %.3e\n", rank, k, abs(res(k) / rhs(k)));
+        EXPECT_TRUE(abs(res(k)) < rel_tol * abs(rhs(k)) + nnls_tol);
+    }
+    if (rank == 0)
+        printf("maximum error: %.5e\n", max_error);
+}
+
+TEST(NNLS, solve_with_L2)
+{
+    int nproc;
+    int rank;
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    const int nrow = 30;
+    const int ncol = 100;
+    const int ncol_local = CAROM::split_dimension(ncol);
+    std::vector<int> row_offset(nproc + 1);
+    const int total_cols = CAROM::get_global_offsets(ncol_local, row_offset,
+                           MPI_COMM_WORLD);
+    const double rel_tol = 0.05;
+    const double nnls_tol = 1.0e-11;
+    const int true_nnz = 21;
+
+    std::default_random_engine generator;
+    generator.seed(
+        1234); // fix the seed to keep the same result for different nproc.
+    std::uniform_real_distribution<> uniform_distribution(0.0, 1.0);
+    std::normal_distribution<double> normal_distribution(0.0, 1.0);
+
+    // distribute from a global matrix to keep the same system for different nproc.
+    CAROM::Matrix Gt(ncol, nrow, false);
+    for (int i = 0; i < ncol; i++)
+        for (int j = 0; j < nrow; j++)
+            Gt(i, j) = normal_distribution(generator);
+    Gt.distribute(ncol_local);
+
+    CAROM::Vector fom_sol(ncol_local, true);
+    CAROM::Vector rom_sol(ncol_local, true);
+    CAROM::Vector rhs(nrow, false);
+
+    // distribute from a global matrix to keep the same system for different nproc.
+    CAROM::Vector fom_sol_serial(ncol, false);
+    for (int c = 0; c < ncol; c++)
+        fom_sol_serial(c) = uniform_distribution(generator);
+    for (int c = 0; c < ncol_local; c++)
+        fom_sol(c) = fom_sol_serial(row_offset[rank] + c);
+
+    Gt.transposeMult(fom_sol, rhs);
+    rom_sol = 0.0;
+
+    CAROM::Vector rhs_lb(rhs);
+    CAROM::Vector rhs_ub(rhs);
+
+    for (int i = 0; i < rhs.dim(); ++i)
+    {
+        double delta = rel_tol * abs(rhs(i));
+        rhs_lb(i) -= delta;
+        rhs_ub(i) += delta;
+    }
+
+    CAROM::NNLSSolver nnls(nnls_tol, 0, 0, 2, 1.0e-4, 1.0e-14, 100000, 100000,
+                           CAROM::NNLS_termination::L2);
+    nnls.solve_parallel_with_scalapack(Gt, rhs_lb, rhs_ub, rom_sol);
+
+    int nnz = 0;
+    for (int i = 0; i < rom_sol.dim(); ++i)
+    {
+        if (rom_sol(i) != 0.0)
+        {
+            nnz++;
+        }
+    }
+
+    std::cout << rank << ": Number of nonzeros in NNLS solution: " << nnz
+              << ", out of " << rom_sol.dim() << std::endl;
+
+    MPI_Allreduce(MPI_IN_PLACE, &nnz, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+
+    if (rank == 0)
+        std::cout << "Global number of nonzeros in NNLS solution: " << nnz << std::endl;
+    EXPECT_EQ(nnz, true_nnz);
+
+    // Check residual of NNLS solution
+    CAROM::Vector res(Gt.numColumns(), false);
+    Gt.transposeMult(rom_sol, res);
+
+    const double normGsol = res.norm();
+    const double normRHS = rhs.norm();
+
+    res -= rhs;
+    const double relNorm = res.norm() / std::max(normGsol, normRHS);
+    std::cout << rank << ": relative residual norm for NNLS solution of Gs = Gw: "
+              << relNorm << std::endl;
+
+    EXPECT_TRUE(res.norm() < rel_tol * normRHS + nnls_tol);
+    double max_error = 0.0;
+    for (int k = 0; k < res.dim(); k++)
+    {
+        max_error = std::max(max_error, abs(res(k) / rhs(k)));
+    }
+    if (rank == 0)
+        printf("maximum error: %.5e\n", max_error);
+}
+
+int main(int argc, char* argv[])
+{
+    ::testing::InitGoogleTest(&argc, argv);
+    MPI_Init(&argc, &argv);
+    int result = RUN_ALL_TESTS();
+    MPI_Finalize();
+    return result;
+}
+
+#else // #ifndef CAROM_HAS_GTEST
+
+int main()
+{
+    std::cout << "libROM was compiled without Google Test support, so unit "
+              << "tests have been disabled. To enable unit tests, compile "
+              << "libROM with Google Test support." << std::endl;
+}
+
+#endif // #endif CAROM_HAS_GTEST
\ No newline at end of file