diff --git a/cpp/src/dual_simplex/CMakeLists.txt b/cpp/src/dual_simplex/CMakeLists.txt
index 16ee502f8..d85471f9b 100644
--- a/cpp/src/dual_simplex/CMakeLists.txt
+++ b/cpp/src/dual_simplex/CMakeLists.txt
@@ -16,6 +16,7 @@
 set(DUAL_SIMPLEX_SRC_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/basis_solves.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/basis_updates.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/bound_flipping_ratio_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/branch_and_bound.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/crossover.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/initial_basis.cpp
@@ -30,6 +31,7 @@ set(DUAL_SIMPLEX_SRC_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/singletons.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/solve.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/sparse_matrix.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/sparse_vector.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/tic_toc.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/triangle_solve.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/vector_math.cpp)
diff --git a/cpp/src/dual_simplex/basis_solves.cpp b/cpp/src/dual_simplex/basis_solves.cpp
index d363c3ee8..53464fc66 100644
--- a/cpp/src/dual_simplex/basis_solves.cpp
+++ b/cpp/src/dual_simplex/basis_solves.cpp
@@ -364,7 +364,8 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
         for (i_t h = 0; h < Sdim; ++h) {
           identity[h] = h;
         }
-        Srank = right_looking_lu(S, medium_tol, identity, S_col_perm, SL, SU, S_perm_inv);
+        Srank = right_looking_lu(
+          S, settings.threshold_partial_pivoting_tol, identity, S_col_perm, SL, SU, S_perm_inv);
         if (Srank != Sdim) {
           // Get the rank deficient columns
           deficient.resize(Sdim - Srank);
diff --git a/cpp/src/dual_simplex/basis_updates.cpp b/cpp/src/dual_simplex/basis_updates.cpp
index 78737c167..a7654ce12 100644
--- a/cpp/src/dual_simplex/basis_updates.cpp
+++ b/cpp/src/dual_simplex/basis_updates.cpp
@@ -30,6 +30,14 @@ i_t basis_update_t<i_t, f_t>::b_solve(const std::vector<f_t>& rhs, std::vector<f
   return b_solve(rhs, solution, Lsol);
 }
 
+template <typename i_t, typename f_t>
+i_t basis_update_t<i_t, f_t>::b_solve(const sparse_vector_t<i_t, f_t>& rhs,
+                                      sparse_vector_t<i_t, f_t>& solution) const
+{
+  sparse_vector_t<i_t, f_t> Lsol(rhs.n, 0);
+  return b_solve(rhs, solution, Lsol);
+}
+
 template <typename i_t, typename f_t>
 i_t basis_update_t<i_t, f_t>::b_solve(const std::vector<f_t>& rhs,
                                       std::vector<f_t>& solution,
@@ -55,6 +63,33 @@ i_t basis_update_t<i_t, f_t>::b_solve(const std::vector<f_t>& rhs,
   return 0;
 }
 
+template <typename i_t, typename f_t>
+i_t basis_update_t<i_t, f_t>::b_solve(const sparse_vector_t<i_t, f_t>& rhs,
+                                      sparse_vector_t<i_t, f_t>& solution,
+                                      sparse_vector_t<i_t, f_t>& Lsol) const
+{
+  const i_t m = L0_.m;
+  assert(row_permutation_.size() == m);
+  assert(rhs.n == m);
+  assert(solution.n == m);
+  assert(Lsol.n == m);
+
+  // P*B = L*U
+  // B*x = b
+  // P*B*x = P*b = b'
+  solution = rhs;
+  solution.inverse_permute_vector(inverse_row_permutation_);
+
+  // L*U*x = b'
+  // Solve for v such that L*v = b'
+  l_solve(solution);
+  Lsol = solution;
+
+  // Solve for x such that U*x = v
+  u_solve(solution);
+  return 0;
+}
+
 template <typename i_t, typename f_t>
 i_t basis_update_t<i_t, f_t>::b_transpose_solve(const std::vector<f_t>& rhs,
                                                 std::vector<f_t>& solution) const
@@ -87,6 +122,112 @@ i_t basis_update_t<i_t, f_t>::b_transpose_solve(const std::vector<f_t>& rhs,
   return 0;
 }
 
+template <typename i_t, typename f_t>
+i_t basis_update_t<i_t, f_t>::b_transpose_solve(const sparse_vector_t<i_t, f_t>& rhs,
+                                                sparse_vector_t<i_t, f_t>& solution) const
+{
+  // Observe that
+  // P*B = L*U
+  // B'*P' = U'*L'
+  // We want to solve
+  // B'*y = c
+  // Let y = P'*w
+  // B'*y = B'*P'*w = U'*L'*w = c
+  // 1. Solve U'*r = c for r
+  // 2. Solve L'*w = r for w
+  // 3. Compute y = P'*w
+
+  const i_t m = L0_.m;
+  assert(rhs.n == m);
+  assert(solution.n == m);
+
+  // Solve for r such that U'*r = c
+  // Actually Q*U'*Q'*r = c
+  sparse_vector_t<i_t, f_t> r = rhs;
+  u_transpose_solve(r);
+
+#ifdef CHECK_U_TRANSPOSE_SOLVE
+  std::vector<f_t> residual;
+  rhs.to_dense(residual);
+  std::vector<f_t> r_dense;
+  r.to_dense(r_dense);
+  std::vector<f_t> product(m);
+  // Q * U' * Q' * r_dense - c
+
+  std::vector<f_t> r_dense_permuted(m);
+  inverse_permute_vector(col_permutation_, r_dense, r_dense_permuted);
+
+  // product = U' * Q' * r_dense
+  matrix_transpose_vector_multiply(U_, 1.0, r_dense_permuted, 0.0, product);
+  std::vector<f_t> product_permuted(m);
+  permute_vector(col_permutation_, product, product_permuted);
+  // residual = product_permuted - c
+  for (i_t k = 0; k < m; ++k) {
+    residual[k] -= product_permuted[k];
+  }
+
+  const f_t Ut_error = vector_norm_inf<i_t, f_t>(residual);
+  if (Ut_error > 1e-6) {
+    printf("|| U' * r - c || %e\n", Ut_error);
+    for (i_t k = 0; k < m; ++k) {
+      if (std::abs(residual[k]) > 1e-6) { printf("%d residual %e\n", k, residual[k]); }
+    }
+    printf("rhs nz %d\n", rhs.i.size());
+  }
+#endif
+
+  // Solve for w such that L'*w = r
+  l_transpose_solve(r);
+
+  // y = P'*w
+  r.inverse_permute_vector(row_permutation_, solution);
+
+#ifdef CHECK_PERMUTATION
+  std::vector<f_t> r_dense2;
+  r.to_dense(r_dense2);
+  std::vector<f_t> solution_dense_permuted(m);
+  permute_vector(inverse_row_permutation_, r_dense2, solution_dense_permuted);
+  std::vector<f_t> solution_dense;
+  solution.to_dense(solution_dense);
+  bool found_error = false;
+  for (i_t k = 0; k < m; ++k) {
+    if (std::abs(solution_dense[k] - solution_dense_permuted[k]) > 1e-6) {
+      printf("B transpose inverse permutation error %d %e %e\n",
+             k,
+             solution_dense[k],
+             solution_dense_permuted[k]);
+      found_error = true;
+    }
+  }
+  if (found_error) {
+    for (i_t k = 0; k < m; ++k) {
+      printf("%d (sparse -> permuted -> dense) %e (sparse -> dense -> permuted)%e\n",
+             k,
+             solution_dense[k],
+             solution_dense_permuted[k]);
+    }
+    for (i_t k = 0; k < solution.i.size(); ++k) {
+      printf("%d solution sparse %d %e\n", k, solution.i[k], solution.x[k]);
+    }
+    for (i_t k = 0; k < m; ++k) {
+      if (solution_dense[k] != 0.0) { printf("%d solution dense %e\n", k, solution_dense[k]); }
+    }
+    for (i_t k = 0; k < m; ++k) {
+      printf("inv permutation %d %d\n", k, inverse_row_permutation_[k]);
+    }
+    for (i_t k = 0; k < m; ++k) {
+      if (r_dense2[k] != 0.0) { printf("%d r dense %e\n", k, r_dense2[k]); }
+    }
+    for (i_t k = 0; k < m; ++k) {
+      if (solution_dense_permuted[k] != 0.0) {
+        printf("%d solution dense permuted %e\n", k, solution_dense_permuted[k]);
+      }
+    }
+  }
+#endif
+  return 0;
+}
+
 // Solve for x such that L*x = b
 template <typename i_t, typename f_t>
 i_t basis_update_t<i_t, f_t>::l_solve(std::vector<f_t>& rhs) const
@@ -101,7 +242,6 @@ i_t basis_update_t<i_t, f_t>::l_solve(std::vector<f_t>& rhs) const
 #endif
   // First solve
   // L0*x0 = b
-  // TODO: Handle a sparse rhs
   dual_simplex::lower_triangular_solve(L0_, rhs);
 #ifdef CHECK_LOWER_SOLVE
   {
@@ -129,6 +269,92 @@ i_t basis_update_t<i_t, f_t>::l_solve(std::vector<f_t>& rhs) const
   return 0;
 }
 
+template <typename i_t, typename f_t>
+i_t basis_update_t<i_t, f_t>::l_solve(sparse_vector_t<i_t, f_t>& rhs) const
+{
+  // L = L0 * R1^{-1} * R2^{-1} * ... * Rk^{-1}
+  //
+  // where Ri = I + e_r d^T
+
+  // First solve
+  // L0*x0 = b
+  const i_t m = L0_.m;
+
+  i_t top = sparse_triangle_solve<i_t, f_t, true>(
+    rhs, std::nullopt, xi_workspace_, L0_, x_workspace_.data());
+  solve_to_sparse_vector(top, rhs);  // Uses xi_workspace_ and x_workspace_ to fill rhs
+
+#ifdef CHECK_L_SOLVE
+  std::vector<f_t> residual(m, 0.0);
+  const i_t col_start = B.col_start[0];
+  const i_t col_end   = B.col_start[1];
+  for (i_t p = col_start; p < col_end; ++p) {
+    residual[B.i[p]] = B.x[p];
+  }
+
+  std::vector<f_t> x0;
+  rhs.to_dense(x0);
+  matrix_vector_multiply(L0_, 1.0, x0, -1.0, residual);
+  const f_t L0_solve_error = vector_norm_inf<i_t, f_t>(residual);
+  if (L0_solve_error > 1e-10) { printf("L0 solve error %e\n", L0_solve_error); }
+#endif
+
+  // then solve R1^{-1}*x1 = x0     ->  x1 = R1*x0
+  // then solve R2^{-1}*x2 = x1     ->  x2 = R2*x1
+  // until we get to
+  // Rk^{-1}*x = xk-1               -> x = Rk*xk-1
+  // Rk = (I + e_rk dk^T)
+  // x = Rk*xk-1 = xk-1 + erk (dk^T xk-1)
+
+#ifdef CHECK_MULTIPLY
+  std::vector<f_t> multiply;
+  rhs.to_dense(multiply);
+#endif
+
+  i_t nz = scatter_into_workspace(rhs);
+
+  for (i_t k = 0; k < num_updates_; ++k) {
+    const i_t r         = pivot_indices_[k];
+    f_t dot             = 0.0;
+    const i_t col_start = S_.col_start[k];
+    const i_t col_end   = S_.col_start[k + 1];
+    for (i_t p = col_start; p < col_end; ++p) {
+      if (xi_workspace_[S_.i[p]]) { dot += S_.x[p] * x_workspace_[S_.i[p]]; }
+    }
+    if (!xi_workspace_[r]) {
+      xi_workspace_[r]      = 1;
+      xi_workspace_[m + nz] = r;
+      nz++;
+    }
+    x_workspace_[r] += dot;
+
+#ifdef CHECK_MULTIPLY
+    f_t dot2 = 0.0;
+    for (i_t p = col_start; p < col_end; ++p) {
+      dot2 += S_.x[p] * multiply[S_.i[p]];
+    }
+    multiply[r] += dot2;
+#endif
+  }
+
+  // Gather the solution into rhs
+  gather_into_sparse_vector(nz, rhs);
+
+  rhs.sort();
+
+#ifdef CHECK_MULTIPLY
+  std::vector<f_t> rhs_dense;
+  rhs.to_dense(rhs_dense);
+  for (i_t k = 0; k < m; ++k) {
+    if (std::abs(rhs_dense[k] - multiply[k]) > 1e-10) {
+      printf("l_solve rhs dense/multiply error %d %e %e\n", k, rhs_dense[k], multiply[k]);
+    }
+  }
+#endif
+
+  return 0;
+}
+
 // Solve for y such that L'*y = c
 template <typename i_t, typename f_t>
 i_t basis_update_t<i_t, f_t>::l_transpose_solve(std::vector<f_t>& rhs) const
@@ -153,6 +379,149 @@ i_t basis_update_t<i_t, f_t>::l_transpose_solve(std::vector<f_t>& rhs) const
   return 0;
 }
 
+template <typename i_t, typename f_t>
+i_t basis_update_t<i_t, f_t>::scatter_into_workspace(const sparse_vector_t<i_t, f_t>& in) const
+{
+  const i_t m = L0_.m;
+  // scatter pattern into xi_workspace_
+  i_t nz = in.i.size();
+  for (i_t k = 0; k < nz; ++k) {
+    const i_t i          = in.i[k];
+    xi_workspace_[i]     = 1;
+    xi_workspace_[m + k] = i;
+  }
+  // scatter values into x_workspace_
+  for (i_t k = 0; k < nz; ++k) {
+    x_workspace_[in.i[k]] = in.x[k];
+  }
+  return nz;
+}
+
+template <typename i_t, typename f_t>
+void basis_update_t<i_t, f_t>::gather_into_sparse_vector(i_t nz,
+                                                         sparse_vector_t<i_t, f_t>& out) const
+{
+  const i_t m = L0_.m;
+  out.i.clear();
+  out.x.clear();
+  out.i.resize(nz);
+  out.x.resize(nz);
+  for (i_t k = 0; k < nz; ++k) {
+    const i_t i          = xi_workspace_[m + k];
+    out.i[k]             = i;
+    out.x[k]             = x_workspace_[i];
+    xi_workspace_[m + k] = 0;
+    xi_workspace_[i]     = 0;
+    x_workspace_[i]      = 0.0;
+  }
+}
+
+template <typename i_t, typename f_t>
+void basis_update_t<i_t, f_t>::solve_to_sparse_vector(i_t top, sparse_vector_t<i_t, f_t>& out) const
+{
+  const i_t m = L0_.m;
+  out.n       = m;
+  out.i.clear();
+  out.x.clear();
+  const i_t nz = m - top;
+  out.x.resize(nz);
+  out.i.resize(nz);
+  i_t k = 0;
+  for (i_t p = top; p < m; ++p) {
+    const i_t i      = xi_workspace_[p];
+    out.i[k]         = i;
+    out.x[k]         = x_workspace_[i];
+    x_workspace_[i]  = 0.0;
+    xi_workspace_[p] = 0;
+    k++;
+  }
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_t<i_t, f_t>::l_transpose_solve(sparse_vector_t<i_t, f_t>& rhs) const
+{
+  // L = L0*R1^{-1}* ... * Rk^{-1}
+  // L' = Rk^{-T} * Rk-1^{-T} * ... * R2^{-T} * R1^{-T} * L0^T
+  // L'*y = c
+  // Rk^{-T} * Rk-1^{-T} * ... * R2^{-T} * R1^{-T} * L0^T * y = c
+  // L0^T * y = cprime = R1^1 * ... * Rk^T * c
+  const i_t m = L0_.m;
+
+  i_t nz = 0;
+
+#ifdef CHECK_UPDATES
+  std::vector<f_t> multiply;
+  rhs.to_dense(multiply);
+  for (i_t k = 0; k < 2 * m; ++k) {
+    if (xi_workspace_[k]) { printf("xi workspace %d %d\n", k, xi_workspace_[k]); }
+  }
+#endif
+
+  if (num_updates_ > 0) { nz = scatter_into_workspace(rhs); }
+
+  for (i_t k = num_updates_ - 1; k >= 0; --k) {
+    const i_t r = pivot_indices_[k];
+    assert(r < m);
+    const i_t col_start = S_.col_start[k];
+    const i_t col_end   = S_.col_start[k + 1];
+    if (xi_workspace_[r]) {
+      for (i_t p = col_start; p < col_end; ++p) {
+        // rhs.x[S_.i[p]] += rhs.x[r] * S_.x[p];
+        if (!xi_workspace_[S_.i[p]]) {
+          xi_workspace_[S_.i[p]] = 1;
+          xi_workspace_[m + nz]  = S_.i[p];
+          nz++;
+        }
+        x_workspace_[S_.i[p]] += x_workspace_[r] * S_.x[p];
+      }
+    }
+#ifdef CHECK_UPDATES
+    for (i_t p = col_start; p < col_end; ++p) {
+      multiply[S_.i[p]] += multiply[r] * S_.x[p];
+    }
+#endif
+  }
+
+  // Gather into rhs
+  if (num_updates_ > 0) {
+    gather_into_sparse_vector(nz, rhs);
+
+    rhs.sort();
+
+#ifdef CHECK_UPDATES
+    std::vector<f_t> rhs_dense;
+    rhs.to_dense(rhs_dense);
+    for (i_t k = 0; k < m; ++k) {
+      if (std::abs(rhs_dense[k] - multiply[k]) > 1e-6) {
+        printf("rhs dense/multiply error %d %e %e\n", k, rhs_dense[k], multiply[k]);
+      }
+    }
+#endif
+  }
+
+  // L0^T * y = cprime
+#ifdef CHECK_LOWER_TRANSPOSE_SOLVE
+  std::vector<f_t> cprime_dense;
+  rhs.to_dense(cprime_dense);
+#endif
+
+  i_t top = sparse_triangle_solve<i_t, f_t, false>(
+    rhs, std::nullopt, xi_workspace_, L0_transpose_, x_workspace_.data());
+  solve_to_sparse_vector(top, rhs);  // Uses xi_workspace_ and x_workspace_ to fill rhs
+
+#ifdef CHECK_LOWER_TRANSPOSE_SOLVE
+  std::vector<f_t> y_dense;
+  rhs.to_dense(y_dense);
+
+  std::vector<f_t> residual = cprime_dense;
+  matrix_transpose_vector_multiply(L0_, 1.0, y_dense, -1.0, residual);
+  const f_t L0_solve_error = vector_norm_inf<i_t, f_t>(residual);
+  if (L0_solve_error > 1e-6) { printf("L0 solve error %e\n", L0_solve_error); }
+
+#endif
+  return 0;
+}
+
 template <typename i_t, typename f_t>
 f_t basis_update_t<i_t, f_t>::update_lower(const std::vector<i_t>& sind,
                                            const std::vector<f_t>& sval,
@@ -205,6 +574,28 @@ i_t basis_update_t<i_t, f_t>::u_solve(std::vector<f_t>& x) const
   return 0;
 }
 
+template <typename i_t, typename f_t>
+i_t basis_update_t<i_t, f_t>::u_solve(sparse_vector_t<i_t, f_t>& rhs) const
+{
+  // Solve Q*U*Q'*x = b
+  // Multiplying by Q' we have U*Q'*x = Q'*b = bprime
+  // Let y = Q'*x so U*y = bprime
+  // 1. Compute bprime = Q'*b
+  // 2. Solve for y such that U*y = bprime
+  // 3. Compute Q*y = x
+  const i_t m = U_.m;
+  sparse_vector_t<i_t, f_t> bprime(m, 0);
+  rhs.inverse_permute_vector(col_permutation_, bprime);
+
+  i_t top = sparse_triangle_solve<i_t, f_t, false>(
+    bprime, std::nullopt, xi_workspace_, U_, x_workspace_.data());
+  solve_to_sparse_vector(top, rhs);  // Uses xi_workspace_ and x_workspace_ to fill rhs
+
+  rhs.inverse_permute_vector(inverse_col_permutation_);
+
+  return 0;
+}
+
 // x = U'(q,q)\b
 template <typename i_t, typename f_t>
 i_t basis_update_t<i_t, f_t>::u_transpose_solve(std::vector<f_t>& x) const
@@ -223,6 +614,113 @@ i_t basis_update_t<i_t, f_t>::u_transpose_solve(std::vector<f_t>& x) const
   return 0;
 }
 
+template <typename i_t, typename f_t>
+i_t basis_update_t<i_t, f_t>::u_transpose_solve(sparse_vector_t<i_t, f_t>& rhs) const
+{
+  // Solve Q*U'*Q'*x = b
+  // Multiplying by Q' we have U'*Q'*x = Q'*b = bprime
+  // Let y = Q'*x so U'*y = bprime
+  // 1. Compute bprime = Q'*b
+  // 2. Solve for y such that U'*y = bprime
+  // 3. Compute Q*y = x
+  const i_t m = U_.m;
+  sparse_vector_t<i_t, f_t> bprime(1, 0);
+#ifdef CHECK_PERMUTATION
+  std::vector<f_t> rhs_dense(m);
+  rhs.to_dense(rhs_dense);
+#endif
+  rhs.inverse_permute_vector(col_permutation_, bprime);
+#ifdef CHECK_PERMUTATION
+  std::vector<f_t> bprime_dense;
+  bprime.to_dense(bprime_dense);
+  std::vector<f_t> rhs_dense_permuted(m);
+  inverse_permute_vector(col_permutation_, rhs_dense, rhs_dense_permuted);
+  for (i_t k = 0; k < m; ++k) {
+    if (std::abs(bprime_dense[k] - rhs_dense_permuted[k]) > 1e-6) {
+      printf("u_transpose inverse permutation error %d %e %e\n",
+             k,
+             bprime_dense[k],
+             rhs_dense_permuted[k]);
+    }
+  }
+#endif
+
+#ifdef CHECK_WORKSPACE
+  for (i_t k = 0; k < 2 * m; ++k) {
+    if (xi_workspace_[k]) {
+      printf("before Utranspose m %d solve xi workspace %d %d\n", m, k, xi_workspace_[k]);
+    }
+  }
+#endif
+
+  // U'*y = bprime
+  i_t top = sparse_triangle_solve<i_t, f_t, true>(
+    bprime, std::nullopt, xi_workspace_, U_transpose_, x_workspace_.data());
+  solve_to_sparse_vector(top, rhs);  // Uses xi_workspace_ and x_workspace_ to fill rhs
+
+#ifdef CHECK_WORKSPACE
+  for (i_t k = 0; k < 2 * m; ++k) {
+    if (xi_workspace_[k]) {
+      printf(
+        "after Utranspose m %d top %d solve xi workspace %d %d\n", m, top, k, xi_workspace_[k]);
+    }
+  }
+#endif
+
+#ifdef CHECK_PERMUTATION
+  std::vector<f_t> rhs_dense2;
+  rhs.to_dense(rhs_dense2);
+#endif
+
+  // Q*y = x
+  rhs.inverse_permute_vector(inverse_col_permutation_);
+#ifdef CHECK_PERMUTATION
+  rhs.to_dense(rhs_dense_permuted);
+  std::vector<f_t> rhs_dense_permuted2(m);
+  permute_vector(col_permutation_, rhs_dense2, rhs_dense_permuted2);
+  bool found_error = false;
+  for (i_t k = 0; k < m; ++k) {
+    if (std::abs(rhs_dense_permuted[k] - rhs_dense_permuted2[k]) > 1e-6) {
+      printf("u_transpose2 permutation error %d %e %e\n",
+             k,
+             rhs_dense_permuted[k],
+             rhs_dense_permuted2[k]);
+      found_error = true;
+    }
+  }
+  if (found_error) {
+    for (i_t k = 0; k < m; ++k) {
+      printf("%d (sparse -> permuted -> dense) %e (sparse -> dense -> permuted)%e\n",
+             k,
+             rhs_dense_permuted[k],
+             rhs_dense_permuted2[k]);
+    }
+    for (i_t k = 0; k < rhs.i.size(); ++k) {
+      printf("%d rhs sparse %d %e\n", k, rhs.i[k], rhs.x[k]);
+    }
+    for (i_t k = 0; k < m; ++k) {
+      if (rhs_dense_permuted[k] != 0.0) {
+        printf("%d rhs dense permuted %e\n", k, rhs_dense_permuted[k]);
+      }
+    }
+    for (i_t k = 0; k < m; ++k) {
+      if (rhs_dense2[k] != 0.0) { printf("%d rhs dense2 %e\n", k, rhs_dense2[k]); }
+    }
+    printf("col permutation %d rhs dense 2 %d rhs dense permuted %d\n",
+           col_permutation_.size(),
+           rhs_dense2.size(),
+           rhs_dense_permuted.size());
+    for (i_t k = 0; k < col_permutation_.size(); ++k) {
+      printf("%d col permutation %d\n", k, col_permutation_[k]);
+    }
+    for (i_t k = 0; k < m; ++k) {
+      printf("%d col permutation inverse %d\n", k, inverse_col_permutation_[k]);
+    }
+  }
+#endif
+  return 0;
+}
+
 template <typename i_t, typename f_t>
 i_t basis_update_t<i_t, f_t>::index_map(i_t r) const
 {
@@ -334,6 +832,7 @@ i_t basis_update_t<i_t, f_t>::update_upper(const std::vector<i_t>& ind,
   U_.col_start[n] = new_nz;
 
   // Check to ensure that U remains upper triangular
+#ifdef CHECK_UPPER_TRIANGULAR
   for (i_t k = 0; k < n; ++k) {
     const i_t col_start = U_.col_start[k];
     const i_t col_end   = U_.col_start[k + 1];
@@ -341,6 +840,10 @@ i_t basis_update_t<i_t, f_t>::update_upper(const std::vector<i_t>& ind,
       assert(U_.i[p] <= k);
     }
   }
+#endif
+
+  // Update U transpose
+  U_.transpose(U_transpose_);
 
   return 0;
 }
@@ -436,7 +939,7 @@ i_t basis_update_t<i_t, f_t>::update(std::vector<f_t>& utilde, i_t leaving_index
     norm_s = update_lower(sind, sval, leaving_index);
   }
 
-#ifdef PARANOID
+#ifdef CHECK_ABAR
   {
     sparse_matrix_t abar_test(m, 1, 1);
     const Int nz           = lower_triangular_multiply(U_, m - 1, abar_test, 1);
@@ -473,21 +976,19 @@ i_t basis_update_t<i_t, f_t>::multiply_lu(csc_matrix_t<i_t, f_t>& out)
   out.col_start.resize(m + 1);
   assert(out.m == m);
   const i_t nz_estimate = L0_.col_start[m] + U_.col_start[m];
-#if 0
-    printf("Nz estimate %d m %d num updates %d\n", nz_estimate, m, num_updates_);
-    printf("q = [");
-    for (Int k = 0; k < m; ++k)
-    {
-        printf("%d ", col_permutation_[k]);
-    }
-    printf("];\n");
-    //PrintMatrix(L0_);
-    printf("p = [");
-    for (Int k = 0; k < m; ++k)
-    {
-        printf("%d ", row_permutation_[k]);
-    }
-    printf("];\n");
+#ifdef PRINT_PERMUTATIONS
+  printf("Nz estimate %d m %d num updates %d\n", nz_estimate, m, num_updates_);
+  printf("q = [");
+  for (i_t k = 0; k < m; ++k) {
+    printf("%d ", col_permutation_[k]);
+  }
+  printf("];\n");
+  // PrintMatrix(L0_);
+  printf("p = [");
+  for (i_t k = 0; k < m; ++k) {
+    printf("%d ", row_permutation_[k]);
+  }
+  printf("];\n");
 #endif
   out.reallocate(nz_estimate);
 
@@ -547,16 +1048,14 @@ i_t basis_update_t<i_t, f_t>::lower_triangular_multiply(const csc_matrix_t<i_t,
     const i_t r = pivot_indices_[k];
     f_t dot     = sparse_dot(sind, sval, S_, k);
     if (dot == 0.0) { continue; }
-#if 0
-        for (Int p = 0; p < sind.size(); ++p)
-        {
-            printf("s %d %e\n", sind[p], sval[p]);
-        }
-        printf("S col start %d %d\n", S_.col_start[k], S_.col_start[k+1]);
-        for (Int p = S_.col_start[k]; p < S_.col_start[k+1]; ++p)
-        {
-            printf("S %d %d %e\n", k, S_.i[p], S_.x[p]);
-        }
+#ifdef DEBUG_MULTIPLY
+    for (i_t p = 0; p < sind.size(); ++p) {
+      printf("s %d %e\n", sind[p], sval[p]);
+    }
+    printf("S col start %d %d\n", S_.col_start[k], S_.col_start[k + 1]);
+    for (i_t p = S_.col_start[k]; p < S_.col_start[k + 1]; ++p) {
+      printf("S %d %d %e\n", k, S_.i[p], S_.x[p]);
+    }
 #endif
     bool fill = true;
     for (i_t p = 0; p < sind.size(); ++p) {
@@ -617,8 +1116,928 @@ i_t basis_update_t<i_t, f_t>::lower_triangular_multiply(const csc_matrix_t<i_t,
   return new_nz;
 }
 
+template <typename i_t, typename f_t>
+void basis_update_mpf_t<i_t, f_t>::gather_into_sparse_vector(i_t nz,
+                                                             sparse_vector_t<i_t, f_t>& out) const
+{
+  const i_t m = L0_.m;
+  out.i.clear();
+  out.x.clear();
+  out.i.reserve(nz);
+  out.x.reserve(nz);
+  const f_t zero_tol = 1e-13;
+  for (i_t k = 0; k < nz; ++k) {
+    const i_t i = xi_workspace_[m + k];
+    if (std::abs(x_workspace_[i]) > zero_tol) {
+      out.i.push_back(i);
+      out.x.push_back(x_workspace_[i]);
+    }
+    xi_workspace_[m + k] = 0;
+    xi_workspace_[i]     = 0;
+    x_workspace_[i]      = 0.0;
+  }
+}
+
+template <typename i_t, typename f_t>
+void basis_update_mpf_t<i_t, f_t>::solve_to_workspace(i_t top) const
+{
+  const i_t m = L0_.m;
+  i_t nz      = 0;
+  for (i_t p = top; p < m; ++p) {
+    const i_t i           = xi_workspace_[p];
+    xi_workspace_[m + nz] = i;
+    xi_workspace_[p]      = 0;
+    nz++;
+  }
+  for (i_t k = 0; k < nz; ++k) {
+    const i_t i      = xi_workspace_[m + k];
+    xi_workspace_[i] = 1;
+  }
+}
+
+template <typename i_t, typename f_t>
+void basis_update_mpf_t<i_t, f_t>::solve_to_sparse_vector(i_t top,
+                                                          sparse_vector_t<i_t, f_t>& out) const
+{
+  const i_t m  = L0_.m;
+  out.n        = m;
+  const i_t nz = m - top;
+  out.x.clear();
+  out.i.clear();
+  out.x.reserve(nz);
+  out.i.reserve(nz);
+  i_t k              = 0;
+  const f_t zero_tol = 1e-13;
+  for (i_t p = top; p < m; ++p) {
+    const i_t i = xi_workspace_[p];
+    if (std::abs(x_workspace_[i]) > zero_tol) {
+      out.i.push_back(i);
+      out.x.push_back(x_workspace_[i]);
+    }
+    x_workspace_[i]  = 0.0;
+    xi_workspace_[p] = 0;
+    k++;
+  }
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::scatter_into_workspace(const sparse_vector_t<i_t, f_t>& in) const
+{
+  const i_t m = L0_.m;
+  // scatter pattern into xi_workspace_
+  i_t nz = in.i.size();
+  for (i_t k = 0; k < nz; ++k) {
+    const i_t i          = in.i[k];
+    xi_workspace_[i]     = 1;
+    xi_workspace_[m + k] = i;
+  }
+  // scatter values into x_workspace_
+  for (i_t k = 0; k < nz; ++k) {
+    x_workspace_[in.i[k]] = in.x[k];
+  }
+  return nz;
+}
+
+template <typename i_t, typename f_t>
+void basis_update_mpf_t<i_t, f_t>::grow_storage(i_t nz, i_t& S_start, i_t& S_nz)
+{
+  const i_t last_S_col     = num_updates_ * 2;
+  const i_t new_last_S_col = last_S_col + 2;
+  if (new_last_S_col >= S_.col_start.size()) {
+    S_.col_start.resize(new_last_S_col + refactor_frequency_);
+  }
+  S_nz = S_.col_start[last_S_col];
+  if (S_nz + nz > S_.i.size()) {
+    S_.i.resize(std::max(2 * S_nz, S_nz + nz));
+    S_.x.resize(std::max(2 * S_nz, S_nz + nz));
+  }
+  S_start = last_S_col;
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::nonzeros(const std::vector<f_t>& x) const
+{
+  i_t nz        = 0;
+  const i_t xsz = x.size();
+  for (i_t i = 0; i < xsz; ++i) {
+    if (x[i] != 0.0) { nz++; }
+  }
+  return nz;
+}
+
+// dot = S(:, col)' * x
+template <typename i_t, typename f_t>
+f_t basis_update_mpf_t<i_t, f_t>::dot_product(i_t col, const std::vector<f_t>& x) const
+{
+  f_t dot             = 0.0;
+  const i_t col_start = S_.col_start[col];
+  const i_t col_end   = S_.col_start[col + 1];
+  for (i_t p = col_start; p < col_end; ++p) {
+    const i_t i = S_.i[p];
+    dot += S_.x[p] * x[i];
+  }
+  return dot;
+}
+
+// dot = S(:, col)' * x
+template <typename i_t, typename f_t>
+f_t basis_update_mpf_t<i_t, f_t>::dot_product(i_t col,
+                                              const std::vector<i_t>& mark,
+                                              const std::vector<f_t>& x) const
+{
+  f_t dot             = 0.0;
+  const i_t col_start = S_.col_start[col];
+  const i_t col_end   = S_.col_start[col + 1];
+  for (i_t p = col_start; p < col_end; ++p) {
+    const i_t i = S_.i[p];
+    if (mark[i]) { dot += S_.x[p] * x[i]; }
+  }
+  return dot;
+}
+
+// x <- x + theta * S(:, col)
+template <typename i_t, typename f_t>
+void basis_update_mpf_t<i_t, f_t>::add_sparse_column(const csc_matrix_t<i_t, f_t>& S,
+                                                     i_t col,
+                                                     f_t theta,
+                                                     std::vector<f_t>& x) const
+{
+  const i_t col_start = S.col_start[col];
+  const i_t col_end   = S.col_start[col + 1];
+  for (i_t p = col_start; p < col_end; ++p) {
+    const i_t i = S.i[p];
+    x[i] += theta * S.x[p];
+  }
+}
+
+template <typename i_t, typename f_t>
+void basis_update_mpf_t<i_t, f_t>::add_sparse_column(const csc_matrix_t<i_t, f_t>& S,
+                                                     i_t col,
+                                                     f_t theta,
+                                                     std::vector<i_t>& mark,
+                                                     i_t& nz,
+                                                     std::vector<f_t>& x) const
+{
+  const i_t m         = L0_.m;
+  const i_t col_start = S.col_start[col];
+  const i_t col_end   = S.col_start[col + 1];
+  for (i_t p = col_start; p < col_end; ++p) {
+    const i_t i = S.i[p];
+    if (!mark[i]) {
+      // Fill occured
+      mark[i]      = 1;
+      mark[m + nz] = i;
+      nz++;
+    }
+    x[i] += theta * S.x[p];
+  }
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::b_transpose_solve(const std::vector<f_t>& rhs,
+                                                    std::vector<f_t>& solution) const
+{
+  std::vector<f_t> UTsol;
+  return b_transpose_solve(rhs, solution, UTsol);
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::b_transpose_solve(const std::vector<f_t>& rhs,
+                                                    std::vector<f_t>& solution,
+                                                    std::vector<f_t>& UTsol) const
+{
+  const i_t m = L0_.m;
+  // P*B = L*U
+  // B'*P' = U'*L'
+  // We want to solve
+  // B'*y = c
+  // Let y = P'*w
+  // B'*y = B'*P'*w = U'*L'*w = c
+  // 1. Solve U'*r = c for r
+  // 2. Solve L'*w = r for w
+  // 3. Compute y = P'*w
+
+  // Solve for r such that U'*r = c
+  std::vector<f_t> r = rhs;
+  u_transpose_solve(r);
+  UTsol = r;
+
+  // Solve for w such that L'*w = r
+  l_transpose_solve(r);
+
+  // Compute y = P'*w
+  inverse_permute_vector(row_permutation_, r, solution);
+
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::b_transpose_solve(const sparse_vector_t<i_t, f_t>& rhs,
+                                                    sparse_vector_t<i_t, f_t>& solution) const
+{
+  sparse_vector_t<i_t, f_t> UTsol(1, 0);
+  return b_transpose_solve(rhs, solution, UTsol);
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::b_transpose_solve(const sparse_vector_t<i_t, f_t>& rhs,
+                                                    sparse_vector_t<i_t, f_t>& solution,
+                                                    sparse_vector_t<i_t, f_t>& UTsol) const
+{
+  // Solve for r such that U'*r = c
+
+  bool use_hypersparse = false;
+  const f_t input_size = static_cast<f_t>(rhs.i.size());
+  estimate_solution_density(input_size, sum_U_transpose_, num_calls_U_transpose_, use_hypersparse);
+  if (use_hypersparse) {
+    solution = rhs;
+    u_transpose_solve(solution);
+  } else {
+    std::vector<f_t> solution_dense;
+    rhs.to_dense(solution_dense);
+    u_transpose_solve(solution_dense);
+    solution.from_dense(solution_dense);
+  }
+  UTsol = solution;
+  sum_U_transpose_ += static_cast<f_t>(solution.i.size()) / input_size;
+
+#ifdef CHECK_U_TRANSPOSE_SOLVE
+  std::vector<f_t> UTsol_dense;
+  UTsol.to_dense(UTsol_dense);
+  std::vector<f_t> rhs_dense;
+  rhs.to_dense(rhs_dense);
+
+  matrix_transpose_vector_multiply(U0_, 1.0, UTsol_dense, -1.0, rhs_dense);
+  if (vector_norm_inf<i_t, f_t>(rhs_dense) > 1e-10) {
+    printf("B transpose solve U transpose residual %e\n", vector_norm_inf<i_t, f_t>(rhs_dense));
+  }
+#endif
+
+  // Solve for w such that L'*w = r
+#ifdef CHECK_L_TRANSPOSE_SOLVE
+  std::vector<f_t> r_dense;
+  solution.to_dense(r_dense);
+#endif
+  const f_t rhs_size = static_cast<f_t>(solution.i.size());
+  estimate_solution_density(rhs_size, sum_L_transpose_, num_calls_L_transpose_, use_hypersparse);
+  if (use_hypersparse) {
+    l_transpose_solve(solution);
+  } else {
+    std::vector<f_t> solution_dense;
+    solution.to_dense(solution_dense);
+    l_transpose_solve(solution_dense);
+    solution.from_dense(solution_dense);
+  }
+  sum_L_transpose_ += static_cast<f_t>(solution.i.size()) / rhs_size;
+
+#ifdef CHECK_L_TRANSPOSE_SOLVE
+  std::vector<f_t> solution_dense;
+  solution.to_dense(solution_dense);
+  l_transpose_multiply(solution_dense);
+  f_t max_error = 0.0;
+  for (i_t k = 0; k < L0_.m; ++k) {
+    if (std::abs(solution_dense[k] - r_dense[k]) > 1e-4) {
+      printf(
+        "B transpose solve L transpose solve error %e: index %d multiply %e rhs %e. update %d. use "
+        "hypersparse %d\n",
+        std::abs(solution_dense[k] - r_dense[k]),
+        k,
+        solution_dense[k],
+        r_dense[k],
+        num_updates_,
+        use_hypersparse);
+    }
+
+    max_error = std::max(max_error, std::abs(solution_dense[k] - r_dense[k]));
+  }
+  if (max_error > 1e-4) { printf("B transpose solve L transpose solve residual %e\n", max_error); }
+#endif
+  // Compute y = P'*w
+  solution.inverse_permute_vector(row_permutation_);
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::u_transpose_solve(std::vector<f_t>& rhs) const
+{
+  total_dense_U_transpose_++;
+  dual_simplex::upper_triangular_transpose_solve(U0_, rhs);
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::u_transpose_solve(sparse_vector_t<i_t, f_t>& rhs) const
+{
+  total_sparse_U_transpose_++;
+  // U0'*x = y
+  // Solve U0'*x0 = y
+  i_t top = dual_simplex::sparse_triangle_solve<i_t, f_t, true>(
+    rhs, std::nullopt, xi_workspace_, U0_transpose_, x_workspace_.data());
+  solve_to_sparse_vector(top, rhs);
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::l_transpose_solve(std::vector<f_t>& rhs) const
+{
+  total_dense_L_transpose_++;
+  // L = L0 * T0 * T1 * ... * T_{num_updates_ - 1}
+  // L' = T_{num_updates_ - 1}^T * T_{num_updates_ - 2}^T * ... * T0^T * L0^T
+  // L'*x = b
+  // L0^T *x = T_0^-T * T_1^-T * ... * T_{num_updates_ - 1}^-T * b = b'
+
+  const f_t zero_tol = 1e-13;
+  // Compute b'
+  for (i_t k = num_updates_ - 1; k >= 0; --k) {
+    // T_k^{-T} = ( I - v u^T/(1 + u^T v))
+    // T_k^{-T} * b = b - v * (u^T * b) / (1 + u^T * v) = b - theta * v, theta = u^T b / mu
+
+    const i_t u_col = 2 * k;
+    const i_t v_col = 2 * k + 1;
+    const f_t mu    = mu_values_[k];
+
+    // dot = u^T * b
+    f_t dot         = dot_product(u_col, rhs);
+    const f_t theta = dot / mu;
+
+    if (std::abs(theta) > zero_tol) { add_sparse_column(S_, v_col, -theta, rhs); }
+  }
+
+  // Solve for x such that L0^T * x = b'
+  dual_simplex::lower_triangular_transpose_solve(L0_, rhs);
+
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::l_transpose_solve(sparse_vector_t<i_t, f_t>& rhs) const
+{
+  total_sparse_L_transpose_++;
+  const i_t m = L0_.m;
+  // L'*x = b
+  // L0^T * x = T_0^-T * T_1^-T * ... * T_{num_updates_ - 1}^-T * b = b'
+
+  scatter_into_workspace(rhs);
+  i_t nz = rhs.i.size();
+
+#ifdef CHECK_MULTIPLY
+  std::vector<f_t> rhs_dense_0;
+  rhs.to_dense(rhs_dense_0);
+#endif
+  const f_t zero_tol = 1e-13;
+  // Compute b'
+  for (i_t k = num_updates_ - 1; k >= 0; --k) {
+    // T_k^{-T} = ( I - v u^T/(1 + u^T v))
+    // T_k^{-T} * b = b - v * (u^T * b) / (1 + u^T * v) = b - theta * v, theta = u^T b / mu
+
+    const i_t u_col = 2 * k;
+    const i_t v_col = 2 * k + 1;
+    const f_t mu    = mu_values_[k];
+
+    // dot = u^T * b
+    f_t dot = dot_product(u_col, xi_workspace_, x_workspace_);
+
+#ifdef CHECK_MULTIPLY
+    f_t dot_check = 0.0;
+    for (i_t p = S_.col_start[u_col]; p < S_.col_start[u_col + 1]; ++p) {
+      const i_t i = S_.i[p];
+      dot_check += S_.x[p] * rhs_dense_0[i];
+    }
+    if (std::abs(dot - dot_check) > 1e-10) {
+      printf("L transpose solve dot erorr: index %d dot %e dot check %e\n", k, dot, dot_check);
+    }
+#endif
+
+    const f_t theta = dot / mu;
+    if (std::abs(theta) > zero_tol) {
+      add_sparse_column(S_, v_col, -theta, xi_workspace_, nz, x_workspace_);
+    }
+
+#ifdef CHECK_MULTIPLY
+    for (i_t p = S_.col_start[v_col]; p < S_.col_start[v_col + 1]; ++p) {
+      const i_t i = S_.i[p];
+      rhs_dense_0[i] -= theta * S_.x[p];
+    }
+#endif
+  }
+
+#ifdef CHECK_MULTIPLY
+  for (i_t i = 0; i < m; ++i) {
+    if (std::abs(rhs_dense_0[i] - x_workspace_[i]) > 1e-9) {
+      printf("L transpose solve multiply error %e index %d sparse %e dense %e\n",
+             std::abs(rhs_dense_0[i] - x_workspace_[i]),
+             i,
+             x_workspace_[i],
+             rhs_dense_0[i]);
+    }
+  }
+#endif
+
+  sparse_vector_t<i_t, f_t> b(m, nz);
+  gather_into_sparse_vector(nz, b);
+  i_t top = dual_simplex::sparse_triangle_solve<i_t, f_t, false>(
+    b, std::nullopt, xi_workspace_, L0_transpose_, x_workspace_.data());
+  solve_to_sparse_vector(top, rhs);
+
+#ifdef CHECK_SPARSE_SOLVE
+  std::vector<f_t> rhs_dense;
+  rhs.to_dense(rhs_dense);
+
+  std::vector<f_t> b_dense(m, 0.0);
+  for (i_t p = 0; p < nz; ++p) {
+    const i_t i = b.i[p];
+    b_dense[i]  = b.x[p];
+  }
+  matrix_vector_multiply(L0_transpose_, 1.0, rhs_dense, -1.0, b_dense);
+  if (vector_norm_inf<i_t, f_t>(b_dense) > 1e-9) {
+    printf("L0 transpose solve residual %e\n", vector_norm_inf<i_t, f_t>(b_dense));
+  }
+#endif
+
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::b_solve(const std::vector<f_t>& rhs,
+                                          std::vector<f_t>& solution) const
+{
+  const i_t m = L0_.m;
+  std::vector<f_t> Lsol(m);
+  return b_solve(rhs, solution, Lsol);
+}
+
+// Solve for x such that B*x = y
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::b_solve(const std::vector<f_t>& rhs,
+                                          std::vector<f_t>& solution,
+                                          std::vector<f_t>& Lsol,
+                                          bool need_Lsol) const
+{
+  const i_t m = L0_.m;
+  // P*B = L*U
+  // B*x = b
+  // P*B*x = P*b
+
+  permute_vector(row_permutation_, rhs, solution);
+
+  // L*U*x = b'
+  // Solve for v such that L*v = b'
+#ifdef CHECK_L_SOLVE
+  std::vector<f_t> rhs_permuted = solution;
+#endif
+  l_solve(solution);
+  if (need_Lsol) { Lsol = solution; }
+
+#ifdef CHECK_L_SOLVE
+  std::vector<f_t> Lsol_check = Lsol;
+  l_multiply(Lsol_check);
+  f_t max_lsol_err = 0.0;
+  for (i_t k = 0; k < m; ++k) {
+    const f_t err = std::abs(Lsol_check[k] - rhs_permuted[k]);
+    max_lsol_err  = std::max(max_lsol_err, err);
+  }
+  printf("B solve L multiply error %e\n", max_lsol_err);
+#endif
+
+  // Solve for x such that U*x = v
+  u_solve(solution);
+
+#ifdef CHECK_U_SOLVE
+  std::vector<f_t> residual = Lsol;
+  matrix_vector_multiply(U0_, 1.0, solution, -1.0, residual);
+  f_t max_err = vector_norm_inf<i_t, f_t>(residual);
+  printf("B solve U solve residual %e\n", max_err);
+#endif
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::b_solve(const sparse_vector_t<i_t, f_t>& rhs,
+                                          sparse_vector_t<i_t, f_t>& solution) const
+{
+  sparse_vector_t<i_t, f_t> Lsol(1, 0);
+  return b_solve(rhs, solution, Lsol, false);
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::b_solve(const sparse_vector_t<i_t, f_t>& rhs,
+                                          sparse_vector_t<i_t, f_t>& solution,
+                                          sparse_vector_t<i_t, f_t>& Lsol,
+                                          bool need_Lsol) const
+{
+  const i_t m = L0_.m;
+  solution    = rhs;
+  solution.inverse_permute_vector(inverse_row_permutation_);
+
+#ifdef CHECK_PERMUTATION
+  std::vector<f_t> permuation_rhs;
+  rhs.to_dense(permuation_rhs);
+  std::vector<f_t> finish_perm(m);
+  permute_vector(row_permutation_, permuation_rhs, finish_perm);
+
+  std::vector<f_t> solution_dense2;
+  solution.to_dense(solution_dense2);
+  for (i_t k = 0; k < m; ++k) {
+    if (finish_perm[k] != solution_dense2[k]) {
+      printf("B solve permutation error %e %e %d\n", finish_perm[k], solution_dense2[k], k);
+    }
+  }
+#endif
+
+#ifdef CHECK_L_SOLVE
+  std::vector<f_t> l_solve_rhs;
+  solution.to_dense(l_solve_rhs);
+#endif
+
+  bool use_hypersparse;
+  const f_t input_size = static_cast<f_t>(rhs.i.size());
+  estimate_solution_density(input_size, sum_L_, num_calls_L_, use_hypersparse);
+  if (use_hypersparse) {
+    l_solve(solution);
+  } else {
+    std::vector<f_t> solution_dense;
+    solution.to_dense(solution_dense);
+    l_solve(solution_dense);
+    solution.from_dense(solution_dense);
+  }
+  if (need_Lsol) { Lsol = solution; }
+  sum_L_ += static_cast<f_t>(solution.i.size()) / input_size;
+
+#ifdef CHECK_L_SOLVE
+  std::vector<f_t> l_solve_dense;
+  Lsol.to_dense(l_solve_dense);
+
+  l_multiply(l_solve_dense);
+  f_t max_err_l_solve = 0.0;
+  for (i_t k = 0; k < m; ++k) {
+    const f_t err   = std::abs(l_solve_dense[k] - l_solve_rhs[k]);
+    max_err_l_solve = std::max(max_err_l_solve, err);
+  }
+  if (max_err_l_solve > 1e-9) { printf("B solve L solve residual %e\n", max_err_l_solve); }
+#endif
+
+#ifdef CHECK_U_SOLVE
+  std::vector<f_t> rhs_dense;
+  solution.to_dense(rhs_dense);
+#endif
+
+  const f_t rhs_size = static_cast<f_t>(solution.i.size());
+  estimate_solution_density(rhs_size, sum_U_, num_calls_U_, use_hypersparse);
+  if (use_hypersparse) {
+    u_solve(solution);
+  } else {
+    std::vector<f_t> solution_dense;
+    solution.to_dense(solution_dense);
+    u_solve(solution_dense);
+    solution.from_dense(solution_dense);
+  }
+  sum_U_ += static_cast<f_t>(solution.i.size()) / rhs_size;
+
+#ifdef CHECK_U_SOLVE
+  std::vector<f_t> solution_dense;
+  solution.to_dense(solution_dense);
+
+  matrix_vector_multiply(U0_, 1.0, solution_dense, -1.0, rhs_dense);
+
+  const f_t max_err = vector_norm_inf<i_t, f_t>(rhs_dense);
+  if (max_err > 1e-9) { printf("B solve U0 solve residual %e\n", max_err); }
+#endif
+  return 0;
+}
+
+// Solve for x such that U*x = y
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::u_solve(std::vector<f_t>& rhs) const
+{
+  total_dense_U_++;
+  const i_t m = L0_.m;
+  // U*x = y
+  dual_simplex::upper_triangular_solve(U0_, rhs);
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::u_solve(sparse_vector_t<i_t, f_t>& rhs) const
+{
+  total_sparse_U_++;
+  const i_t m = L0_.m;
+  // U*x = y
+
+  // Solve U0*x = y
+  i_t top = dual_simplex::sparse_triangle_solve<i_t, f_t, false>(
+    rhs, std::nullopt, xi_workspace_, U0_, x_workspace_.data());
+  solve_to_sparse_vector(top, rhs);
+
+  return 0;
+}
+// Solve for x such that L*x = y
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::l_solve(std::vector<f_t>& rhs) const
+{
+  total_dense_L_++;
+  const i_t m = L0_.m;
+  // L*x = y
+  // L0 * T0 * T1 * ... * T_{num_updates_ - 1} * x = y
+
+  // First solve L0*x0 = y
+#ifdef CHECK_L0_SOLVE
+  std::vector<f_t> residual = rhs;
+#endif
+#ifdef CHECK_L_SOLVE
+  std::vector<f_t> rhs_check = rhs;
+#endif
+  dual_simplex::lower_triangular_solve(L0_, rhs);
+
+#ifdef CHECK_L0_SOLVE
+  matrix_vector_multiply(L0_, 1.0, rhs, -1.0, residual);
+  f_t max_err = vector_norm_inf<i_t, f_t>(residual);
+  printf("L solve: L0 solve residual %e\n", max_err);
+#endif
+
+  // Then T0 * T1 * ... * T_{num_updates_ - 1} * x = x0
+  // Or x = T_{num_updates}^{-1} * T_1^{-1} * T_0^{-1}  x0
+  const f_t zero_tol = 1e-16;  // Any higher and pilot_ja fails
+  for (i_t k = 0; k < num_updates_; ++k) {
+    // T = I + u*v^T
+    // T^{-1} = I - u*v^T / (1 + v^T*u)
+    // T^{-1} * x = x - u*v^T * x / (1 + v^T*u) = x - theta * u, theta = v^T * x / (1 + v^T*u) = v^T
+    // x / mu
+    const f_t mu    = mu_values_[k];
+    const i_t u_col = 2 * k;
+    const i_t v_col = 2 * k + 1;
+    f_t dot         = dot_product(v_col, rhs);
+    const f_t theta = dot / mu;
+
+    if (std::abs(theta) > zero_tol) { add_sparse_column(S_, u_col, -theta, rhs); }
+  }
+
+#ifdef CHECK_L_SOLVE
+  std::vector<f_t> inout = rhs;
+  l_multiply(inout);
+  f_t err_max = 0.0;
+  for (i_t k = 0; k < m; ++k) {
+    const f_t err = std::abs(inout[k] - rhs_check[k]);
+    err_max       = std::max(err_max, err);
+  }
+  printf("L solve residual %e\n", err_max);
+#endif
+
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::l_solve(sparse_vector_t<i_t, f_t>& rhs) const
+{
+  total_sparse_L_++;
+  const i_t m = L0_.m;
+  // L*x = y
+  // L0 * T0 * T1 * ... * T_{num_updates_ - 1} * x = y
+
+  // First solve L0*x0 = y
+  i_t top = dual_simplex::sparse_triangle_solve<i_t, f_t, true>(
+    rhs, std::nullopt, xi_workspace_, L0_, x_workspace_.data());
+  solve_to_workspace(top);  // Uses xi_workspace_ and x_workspace_ to fill rhs
+  i_t nz = m - top;
+  // Then T0 * T1 * ... * T_{num_updates_ - 1} * x = x0
+  // Or x = T_{num_updates}^{-1} * T_1^{-1} * T_0^{-1}  x0
+  const f_t zero_tol = 1e-13;
+  for (i_t k = 0; k < num_updates_; ++k) {
+    // T = I + u*v^T
+    // T^{-1} = I - u*v^T / (1 + v^T*u)
+    // T^{-1} * x = x - u*v^T * x / (1 + v^T*u) = x - theta * u, theta = v^T * x / (1 + v^T*u) = v^T
+    // x / mu
+    const f_t mu    = mu_values_[k];
+    const i_t u_col = 2 * k;
+    const i_t v_col = 2 * k + 1;
+
+    // dot = v^T * x
+    f_t dot = dot_product(v_col, xi_workspace_, x_workspace_);
+
+    const f_t theta = dot / mu;
+    if (std::abs(theta) > zero_tol) {
+      add_sparse_column(S_, u_col, -theta, xi_workspace_, nz, x_workspace_);
+    }
+  }
+
+  gather_into_sparse_vector(nz, rhs);
+
+  return 0;
+}
+
+// Takes in utilde such that L*utilde = abar, where abar is the column to add to the basis
+// and etilde such that U'*etilde = e_leaving
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::update(const std::vector<f_t>& utilde,
+                                         const std::vector<f_t>& etilde,
+                                         i_t leaving_index)
+{
+  const i_t m = L0_.m;
+#ifdef PRINT_NUM_UPDATES
+  printf("Update: num_updates_ %d\n", num_updates_);
+#endif
+
+  // We are going to create a new matrix T = I + u*v^T
+  const i_t col_start = U0_.col_start[leaving_index];
+  const i_t col_end   = U0_.col_start[leaving_index + 1];
+  std::vector<f_t> u  = utilde;
+  // u = utilde - U0(:, leaving_index)
+  add_sparse_column(U0_, leaving_index, -1.0, u);
+
+  i_t u_nz = nonzeros(u);
+
+  // v = etilde
+  i_t v_nz = nonzeros(etilde);
+
+  i_t nz = u_nz + v_nz;
+  i_t S_start;
+  i_t S_nz;
+  grow_storage(nz, S_start, S_nz);
+#ifdef PRINT_NZ_INFO
+  printf("Update: S_start %d S_nz %d num updates %d S.n %d\n", S_start, S_nz, num_updates_, S_.n);
+#endif
+
+  i_t S_nz_start = S_nz;
+
+  // Scatter u into S
+  S_.append_column(u);
+
+  // Scatter v into S
+  S_.append_column(etilde);
+
+  // Compute mu = 1 + v^T * u
+  const f_t mu = 1.0 + sparse_dot(S_.i.data() + S_.col_start[S_start],
+                                  S_.x.data() + S_.col_start[S_start],
+                                  S_.col_start[S_start + 1] - S_.col_start[S_start],
+                                  S_.i.data() + S_.col_start[S_start + 1],
+                                  S_.x.data() + S_.col_start[S_start + 1],
+                                  v_nz);
+
+#ifdef CHECK_MU
+  const f_t mu_check = 1.0 + dot<i_t, f_t>(etilde, u);
+  printf("Update: mu %e mu_check %e diff %e\n", mu, mu_check, std::abs(mu - mu_check));
+#endif
+  mu_values_.push_back(mu);
+
+#ifdef PRINT_MU_INFO
+  printf("Update mu %e u nz %d v nz %d\n",
+         mu_values_.back(),
+         S_.col_start[S_start + 1] - S_.col_start[S_start],
+         S_.col_start[S_start + 2] - S_.col_start[S_start + 1]);
+#endif
+  num_updates_++;
+
+  return 0;
+}
+
+// Takes in utilde such that L*utilde = abar, where abar is the column to add to the basis
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::update(const sparse_vector_t<i_t, f_t>& utilde,
+                                         sparse_vector_t<i_t, f_t>& etilde,
+                                         i_t leaving_index)
+{
+  const i_t m = L0_.m;
+#ifdef PRINT_NUM_UPDATES
+  printf("Update: num_updates_ %d\n", num_updates_);
+#endif
+
+  // We are going to create a new matrix T = I + u*v^T
+  // where u = utilde - U0(:, p) and v = etilde
+
+  // Scatter utilde into the workspace
+  i_t nz = scatter_into_workspace(utilde);
+
+  // Subtract the column of U0 corresponding to the leaving index
+  add_sparse_column(U0_, leaving_index, -1.0, xi_workspace_, nz, x_workspace_);
+
+  // Ensure the workspace is sorted. Otherwise, the sparse dot will be incorrect.
+  std::sort(xi_workspace_.begin() + m, xi_workspace_.begin() + m + nz, std::less<i_t>());
+
+  // Gather the workspace into a column of S
+  i_t S_start;
+  i_t S_nz;
+  grow_storage(nz + etilde.i.size(), S_start, S_nz);
+
+  S_.append_column(nz, xi_workspace_.data() + m, x_workspace_.data());
+
+  // Gather etilde into a column of S
+  etilde.sort();  // Needs to be sorted for the sparse dot. TODO(CMM): Is etilde sorted on input?
+  S_.append_column(etilde);
+
+  // Compute mu = 1 + v^T * u
+  mu_values_.push_back(1.0 + sparse_dot(S_.i.data() + S_.col_start[S_start],
+                                        S_.x.data() + S_.col_start[S_start],
+                                        S_.col_start[S_start + 1] - S_.col_start[S_start],
+                                        S_.i.data() + S_.col_start[S_start + 1],
+                                        S_.x.data() + S_.col_start[S_start + 1],
+                                        S_.col_start[S_start + 2] - S_.col_start[S_start + 1]));
+  // Clear the workspace
+  for (i_t k = 0; k < nz; ++k) {
+    const i_t i          = xi_workspace_[m + k];
+    xi_workspace_[i]     = 0;
+    x_workspace_[i]      = 0.0;
+    xi_workspace_[m + k] = 0;
+  }
+
+#ifdef PRINT_MU_INFO
+  printf("Update mu %e u nz %d v nz %d\n",
+         mu_values_.back(),
+         S_.col_start[S_start + 1] - S_.col_start[S_start],
+         S_.col_start[S_start + 2] - S_.col_start[S_start + 1]);
+#endif
+
+  num_updates_++;
+
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+void basis_update_mpf_t<i_t, f_t>::l_multiply(std::vector<f_t>& inout) const
+{
+  const i_t m = L0_.m;
+  // L*x = y
+  // L0 * T0 * T1 * ... * T_{num_updates_ - 1} * x = y
+
+  for (i_t k = num_updates_ - 1; k >= 0; --k) {
+    // T_k = ( I + u v^T)
+    // T_k * b = b + u * (v^T * b) = b + theta * u, theta = v^T b
+    const i_t u_col = 2 * k;
+    const i_t v_col = 2 * k + 1;
+    const f_t mu    = mu_values_[k];
+
+    // dot = v^T b
+    f_t dot         = dot_product(v_col, inout);
+    const f_t theta = dot;
+    add_sparse_column(S_, u_col, theta, inout);
+  }
+
+  std::vector<f_t> out(m, 0.0);
+  matrix_vector_multiply(L0_, 1.0, inout, 0.0, out);
+  inout = out;
+}
+
+template <typename i_t, typename f_t>
+void basis_update_mpf_t<i_t, f_t>::l_transpose_multiply(std::vector<f_t>& inout) const
+{
+  const i_t m = L0_.m;
+  std::vector<f_t> out(m, 0.0);
+  matrix_vector_multiply(L0_transpose_, 1.0, inout, 0.0, out);
+
+  inout = out;
+
+  const f_t zero_tol = 1e-13;
+  for (i_t k = 0; k < num_updates_; ++k) {
+    const i_t u_col = 2 * k;
+    const i_t v_col = 2 * k + 1;
+    const f_t mu    = mu_values_[k];
+
+    // T_k = ( I + u v^T)
+    // T_k^T = ( I + v u^T)
+    // T_k^T * b = b + v * (u^T * b) = b + theta * v, theta = u^T * b
+    f_t dot         = dot_product(u_col, inout);
+    const f_t theta = dot;
+    if (std::abs(theta) > zero_tol) { add_sparse_column(S_, v_col, theta, inout); }
+  }
+}
+
+template <typename i_t, typename f_t>
+void basis_update_mpf_t<i_t, f_t>::multiply_lu(csc_matrix_t<i_t, f_t>& out) const
+{
+  // P*B = L*U
+  // B = P'*L*U
+  const i_t m = L0_.m;
+
+  out.col_start.resize(m + 1);
+  out.col_start[0] = 0;
+  out.i.clear();
+  out.x.clear();
+
+  i_t B_nz = 0;
+
+  for (i_t j = 0; j < m; ++j) {
+    // B(:, j) = L*U(:, j)
+    out.col_start[j] = B_nz;
+
+    std::vector<f_t> Uj(m, 0.0);
+    U0_.load_a_column(j, Uj);
+    l_multiply(Uj);
+    for (i_t i = 0; i < m; ++i) {
+      if (Uj[i] != 0.0) {
+        out.i.push_back(row_permutation_[i]);
+        out.x.push_back(Uj[i]);
+        B_nz++;
+      }
+    }
+  }
+  out.col_start[m] = B_nz;
+
+  out.m      = m;
+  out.n      = m;
+  out.nz_max = B_nz;
+}
+
 #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE
 template class basis_update_t<int, double>;
+template class basis_update_mpf_t<int, double>;
 #endif
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/basis_updates.hpp b/cpp/src/dual_simplex/basis_updates.hpp
index 73bec6a5d..73592e180 100644
--- a/cpp/src/dual_simplex/basis_updates.hpp
+++ b/cpp/src/dual_simplex/basis_updates.hpp
@@ -18,8 +18,11 @@
 #pragma once
 
 #include <dual_simplex/sparse_matrix.hpp>
+#include <dual_simplex/sparse_vector.hpp>
 #include <dual_simplex/types.hpp>
 
+#include <numeric>
+
 namespace cuopt::linear_programming::dual_simplex {
 
 // Forrest-Tomlin update to the LU factorization of a basis matrix B
@@ -32,11 +35,18 @@ class basis_update_t {
     : L0_(Linit),
       U_(Uinit),
       row_permutation_(p),
+      inverse_row_permutation_(p.size()),
       S_(Linit.m, 1, 0),
       col_permutation_(Linit.m),
-      inverse_col_permutation_(Linit.m)
+      inverse_col_permutation_(Linit.m),
+      xi_workspace_(2 * Linit.m, 0),
+      x_workspace_(Linit.m, 0.0),
+      U_transpose_(1, 1, 1),
+      L0_transpose_(1, 1, 1)
   {
+    inverse_permutation(row_permutation_, inverse_row_permutation_);
     clear();
+    compute_transposes();
   }
 
   i_t reset(const csc_matrix_t<i_t, f_t>& Linit,
@@ -47,34 +57,60 @@ class basis_update_t {
     U_  = Uinit;
     assert(p.size() == Linit.m);
     row_permutation_ = p;
+    inverse_permutation(row_permutation_, inverse_row_permutation_);
     clear();
+    compute_transposes();
     return 0;
   }
 
   // Solves for x such that B*x = b, where B is the basis matrix
   i_t b_solve(const std::vector<f_t>& rhs, std::vector<f_t>& solution) const;
 
+  // Solves for x such that B*x = b, where B is the basis matrix
+  i_t b_solve(const sparse_vector_t<i_t, f_t>& rhs, sparse_vector_t<i_t, f_t>& solution) const;
+
   // Solves for x such that B*x = b, where B is the basis matrix, also returns L*v = P*b
   // This is useful for avoiding an extra solve with the update
   i_t b_solve(const std::vector<f_t>& rhs,
               std::vector<f_t>& solution,
               std::vector<f_t>& Lsol) const;
 
+  // Solves for x such that B*x = b, where B is the basis matrix, also returns L*v = P*b
+  // This is useful for avoiding an extra solve with the update
+  i_t b_solve(const sparse_vector_t<i_t, f_t>& rhs,
+              sparse_vector_t<i_t, f_t>& solution,
+              sparse_vector_t<i_t, f_t>& Lsol) const;
+
   // Solves for y such that B'*y = c, where B is the basis matrix
   i_t b_transpose_solve(const std::vector<f_t>& rhs, std::vector<f_t>& solution) const;
 
+  i_t b_transpose_solve(const sparse_vector_t<i_t, f_t>& rhs,
+                        sparse_vector_t<i_t, f_t>& solution) const;
+
   // Solve for x such that L*x = y
   i_t l_solve(std::vector<f_t>& rhs) const;
 
+  // Solve for x such that L*x = y
+  i_t l_solve(sparse_vector_t<i_t, f_t>& rhs) const;
+
   // Solve for x such that L'*x = y
   i_t l_transpose_solve(std::vector<f_t>& rhs) const;
 
+  // Solve for x such that L'*x = y
+  i_t l_transpose_solve(sparse_vector_t<i_t, f_t>& rhs) const;
+
   // Solve for x such that U*x = y
   i_t u_solve(std::vector<f_t>& rhs) const;
 
+  // Solve for x such that U*x = y
+  i_t u_solve(sparse_vector_t<i_t, f_t>& rhs) const;
+
   // Solve for x such that U'*x = y
   i_t u_transpose_solve(std::vector<f_t>& rhs) const;
 
+  // Solve for x such that U'*x = y
+  i_t u_transpose_solve(sparse_vector_t<i_t, f_t>& rhs) const;
+
   // Replace the column B(:, leaving_index) with the vector abar. Pass in utilde such that L*utilde
   // = abar
   i_t update(std::vector<f_t>& utilde, i_t leaving_index);
@@ -85,6 +121,12 @@ class basis_update_t {
 
   const std::vector<i_t>& row_permutation() const { return row_permutation_; }
 
+  void compute_transposes()
+  {
+    L0_.transpose(L0_transpose_);
+    U_.transpose(U_transpose_);
+  }
+
  private:
   void clear()
   {
@@ -110,14 +152,271 @@ class basis_update_t {
                                 csc_matrix_t<i_t, f_t>& out,
                                 i_t out_col) const;
 
-  i_t num_updates_;                   // Number of rank-1 updates to L0
-  csc_matrix_t<i_t, f_t> L0_;         // Sparse lower triangular matrix from initial factorization
-  csc_matrix_t<i_t, f_t> U_;          // Sparse upper triangular matrix. Is modified by updates
-  std::vector<i_t> row_permutation_;  // Row permutation from initial factorization L*U = P*B
-  std::vector<i_t> pivot_indices_;    // indicies for rank-1 updates to L
-  csc_matrix_t<i_t, f_t> S_;          // stores the pivot elements for rank-1 updates to L
-  std::vector<i_t> col_permutation_;  // symmetric permuation q used in U(q, q) represents Q
+  void solve_to_sparse_vector(i_t top, sparse_vector_t<i_t, f_t>& out) const;
+  i_t scatter_into_workspace(const sparse_vector_t<i_t, f_t>& in) const;
+  void gather_into_sparse_vector(i_t nz, sparse_vector_t<i_t, f_t>& out) const;
+
+  i_t num_updates_;                    // Number of rank-1 updates to L0
+  mutable csc_matrix_t<i_t, f_t> L0_;  // Sparse lower triangular matrix from initial factorization
+  mutable csc_matrix_t<i_t, f_t> U_;   // Sparse upper triangular matrix. Is modified by updates
+  std::vector<i_t> row_permutation_;   // Row permutation from initial factorization L*U = P*B
+  std::vector<i_t>
+    inverse_row_permutation_;       // Inverse row permutation from initial factorization L*U = P*B
+  std::vector<i_t> pivot_indices_;  // indicies for rank-1 updates to L
+  csc_matrix_t<i_t, f_t> S_;        // stores the pivot elements for rank-1 updates to L
+  std::vector<i_t> col_permutation_;          // symmetric permuation q used in U(q, q) represents Q
   std::vector<i_t> inverse_col_permutation_;  // inverse permutation represents Q'
+  mutable std::vector<i_t> xi_workspace_;
+  mutable std::vector<f_t> x_workspace_;
+  mutable csc_matrix_t<i_t, f_t> U_transpose_;   // Needed for sparse solves
+  mutable csc_matrix_t<i_t, f_t> L0_transpose_;  // Needed for sparse solves
+};
+
+// Middle product form update to the LU factorization of a basis matrix B
+template <typename i_t, typename f_t>
+class basis_update_mpf_t {
+ public:
+  basis_update_mpf_t(const csc_matrix_t<i_t, f_t>& Linit,
+                     const csc_matrix_t<i_t, f_t>& Uinit,
+                     const std::vector<i_t>& p,
+                     const i_t refactor_frequency)
+    : L0_(Linit),
+      U0_(Uinit),
+      row_permutation_(p),
+      inverse_row_permutation_(p.size()),
+      S_(Linit.m, 0, 0),
+      col_permutation_(Linit.m),
+      inverse_col_permutation_(Linit.m),
+      xi_workspace_(2 * Linit.m, 0),
+      x_workspace_(Linit.m, 0.0),
+      U0_transpose_(1, 1, 1),
+      L0_transpose_(1, 1, 1),
+      refactor_frequency_(refactor_frequency),
+      total_sparse_L_transpose_(0),
+      total_dense_L_transpose_(0),
+      total_sparse_L_(0),
+      total_dense_L_(0),
+      total_sparse_U_transpose_(0),
+      total_dense_U_transpose_(0),
+      total_sparse_U_(0),
+      total_dense_U_(0),
+      hypersparse_threshold_(0.05)
+  {
+    inverse_permutation(row_permutation_, inverse_row_permutation_);
+    clear();
+    compute_transposes();
+    reset_stas();
+  }
+
+  void print_stats() const
+  {
+    i_t total_L_transpose_calls = total_sparse_L_transpose_ + total_dense_L_transpose_;
+    i_t total_U_transpose_calls = total_sparse_U_transpose_ + total_dense_U_transpose_;
+    i_t total_L_calls           = total_sparse_L_ + total_dense_L_;
+    i_t total_U_calls           = total_sparse_U_ + total_dense_U_;
+    // clang-format off
+    printf("sparse L transpose  %8d %8.2f%\n", total_sparse_L_transpose_, 100.0 * total_sparse_L_transpose_ / total_L_transpose_calls);
+    printf("dense  L transpose  %8d %8.2f%\n", total_dense_L_transpose_, 100.0 * total_dense_L_transpose_ / total_L_transpose_calls);
+    printf("sparse U transpose  %8d %8.2f%\n", total_sparse_U_transpose_, 100.0 * total_sparse_U_transpose_ / total_U_transpose_calls);
+    printf("dense  U transpose  %8d %8.2f%\n", total_dense_U_transpose_, 100.0 * total_dense_U_transpose_ / total_U_transpose_calls);
+    printf("sparse L            %8d %8.2f%\n", total_sparse_L_, 100.0 * total_sparse_L_ / total_L_calls);
+    printf("dense  L            %8d %8.2f%\n", total_dense_L_, 100.0 * total_dense_L_ / total_L_calls);
+    printf("sparse U            %8d %8.2f%\n", total_sparse_U_, 100.0 * total_sparse_U_ / total_U_calls);
+    printf("dense  U            %8d %8.2f%\n", total_dense_U_, 100.0 * total_dense_U_ / total_U_calls);
+    // clang-format on
+  }
+
+  void reset_stas()
+  {
+    num_calls_L_           = 0;
+    num_calls_U_           = 0;
+    num_calls_L_transpose_ = 0;
+    num_calls_U_transpose_ = 0;
+    sum_L_                 = 0.0;
+    sum_U_                 = 0.0;
+    sum_L_transpose_       = 0.0;
+    sum_U_transpose_       = 0.0;
+  }
+
+  i_t reset(const csc_matrix_t<i_t, f_t>& Linit,
+            const csc_matrix_t<i_t, f_t>& Uinit,
+            const std::vector<i_t>& p)
+  {
+    L0_ = Linit;
+    U0_ = Uinit;
+    assert(p.size() == Linit.m);
+    row_permutation_ = p;
+    inverse_permutation(row_permutation_, inverse_row_permutation_);
+    clear();
+    compute_transposes();
+    reset_stas();
+    return 0;
+  }
+
+  f_t estimate_solution_density(f_t rhs_nz, f_t sum, i_t& num_calls, bool& use_hypersparse) const
+  {
+    num_calls++;
+    const f_t average_growth    = std::max(1.0, sum / static_cast<f_t>(num_calls));
+    const f_t predicted_nz      = rhs_nz * average_growth;
+    const f_t predicted_density = predicted_nz / static_cast<f_t>(L0_.m);
+    use_hypersparse             = predicted_density < hypersparse_threshold_;
+    return predicted_nz;
+  }
+
+  // Solves for x such that B*x = b, where B is the basis matrix
+  i_t b_solve(const std::vector<f_t>& rhs, std::vector<f_t>& solution) const;
+  i_t b_solve(const sparse_vector_t<i_t, f_t>& rhs, sparse_vector_t<i_t, f_t>& solution) const;
+  i_t b_solve(const std::vector<f_t>& rhs,
+              std::vector<f_t>& solution,
+              std::vector<f_t>& Lsol,
+              bool need_Lsol = true) const;
+  i_t b_solve(const sparse_vector_t<i_t, f_t>& rhs,
+              sparse_vector_t<i_t, f_t>& solution,
+              sparse_vector_t<i_t, f_t>& Lsol,
+              bool need_Lsol = true) const;
+
+  // Solves for y such that B'*y = c, where B is the basis matrix
+  i_t b_transpose_solve(const std::vector<f_t>& rhs, std::vector<f_t>& solution) const;
+  i_t b_transpose_solve(const sparse_vector_t<i_t, f_t>& rhs,
+                        sparse_vector_t<i_t, f_t>& solution) const;
+  i_t b_transpose_solve(const std::vector<f_t>& rhs,
+                        std::vector<f_t>& solution,
+                        std::vector<f_t>& UTsol) const;
+  i_t b_transpose_solve(const sparse_vector_t<i_t, f_t>& rhs,
+                        sparse_vector_t<i_t, f_t>& solution,
+                        sparse_vector_t<i_t, f_t>& UTsol) const;
+  // Solve for x such that L*x = y
+  i_t l_solve(std::vector<f_t>& rhs) const;
+
+  // Solve for x such that L*x = y
+  i_t l_solve(sparse_vector_t<i_t, f_t>& rhs) const;
+
+  // Solve for x such that L'*x = y
+  i_t l_transpose_solve(std::vector<f_t>& rhs) const;
+
+  // Solve for x such that L'*x = y
+  i_t l_transpose_solve(sparse_vector_t<i_t, f_t>& rhs) const;
+
+  // Solve for x such that U*x = y
+  i_t u_solve(std::vector<f_t>& rhs) const;
+
+  // Solve for x such that U*x = y
+  i_t u_solve(sparse_vector_t<i_t, f_t>& rhs) const;
+
+  // Solve for x such that U'*x = y
+  i_t u_transpose_solve(std::vector<f_t>& rhs) const;
+
+  // Solve for x such that U'*x = y
+  i_t u_transpose_solve(sparse_vector_t<i_t, f_t>& rhs) const;
+
+  // Replace the column B(:, leaving_index) with the vector abar. Pass in utilde such that L*utilde
+  // = abar
+  i_t update(const std::vector<f_t>& utilde, const std::vector<f_t>& etilde, i_t leaving_index);
+
+  // Replace the column B(:, leaving_index) with the vector abar. Pass in utilde such that L*utilde
+  // = abar
+  i_t update(const sparse_vector_t<i_t, f_t>& utilde,
+             sparse_vector_t<i_t, f_t>& etilde,
+             i_t leaving_index);
+
+  i_t num_updates() const { return num_updates_; }
+
+  const std::vector<i_t>& row_permutation() const { return row_permutation_; }
+
+  void compute_transposes()
+  {
+    L0_.transpose(L0_transpose_);
+    U0_.transpose(U0_transpose_);
+  }
+
+  void multiply_lu(csc_matrix_t<i_t, f_t>& out) const;
+
+ private:
+  void clear()
+  {
+    pivot_indices_.clear();
+    pivot_indices_.reserve(L0_.m);
+    std::iota(col_permutation_.begin(), col_permutation_.end(), 0);
+    std::iota(inverse_col_permutation_.begin(), inverse_col_permutation_.end(), 0);
+    S_.col_start.resize(refactor_frequency_ + 1);
+    S_.col_start[0] = 0;
+    S_.col_start[1] = 0;
+    S_.i.clear();
+    S_.x.clear();
+    S_.n = 0;
+    mu_values_.clear();
+    mu_values_.reserve(refactor_frequency_);
+    num_updates_ = 0;
+  }
+  void grow_storage(i_t nz, i_t& S_start, i_t& S_nz);
+  i_t index_map(i_t leaving) const;
+  f_t u_diagonal(i_t j) const;
+  i_t place_diagonals();
+  f_t update_lower(const std::vector<i_t>& sind, const std::vector<f_t>& sval, i_t leaving);
+  i_t update_upper(const std::vector<i_t>& ind, const std::vector<f_t>& baru, i_t t);
+  i_t lower_triangular_multiply(const csc_matrix_t<i_t, f_t>& in,
+                                i_t in_col,
+                                csc_matrix_t<i_t, f_t>& out,
+                                i_t out_col) const;
+
+  void solve_to_workspace(i_t top) const;
+  void solve_to_sparse_vector(i_t top, sparse_vector_t<i_t, f_t>& out) const;
+  i_t scatter_into_workspace(const sparse_vector_t<i_t, f_t>& in) const;
+  void gather_into_sparse_vector(i_t nz, sparse_vector_t<i_t, f_t>& out) const;
+  i_t nonzeros(const std::vector<f_t>& x) const;
+  f_t dot_product(i_t col, const std::vector<f_t>& x) const;
+  f_t dot_product(i_t col, const std::vector<i_t>& mark, const std::vector<f_t>& x) const;
+  void add_sparse_column(const csc_matrix_t<i_t, f_t>& S,
+                         i_t col,
+                         f_t theta,
+                         std::vector<f_t>& x) const;
+  void add_sparse_column(const csc_matrix_t<i_t, f_t>& S,
+                         i_t col,
+                         f_t theta,
+                         std::vector<i_t>& mark,
+                         i_t& nz,
+                         std::vector<f_t>& x) const;
+
+  void l_multiply(std::vector<f_t>& inout) const;
+  void l_transpose_multiply(std::vector<f_t>& inout) const;
+
+  i_t num_updates_;                    // Number of rank-1 updates to L0
+  i_t refactor_frequency_;             // Average updates before refactoring
+  mutable csc_matrix_t<i_t, f_t> L0_;  // Sparse lower triangular matrix from initial factorization
+  mutable csc_matrix_t<i_t, f_t> U0_;  // Sparse upper triangular matrix from initial factorization
+  std::vector<i_t> row_permutation_;   // Row permutation from initial factorization L*U = P*B
+  std::vector<i_t>
+    inverse_row_permutation_;       // Inverse row permutation from initial factorization L*U = P*B
+  std::vector<i_t> pivot_indices_;  // indicies for rank-1 updates to L
+  csc_matrix_t<i_t, f_t> S_;        // stores information about the rank-1 updates to L
+  std::vector<f_t> mu_values_;      // stores information about the rank-1 updates to L
+  std::vector<i_t> col_permutation_;          // symmetric permuation q used in U(q, q) represents Q
+  std::vector<i_t> inverse_col_permutation_;  // inverse permutation represents Q'
+  mutable std::vector<i_t> xi_workspace_;
+  mutable std::vector<f_t> x_workspace_;
+  mutable csc_matrix_t<i_t, f_t> U0_transpose_;  // Needed for sparse solves
+  mutable csc_matrix_t<i_t, f_t> L0_transpose_;  // Needed for sparse solves
+
+  mutable i_t total_sparse_L_transpose_;
+  mutable i_t total_dense_L_transpose_;
+  mutable i_t total_sparse_L_;
+  mutable i_t total_dense_L_;
+  mutable i_t total_sparse_U_transpose_;
+  mutable i_t total_dense_U_transpose_;
+  mutable i_t total_sparse_U_;
+  mutable i_t total_dense_U_;
+
+  mutable i_t num_calls_L_;
+  mutable i_t num_calls_U_;
+  mutable i_t num_calls_L_transpose_;
+  mutable i_t num_calls_U_transpose_;
+
+  mutable f_t sum_L_;
+  mutable f_t sum_U_;
+  mutable f_t sum_L_transpose_;
+  mutable f_t sum_U_transpose_;
+
+  f_t hypersparse_threshold_;
 };
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp b/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp
new file mode 100644
index 000000000..11753cbcb
--- /dev/null
+++ b/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp
@@ -0,0 +1,346 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <dual_simplex/bound_flipping_ratio_test.hpp>
+
+#include <dual_simplex/tic_toc.hpp>
+
+#include <algorithm>
+#include <cmath>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+template <typename i_t, typename f_t>
+i_t bound_flipping_ratio_test_t<i_t, f_t>::compute_breakpoints(std::vector<i_t>& indicies,
+                                                               std::vector<f_t>& ratios)
+{
+  i_t n                  = n_;
+  i_t m                  = m_;
+  constexpr bool verbose = false;
+  f_t pivot_tol          = settings_.pivot_tol;
+  const f_t dual_tol     = settings_.dual_tol / 10;
+
+  i_t idx = 0;
+  while (idx == 0 && pivot_tol >= 1e-12) {
+    // for (i_t k = 0; k < n - m; ++k) {
+    //   const i_t j = nonbasic_list_[k];
+    for (i_t h = 0; h < delta_z_indices_.size(); ++h) {
+      const i_t j = delta_z_indices_[h];
+      const i_t k = nonbasic_mark_[j];
+      if (vstatus_[j] == variable_status_t::NONBASIC_FIXED) { continue; }
+      if (vstatus_[j] == variable_status_t::NONBASIC_LOWER && delta_z_[j] < -pivot_tol) {
+        indicies[idx] = k;
+        ratios[idx]   = std::max((-dual_tol - z_[j]) / delta_z_[j], 0.0);
+        if constexpr (verbose) { settings_.log.printf("ratios[%d] = %e\n", idx, ratios[idx]); }
+        idx++;
+      }
+      if (vstatus_[j] == variable_status_t::NONBASIC_UPPER && delta_z_[j] > pivot_tol) {
+        indicies[idx] = k;
+        ratios[idx]   = std::max((dual_tol - z_[j]) / delta_z_[j], 0.0);
+        if constexpr (verbose) { settings_.log.printf("ratios[%d] = %e\n", idx, ratios[idx]); }
+        idx++;
+      }
+    }
+    pivot_tol /= 10;
+  }
+  return idx;
+}
+
+template <typename i_t, typename f_t>
+i_t bound_flipping_ratio_test_t<i_t, f_t>::single_pass(i_t start,
+                                                       i_t end,
+                                                       const std::vector<i_t>& indicies,
+                                                       const std::vector<f_t>& ratios,
+                                                       f_t& slope,
+                                                       f_t& step_length,
+                                                       i_t& nonbasic_entering,
+                                                       i_t& entering_index)
+{
+  // Find the minimum ratio
+  f_t min_val    = inf;
+  entering_index = -1;
+  i_t candidate  = -1;
+  f_t zero_tol   = settings_.zero_tol;
+  i_t k_idx      = -1;
+  for (i_t k = start; k < end; ++k) {
+    if (ratios[k] < min_val) {
+      min_val   = ratios[k];
+      candidate = indicies[k];
+      k_idx     = k;
+    } else if (ratios[k] < min_val + zero_tol) {
+      // Use Harris to select variables with larger pivots
+      const i_t j = nonbasic_list_[indicies[k]];
+      if (std::abs(delta_z_[j]) > std::abs(delta_z_[candidate])) {
+        min_val   = ratios[k];
+        candidate = indicies[k];
+        k_idx     = k;
+      }
+    }
+  }
+  step_length       = min_val;
+  nonbasic_entering = candidate;
+  const i_t j = entering_index = nonbasic_list_[nonbasic_entering];
+
+  constexpr bool verbose = false;
+  if (bounded_variables_[j]) {
+    const f_t interval    = upper_[j] - lower_[j];
+    const f_t delta_slope = std::abs(delta_z_[j]) * interval;
+    if constexpr (verbose) {
+      settings_.log.printf("single pass delta slope %e slope %e after slope %e step length %e\n",
+                           delta_slope,
+                           slope,
+                           slope - delta_slope,
+                           step_length);
+    }
+    slope -= delta_slope;
+    return k_idx;  // we should see if we can continue to increase the step-length
+  }
+  return -1;  // we are done. do not increase the step-length further
+}
+
+template <typename i_t, typename f_t>
+i_t bound_flipping_ratio_test_t<i_t, f_t>::compute_step_length(f_t& step_length,
+                                                               i_t& nonbasic_entering)
+{
+  const i_t m            = m_;
+  const i_t n            = n_;
+  const i_t nz           = delta_z_indices_.size();
+  constexpr bool verbose = false;
+
+  // Compute the initial set of breakpoints
+  std::vector<i_t> indicies(nz);
+  std::vector<f_t> ratios(nz);
+  i_t num_breakpoints = compute_breakpoints(indicies, ratios);
+  if constexpr (verbose) { settings_.log.printf("Initial breakpoints %d\n", num_breakpoints); }
+  if (num_breakpoints == 0) {
+    nonbasic_entering = -1;
+    return -1;
+  }
+
+  f_t slope          = slope_;
+  nonbasic_entering  = -1;
+  i_t entering_index = -1;
+
+  i_t k_idx = single_pass(
+    0, num_breakpoints, indicies, ratios, slope, step_length, nonbasic_entering, entering_index);
+  bool continue_search = k_idx >= 0 && num_breakpoints > 1 && slope > 0.0;
+  if (!continue_search) {
+    if constexpr (0) {
+      settings_.log.printf(
+        "BFRT stopping. No bound flips. Step length %e Nonbasic entering %d Entering %d pivot %e\n",
+        step_length,
+        nonbasic_entering,
+        entering_index,
+        std::abs(delta_z_[entering_index]));
+    }
+    return entering_index;
+  }
+
+  if constexpr (verbose) {
+    settings_.log.printf(
+      "Continuing past initial step length %e entering index %d nonbasic entering %d slope %e\n",
+      step_length,
+      entering_index,
+      nonbasic_entering,
+      slope);
+  }
+
+  // Continue the search using a heap to order the breakpoints
+  ratios[k_idx]   = ratios[num_breakpoints - 1];
+  indicies[k_idx] = indicies[num_breakpoints - 1];
+
+  constexpr bool use_bucket_pass = false;
+
+  if (use_bucket_pass) {
+    f_t max_ratio = 0.0;
+    for (i_t k = 0; k < num_breakpoints - 1; ++k) {
+      if (ratios[k] > max_ratio) { max_ratio = ratios[k]; }
+    }
+    settings_.log.printf(
+      "Starting heap passes. %d breakpoints max ratio %e\n", num_breakpoints - 1, max_ratio);
+    bucket_pass(
+      indicies, ratios, num_breakpoints - 1, slope, step_length, nonbasic_entering, entering_index);
+  }
+
+  heap_passes(
+    indicies, ratios, num_breakpoints - 1, slope, step_length, nonbasic_entering, entering_index);
+
+  if constexpr (verbose) {
+    settings_.log.printf("BFRT step length %e entering index %d non basic entering %d pivot %e\n",
+                         step_length,
+                         entering_index,
+                         nonbasic_entering,
+                         std::abs(delta_z_[entering_index]));
+  }
+  return entering_index;
+}
+
+template <typename i_t, typename f_t>
+void bound_flipping_ratio_test_t<i_t, f_t>::heap_passes(const std::vector<i_t>& current_indicies,
+                                                        const std::vector<f_t>& current_ratios,
+                                                        i_t num_breakpoints,
+                                                        f_t& slope,
+                                                        f_t& step_length,
+                                                        i_t& nonbasic_entering,
+                                                        i_t& entering_index)
+{
+  std::vector<i_t> bare_idx(num_breakpoints);
+  constexpr bool verbose                = false;
+  const f_t dual_tol                    = settings_.dual_tol;
+  const f_t zero_tol                    = settings_.zero_tol;
+  const std::vector<f_t>& delta_z       = delta_z_;
+  const std::vector<i_t>& nonbasic_list = nonbasic_list_;
+  const i_t N                           = num_breakpoints;
+  for (i_t k = 0; k < N; ++k) {
+    bare_idx[k] = k;
+    if constexpr (verbose) {
+      settings_.log.printf("Adding index %d ratio %e pivot %e to heap\n",
+                           current_indicies[k],
+                           current_ratios[k],
+                           std::abs(delta_z[nonbasic_list[current_indicies[k]]]));
+    }
+  }
+
+  auto compare = [zero_tol, &current_ratios, &current_indicies, &delta_z, &nonbasic_list](
+                   const i_t& a, const i_t& b) {
+    return (current_ratios[a] > current_ratios[b]) ||
+           (current_ratios[b] - current_ratios[a] < zero_tol &&
+            std::abs(delta_z[nonbasic_list[current_indicies[a]]]) >
+              std::abs(delta_z[nonbasic_list[current_indicies[b]]]));
+  };
+
+  std::make_heap(bare_idx.begin(), bare_idx.end(), compare);
+
+  while (bare_idx.size() > 0 && slope > 0) {
+    // Remove minimum ratio from the heap and rebalance
+    i_t heap_index = bare_idx.front();
+    std::pop_heap(bare_idx.begin(), bare_idx.end(), compare);
+    bare_idx.pop_back();
+
+    nonbasic_entering = current_indicies[heap_index];
+    const i_t j = entering_index = nonbasic_list_[nonbasic_entering];
+    step_length                  = current_ratios[heap_index];
+
+    if (bounded_variables_[j]) {
+      // We have a bounded variable
+      const f_t interval    = upper_[j] - lower_[j];
+      const f_t delta_slope = std::abs(delta_z_[j]) * interval;
+      const f_t pivot       = std::abs(delta_z[j]);
+      if constexpr (verbose) {
+        settings_.log.printf(
+          "heap %d step-length %.12e pivot %e nonbasic entering %d slope %e delta_slope %e new "
+          "slope %e\n",
+          bare_idx.size(),
+          current_ratios[heap_index],
+          pivot,
+          nonbasic_entering,
+          slope,
+          delta_slope,
+          slope - delta_slope);
+      }
+      slope -= delta_slope;
+    } else {
+      // The variable is not bounded. Stop the search.
+      break;
+    }
+
+    if (toc(start_time_) > settings_.time_limit) {
+      entering_index = -2;
+      return;
+    }
+    if (settings_.concurrent_halt != nullptr &&
+        settings_.concurrent_halt->load(std::memory_order_acquire) == 1) {
+      entering_index = -3;
+      return;
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+void bound_flipping_ratio_test_t<i_t, f_t>::bucket_pass(const std::vector<i_t>& current_indicies,
+                                                        const std::vector<f_t>& current_ratios,
+                                                        i_t num_breakpoints,
+                                                        f_t& slope,
+                                                        f_t& step_length,
+                                                        i_t& nonbasic_entering,
+                                                        i_t& entering_index)
+{
+  const f_t dual_tol                    = settings_.dual_tol;
+  const f_t zero_tol                    = settings_.zero_tol;
+  const std::vector<f_t>& delta_z       = delta_z_;
+  const std::vector<i_t>& nonbasic_list = nonbasic_list_;
+  const i_t N                           = num_breakpoints;
+
+  const i_t K = 400;  // 0, -16, -15, ...., 0, 1, ...., 400 - 18 = 382
+  std::vector<f_t> buckets(K, 0.0);
+  std::vector<i_t> bucket_count(K, 0);
+  for (i_t k = 0; k < N; ++k) {
+    const i_t idx          = current_indicies[k];
+    const f_t ratio        = current_ratios[k];
+    const f_t min_exponent = -16.0;
+    const f_t max_exponent = 382.0;
+    const f_t exponent     = std::max(min_exponent, std::min(max_exponent, std::log10(ratio)));
+    const i_t bucket_idx   = ratio == 0.0 ? 0 : static_cast<i_t>(exponent - min_exponent + 1);
+    // settings_.log.printf("Ratio %e exponent %e bucket_idx %d\n", ratio, exponent, bucket_idx);
+    const i_t j           = nonbasic_list[idx];
+    const f_t interval    = upper_[j] - lower_[j];
+    const f_t delta_slope = std::abs(delta_z_[j]) * interval;
+    buckets[bucket_idx] += delta_slope;
+    bucket_count[bucket_idx]++;
+  }
+
+  std::vector<f_t> cumulative_sum(K, 0.0);
+  cumulative_sum[0] = buckets[0];
+  if (cumulative_sum[0] > slope) {
+    settings_.log.printf(
+      "Bucket 0. Count in bucket %d. Slope %e. Cumulative sum %e. Bucket value %e\n",
+      bucket_count[0],
+      slope,
+      cumulative_sum[0],
+      buckets[0]);
+    return;
+  }
+  i_t k;
+  bool exceeded = false;
+  for (k = 1; k < K; ++k) {
+    cumulative_sum[k] = cumulative_sum[k - 1] + buckets[k];
+    if (cumulative_sum[k] > slope) {
+      exceeded = true;
+      break;
+    }
+  }
+
+  if (exceeded) {
+    settings_.log.printf(
+      "Value in bucket %d. Count in buckets %d. Slope %e. Cumulative sum %e. Next sum %e Bucket "
+      "value %e\n",
+      k,
+      bucket_count[k],
+      slope,
+      cumulative_sum[k - 1],
+      cumulative_sum[k],
+      buckets[k - 1]);
+  }
+}
+
+#ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE
+
+template class bound_flipping_ratio_test_t<int, double>;
+
+#endif
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp b/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp
new file mode 100644
index 000000000..1d741ba28
--- /dev/null
+++ b/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp
@@ -0,0 +1,107 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <dual_simplex/initial_basis.hpp>
+#include <dual_simplex/simplex_solver_settings.hpp>
+
+#include <vector>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+template <typename i_t, typename f_t>
+class bound_flipping_ratio_test_t {
+ public:
+  bound_flipping_ratio_test_t(const simplex_solver_settings_t<i_t, f_t>& settings,
+                              f_t start_time,
+                              i_t m,
+                              i_t n,
+                              f_t initial_slope,
+                              const std::vector<f_t>& lower,
+                              const std::vector<f_t>& upper,
+                              const std::vector<uint8_t>& bounded_variables,
+                              const std::vector<variable_status_t>& vstatus,
+                              const std::vector<i_t>& nonbasic_list,
+                              const std::vector<f_t>& z,
+                              const std::vector<f_t>& delta_z,
+                              const std::vector<i_t>& delta_z_indices,
+                              const std::vector<i_t>& nonbasic_mark)
+    : settings_(settings),
+      start_time_(start_time),
+      m_(m),
+      n_(n),
+      slope_(initial_slope),
+      lower_(lower),
+      upper_(upper),
+      bounded_variables_(bounded_variables),
+      vstatus_(vstatus),
+      nonbasic_list_(nonbasic_list),
+      z_(z),
+      delta_z_(delta_z),
+      delta_z_indices_(delta_z_indices),
+      nonbasic_mark_(nonbasic_mark)
+  {
+  }
+
+  i_t compute_step_length(f_t& step_length, i_t& nonbasic_entering);
+
+ private:
+  i_t compute_breakpoints(std::vector<i_t>& indices, std::vector<f_t>& ratios);
+  i_t single_pass(i_t start,
+                  i_t end,
+                  const std::vector<i_t>& indices,
+                  const std::vector<f_t>& ratios,
+                  f_t& slope,
+                  f_t& step_length,
+                  i_t& nonbasic_entering,
+                  i_t& enetering_index);
+  void heap_passes(const std::vector<i_t>& current_indicies,
+                   const std::vector<f_t>& current_ratios,
+                   i_t num_breakpoints,
+                   f_t& slope,
+                   f_t& step_lenght,
+                   i_t& nonbasic_entering,
+                   i_t& entering_index);
+
+  void bucket_pass(const std::vector<i_t>& current_indicies,
+                   const std::vector<f_t>& current_ratios,
+                   i_t num_breakpoints,
+                   f_t& slope,
+                   f_t& step_length,
+                   i_t& nonbasic_entering,
+                   i_t& entering_index);
+
+  const std::vector<f_t>& lower_;
+  const std::vector<f_t>& upper_;
+  const std::vector<uint8_t>& bounded_variables_;
+  const std::vector<i_t>& nonbasic_list_;
+  const std::vector<variable_status_t>& vstatus_;
+  const std::vector<f_t>& z_;
+  const std::vector<f_t>& delta_z_;
+  const std::vector<i_t>& delta_z_indices_;
+  const std::vector<i_t>& nonbasic_mark_;
+
+  const simplex_solver_settings_t<i_t, f_t>& settings_;
+
+  f_t start_time_;
+  f_t slope_;
+
+  i_t n_;
+  i_t m_;
+};
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/branch_and_bound.cpp b/cpp/src/dual_simplex/branch_and_bound.cpp
index e141a71bf..906704012 100644
--- a/cpp/src/dual_simplex/branch_and_bound.cpp
+++ b/cpp/src/dual_simplex/branch_and_bound.cpp
@@ -205,6 +205,29 @@ void graphviz_edge(const simplex_solver_settings_t<i_t, f_t>& settings,
   }
 }
 
+dual::status_t convert_lp_status_to_dual_status(lp_status_t status)
+{
+  if (status == lp_status_t::OPTIMAL) {
+    return dual::status_t::OPTIMAL;
+  } else if (status == lp_status_t::INFEASIBLE) {
+    return dual::status_t::DUAL_UNBOUNDED;
+  } else if (status == lp_status_t::ITERATION_LIMIT) {
+    return dual::status_t::ITERATION_LIMIT;
+  } else if (status == lp_status_t::TIME_LIMIT) {
+    return dual::status_t::TIME_LIMIT;
+  } else if (status == lp_status_t::NUMERICAL_ISSUES) {
+    return dual::status_t::NUMERICAL;
+  } else if (status == lp_status_t::CUTOFF) {
+    return dual::status_t::CUTOFF;
+  } else if (status == lp_status_t::CONCURRENT_LIMIT) {
+    return dual::status_t::CONCURRENT_LIMIT;
+  } else if (status == lp_status_t::UNSET) {
+    return dual::status_t::UNSET;
+  } else {
+    return dual::status_t::NUMERICAL;
+  }
+}
+
 }  // namespace
 
 template <typename f_t>
@@ -380,7 +403,7 @@ branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
   : original_problem(user_problem), settings(solver_settings), original_lp(1, 1, 1)
 {
   start_time = tic();
-  convert_user_problem(original_problem, original_lp, new_slacks);
+  convert_user_problem(original_problem, settings, original_lp, new_slacks);
   full_variable_types(original_problem, original_lp, var_types);
 
   global_variables::mutex_upper.lock();
@@ -674,6 +697,12 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
                                            leaf_solution,
                                            node_iter,
                                            leaf_edge_norms);
+    if (lp_status == dual::status_t::NUMERICAL) {
+      settings.log.printf("Numerical issue node %d. Resolving from scratch.\n", nodes_explored);
+      lp_status_t second_status = solve_linear_program_advanced(
+        leaf_problem, lp_start_time, lp_settings, leaf_solution, leaf_vstatus, leaf_edge_norms);
+      lp_status = convert_lp_status_to_dual_status(second_status);
+    }
     total_lp_solve_time += toc(lp_start_time);
     total_lp_iters += node_iter;
 
diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp
index cbfc66a22..a83e3bf72 100644
--- a/cpp/src/dual_simplex/phase2.cpp
+++ b/cpp/src/dual_simplex/phase2.cpp
@@ -17,14 +17,17 @@
 
 #include <dual_simplex/basis_solves.hpp>
 #include <dual_simplex/basis_updates.hpp>
+#include <dual_simplex/bound_flipping_ratio_test.hpp>
 #include <dual_simplex/initial_basis.hpp>
 #include <dual_simplex/phase1.hpp>
 #include <dual_simplex/phase2.hpp>
+#include <dual_simplex/random.hpp>
 #include <dual_simplex/solve.hpp>
 #include <dual_simplex/sparse_matrix.hpp>
 #include <dual_simplex/tic_toc.hpp>
 
 #include <cassert>
+#include <cmath>
 #include <cstdio>
 #include <iterator>
 #include <limits>
@@ -34,17 +37,467 @@ namespace cuopt::linear_programming::dual_simplex {
 
 namespace phase2 {
 
+// Computes vectors farkas_y, farkas_zl, farkas_zu that satisfy
+//
+// A'*farkas_y + farkas_zl - farkas_zu ~= 0
+// farkas_zl, farkas_zu >= 0,
+// b'*farkas_y + l'*farkas_zl - u'*farkas_zu = farkas_constant > 0
+//
+// This is a Farkas certificate for the infeasibility of the primal problem
+//
+// A*x = b, l <= x <= u
+template <typename i_t, typename f_t>
+void compute_farkas_certificate(const lp_problem_t<i_t, f_t>& lp,
+                                const simplex_solver_settings_t<i_t, f_t>& settings,
+                                const std::vector<variable_status_t>& vstatus,
+                                const std::vector<f_t>& x,
+                                const std::vector<f_t>& y,
+                                const std::vector<f_t>& z,
+                                const std::vector<f_t>& delta_y,
+                                const std::vector<f_t>& delta_z,
+                                i_t direction,
+                                i_t leaving_index,
+                                f_t obj_val,
+                                std::vector<f_t>& farkas_y,
+                                std::vector<f_t>& farkas_zl,
+                                std::vector<f_t>& farkas_zu,
+                                f_t& farkas_constant)
+{
+  const i_t m = lp.num_rows;
+  const i_t n = lp.num_cols;
+
+  std::vector<f_t> original_residual = z;
+  matrix_transpose_vector_multiply(lp.A, 1.0, y, 1.0, original_residual);
+  for (i_t j = 0; j < n; ++j) {
+    original_residual[j] -= lp.objective[j];
+  }
+  const f_t original_residual_norm = vector_norm2<i_t, f_t>(original_residual);
+  settings.log.printf("|| A'*y + z - c || = %e\n", original_residual_norm);
+
+  std::vector<f_t> zl(n);
+  std::vector<f_t> zu(n);
+  for (i_t j = 0; j < n; ++j) {
+    zl[j] = std::max(0.0, z[j]);
+    zu[j] = -std::min(0.0, z[j]);
+  }
+
+  original_residual = zl;
+  matrix_transpose_vector_multiply(lp.A, 1.0, y, 1.0, original_residual);
+  for (i_t j = 0; j < n; ++j) {
+    original_residual[j] -= (zu[j] + lp.objective[j]);
+  }
+  const f_t original_residual_2 = vector_norm2<i_t, f_t>(original_residual);
+  settings.log.printf("|| A'*y + zl - zu - c || = %e\n", original_residual_2);
+
+  std::vector<f_t> search_dir_residual = delta_z;
+  matrix_transpose_vector_multiply(lp.A, 1.0, delta_y, 1.0, search_dir_residual);
+  settings.log.printf("|| A'*delta_y + delta_z || = %e\n",
+                      vector_norm2<i_t, f_t>(search_dir_residual));
+
+  std::vector<f_t> y_bar(m);
+  for (i_t i = 0; i < m; ++i) {
+    y_bar[i] = y[i] + delta_y[i];
+  }
+  original_residual = z;
+  matrix_transpose_vector_multiply(lp.A, 1.0, y_bar, 1.0, original_residual);
+  for (i_t j = 0; j < n; ++j) {
+    original_residual[j] += (delta_z[j] - lp.objective[j]);
+  }
+  const f_t original_residual_3 = vector_norm2<i_t, f_t>(original_residual);
+  settings.log.printf("|| A'*(y + delta_y) + (z + delta_z) - c || = %e\n", original_residual_3);
+
+  farkas_y.resize(m);
+  farkas_zl.resize(n);
+  farkas_zu.resize(n);
+
+  f_t gamma = 0.0;
+  for (i_t j = 0; j < n; ++j) {
+    const f_t cj    = lp.objective[j];
+    const f_t lower = lp.lower[j];
+    const f_t upper = lp.upper[j];
+    if (lower > -inf) { gamma -= lower * std::min(0.0, cj); }
+    if (upper < inf) { gamma -= upper * std::max(0.0, cj); }
+  }
+  printf("gamma = %e\n", gamma);
+
+  const f_t threshold          = 1.0;
+  const f_t positive_threshold = std::max(-gamma, 0.0) + threshold;
+  printf("positive_threshold = %e\n", positive_threshold);
+
+  // We need to increase the dual objective to positive threshold
+  f_t alpha        = threshold;
+  const f_t infeas = (direction == 1) ? (lp.lower[leaving_index] - x[leaving_index])
+                                      : (x[leaving_index] - lp.upper[leaving_index]);
+  // We need the new objective to be at least positive_threshold
+  // positive_threshold = obj_val+ alpha * infeas
+  // infeas > 0, alpha > 0, positive_threshold > 0
+  printf("direction = %d\n", direction);
+  printf(
+    "lower %e x %e upper %d\n", lp.lower[leaving_index], x[leaving_index], lp.upper[leaving_index]);
+  printf("infeas = %e\n", infeas);
+  printf("obj_val = %e\n", obj_val);
+  alpha = std::max(threshold, (positive_threshold - obj_val) / infeas);
+  printf("alpha = %e\n", alpha);
+
+  std::vector<f_t> y_prime(m);
+  std::vector<f_t> zl_prime(n);
+  std::vector<f_t> zu_prime(n);
+
+  // farkas_y = y + alpha * delta_y
+  for (i_t i = 0; i < m; ++i) {
+    farkas_y[i] = y[i] + alpha * delta_y[i];
+    y_prime[i]  = y[i] + alpha * delta_y[i];
+  }
+  // farkas_zl = z + alpha * delta_z  - c-
+  for (i_t j = 0; j < n; ++j) {
+    const f_t cj        = lp.objective[j];
+    const f_t z_j       = z[j];
+    const f_t delta_z_j = delta_z[j];
+    farkas_zl[j] = std::max(0.0, z_j) + alpha * std::max(0.0, delta_z_j) + -std::min(0.0, cj);
+    zl_prime[j]  = zl[j] + alpha * std::max(0.0, delta_z_j);
+  }
+
+  // farkas_zu = z + alpha * delta_z + c+
+  for (i_t j = 0; j < n; ++j) {
+    const f_t cj        = lp.objective[j];
+    const f_t z_j       = z[j];
+    const f_t delta_z_j = delta_z[j];
+    farkas_zu[j] = -std::min(0.0, z_j) - alpha * std::min(0.0, delta_z_j) + std::max(0.0, cj);
+    zu_prime[j]  = zu[j] + alpha * (-std::min(0.0, delta_z_j));
+  }
+
+  // farkas_constant = b'*farkas_y + l'*farkas_zl - u'*farkas_zu
+  farkas_constant   = 0.0;
+  f_t test_constant = 0.0;
+  f_t test_3        = 0.0;
+  for (i_t i = 0; i < m; ++i) {
+    farkas_constant += lp.rhs[i] * farkas_y[i];
+    test_constant += lp.rhs[i] * y_prime[i];
+    test_3 += lp.rhs[i] * delta_y[i];
+  }
+  printf("b'*delta_y = %e\n", test_3);
+  printf("|| b || %e\n", vector_norm_inf<i_t, f_t>(lp.rhs));
+  printf("|| delta y || %e\n", vector_norm_inf<i_t, f_t>(delta_y));
+  for (i_t j = 0; j < n; ++j) {
+    const f_t lower = lp.lower[j];
+    const f_t upper = lp.upper[j];
+    if (lower > -inf) {
+      farkas_constant += lower * farkas_zl[j];
+      test_constant += lower * zl_prime[j];
+      const f_t delta_z_l_j = std::max(delta_z[j], 0.0);
+      test_3 += lower * delta_z_l_j;
+    }
+    if (upper < inf) {
+      farkas_constant -= upper * farkas_zu[j];
+      test_constant -= upper * zu_prime[j];
+      const f_t delta_z_u_j = -std::min(delta_z[j], 0.0);
+      test_3 -= upper * delta_z_u_j;
+    }
+  }
+
+  // Verify that the Farkas certificate is valid
+  std::vector<f_t> residual = farkas_zl;
+  matrix_transpose_vector_multiply(lp.A, 1.0, farkas_y, 1.0, residual);
+  for (i_t j = 0; j < n; ++j) {
+    residual[j] -= farkas_zu[j];
+  }
+  const f_t residual_norm = vector_norm2<i_t, f_t>(residual);
+
+  f_t zl_min = 0.0;
+  for (i_t j = 0; j < n; ++j) {
+    zl_min = std::min(zl_min, farkas_zl[j]);
+  }
+  settings.log.printf("farkas_zl_min = %e\n", zl_min);
+  f_t zu_min = 0.0;
+  for (i_t j = 0; j < n; ++j) {
+    zu_min = std::min(zu_min, farkas_zu[j]);
+  }
+  settings.log.printf("farkas_zu_min = %e\n", zu_min);
+
+  settings.log.printf("|| A'*farkas_y + farkas_zl - farkas_zu || = %e\n", residual_norm);
+  settings.log.printf("b'*farkas_y + l'*farkas_zl - u'*farkas_zu = %e\n", farkas_constant);
+
+  if (residual_norm < 1e-6 && farkas_constant > 0.0 && zl_min >= 0.0 && zu_min >= 0.0) {
+    settings.log.printf("Farkas certificate of infeasibility constructed\n");
+  }
+}
+
+template <typename i_t, typename f_t>
+void initial_perturbation(const lp_problem_t<i_t, f_t>& lp,
+                          const simplex_solver_settings_t<i_t, f_t>& settings,
+                          const std::vector<variable_status_t>& vstatus,
+                          std::vector<f_t>& objective)
+{
+  const i_t m           = lp.num_rows;
+  const i_t n           = lp.num_cols;
+  f_t max_abs_obj_coeff = 0.0;
+  for (i_t j = 0; j < n; ++j) {
+    max_abs_obj_coeff = std::max(max_abs_obj_coeff, std::abs(lp.objective[j]));
+  }
+
+  const f_t dual_tol = settings.dual_tol;
+
+  std::srand(static_cast<unsigned int>(std::time(nullptr)));
+
+  objective.resize(n);
+  f_t sum_perturb = 0.0;
+  i_t num_perturb = 0;
+
+  random_t<i_t, f_t> random(settings.seed);
+  for (i_t j = 0; j < n; ++j) {
+    f_t obj = objective[j] = lp.objective[j];
+
+    const f_t lower = lp.lower[j];
+    const f_t upper = lp.upper[j];
+    if (vstatus[j] == variable_status_t::NONBASIC_FIXED ||
+        vstatus[j] == variable_status_t::NONBASIC_FREE || lower == upper ||
+        lower == -inf && upper == inf) {
+      continue;
+    }
+
+    const f_t rand_val = random.random();
+    const f_t perturb =
+      (1e-5 * std::abs(obj) + 1e-7 * max_abs_obj_coeff + 10 * dual_tol) * (1.0 + rand_val);
+
+    if (vstatus[j] == variable_status_t::NONBASIC_LOWER || lower > -inf && upper < inf && obj > 0) {
+      objective[j] = obj + perturb;
+      sum_perturb += perturb;
+      num_perturb++;
+    } else if (vstatus[j] == variable_status_t::NONBASIC_UPPER ||
+               lower > -inf && upper < inf && obj < 0) {
+      objective[j] = obj - perturb;
+      sum_perturb += perturb;
+      num_perturb++;
+    }
+  }
+
+  settings.log.printf("Applied initial perturbation of %e to %d/%d objective coefficients\n",
+                      sum_perturb,
+                      num_perturb,
+                      n);
+}
+
+template <typename i_t, typename f_t>
+void compute_reduced_cost_update(const lp_problem_t<i_t, f_t>& lp,
+                                 const std::vector<i_t>& basic_list,
+                                 const std::vector<i_t>& nonbasic_list,
+                                 const std::vector<f_t>& delta_y,
+                                 i_t leaving_index,
+                                 i_t direction,
+                                 std::vector<i_t>& delta_z_mark,
+                                 std::vector<i_t>& delta_z_indices,
+                                 std::vector<f_t>& delta_z)
+{
+  const i_t m = lp.num_rows;
+  const i_t n = lp.num_cols;
+
+  // delta_zB = sigma*ei
+  for (i_t k = 0; k < m; k++) {
+    const i_t j = basic_list[k];
+    delta_z[j]  = 0;
+  }
+  delta_z[leaving_index] = direction;
+  // delta_zN = -N'*delta_y
+  for (i_t k = 0; k < n - m; k++) {
+    const i_t j = nonbasic_list[k];
+    // z_j <- -A(:, j)'*delta_y
+    const i_t col_start = lp.A.col_start[j];
+    const i_t col_end   = lp.A.col_start[j + 1];
+    f_t dot             = 0.0;
+    for (i_t p = col_start; p < col_end; ++p) {
+      dot += lp.A.x[p] * delta_y[lp.A.i[p]];
+    }
+    delta_z[j] = -dot;
+    if (dot != 0.0) {
+      delta_z_indices.push_back(j);  // Note delta_z_indices has n elements reserved
+      delta_z_mark[j] = 1;
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+void compute_delta_z(const csc_matrix_t<i_t, f_t>& A_transpose,
+                     const sparse_vector_t<i_t, f_t>& delta_y,
+                     i_t leaving_index,
+                     i_t direction,
+                     std::vector<i_t>& nonbasic_mark,
+                     std::vector<i_t>& delta_z_mark,
+                     std::vector<i_t>& delta_z_indices,
+                     std::vector<f_t>& delta_z)
+{
+  // delta_zN = - N'*delta_y
+  const i_t nz_delta_y = delta_y.i.size();
+  for (i_t k = 0; k < nz_delta_y; k++) {
+    const i_t i         = delta_y.i[k];
+    const f_t delta_y_i = delta_y.x[k];
+    if (std::abs(delta_y_i) < 1e-12) { continue; }
+    const i_t row_start = A_transpose.col_start[i];
+    const i_t row_end   = A_transpose.col_start[i + 1];
+    for (i_t p = row_start; p < row_end; ++p) {
+      const i_t j = A_transpose.i[p];
+      if (nonbasic_mark[j] >= 0) {
+        delta_z[j] -= delta_y_i * A_transpose.x[p];
+        if (!delta_z_mark[j]) {
+          delta_z_mark[j] = 1;
+          delta_z_indices.push_back(j);  // Note delta_z_indices has n elements reserved
+        }
+      }
+    }
+  }
+
+  // delta_zB = sigma*ei
+  delta_z[leaving_index] = direction;
+
+#ifdef CHECK_CHANGE_IN_REDUCED_COST
+  delta_y_sparse.to_dense(delta_y);
+  std::vector<f_t> delta_z_check(n);
+  std::vector<i_t> delta_z_mark_check(n, 0);
+  std::vector<i_t> delta_z_indices_check;
+  phase2::compute_reduced_cost_update(lp,
+                                      basic_list,
+                                      nonbasic_list,
+                                      delta_y,
+                                      leaving_index,
+                                      direction,
+                                      delta_z_mark_check,
+                                      delta_z_indices_check,
+                                      delta_z_check);
+  f_t error_check = 0.0;
+  for (i_t k = 0; k < n; ++k) {
+    const f_t diff = std::abs(delta_z[k] - delta_z_check[k]);
+    if (diff > 1e-6) {
+      printf("delta_z error %d transpose %e no transpose %e diff %e\n",
+             k,
+             delta_z[k],
+             delta_z_check[k],
+             diff);
+    }
+    error_check = std::max(error_check, diff);
+  }
+  if (error_check > 1e-6) { printf("delta_z error %e\n", error_check); }
+#endif
+}
+
+template <typename i_t, typename f_t>
+void compute_reduced_costs(const std::vector<f_t>& objective,
+                           const csc_matrix_t<i_t, f_t>& A,
+                           const std::vector<f_t>& y,
+                           const std::vector<i_t>& basic_list,
+                           const std::vector<i_t>& nonbasic_list,
+                           std::vector<f_t>& z)
+{
+  const i_t m = A.m;
+  const i_t n = A.n;
+  // zN = cN - N'*y
+  for (i_t k = 0; k < n - m; k++) {
+    const i_t j = nonbasic_list[k];
+    // z_j <- c_j
+    z[j] = objective[j];
+
+    // z_j <- z_j - A(:, j)'*y
+    const i_t col_start = A.col_start[j];
+    const i_t col_end   = A.col_start[j + 1];
+    f_t dot             = 0.0;
+    for (i_t p = col_start; p < col_end; ++p) {
+      dot += A.x[p] * y[A.i[p]];
+    }
+    z[j] -= dot;
+  }
+  // zB = 0
+  for (i_t k = 0; k < m; ++k) {
+    z[basic_list[k]] = 0.0;
+  }
+}
+
+template <typename i_t, typename f_t>
+void compute_primal_variables(const basis_update_mpf_t<i_t, f_t>& ft,
+                              const std::vector<f_t>& lp_rhs,
+                              const csc_matrix_t<i_t, f_t>& A,
+                              const std::vector<i_t>& basic_list,
+                              const std::vector<i_t>& nonbasic_list,
+                              f_t tight_tol,
+                              std::vector<f_t>& x)
+{
+  const i_t m          = A.m;
+  const i_t n          = A.n;
+  std::vector<f_t> rhs = lp_rhs;
+  // rhs = b - sum_{j : x_j = l_j} A(:, j) * l(j)
+  //         - sum_{j : x_j = u_j} A(:, j) * u(j)
+  for (i_t k = 0; k < n - m; ++k) {
+    const i_t j         = nonbasic_list[k];
+    const i_t col_start = A.col_start[j];
+    const i_t col_end   = A.col_start[j + 1];
+    const f_t xj        = x[j];
+    if (std::abs(xj) < tight_tol * 10) continue;
+    for (i_t p = col_start; p < col_end; ++p) {
+      rhs[A.i[p]] -= xj * A.x[p];
+    }
+  }
+
+  std::vector<f_t> xB(m);
+  ft.b_solve(rhs, xB);
+
+  for (i_t k = 0; k < m; ++k) {
+    const i_t j = basic_list[k];
+    x[j]        = xB[k];
+  }
+}
+
+template <typename i_t, typename f_t>
+void clear_delta_z(i_t entering_index,
+                   i_t leaving_index,
+                   std::vector<i_t>& delta_z_mark,
+                   std::vector<i_t>& delta_z_indices,
+                   std::vector<f_t>& delta_z)
+{
+  for (i_t k = 0; k < delta_z_indices.size(); k++) {
+    const i_t j     = delta_z_indices[k];
+    delta_z[j]      = 0.0;
+    delta_z_mark[j] = 0;
+  }
+  if (entering_index != -1) { delta_z[entering_index] = 0.0; }
+  delta_z[leaving_index] = 0.0;
+  delta_z_indices.clear();
+}
+
+template <typename i_t, typename f_t>
+void clear_delta_x(const std::vector<i_t>& basic_list,
+                   i_t entering_index,
+                   sparse_vector_t<i_t, f_t>& scaled_delta_xB_sparse,
+                   std::vector<f_t>& delta_x)
+{
+  const i_t scaled_delta_xB_nz = scaled_delta_xB_sparse.i.size();
+  for (i_t k = 0; k < scaled_delta_xB_nz; ++k) {
+    const i_t j = basic_list[scaled_delta_xB_sparse.i[k]];
+    delta_x[j]  = 0.0;
+  }
+  // Leaving index already included above
+  delta_x[entering_index] = 0.0;
+  scaled_delta_xB_sparse.i.clear();
+  scaled_delta_xB_sparse.x.clear();
+}
+
+template <typename i_t, typename f_t>
+void compute_dual_residual(const csc_matrix_t<i_t, f_t>& A,
+                           const std::vector<f_t>& objective,
+                           const std::vector<f_t>& y,
+                           const std::vector<f_t>& z,
+                           std::vector<f_t>& dual_residual)
+{
+  dual_residual = z;
+  const i_t n   = A.n;
+  // r = A'*y + z  - c
+  for (i_t j = 0; j < n; ++j) {
+    dual_residual[j] -= objective[j];
+  }
+  matrix_transpose_vector_multiply(A, 1.0, y, 1.0, dual_residual);
+}
+
 template <typename i_t, typename f_t>
 f_t l2_dual_residual(const lp_problem_t<i_t, f_t>& lp, const lp_solution_t<i_t, f_t>& solution)
 {
-  std::vector<f_t> dual_residual = solution.z;
-  const i_t n                    = lp.num_cols;
-  // dual_residual <- z - c
-  for (i_t j = 0; j < n; j++) {
-    dual_residual[j] -= lp.objective[j];
-  }
-  // dual_residual <- 1.0*A'*y + 1.0*(z - c)
-  matrix_transpose_vector_multiply(lp.A, 1.0, solution.y, 1.0, dual_residual);
+  std::vector<f_t> dual_residual;
+  compute_dual_residual(lp.A, lp.objective, solution.y, solution.z, dual_residual);
   return vector_norm2<i_t, f_t>(dual_residual);
 }
 
@@ -56,9 +509,38 @@ f_t l2_primal_residual(const lp_problem_t<i_t, f_t>& lp, const lp_solution_t<i_t
   return vector_norm2<i_t, f_t>(primal_residual);
 }
 
+template <typename i_t, typename f_t>
+void vstatus_changes(const std::vector<variable_status_t>& vstatus,
+                     const std::vector<variable_status_t>& vstatus_old,
+                     const std::vector<f_t>& z,
+                     const std::vector<f_t>& z_old,
+                     i_t& num_vstatus_changes,
+                     i_t& num_z_changes)
+{
+  num_vstatus_changes = 0;
+  num_z_changes       = 0;
+  const i_t n         = vstatus.size();
+  for (i_t j = 0; j < n; ++j) {
+    if (vstatus[j] != vstatus_old[j]) { num_vstatus_changes++; }
+    if (std::abs(z[j] - z_old[j]) > 1e-6) { num_z_changes++; }
+  }
+}
+
+template <typename f_t>
+void compute_bounded_info(const std::vector<f_t>& lower,
+                          const std::vector<f_t>& upper,
+                          std::vector<uint8_t>& bounded_variables)
+{
+  const size_t n = lower.size();
+  for (size_t j = 0; j < n; j++) {
+    const bool bounded   = (lower[j] > -inf) && (upper[j] < inf) && (lower[j] != upper[j]);
+    bounded_variables[j] = static_cast<uint8_t>(bounded);
+  }
+}
+
 template <typename i_t, typename f_t>
 void compute_dual_solution_from_basis(const lp_problem_t<i_t, f_t>& lp,
-                                      basis_update_t<i_t, f_t>& ft,
+                                      basis_update_mpf_t<i_t, f_t>& ft,
                                       const std::vector<i_t>& basic_list,
                                       const std::vector<i_t>& nonbasic_list,
                                       std::vector<f_t>& y,
@@ -101,6 +583,217 @@ void compute_dual_solution_from_basis(const lp_problem_t<i_t, f_t>& lp,
   }
 }
 
+template <typename i_t, typename f_t>
+i_t compute_primal_solution_from_basis(const lp_problem_t<i_t, f_t>& lp,
+                                       basis_update_mpf_t<i_t, f_t>& ft,
+                                       const std::vector<i_t>& basic_list,
+                                       const std::vector<i_t>& nonbasic_list,
+                                       const std::vector<variable_status_t>& vstatus,
+                                       std::vector<f_t>& x)
+{
+  const i_t m          = lp.num_rows;
+  const i_t n          = lp.num_cols;
+  std::vector<f_t> rhs = lp.rhs;
+
+  for (i_t k = 0; k < n - m; ++k) {
+    const i_t j = nonbasic_list[k];
+    if (vstatus[j] == variable_status_t::NONBASIC_LOWER ||
+        vstatus[j] == variable_status_t::NONBASIC_FIXED) {
+      x[j] = lp.lower[j];
+    } else if (vstatus[j] == variable_status_t::NONBASIC_UPPER) {
+      x[j] = lp.upper[j];
+    } else if (vstatus[j] == variable_status_t::NONBASIC_FREE) {
+      x[j] = 0.0;
+    }
+  }
+
+  // rhs = b - sum_{j : x_j = l_j} A(:, j) l(j) - sum_{j : x_j = u_j} A(:, j) *
+  // u(j)
+  for (i_t k = 0; k < n - m; ++k) {
+    const i_t j         = nonbasic_list[k];
+    const i_t col_start = lp.A.col_start[j];
+    const i_t col_end   = lp.A.col_start[j + 1];
+    const f_t xj        = x[j];
+    for (i_t p = col_start; p < col_end; ++p) {
+      rhs[lp.A.i[p]] -= xj * lp.A.x[p];
+    }
+  }
+
+  std::vector<f_t> xB(m);
+  ft.b_solve(rhs, xB);
+
+  for (i_t k = 0; k < m; ++k) {
+    const i_t j = basic_list[k];
+    x[j]        = xB[k];
+  }
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+f_t compute_initial_primal_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
+                                           const simplex_solver_settings_t<i_t, f_t>& settings,
+                                           const std::vector<i_t>& basic_list,
+                                           const std::vector<f_t>& x,
+                                           std::vector<f_t>& squared_infeasibilities,
+                                           std::vector<i_t>& infeasibility_indices)
+{
+  const i_t m = lp.num_rows;
+  const i_t n = lp.num_cols;
+  squared_infeasibilities.resize(n, 0.0);
+  infeasibility_indices.reserve(n);
+  infeasibility_indices.clear();
+  f_t primal_inf = 0.0;
+  for (i_t k = 0; k < m; ++k) {
+    const i_t j            = basic_list[k];
+    const f_t lower_infeas = lp.lower[j] - x[j];
+    const f_t upper_infeas = x[j] - lp.upper[j];
+    const f_t infeas       = std::max(lower_infeas, upper_infeas);
+    if (infeas > settings.primal_tol) {
+      const f_t square_infeas    = infeas * infeas;
+      squared_infeasibilities[j] = square_infeas;
+      infeasibility_indices.push_back(j);
+      primal_inf += square_infeas;
+    }
+  }
+  return primal_inf;
+}
+
+template <typename i_t, typename f_t>
+void update_single_primal_infeasibility(const std::vector<f_t>& lower,
+                                        const std::vector<f_t>& upper,
+                                        const std::vector<f_t>& x,
+                                        f_t primal_tol,
+                                        std::vector<f_t>& squared_infeasibilities,
+                                        std::vector<i_t>& infeasibility_indices,
+                                        i_t j,
+                                        f_t& primal_inf)
+{
+  const f_t old_val = squared_infeasibilities[j];
+  // x_j < l_j - epsilon => -x_j + l_j > epsilon
+  const f_t lower_infeas = lower[j] - x[j];
+  // x_j > u_j + epsilon => x_j - u_j > epsilon
+  const f_t upper_infeas = x[j] - upper[j];
+  const f_t infeas       = std::max(lower_infeas, upper_infeas);
+  const f_t new_val      = infeas * infeas;
+  if (infeas > primal_tol) {
+    primal_inf = std::max(0.0, primal_inf + (new_val - old_val));
+    // We are infeasible w.r.t the tolerance
+    if (old_val == 0.0) {
+      // This is a new infeasibility
+      // We need to add it to the list
+      infeasibility_indices.push_back(j);
+    } else {
+      // Already infeasible
+    }
+    squared_infeasibilities[j] = new_val;
+  } else {
+    // We are feasible w.r.t the tolerance
+    if (old_val != 0.0) {
+      // We were previously infeasible,
+      primal_inf                 = std::max(0.0, primal_inf - old_val);
+      squared_infeasibilities[j] = 0.0;
+    } else {
+      // Still feasible
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+void update_primal_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
+                                   const simplex_solver_settings_t<i_t, f_t>& settings,
+                                   const std::vector<i_t>& basic_list,
+                                   const std::vector<f_t>& x,
+                                   i_t entering_index,
+                                   i_t leaving_index,
+                                   std::vector<i_t>& basic_change_list,
+                                   std::vector<f_t>& squared_infeasibilities,
+                                   std::vector<i_t>& infeasibility_indices,
+                                   f_t& primal_inf)
+{
+  const f_t primal_tol = settings.primal_tol;
+  const i_t nz         = basic_change_list.size();
+  for (i_t k = 0; k < nz; ++k) {
+    const i_t j = basic_list[basic_change_list[k]];
+    // The change list will contain the leaving variable,
+    // But not the entering variable.
+
+    if (j == leaving_index) {
+      // Force the leaving variable to be feasible
+      const f_t old_val          = squared_infeasibilities[j];
+      squared_infeasibilities[j] = 0.0;
+      primal_inf                 = std::max(0.0, primal_inf - old_val);
+      continue;
+    }
+    update_single_primal_infeasibility(lp.lower,
+                                       lp.upper,
+                                       x,
+                                       primal_tol,
+                                       squared_infeasibilities,
+                                       infeasibility_indices,
+                                       j,
+                                       primal_inf);
+  }
+}
+
+template <typename i_t, typename f_t>
+void clean_up_infeasibilities(std::vector<f_t>& squared_infeasibilities,
+                              std::vector<i_t>& infeasibility_indices)
+{
+  bool needs_clean_up = false;
+  for (i_t k = 0; k < infeasibility_indices.size(); ++k) {
+    const i_t j              = infeasibility_indices[k];
+    const f_t squared_infeas = squared_infeasibilities[j];
+    if (squared_infeas == 0.0) { needs_clean_up = true; }
+  }
+
+  if (needs_clean_up) {
+    for (i_t k = 0; k < infeasibility_indices.size(); ++k) {
+      const i_t j              = infeasibility_indices[k];
+      const f_t squared_infeas = squared_infeasibilities[j];
+      if (squared_infeas == 0.0) {
+        // Set to the last element
+        const i_t sz             = infeasibility_indices.size();
+        infeasibility_indices[k] = infeasibility_indices[sz - 1];
+        infeasibility_indices.pop_back();
+        i_t new_j = infeasibility_indices[k];
+        if (squared_infeasibilities[new_j] == 0.0) { k--; }
+      }
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+i_t steepest_edge_pricing_with_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
+                                               const simplex_solver_settings_t<i_t, f_t>& settings,
+                                               const std::vector<f_t>& x,
+                                               const std::vector<f_t>& dy_steepest_edge,
+                                               const std::vector<i_t>& basic_mark,
+                                               std::vector<f_t>& squared_infeasibilities,
+                                               std::vector<i_t>& infeasibility_indices,
+                                               i_t& direction,
+                                               i_t& basic_leaving,
+                                               f_t& max_val)
+{
+  max_val           = 0.0;
+  i_t leaving_index = -1;
+  const i_t nz      = infeasibility_indices.size();
+  for (i_t k = 0; k < nz; ++k) {
+    const i_t j              = infeasibility_indices[k];
+    const f_t squared_infeas = squared_infeasibilities[j];
+    const f_t val            = squared_infeas / dy_steepest_edge[j];
+    if (val > max_val || val == max_val && j > leaving_index) {
+      max_val                = val;
+      leaving_index          = j;
+      const f_t lower_infeas = lp.lower[j] - x[j];
+      const f_t upper_infeas = x[j] - lp.upper[j];
+      direction              = lower_infeas >= upper_infeas ? 1 : -1;
+    }
+  }
+
+  basic_leaving = leaving_index >= 0 ? basic_mark[leaving_index] : -1;
+  return leaving_index;
+}
+
 template <typename i_t, typename f_t>
 i_t steepest_edge_pricing(const lp_problem_t<i_t, f_t>& lp,
                           const simplex_solver_settings_t<i_t, f_t>& settings,
@@ -344,185 +1037,200 @@ i_t phase2_ratio_test(const lp_problem_t<i_t, f_t>& lp,
   return entering_index;
 }
 
-template <typename i_t, typename f_t>
-i_t bound_flipping_ratio_test(const lp_problem_t<i_t, f_t>& lp,
-                              const simplex_solver_settings_t<i_t, f_t>& settings,
-                              f_t start_time,
-                              const std::vector<variable_status_t>& vstatus,
-                              const std::vector<i_t>& nonbasic_list,
-                              const std::vector<f_t>& x,
-                              std::vector<f_t>& z,
-                              std::vector<f_t>& delta_z,
-                              i_t direction,
-                              i_t leaving_index,
-                              f_t& step_length,
-                              i_t& nonbasic_entering)
-{
-  const i_t n = lp.num_cols;
-  const i_t m = lp.num_rows;
-
-  f_t slope = direction == 1 ? (lp.lower[leaving_index] - x[leaving_index])
-                             : (x[leaving_index] - lp.upper[leaving_index]);
-  assert(slope > 0);
-
-  const f_t pivot_tol         = settings.pivot_tol;
-  const f_t relaxed_pivot_tol = settings.pivot_tol;
-  const f_t zero_tol          = settings.zero_tol;
-  std::list<i_t> q_pos;
-  assert(nonbasic_list.size() == n - m);
-  for (i_t k = 0; k < n - m; ++k) {
-    const i_t j = nonbasic_list[k];
-    if (vstatus[j] == variable_status_t::NONBASIC_FIXED) { continue; }
-    if (vstatus[j] == variable_status_t::NONBASIC_LOWER && delta_z[j] < -pivot_tol) {
-      q_pos.push_back(k);
-    } else if (vstatus[j] == variable_status_t::NONBASIC_UPPER && delta_z[j] > pivot_tol) {
-      q_pos.push_back(k);
-    }
-  }
-  i_t entering_index = -1;
-  step_length        = inf;
-  const f_t dual_tol = settings.dual_tol / 10;
-  while (q_pos.size() > 0 && slope > 0) {
-    // Find the minimum ratio for nonbasic variables in q_pos
-    f_t min_val = inf;
-    typename std::list<i_t>::iterator q_index;
-    i_t candidate = -1;
-    for (typename std::list<i_t>::iterator it = q_pos.begin(); it != q_pos.end(); ++it) {
-      const i_t k = *it;
-      const i_t j = nonbasic_list[k];
-      f_t ratio   = inf;
-      if (vstatus[j] == variable_status_t::NONBASIC_LOWER && delta_z[j] < -pivot_tol) {
-        ratio = (-dual_tol - z[j]) / delta_z[j];
-      } else if (vstatus[j] == variable_status_t::NONBASIC_UPPER && delta_z[j] > pivot_tol) {
-        ratio = (dual_tol - z[j]) / delta_z[j];
-      } else if (min_val != inf) {
-        // We've already found something just continue;
-      } else if (vstatus[j] == variable_status_t::NONBASIC_LOWER) {
-        ratio = (-dual_tol - z[j]) / delta_z[j];
-      } else if (vstatus[j] == variable_status_t::NONBASIC_UPPER) {
-        ratio = (dual_tol - z[j]) / delta_z[j];
-      } else {
-        assert(1 == 0);
-      }
-
-      ratio = std::max(ratio, 0.0);
-
-      if (ratio < min_val) {
-        min_val = ratio;
-        q_index = it;  // Save the iterator so we can remove the element it
-                       // points to from the q_pos list later (if it corresponds
-                       // to a bounded variable)
-        candidate = j;
-      } else if (ratio < min_val + zero_tol &&
-                 std::abs(delta_z[j]) > std::abs(delta_z[candidate])) {
-        min_val   = ratio;
-        q_index   = it;
-        candidate = j;
-      }
-    }
-    step_length       = min_val;  // Save the step length
-    nonbasic_entering = *q_index;
-    const i_t j = entering_index = nonbasic_list[nonbasic_entering];
-    if (lp.lower[j] > -inf && lp.upper[j] < inf && lp.lower[j] != lp.upper[j]) {
-      const f_t interval    = lp.upper[j] - lp.lower[j];
-      const f_t delta_slope = std::abs(delta_z[j]) * interval;
-#ifdef BOUND_FLIP_DEBUG
-      if (slope - delta_slope > 0) {
-        log.printf(
-          "Bound flip %d slope change %e prev slope %e slope %e. curr step "
-          "length %e\n",
-          j,
-          delta_slope,
-          slope,
-          slope - delta_slope,
-          step_length);
-      }
-#endif
-      slope -= delta_slope;
-      q_pos.erase(q_index);
-    } else {
-      // we hit a variable that is not bounded. Exit
-      break;
-    }
-
-    if (toc(start_time) > settings.time_limit) { return -2; }
-    if (settings.concurrent_halt != nullptr &&
-        settings.concurrent_halt->load(std::memory_order_acquire) == 1) {
-      return -3;
-    }
-  }
-  // step_length, nonbasic_entering, and entering_index are defined after the
-  // while loop
-  assert(step_length >= 0);
-
-  return entering_index;
-}
-
 template <typename i_t, typename f_t>
 i_t flip_bounds(const lp_problem_t<i_t, f_t>& lp,
                 const simplex_solver_settings_t<i_t, f_t>& settings,
+                const std::vector<uint8_t>& bounded_variables,
                 const std::vector<f_t>& objective,
                 const std::vector<f_t>& z,
+                const std::vector<i_t>& delta_z_indices,
                 const std::vector<i_t>& nonbasic_list,
                 i_t entering_index,
                 std::vector<variable_status_t>& vstatus,
                 std::vector<f_t>& delta_x,
-                std::vector<f_t>& atilde)
+                std::vector<i_t>& mark,
+                std::vector<f_t>& atilde,
+                std::vector<i_t>& atilde_index)
 {
-  f_t delta_obj = 0;
-  for (i_t j : nonbasic_list) {
+  i_t num_flipped = 0;
+  for (i_t j : delta_z_indices) {
     if (j == entering_index) { continue; }
-    const bool bounded =
-      (lp.lower[j] > -inf) && (lp.upper[j] < inf) && (lp.lower[j] != lp.upper[j]);
-    if (!bounded) { continue; }
+    if (!bounded_variables[j]) { continue; }
     // x_j is now a nonbasic bounded variable that will not enter the basis this
     // iteration
     const f_t dual_tol =
       settings.dual_tol;  // lower to 1e-7 or less will cause 25fv47 and d2q06c to cycle
     if (vstatus[j] == variable_status_t::NONBASIC_LOWER && z[j] < -dual_tol) {
       const f_t delta = lp.upper[j] - lp.lower[j];
-      scatter_dense(lp.A, j, -delta, atilde);
-      delta_obj += delta * objective[j];
+      scatter_dense(lp.A, j, -delta, atilde, mark, atilde_index);
       delta_x[j] += delta;
       vstatus[j] = variable_status_t::NONBASIC_UPPER;
 #ifdef BOUND_FLIP_DEBUG
       settings.log.printf(
         "Flipping nonbasic %d from lo %e to up %e. z %e\n", j, lp.lower[j], lp.upper[j], z[j]);
 #endif
+      num_flipped++;
     } else if (vstatus[j] == variable_status_t::NONBASIC_UPPER && z[j] > dual_tol) {
       const f_t delta = lp.lower[j] - lp.upper[j];
-      scatter_dense(lp.A, j, -delta, atilde);
-      delta_obj += delta * objective[j];
+      scatter_dense(lp.A, j, -delta, atilde, mark, atilde_index);
       delta_x[j] += delta;
       vstatus[j] = variable_status_t::NONBASIC_LOWER;
 #ifdef BOUND_FLIP_DEBUG
       settings.log.printf(
         "Flipping nonbasic %d from up %e to lo %e. z %e\n", j, lp.upper[j], lp.lower[j], z[j]);
 #endif
+      num_flipped++;
     }
   }
-  return 0;
+  return num_flipped;
+}
+
+template <typename i_t, typename f_t>
+void initialize_steepest_edge_norms_from_slack_basis(const std::vector<i_t>& basic_list,
+                                                     const std::vector<i_t>& nonbasic_list,
+                                                     std::vector<f_t>& delta_y_steepest_edge)
+{
+  const i_t m = basic_list.size();
+  const i_t n = delta_y_steepest_edge.size();
+  for (i_t k = 0; k < m; ++k) {
+    const i_t j              = basic_list[k];
+    delta_y_steepest_edge[j] = 1.0;
+  }
+  const i_t n_minus_m = n - m;
+  for (i_t k = 0; k < n_minus_m; ++k) {
+    const i_t j              = nonbasic_list[k];
+    delta_y_steepest_edge[j] = 1e-4;
+  }
 }
 
 template <typename i_t, typename f_t>
-i_t initialize_steepest_edge_norms(const simplex_solver_settings_t<i_t, f_t>& settings,
+i_t initialize_steepest_edge_norms(const lp_problem_t<i_t, f_t>& lp,
+                                   const simplex_solver_settings_t<i_t, f_t>& settings,
                                    const f_t start_time,
                                    const std::vector<i_t>& basic_list,
-                                   const basis_update_t<i_t, f_t>& ft,
+                                   basis_update_mpf_t<i_t, f_t>& ft,
                                    std::vector<f_t>& delta_y_steepest_edge)
 {
-  // TODO: Skip this initialization when starting from a slack basis
-  //       Or skip individual columns corresponding to slack variables
-  const i_t m  = basic_list.size();
+  const i_t m = basic_list.size();
+
+  // We want to compute B^T delta_y_i = -e_i
+  // If there is a column u of B^T such that B^T(:, u) = alpha * e_i than the
+  // solve delta_y_i = -1/alpha * e_u
+  // So we need to find columns of B^T (or rows of B) with only a single non-zero entry
+  f_t start_singleton_rows = tic();
+  std::vector<i_t> row_degree(m, 0);
+  std::vector<i_t> mapping(m, -1);
+  std::vector<f_t> coeff(m, 0.0);
+
+  for (i_t k = 0; k < m; ++k) {
+    const i_t j         = basic_list[k];
+    const i_t col_start = lp.A.col_start[j];
+    const i_t col_end   = lp.A.col_start[j + 1];
+    for (i_t p = col_start; p < col_end; ++p) {
+      const i_t i = lp.A.i[p];
+      row_degree[i]++;
+      // column j of A is column k of B
+      mapping[k] = i;
+      coeff[k]   = lp.A.x[p];
+    }
+  }
+
+#ifdef CHECK_SINGLETON_ROWS
+  csc_matrix_t<i_t, f_t> B(m, m, 0);
+  form_b(lp.A, basic_list, B);
+  csc_matrix_t<i_t, f_t> B_transpose(m, m, 0);
+  B.transpose(B_transpose);
+#endif
+
+  i_t num_singleton_rows = 0;
+  for (i_t i = 0; i < m; ++i) {
+    if (row_degree[i] == 1) {
+      num_singleton_rows++;
+#ifdef CHECK_SINGLETON_ROWS
+      const i_t col_start = B_transpose.col_start[i];
+      const i_t col_end   = B_transpose.col_start[i + 1];
+      if (col_end - col_start != 1) {
+        settings.log.printf("Singleton row %d has %d non-zero entries\n", i, col_end - col_start);
+      }
+#endif
+    }
+  }
+
+  if (num_singleton_rows > 0) {
+    settings.log.printf("Found %d singleton rows for steepest edge norms in %.2fs\n",
+                        num_singleton_rows,
+                        toc(start_singleton_rows));
+  }
+
   f_t last_log = tic();
   for (i_t k = 0; k < m; ++k) {
-    std::vector<f_t> ei(m);
-    std::vector<f_t> dy(m);
-    const i_t j = basic_list[k];
-    ei[k]       = -1.0;
-    ft.b_transpose_solve(ei, dy);
-    ei[k]          = 0.0;
-    const f_t init = vector_norm2_squared<i_t, f_t>(dy);
+    sparse_vector_t<i_t, f_t> sparse_ei(m, 1);
+    sparse_ei.x[0] = -1.0;
+    sparse_ei.i[0] = k;
+    const i_t j    = basic_list[k];
+    f_t init       = -1.0;
+    if (row_degree[mapping[k]] == 1) {
+      const i_t u     = mapping[k];
+      const f_t alpha = coeff[k];
+      // dy[u] = -1.0 / alpha;
+      f_t my_init = 1.0 / (alpha * alpha);
+      init        = my_init;
+#ifdef CHECK_HYPERSPARSE
+      std::vector<f_t> residual(m);
+      b_transpose_multiply(lp, basic_list, dy, residual);
+      float error = 0;
+      for (i_t h = 0; h < m; ++h) {
+        const f_t error_component = std::abs(residual[h] - ei[h]);
+        error += error_component;
+        if (error_component > 1e-12) {
+          settings.log.printf("Singleton row %d component %d error %e residual %e ei %e\n",
+                              k,
+                              h,
+                              error_component,
+                              residual[h],
+                              ei[h]);
+        }
+      }
+      if (error > 1e-12) { settings.log.printf("Singleton row %d error %e\n", k, error); }
+#endif
+
+#ifdef CHECK_HYPERSPARSE
+      dy[u] = 0.0;
+      ft.b_transpose_solve(ei, dy);
+      init = vector_norm2_squared<i_t, f_t>(dy);
+      if (init != my_init) {
+        settings.log.printf("Singleton row %d error %.16e init %.16e my_init %.16e\n",
+                            k,
+                            std::abs(init - my_init),
+                            init,
+                            my_init);
+      }
+#endif
+    } else {
+#if COMPARE_WITH_DENSE
+      ft.b_transpose_solve(ei, dy);
+      init = vector_norm2_squared<i_t, f_t>(dy);
+#else
+      sparse_vector_t<i_t, f_t> sparse_dy(m, 0);
+      ft.b_transpose_solve(sparse_ei, sparse_dy);
+      f_t my_init = 0.0;
+      for (i_t p = 0; p < sparse_dy.x.size(); ++p) {
+        my_init += sparse_dy.x[p] * sparse_dy.x[p];
+      }
+#endif
+#if COMPARE_WITH_DENSE
+      if (std::abs(init - my_init) > 1e-12) {
+        settings.log.printf("Singleton row %d error %.16e init %.16e my_init %.16e\n",
+                            k,
+                            std::abs(init - my_init),
+                            init,
+                            my_init);
+      }
+#endif
+      init = my_init;
+    }
+    // ei[k]          = 0.0;
+    // init = vector_norm2_squared<i_t, f_t>(dy);
     assert(init > 0);
     delta_y_steepest_edge[j] = init;
 
@@ -544,26 +1252,25 @@ i_t initialize_steepest_edge_norms(const simplex_solver_settings_t<i_t, f_t>& se
 template <typename i_t, typename f_t>
 i_t update_steepest_edge_norms(const simplex_solver_settings_t<i_t, f_t>& settings,
                                const std::vector<i_t>& basic_list,
-                               const basis_update_t<i_t, f_t>& ft,
+                               const basis_update_mpf_t<i_t, f_t>& ft,
                                i_t direction,
-                               const std::vector<f_t>& delta_y,
-                               const std::vector<f_t>& scaled_delta_xB,
+                               const sparse_vector_t<i_t, f_t>& delta_y_sparse,
+                               f_t dy_norm_squared,
+                               const sparse_vector_t<i_t, f_t>& scaled_delta_xB,
                                i_t basic_leaving_index,
                                i_t entering_index,
+                               std::vector<f_t>& v,
                                std::vector<f_t>& delta_y_steepest_edge)
 {
-  i_t m = delta_y.size();
-  std::vector<f_t> v(m);
+  i_t m                = basic_list.size();
+  const i_t delta_y_nz = delta_y_sparse.i.size();
+  sparse_vector_t<i_t, f_t> v_sparse(m, 0);
   // B^T delta_y = - direction * e_basic_leaving_index
   // We want B v =  - B^{-T} e_basic_leaving_index
-  ft.b_solve(delta_y, v);
-  // if direction = -1 we need to scale v
-  if (direction == -1) {
-    for (i_t k = 0; k < m; ++k) {
-      v[k] *= -1;
-    }
-  }
-  const f_t dy_norm_squared      = vector_norm2_squared<i_t, f_t>(delta_y);
+  ft.b_solve(delta_y_sparse, v_sparse);
+  if (direction == -1) { v_sparse.negate(); }
+  v_sparse.scatter(v);
+
   const i_t leaving_index        = basic_list[basic_leaving_index];
   const f_t prev_dy_norm_squared = delta_y_steepest_edge[leaving_index];
 #ifdef STEEPEST_EDGE_DEBUG
@@ -580,17 +1287,18 @@ i_t update_steepest_edge_norms(const simplex_solver_settings_t<i_t, f_t>& settin
 
   // B*w = A(:, leaving_index)
   // B*scaled_delta_xB = -A(:, leaving_index) so w = -scaled_delta_xB
-  const f_t wr = -scaled_delta_xB[basic_leaving_index];
+  const f_t wr = -scaled_delta_xB.find_coefficient(basic_leaving_index);
   if (wr == 0) { return -1; }
-  const f_t omegar = dy_norm_squared / (wr * wr);
-
-  for (i_t k = 0; k < m; ++k) {
+  const f_t omegar             = dy_norm_squared / (wr * wr);
+  const i_t scaled_delta_xB_nz = scaled_delta_xB.i.size();
+  for (i_t h = 0; h < scaled_delta_xB_nz; ++h) {
+    const i_t k = scaled_delta_xB.i[h];
     const i_t j = basic_list[k];
     if (k == basic_leaving_index) {
-      const f_t w_squared      = scaled_delta_xB[k] * scaled_delta_xB[k];
+      const f_t w_squared      = scaled_delta_xB.x[h] * scaled_delta_xB.x[h];
       delta_y_steepest_edge[j] = (1.0 / w_squared) * dy_norm_squared;
     } else {
-      const f_t wk = -scaled_delta_xB[k];
+      const f_t wk = -scaled_delta_xB.x[h];
       f_t new_val  = delta_y_steepest_edge[j] + wk * (2.0 * v[k] / wr + wk * omegar);
       new_val      = std::max(new_val, 1e-4);
 #ifdef STEEPEST_EDGE_DEBUG
@@ -611,23 +1319,30 @@ i_t update_steepest_edge_norms(const simplex_solver_settings_t<i_t, f_t>& settin
     }
   }
 
+  const i_t v_nz = v_sparse.i.size();
+  for (i_t k = 0; k < v_nz; ++k) {
+    v[v_sparse.i[k]] = 0.0;
+  }
+
   return 0;
 }
 
 // Compute steepest edge info for entering variable
 template <typename i_t, typename f_t>
-i_t compute_steepest_edge_norm_entering(const simplex_solver_settings_t<i_t, f_t>& setttings,
+i_t compute_steepest_edge_norm_entering(const simplex_solver_settings_t<i_t, f_t>& settings,
                                         i_t m,
-                                        const basis_update_t<i_t, f_t>& ft,
+                                        const basis_update_mpf_t<i_t, f_t>& ft,
                                         i_t basic_leaving_index,
                                         i_t entering_index,
                                         std::vector<f_t>& steepest_edge_norms)
 {
-  std::vector<f_t> es(m);
-  es[basic_leaving_index] = -1.0;
-  std::vector<f_t> delta_ys(m);
-  ft.b_transpose_solve(es, delta_ys);
-  steepest_edge_norms[entering_index] = vector_norm2_squared<i_t, f_t>(delta_ys);
+  sparse_vector_t<i_t, f_t> es_sparse(m, 1);
+  es_sparse.i[0] = basic_leaving_index;
+  es_sparse.x[0] = -1.0;
+  sparse_vector_t<i_t, f_t> delta_ys_sparse(m, 0);
+  ft.b_transpose_solve(es_sparse, delta_ys_sparse);
+  steepest_edge_norms[entering_index] = delta_ys_sparse.norm2_squared();
+
 #ifdef STEEPEST_EDGE_DEBUG
   settings.log.printf("Steepest edge norm %e for entering j %d at i %d\n",
                       steepest_edge_norms[entering_index],
@@ -640,7 +1355,7 @@ i_t compute_steepest_edge_norm_entering(const simplex_solver_settings_t<i_t, f_t
 template <typename i_t, typename f_t>
 i_t check_steepest_edge_norms(const simplex_solver_settings_t<i_t, f_t>& settings,
                               const std::vector<i_t>& basic_list,
-                              const basis_update_t<i_t, f_t>& ft,
+                              const basis_update_mpf_t<i_t, f_t>& ft,
                               const std::vector<f_t>& delta_y_steepest_edge)
 {
   const i_t m = basic_list.size();
@@ -664,6 +1379,7 @@ i_t check_steepest_edge_norms(const simplex_solver_settings_t<i_t, f_t>& setting
 template <typename i_t, typename f_t>
 i_t compute_perturbation(const lp_problem_t<i_t, f_t>& lp,
                          const simplex_solver_settings_t<i_t, f_t>& settings,
+                         const std::vector<i_t>& delta_z_indices,
                          std::vector<f_t>& z,
                          std::vector<f_t>& objective,
                          f_t& sum_perturb)
@@ -673,7 +1389,8 @@ i_t compute_perturbation(const lp_problem_t<i_t, f_t>& lp,
   const f_t tight_tol = settings.tight_tol;
   i_t num_perturb     = 0;
   sum_perturb         = 0.0;
-  for (i_t j = 0; j < n; ++j) {
+  for (i_t k = 0; k < delta_z_indices.size(); ++k) {
+    const i_t j = delta_z_indices[k];
     if (lp.upper[j] == inf && lp.lower[j] > -inf && z[j] < -tight_tol) {
       const f_t violation = -z[j];
       z[j] += violation;  // z[j] <- 0
@@ -708,6 +1425,233 @@ i_t compute_perturbation(const lp_problem_t<i_t, f_t>& lp,
   return 0;
 }
 
+template <typename i_t>
+void reset_basis_mark(const std::vector<i_t>& basic_list,
+                      const std::vector<i_t>& nonbasic_list,
+                      std::vector<i_t>& basic_mark,
+                      std::vector<i_t>& nonbasic_mark)
+{
+  const i_t m         = basic_list.size();
+  const i_t n         = nonbasic_mark.size();
+  const i_t n_minus_m = n - m;
+
+  for (i_t k = 0; k < n; k++) {
+    basic_mark[k] = -1;
+  }
+
+  for (i_t k = 0; k < n; k++) {
+    nonbasic_mark[k] = -1;
+  }
+
+  for (i_t k = 0; k < n_minus_m; k++) {
+    nonbasic_mark[nonbasic_list[k]] = k;
+  }
+
+  for (i_t k = 0; k < m; k++) {
+    basic_mark[basic_list[k]] = k;
+  }
+}
+
+template <typename i_t, typename f_t>
+void compute_delta_y(const basis_update_mpf_t<i_t, f_t>& ft,
+                     i_t basic_leaving_index,
+                     i_t direction,
+                     sparse_vector_t<i_t, f_t>& delta_y_sparse,
+                     sparse_vector_t<i_t, f_t>& UTsol_sparse)
+{
+  const i_t m = delta_y_sparse.n;
+  // BT*delta_y = -delta_zB = -sigma*ei
+  sparse_vector_t<i_t, f_t> ei_sparse(m, 1);
+  ei_sparse.i[0] = basic_leaving_index;
+  ei_sparse.x[0] = -direction;
+  ft.b_transpose_solve(ei_sparse, delta_y_sparse, UTsol_sparse);
+
+  if (direction != -1) {
+    // We solved BT*delta_y = -sigma*ei, but for the update we need
+    // UT*etilde = ei. So we need to flip the sign of the solution
+    // in the case that sigma == 1.
+    UTsol_sparse.negate();
+  }
+
+#ifdef CHECK_B_TRANSPOSE_SOLVE
+  std::vector<f_t> delta_y_sparse_vector_check(m);
+  delta_y_sparse.to_dense(delta_y_sparse_vector_check);
+  f_t error_check = 0.0;
+  for (i_t k = 0; k < m; ++k) {
+    if (std::abs(delta_y[k] - delta_y_sparse_vector_check[k]) > 1e-6) {
+      settings.log.printf(
+        "\tBTranspose error %d %e %e\n", k, delta_y[k], delta_y_sparse_vector_check[k]);
+    }
+    error_check += std::abs(delta_y[k] - delta_y_sparse_vector_check[k]);
+  }
+  if (error_check > 1e-6) { settings.log.printf("BTranspose error %e\n", error_check); }
+  std::vector<f_t> residual(m);
+  b_transpose_multiply(lp, basic_list, delta_y_sparse_vector_check, residual);
+  for (i_t k = 0; k < m; ++k) {
+    if (std::abs(residual[k] - ei[k]) > 1e-6) {
+      settings.log.printf("\tBTranspose multiply error %d %e %e\n", k, residual[k], ei[k]);
+    }
+  }
+#endif
+}
+
+template <typename i_t, typename f_t>
+void update_dual_variables(const sparse_vector_t<i_t, f_t>& delta_y_sparse,
+                           const std::vector<i_t>& delta_z_indices,
+                           const std::vector<f_t>& delta_z,
+                           f_t step_length,
+                           i_t leaving_index,
+                           std::vector<f_t>& y,
+                           std::vector<f_t>& z)
+{
+  // Update dual variables
+  // y <- y + steplength * delta_y
+  const i_t delta_y_nz = delta_y_sparse.i.size();
+  for (i_t k = 0; k < delta_y_nz; ++k) {
+    const i_t i = delta_y_sparse.i[k];
+    y[i] += step_length * delta_y_sparse.x[k];
+  }
+  // z <- z + steplength * delta_z
+  const i_t delta_z_nz = delta_z_indices.size();
+  for (i_t k = 0; k < delta_z_nz; ++k) {
+    const i_t j = delta_z_indices[k];
+    z[j] += step_length * delta_z[j];
+  }
+  z[leaving_index] += step_length * delta_z[leaving_index];
+}
+
+template <typename i_t, typename f_t>
+void adjust_for_flips(const basis_update_mpf_t<i_t, f_t>& ft,
+                      const std::vector<i_t>& basic_list,
+                      const std::vector<i_t>& delta_z_indices,
+                      std::vector<i_t>& atilde_index,
+                      std::vector<f_t>& atilde,
+                      std::vector<i_t>& atilde_mark,
+                      sparse_vector_t<i_t, f_t>& delta_xB_0_sparse,
+                      std::vector<f_t>& delta_x_flip,
+                      std::vector<f_t>& x)
+{
+  const i_t m         = basic_list.size();
+  const i_t atilde_nz = atilde_index.size();
+  // B*delta_xB_0 = atilde
+  sparse_vector_t<i_t, f_t> atilde_sparse(m, atilde_nz);
+  for (i_t k = 0; k < atilde_nz; ++k) {
+    atilde_sparse.i[k] = atilde_index[k];
+    atilde_sparse.x[k] = atilde[atilde_index[k]];
+  }
+  ft.b_solve(atilde_sparse, delta_xB_0_sparse);
+  const i_t delta_xB_0_nz = delta_xB_0_sparse.i.size();
+  for (i_t k = 0; k < delta_xB_0_nz; ++k) {
+    const i_t j = basic_list[delta_xB_0_sparse.i[k]];
+    x[j] += delta_xB_0_sparse.x[k];
+  }
+
+  for (i_t j : delta_z_indices) {
+    x[j] += delta_x_flip[j];
+    delta_x_flip[j] = 0.0;
+  }
+
+  // Clear atilde
+  for (i_t k = 0; k < atilde_index.size(); ++k) {
+    atilde[atilde_index[k]] = 0.0;
+  }
+  // Clear atilde_mark
+  for (i_t k = 0; k < atilde_mark.size(); ++k) {
+    atilde_mark[k] = 0;
+  }
+  atilde_index.clear();
+}
+
+template <typename i_t, typename f_t>
+i_t compute_delta_x(const lp_problem_t<i_t, f_t>& lp,
+                    const basis_update_mpf_t<i_t, f_t>& ft,
+                    i_t entering_index,
+                    i_t leaving_index,
+                    i_t basic_leaving_index,
+                    i_t direction,
+                    const std::vector<i_t>& basic_list,
+                    const std::vector<f_t>& delta_x_flip,
+                    const sparse_vector_t<i_t, f_t>& rhs_sparse,
+                    const std::vector<f_t>& x,
+                    sparse_vector_t<i_t, f_t>& utilde_sparse,
+                    sparse_vector_t<i_t, f_t>& scaled_delta_xB_sparse,
+                    std::vector<f_t>& delta_x)
+{
+  f_t delta_x_leaving = direction == 1 ? lp.lower[leaving_index] - x[leaving_index]
+                                       : lp.upper[leaving_index] - x[leaving_index];
+  // B*w = -A(:, entering)
+  ft.b_solve(rhs_sparse, scaled_delta_xB_sparse, utilde_sparse);
+  scaled_delta_xB_sparse.negate();
+
+#ifdef CHECK_B_SOLVE
+  std::vector<f_t> scaled_delta_xB(m);
+  {
+    std::vector<f_t> residual_B(m);
+    b_multiply(lp, basic_list, scaled_delta_xB, residual_B);
+    f_t err_max = 0;
+    for (i_t k = 0; k < m; ++k) {
+      const f_t err = std::abs(rhs[k] + residual_B[k]);
+      if (err >= 1e-6) {
+        settings.log.printf(
+          "Bsolve diff %d %e rhs %e residual %e\n", k, err, rhs[k], residual_B[k]);
+      }
+      err_max = std::max(err_max, err);
+    }
+    if (err_max > 1e-6) { settings.log.printf("B multiply error %e\n", err_max); }
+  }
+#endif
+
+  f_t scale = scaled_delta_xB_sparse.find_coefficient(basic_leaving_index);
+  if (scale != scale) {
+    // We couldn't find a coefficient for the basic leaving index.
+    // Either this is a bug or the primal step length is inf.
+    return -1;
+  }
+  const f_t primal_step_length = delta_x_leaving / scale;
+  const i_t scaled_delta_xB_nz = scaled_delta_xB_sparse.i.size();
+  for (i_t k = 0; k < scaled_delta_xB_nz; ++k) {
+    const i_t j = basic_list[scaled_delta_xB_sparse.i[k]];
+    delta_x[j]  = primal_step_length * scaled_delta_xB_sparse.x[k];
+  }
+  delta_x[leaving_index]  = delta_x_leaving;
+  delta_x[entering_index] = primal_step_length;
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+void update_primal_variables(const sparse_vector_t<i_t, f_t>& scaled_delta_xB_sparse,
+                             const std::vector<i_t>& basic_list,
+                             const std::vector<f_t>& delta_x,
+                             i_t entering_index,
+                             std::vector<f_t>& x)
+{
+  // x <- x + delta_x
+  const i_t scaled_delta_xB_nz = scaled_delta_xB_sparse.i.size();
+  for (i_t k = 0; k < scaled_delta_xB_nz; ++k) {
+    const i_t j = basic_list[scaled_delta_xB_sparse.i[k]];
+    x[j] += delta_x[j];
+  }
+  // Leaving index already included above
+  x[entering_index] += delta_x[entering_index];
+}
+
+template <typename i_t, typename f_t>
+void update_objective(const std::vector<i_t>& basic_list,
+                      const std::vector<i_t>& changed_basic_indices,
+                      const std::vector<f_t>& objective,
+                      const std::vector<f_t>& delta_x,
+                      i_t entering_index,
+                      f_t& obj)
+{
+  const i_t changed_basic_nz = changed_basic_indices.size();
+  for (i_t k = 0; k < changed_basic_nz; ++k) {
+    const i_t j = basic_list[changed_basic_indices[k]];
+    obj += delta_x[j] * objective[j];
+  }
+  // Leaving index already included above
+  obj += delta_x[entering_index] * objective[entering_index];
+}
+
 template <typename i_t, typename f_t>
 f_t dual_infeasibility(const lp_problem_t<i_t, f_t>& lp,
                        const simplex_solver_settings_t<i_t, f_t>& settings,
@@ -833,6 +1777,103 @@ f_t primal_infeasibility(const lp_problem_t<i_t, f_t>& lp,
   return primal_inf;
 }
 
+template <typename i_t, typename f_t>
+void check_primal_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
+                                  const simplex_solver_settings_t<i_t, f_t>& settings,
+                                  const std::vector<i_t>& basic_list,
+                                  const std::vector<f_t>& x,
+                                  const std::vector<f_t>& squared_infeasibilities,
+                                  const std::vector<i_t>& infeasibility_indices)
+{
+  const i_t m = basic_list.size();
+  for (i_t k = 0; k < m; ++k) {
+    const i_t j            = basic_list[k];
+    const f_t lower_infeas = lp.lower[j] - x[j];
+    const f_t upper_infeas = x[j] - lp.upper[j];
+    const f_t infeas       = std::max(lower_infeas, upper_infeas);
+    if (infeas > settings.primal_tol) {
+      const f_t square_infeas = infeas * infeas;
+      if (square_infeas != squared_infeasibilities[j]) {
+        settings.log.printf("Primal infeasibility mismatch %d %e != %e\n",
+                            j,
+                            square_infeas,
+                            squared_infeasibilities[j]);
+      }
+      bool found = false;
+      for (i_t h = 0; h < infeasibility_indices.size(); ++h) {
+        if (infeasibility_indices[h] == j) {
+          found = true;
+          break;
+        }
+      }
+      if (!found) { settings.log.printf("Infeasibility index not found %d\n", j); }
+    }
+  }
+}
+
+template <typename i_t>
+void check_basic_infeasibilities(const std::vector<i_t>& basic_list,
+                                 const std::vector<i_t>& basic_mark,
+                                 const std::vector<i_t>& infeasibility_indices,
+                                 i_t info)
+{
+  for (i_t k = 0; k < infeasibility_indices.size(); ++k) {
+    const i_t j = infeasibility_indices[k];
+    if (basic_mark[j] < 0) { printf("%d basic_infeasibilities basic_mark[%d] < 0\n", info, j); }
+  }
+}
+
+template <typename i_t, typename f_t>
+void check_update(const lp_problem_t<i_t, f_t>& lp,
+                  const simplex_solver_settings_t<i_t, f_t>& settings,
+                  const basis_update_t<i_t, f_t>& ft,
+                  const std::vector<i_t>& basic_list,
+                  const std::vector<i_t>& basic_leaving_index)
+{
+  const i_t m = basic_list.size();
+  csc_matrix_t<i_t, f_t> Btest(m, m, 1);
+  ft.multiply_lu(Btest);
+  {
+    csc_matrix_t<i_t, f_t> B(m, m, 1);
+    form_b(lp.A, basic_list, B);
+    csc_matrix_t<i_t, f_t> Diff(m, m, 1);
+    add(Btest, B, 1.0, -1.0, Diff);
+    const f_t err = Diff.norm1();
+    if (err > settings.primal_tol) { settings.log.printf("|| B - L*U || %e\n", Diff.norm1()); }
+    if (err > settings.primal_tol) {
+      for (i_t j = 0; j < m; ++j) {
+        for (i_t p = Diff.col_start[j]; p < Diff.col_start[j + 1]; ++p) {
+          const i_t i = Diff.i[p];
+          if (Diff.x[p] != 0.0) { settings.log.printf("Diff %d %d %e\n", j, i, Diff.x[p]); }
+        }
+      }
+    }
+    settings.log.printf("basic leaving index %d\n", basic_leaving_index);
+    assert(err < settings.primal_tol);
+  }
+}
+
+template <typename i_t, typename f_t>
+void check_basis_mark(const simplex_solver_settings_t<i_t, f_t>& settings,
+                      const std::vector<i_t>& basic_list,
+                      const std::vector<i_t>& nonbasic_list,
+                      const std::vector<i_t>& basic_mark,
+                      const std::vector<i_t>& nonbasic_mark)
+{
+  const i_t m = basic_list.size();
+  const i_t n = basic_mark.size();
+  for (i_t k = 0; k < m; k++) {
+    if (basic_mark[basic_list[k]] != k) {
+      settings.log.printf("Basic mark %d %d\n", basic_list[k], k);
+    }
+  }
+  for (i_t k = 0; k < n - m; k++) {
+    if (nonbasic_mark[nonbasic_list[k]] != k) {
+      settings.log.printf("Nonbasic mark %d %d\n", nonbasic_list[k], k);
+    }
+  }
+}
+
 template <typename i_t, typename f_t>
 void bound_info(const lp_problem_t<i_t, f_t>& lp,
                 const simplex_solver_settings_t<i_t, f_t>& settings)
@@ -953,10 +1994,32 @@ void set_primal_variables_on_bounds(const lp_problem_t<i_t, f_t>& lp,
   }
 }
 
+template <typename f_t>
+f_t compute_perturbed_objective(const std::vector<f_t>& objective, const std::vector<f_t>& x)
+{
+  const size_t n = objective.size();
+  f_t obj_val    = 0.0;
+  for (size_t j = 0; j < n; ++j) {
+    obj_val += objective[j] * x[j];
+  }
+  return obj_val;
+}
+
+template <typename i_t, typename f_t>
+f_t amount_of_perturbation(const lp_problem_t<i_t, f_t>& lp, const std::vector<f_t>& objective)
+{
+  f_t perturbation = 0.0;
+  const i_t n      = lp.num_cols;
+  for (i_t j = 0; j < n; ++j) {
+    perturbation += std::abs(lp.objective[j] - objective[j]);
+  }
+  return perturbation;
+}
+
 template <typename i_t, typename f_t>
 void prepare_optimality(const lp_problem_t<i_t, f_t>& lp,
                         const simplex_solver_settings_t<i_t, f_t>& settings,
-                        basis_update_t<i_t, f_t>& ft,
+                        basis_update_mpf_t<i_t, f_t>& ft,
                         const std::vector<f_t>& objective,
                         const std::vector<i_t>& basic_list,
                         const std::vector<i_t>& nonbasic_list,
@@ -975,11 +2038,7 @@ void prepare_optimality(const lp_problem_t<i_t, f_t>& lp,
 
   sol.objective      = compute_objective(lp, sol.x);
   sol.user_objective = compute_user_objective(lp, sol.objective);
-  f_t perturbation   = 0.0;
-  for (i_t j = 0; j < n; ++j) {
-    perturbation += std::abs(lp.objective[j] - objective[j]);
-  }
-
+  f_t perturbation   = phase2::amount_of_perturbation(lp, objective);
   if (perturbation > 1e-6 && phase == 2) {
     // Try to remove perturbation
     std::vector<f_t> unperturbed_y(m);
@@ -994,6 +2053,8 @@ void prepare_optimality(const lp_problem_t<i_t, f_t>& lp,
         z            = unperturbed_z;
         y            = unperturbed_y;
         perturbation = 0.0;
+      } else {
+        settings.log.printf("Failed to remove perturbation of %.2e.\n", perturbation);
       }
     }
   }
@@ -1015,7 +2076,6 @@ void prepare_optimality(const lp_problem_t<i_t, f_t>& lp,
       settings.log.printf("Primal infeasibility (abs): %.2e\n", primal_infeas);
       settings.log.printf("Dual infeasibility (abs):   %.2e\n", dual_infeas);
       settings.log.printf("Perturbation:               %.2e\n", perturbation);
-      settings.log.printf("Max steepest edge norm:     %.2e\n", max_val);
     } else {
       settings.log.printf("\n");
       settings.log.printf(
@@ -1026,6 +2086,81 @@ void prepare_optimality(const lp_problem_t<i_t, f_t>& lp,
   }
 }
 
+template <typename i_t, typename f_t>
+class phase2_timers_t {
+ public:
+  phase2_timers_t(bool should_time)
+    : record_time(should_time),
+      bfrt_time(0),
+      pricing_time(0),
+      btran_time(0),
+      ftran_time(0),
+      flip_time(0),
+      delta_z_time(0),
+      se_norms_time(0),
+      se_entering_time(0),
+      lu_update_time(0),
+      perturb_time(0),
+      vector_time(0),
+      objective_time(0),
+      update_infeasibility_time(0)
+  {
+  }
+
+  void start_timer()
+  {
+    if (!record_time) { return; }
+    start_time = tic();
+  }
+
+  f_t stop_timer()
+  {
+    if (!record_time) { return 0.0; }
+    return toc(start_time);
+  }
+
+  void print_timers(const simplex_solver_settings_t<i_t, f_t>& settings) const
+  {
+    if (!record_time) { return; }
+    const f_t total_time = bfrt_time + pricing_time + btran_time + ftran_time + flip_time +
+                           delta_z_time + lu_update_time + se_norms_time + se_entering_time +
+                           perturb_time + vector_time + objective_time + update_infeasibility_time;
+    // clang-format off
+    settings.log.printf("BFRT time       %.2fs %4.1f%\n", bfrt_time, 100.0 * bfrt_time / total_time);
+    settings.log.printf("Pricing time    %.2fs %4.1f%\n", pricing_time, 100.0 * pricing_time / total_time);
+    settings.log.printf("BTran time      %.2fs %4.1f%\n", btran_time, 100.0 * btran_time / total_time);
+    settings.log.printf("FTran time      %.2fs %4.1f%\n", ftran_time, 100.0 * ftran_time / total_time);
+    settings.log.printf("Flip time       %.2fs %4.1f%\n", flip_time, 100.0 * flip_time / total_time);
+    settings.log.printf("Delta_z time    %.2fs %4.1f%\n", delta_z_time, 100.0 * delta_z_time / total_time);
+    settings.log.printf("LU update time  %.2fs %4.1f%\n", lu_update_time, 100.0 * lu_update_time / total_time);
+    settings.log.printf("SE norms time   %.2fs %4.1f%\n", se_norms_time, 100.0 * se_norms_time / total_time);
+    settings.log.printf("SE enter time   %.2fs %4.1f%\n", se_entering_time, 100.0 * se_entering_time / total_time);
+    settings.log.printf("Perturb time    %.2fs %4.1f%\n", perturb_time, 100.0 * perturb_time / total_time);
+    settings.log.printf("Vector time     %.2fs %4.1f%\n", vector_time, 100.0 * vector_time / total_time);
+    settings.log.printf("Objective time  %.2fs %4.1f%\n", objective_time, 100.0 * objective_time / total_time);
+    settings.log.printf("Inf update time %.2fs %4.1f%\n", update_infeasibility_time, 100.0 * update_infeasibility_time / total_time);
+    settings.log.printf("Sum             %.2fs\n", total_time);
+    // clang-format on
+  }
+  f_t bfrt_time;
+  f_t pricing_time;
+  f_t btran_time;
+  f_t ftran_time;
+  f_t flip_time;
+  f_t delta_z_time;
+  f_t se_norms_time;
+  f_t se_entering_time;
+  f_t lu_update_time;
+  f_t perturb_time;
+  f_t vector_time;
+  f_t objective_time;
+  f_t update_infeasibility_time;
+
+ private:
+  f_t start_time;
+  bool record_time;
+};
+
 }  // namespace phase2
 
 template <typename i_t, typename f_t>
@@ -1093,7 +2228,7 @@ dual::status_t dual_phase2(i_t phase,
   if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; }
   assert(q.size() == m);
   reorder_basic_list(q, basic_list);
-  basis_update_t ft(L, U, p);
+  basis_update_mpf_t<i_t, f_t> ft(L, U, p, settings.refactor_frequency);
 
   std::vector<f_t> c_basic(m);
   for (i_t k = 0; k < m; ++k) {
@@ -1105,41 +2240,19 @@ dual::status_t dual_phase2(i_t phase,
   ft.b_transpose_solve(c_basic, y);
   if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; }
   constexpr bool print_norms = false;
-  if (print_norms) {
+  if constexpr (print_norms) {
     settings.log.printf(
       "|| y || %e || cB || %e\n", vector_norm_inf<i_t, f_t>(y), vector_norm_inf<i_t, f_t>(c_basic));
   }
 
-  // zN = cN - N'*y
-  for (i_t k = 0; k < n - m; k++) {
-    const i_t j = nonbasic_list[k];
-    // z_j <- c_j
-    z[j] = objective[j];
-
-    // z_j <- z_j - A(:, j)'*y
-    const i_t col_start = lp.A.col_start[j];
-    const i_t col_end   = lp.A.col_start[j + 1];
-    f_t dot             = 0.0;
-    for (i_t p = col_start; p < col_end; ++p) {
-      dot += lp.A.x[p] * y[lp.A.i[p]];
-    }
-    z[j] -= dot;
-  }
-  // zB = 0
-  for (i_t k = 0; k < m; ++k) {
-    z[basic_list[k]] = 0.0;
-  }
-  if (print_norms) { settings.log.printf("|| z || %e\n", vector_norm_inf<i_t, f_t>(z)); }
+  phase2::compute_reduced_costs(objective, lp.A, y, basic_list, nonbasic_list, z);
+  if constexpr (print_norms) { settings.log.printf("|| z || %e\n", vector_norm_inf<i_t, f_t>(z)); }
 
 #ifdef COMPUTE_DUAL_RESIDUAL
-  // || A'*y + z  - c||_inf
-  std::vector<f_t> dual_res1 = z;
-  for (i_t j = 0; j < n; ++j) {
-    dual_res1[j] -= objective[j];
-  }
-  matrix_transpose_vector_multiply(lp.A, 1.0, y, 1.0, dual_res1);
+  std::vector<f_t> dual_res1;
+  compute_dual_residual(lp.A, objective, y, z, dual_res1);
   f_t dual_res_norm = vector_norm_inf<i_t, f_t>(dual_res1);
-  if (1 || dual_res_norm > settings.tight_tol) {
+  if (dual_res_norm > settings.tight_tol) {
     settings.log.printf("|| A'*y + z - c || %e\n", dual_res_norm);
   }
   assert(dual_res_norm < 1e-3);
@@ -1148,15 +2261,11 @@ dual::status_t dual_phase2(i_t phase,
   phase2::set_primal_variables_on_bounds(lp, settings, z, vstatus, x);
 
 #ifdef PRINT_VSTATUS_CHANGES
-  i_t num_vstatus_changes = 0;
-  i_t num_z_changes       = 0;
-  for (i_t j = 0; j < n; ++j) {
-    if (vstatus[j] != vstatus_old[j]) { num_vstatus_changes++; }
-    if (std::abs(z[j] - z_old[j]) > 1e-6) { num_z_changes++; }
-  }
-
-  printf("Number of vstatus changes %d\n", num_vstatus_changes);
-  printf("Number of z changes %d\n", num_z_changes);
+  i_t num_vstatus_changes;
+  i_t num_z_changes;
+  phase2::vstatus_changes(vstatus, vstatus_old, z, z_old, num_vstatus_changes, num_z_changes);
+  settings.log.printf("Number of vstatus changes %d\n", num_vstatus_changes);
+  settings.log.printf("Number of z changes %d\n", num_z_changes);
 #endif
 
   const f_t init_dual_inf =
@@ -1171,28 +2280,10 @@ dual::status_t dual_phase2(i_t phase,
     }
   }
 
-  std::vector<f_t> rhs = lp.rhs;
-  // rhs = b - sum_{j : x_j = l_j} A(:, j) l(j) - sum_{j : x_j = u_j} A(:, j) *
-  // u(j)
-  for (i_t k = 0; k < n - m; ++k) {
-    const i_t j         = nonbasic_list[k];
-    const i_t col_start = lp.A.col_start[j];
-    const i_t col_end   = lp.A.col_start[j + 1];
-    const f_t xj        = x[j];
-    if (std::abs(xj) < settings.tight_tol * 10) continue;
-    for (i_t p = col_start; p < col_end; ++p) {
-      rhs[lp.A.i[p]] -= xj * lp.A.x[p];
-    }
-  }
+  phase2::compute_primal_variables(
+    ft, lp.rhs, lp.A, basic_list, nonbasic_list, settings.tight_tol, x);
 
-  std::vector<f_t> xB(m);
-  ft.b_solve(rhs, xB);
   if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; }
-
-  for (i_t k = 0; k < m; ++k) {
-    const i_t j = basic_list[k];
-    x[j]        = xB[k];
-  }
   if (print_norms) { settings.log.printf("|| x || %e\n", vector_norm2<i_t, f_t>(x)); }
 
 #ifdef COMPUTE_PRIMAL_RESIDUAL
@@ -1207,18 +2298,12 @@ dual::status_t dual_phase2(i_t phase,
   if (delta_y_steepest_edge.size() == 0) {
     delta_y_steepest_edge.resize(n);
     if (slack_basis) {
-      for (i_t k = 0; k < m; ++k) {
-        const i_t j              = basic_list[k];
-        delta_y_steepest_edge[j] = 1.0;
-      }
-      for (i_t k = 0; k < n - m; ++k) {
-        const i_t j              = nonbasic_list[k];
-        delta_y_steepest_edge[j] = 1e-4;
-      }
+      phase2::initialize_steepest_edge_norms_from_slack_basis(
+        basic_list, nonbasic_list, delta_y_steepest_edge);
     } else {
       std::fill(delta_y_steepest_edge.begin(), delta_y_steepest_edge.end(), -1);
       if (phase2::initialize_steepest_edge_norms(
-            settings, start_time, basic_list, ft, delta_y_steepest_edge) == -1) {
+            lp, settings, start_time, basic_list, ft, delta_y_steepest_edge) == -1) {
         return dual::status_t::TIME_LIMIT;
       }
     }
@@ -1227,35 +2312,74 @@ dual::status_t dual_phase2(i_t phase,
                         vector_norm2<i_t, f_t>(delta_y_steepest_edge));
   }
 
-  if (phase == 2) { settings.log.printf(" Iter     Objective   Primal Infeas  Perturb  Time\n"); }
+  if (phase == 2) {
+    settings.log.printf(" Iter     Objective           Num Inf.  Sum Inf.     Perturb  Time\n");
+  }
 
   const i_t iter_limit = settings.iteration_limit;
-  std::vector<f_t> delta_y(m);
-  std::vector<f_t> delta_z(n);
-  std::vector<f_t> delta_x(n);
+  std::vector<f_t> delta_y(m, 0.0);
+  std::vector<f_t> delta_z(n, 0.0);
+  std::vector<f_t> delta_x(n, 0.0);
+  std::vector<f_t> delta_x_flip(n, 0.0);
+  std::vector<f_t> atilde(m, 0.0);
+  std::vector<i_t> atilde_mark(m, 0);
+  std::vector<i_t> atilde_index;
+  std::vector<i_t> nonbasic_mark(n);
+  std::vector<i_t> basic_mark(n);
+  std::vector<i_t> delta_z_mark(n, 0);
+  std::vector<i_t> delta_z_indices;
+  std::vector<f_t> v(m, 0.0);
+  std::vector<f_t> squared_infeasibilities;
+  std::vector<i_t> infeasibility_indices;
+
+  delta_z_indices.reserve(n);
+
+  phase2::reset_basis_mark(basic_list, nonbasic_list, basic_mark, nonbasic_mark);
+
+  std::vector<uint8_t> bounded_variables(n, 0);
+  phase2::compute_bounded_info(lp.lower, lp.upper, bounded_variables);
+
+  f_t primal_infeasibility = phase2::compute_initial_primal_infeasibilities(
+    lp, settings, basic_list, x, squared_infeasibilities, infeasibility_indices);
+
+#ifdef CHECK_BASIC_INFEASIBILITIES
+  phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 0);
+#endif
+
+  csc_matrix_t<i_t, f_t> A_transpose(1, 1, 0);
+  lp.A.transpose(A_transpose);
+
+  f_t obj              = compute_objective(lp, x);
   const i_t start_iter = iter;
+
+  i_t sparse_delta_z = 0;
+  i_t dense_delta_z  = 0;
+  phase2::phase2_timers_t<i_t, f_t> timers(false);
+
   while (iter < iter_limit) {
     // Pricing
-    i_t direction;
-    i_t basic_leaving_index;
-    f_t primal_infeasibility;
-    i_t leaving_index = -1;
+    i_t direction           = 0;
+    i_t basic_leaving_index = -1;
+    i_t leaving_index       = -1;
     f_t max_val;
+    timers.start_timer();
     if (settings.use_steepest_edge_pricing) {
-      leaving_index = phase2::steepest_edge_pricing(lp,
-                                                    settings,
-                                                    x,
-                                                    delta_y_steepest_edge,
-                                                    basic_list,
-                                                    direction,
-                                                    basic_leaving_index,
-                                                    primal_infeasibility,
-                                                    max_val);
+      leaving_index = phase2::steepest_edge_pricing_with_infeasibilities(lp,
+                                                                         settings,
+                                                                         x,
+                                                                         delta_y_steepest_edge,
+                                                                         basic_mark,
+                                                                         squared_infeasibilities,
+                                                                         infeasibility_indices,
+                                                                         direction,
+                                                                         basic_leaving_index,
+                                                                         max_val);
     } else {
       // Max infeasibility pricing
       leaving_index = phase2::phase2_pricing(
         lp, settings, x, basic_list, direction, basic_leaving_index, primal_infeasibility);
     }
+    timers.pricing_time += timers.stop_timer();
     if (leaving_index == -1) {
       phase2::prepare_optimality(lp,
                                  settings,
@@ -1277,18 +2401,18 @@ dual::status_t dual_phase2(i_t phase,
     }
 
     // BTran
-    // TODO: replace with sparse solve.
-    std::vector<f_t> ei(m, 0.0);
-    std::vector<f_t> delta_y(m);
-    ei[basic_leaving_index] = -direction;
     // BT*delta_y = -delta_zB = -sigma*ei
-    ft.b_transpose_solve(ei, delta_y);
+    timers.start_timer();
+    sparse_vector_t<i_t, f_t> delta_y_sparse(m, 0);
+    sparse_vector_t<i_t, f_t> UTsol_sparse(m, 0);
+    phase2::compute_delta_y(ft, basic_leaving_index, direction, delta_y_sparse, UTsol_sparse);
+    timers.btran_time += timers.stop_timer();
 
-    const f_t steepest_edge_norm_check = vector_norm2_squared<i_t, f_t>(delta_y);
+    const f_t steepest_edge_norm_check = delta_y_sparse.norm2_squared();
     if (delta_y_steepest_edge[leaving_index] <
         settings.steepest_edge_ratio * steepest_edge_norm_check) {
       constexpr bool verbose = false;
-      if (verbose) {
+      if constexpr (verbose) {
         settings.log.printf(
           "iteration restart due to steepest edge. Leaving %d. Actual %.2e "
           "from update %.2e\n",
@@ -1300,43 +2424,48 @@ dual::status_t dual_phase2(i_t phase,
       continue;
     }
 
-#ifdef COMPUTE_BTRANSPOSE_RESIDUAL
-    {
-      std::vector<f_t> res(m);
-      b_transpose_multiply(lp, basic_list, delta_y, res);
-      for (Int k = 0; k < m; k++) {
-        const f_t err = std::abs(res[k] - ei[k]);
-        if (err > 1e-4) { settings.log.printf("BT err %d %e\n", k, err); }
-        assert(err < 1e-4);
-      }
+    timers.start_timer();
+    i_t delta_y_nz0      = 0;
+    const i_t nz_delta_y = delta_y_sparse.i.size();
+    for (i_t k = 0; k < nz_delta_y; k++) {
+      if (std::abs(delta_y_sparse.x[k]) > 1e-12) { delta_y_nz0++; }
     }
-#endif
-
-    // delta_zB = sigma*ei
-    for (i_t k = 0; k < m; k++) {
-      const i_t j = basic_list[k];
-      delta_z[j]  = 0;
-    }
-    delta_z[leaving_index] = direction;
-    // delta_zN = -N'*delta_y
-    for (i_t k = 0; k < n - m; k++) {
-      const i_t j = nonbasic_list[k];
-      // z_j <- -A(:, j)'*delta_y
-      const i_t col_start = lp.A.col_start[j];
-      const i_t col_end   = lp.A.col_start[j + 1];
-      f_t dot             = 0.0;
-      for (i_t p = col_start; p < col_end; ++p) {
-        dot += lp.A.x[p] * delta_y[lp.A.i[p]];
-      }
-      delta_z[j] = -dot;
+    const f_t delta_y_nz_percentage = delta_y_nz0 / static_cast<f_t>(m) * 100.0;
+    const bool use_transpose        = delta_y_nz_percentage <= 30.0;
+    if (use_transpose) {
+      sparse_delta_z++;
+      phase2::compute_delta_z(A_transpose,
+                              delta_y_sparse,
+                              leaving_index,
+                              direction,
+                              nonbasic_mark,
+                              delta_z_mark,
+                              delta_z_indices,
+                              delta_z);
+    } else {
+      dense_delta_z++;
+      // delta_zB = sigma*ei
+      delta_y_sparse.to_dense(delta_y);
+      phase2::compute_reduced_cost_update(lp,
+                                          basic_list,
+                                          nonbasic_list,
+                                          delta_y,
+                                          leaving_index,
+                                          direction,
+                                          delta_z_mark,
+                                          delta_z_indices,
+                                          delta_z);
     }
+    timers.delta_z_time += timers.stop_timer();
 
 #ifdef COMPUTE_DUAL_RESIDUAL
-    std::vector<f_t> dual_residual = delta_z;
+    std::vector<f_t> dual_residual;
+    std::vector<f_t> zeros(n, 0.0);
+    phase2::compute_dual_residual(lp.A, zeros, delta_y, delta_z, dual_residual);
     // || A'*delta_y + delta_z ||_inf
-    matrix_transpose_vector_multiply(lp.A, 1.0, delta_y, 1.0, dual_residual);
     f_t dual_residual_norm = vector_norm_inf<i_t, f_t>(dual_residual);
-    settings.log.printf("|| A'*dy - dz || %e\n", dual_residual_norm);
+    settings.log.printf(
+      "|| A'*dy - dz || %e use transpose %d\n", dual_residual_norm, use_transpose);
 #endif
 
     // Ratio test
@@ -1356,18 +2485,25 @@ dual::status_t dual_phase2(i_t phase,
                                                    step_length,
                                                    nonbasic_entering_index);
     } else if (bound_flip_ratio) {
-      entering_index = phase2::bound_flipping_ratio_test(lp,
-                                                         settings,
-                                                         start_time,
-                                                         vstatus,
-                                                         nonbasic_list,
-                                                         x,
-                                                         z,
-                                                         delta_z,
-                                                         direction,
-                                                         leaving_index,
-                                                         step_length,
-                                                         nonbasic_entering_index);
+      timers.start_timer();
+      f_t slope = direction == 1 ? (lp.lower[leaving_index] - x[leaving_index])
+                                 : (x[leaving_index] - lp.upper[leaving_index]);
+      bound_flipping_ratio_test_t<i_t, f_t> bfrt(settings,
+                                                 start_time,
+                                                 m,
+                                                 n,
+                                                 slope,
+                                                 lp.lower,
+                                                 lp.upper,
+                                                 bounded_variables,
+                                                 vstatus,
+                                                 nonbasic_list,
+                                                 z,
+                                                 delta_z,
+                                                 delta_z_indices,
+                                                 nonbasic_mark);
+      entering_index = bfrt.compute_step_length(step_length, nonbasic_entering_index);
+      timers.bfrt_time += timers.stop_timer();
     } else {
       entering_index = phase2::phase2_ratio_test(
         lp, settings, vstatus, nonbasic_list, z, delta_z, step_length, nonbasic_entering_index);
@@ -1375,33 +2511,135 @@ dual::status_t dual_phase2(i_t phase,
     if (entering_index == -2) { return dual::status_t::TIME_LIMIT; }
     if (entering_index == -3) { return dual::status_t::CONCURRENT_LIMIT; }
     if (entering_index == -1) {
-      if (primal_infeasibility > settings.primal_tol &&
-          max_val < settings.steepest_edge_primal_tol) {
-        // We could be done
-        settings.log.printf("Exiting due to small primal infeasibility se %e\n", max_val);
-        phase2::prepare_optimality(lp,
-                                   settings,
-                                   ft,
-                                   objective,
-                                   basic_list,
-                                   nonbasic_list,
-                                   vstatus,
-                                   phase,
-                                   start_time,
-                                   max_val,
-                                   iter,
-                                   x,
-                                   y,
-                                   z,
-                                   sol);
-        status = dual::status_t::OPTIMAL;
-        break;
+      settings.log.printf("No entering variable found. Iter %d\n", iter);
+      settings.log.printf("Scaled infeasibility %e\n", max_val);
+      f_t perturbation = phase2::amount_of_perturbation(lp, objective);
+
+      if (perturbation > 0.0 && phase == 2) {
+        // Try to remove perturbation
+        std::vector<f_t> unperturbed_y(m);
+        std::vector<f_t> unperturbed_z(n);
+        phase2::compute_dual_solution_from_basis(
+          lp, ft, basic_list, nonbasic_list, unperturbed_y, unperturbed_z);
+        {
+          const f_t dual_infeas = phase2::dual_infeasibility(
+            lp, settings, vstatus, unperturbed_z, settings.tight_tol, settings.dual_tol);
+          settings.log.printf("Dual infeasibility after removing perturbation %e\n", dual_infeas);
+          if (dual_infeas <= settings.dual_tol) {
+            settings.log.printf("Removed perturbation of %.2e.\n", perturbation);
+            z            = unperturbed_z;
+            y            = unperturbed_y;
+            perturbation = 0.0;
+
+            std::vector<f_t> unperturbed_x(n);
+            phase2::compute_primal_solution_from_basis(
+              lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x);
+            x                    = unperturbed_x;
+            primal_infeasibility = phase2::compute_initial_primal_infeasibilities(
+              lp, settings, basic_list, x, squared_infeasibilities, infeasibility_indices);
+            settings.log.printf("Updated primal infeasibility: %e\n", primal_infeasibility);
+
+            objective = lp.objective;
+            // Need to reset the objective value, since we have recomputed x
+            obj = phase2::compute_perturbed_objective(objective, x);
+            if (dual_infeas <= settings.dual_tol && primal_infeasibility <= settings.primal_tol) {
+              phase2::prepare_optimality(lp,
+                                         settings,
+                                         ft,
+                                         objective,
+                                         basic_list,
+                                         nonbasic_list,
+                                         vstatus,
+                                         phase,
+                                         start_time,
+                                         max_val,
+                                         iter,
+                                         x,
+                                         y,
+                                         z,
+                                         sol);
+              status = dual::status_t::OPTIMAL;
+              break;
+            }
+            settings.log.printf(
+              "Continuing with perturbation removed and steepest edge norms reset\n");
+            // Clear delta_z before restarting the iteration
+            phase2::clear_delta_z(
+              entering_index, leaving_index, delta_z_mark, delta_z_indices, delta_z);
+            continue;
+          } else {
+            std::vector<f_t> unperturbed_x(n);
+            phase2::compute_primal_solution_from_basis(
+              lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x);
+            x                    = unperturbed_x;
+            primal_infeasibility = phase2::compute_initial_primal_infeasibilities(
+              lp, settings, basic_list, x, squared_infeasibilities, infeasibility_indices);
+
+            const f_t orig_dual_infeas = phase2::dual_infeasibility(
+              lp, settings, vstatus, z, settings.tight_tol, settings.dual_tol);
+
+            if (primal_infeasibility <= settings.primal_tol &&
+                orig_dual_infeas <= settings.dual_tol) {
+              phase2::prepare_optimality(lp,
+                                         settings,
+                                         ft,
+                                         objective,
+                                         basic_list,
+                                         nonbasic_list,
+                                         vstatus,
+                                         phase,
+                                         start_time,
+                                         max_val,
+                                         iter,
+                                         x,
+                                         y,
+                                         z,
+                                         sol);
+              status = dual::status_t::OPTIMAL;
+              break;
+            }
+            settings.log.printf("Failed to remove perturbation of %.2e.\n", perturbation);
+          }
+        }
+      }
+
+      if (perturbation == 0.0 && phase == 2) {
+        constexpr bool use_farkas = false;
+        if constexpr (use_farkas) {
+          std::vector<f_t> farkas_y;
+          std::vector<f_t> farkas_zl;
+          std::vector<f_t> farkas_zu;
+          f_t farkas_constant;
+          std::vector<f_t> my_delta_y;
+          delta_y_sparse.to_dense(my_delta_y);
+
+          // TODO(CMM): Do I use the perturbed or unperturbed objective?
+          const f_t obj_val = phase2::compute_perturbed_objective(objective, x);
+          phase2::compute_farkas_certificate(lp,
+                                             settings,
+                                             vstatus,
+                                             x,
+                                             y,
+                                             z,
+                                             my_delta_y,
+                                             delta_z,
+                                             direction,
+                                             leaving_index,
+                                             obj_val,
+                                             farkas_y,
+                                             farkas_zl,
+                                             farkas_zu,
+                                             farkas_constant);
+        }
       }
+
       const f_t dual_infeas =
         phase2::dual_infeasibility(lp, settings, vstatus, z, settings.tight_tol, settings.dual_tol);
       settings.log.printf("Dual infeasibility %e\n", dual_infeas);
       const f_t primal_inf = phase2::primal_infeasibility(lp, settings, vstatus, x);
       settings.log.printf("Primal infeasibility %e\n", primal_inf);
+      settings.log.printf("Updates %d\n", ft.num_updates());
+      settings.log.printf("Steepest edge %e\n", max_val);
       if (dual_infeas > settings.dual_tol) {
         settings.log.printf(
           "Numerical issues encountered. No entering variable found with large infeasibility.\n");
@@ -1410,121 +2648,111 @@ dual::status_t dual_phase2(i_t phase,
       return dual::status_t::DUAL_UNBOUNDED;
     }
 
+    timers.start_timer();
     // Update dual variables
     // y <- y + steplength * delta_y
-    for (i_t i = 0; i < m; ++i) {
-      y[i] += step_length * delta_y[i];
-    }
-
     // z <- z + steplength * delta_z
-    for (i_t j = 0; j < n; ++j) {
-      z[j] += step_length * delta_z[j];
-    }
+    phase2::update_dual_variables(
+      delta_y_sparse, delta_z_indices, delta_z, step_length, leaving_index, y, z);
+    timers.vector_time += timers.stop_timer();
 
 #ifdef COMPUTE_DUAL_RESIDUAL
-    dual_res1 = z;
-    for (i_t j = 0; j < n; ++j) {
-      dual_res1[j] -= objective[j];
-    }
-    matrix_transpose_vector_multiply(lp.A, 1.0, y, 1.0, dual_res1);
+    phase2::compute_dual_residual(lp.A, objective, y, z, dual_res1);
     f_t dual_res_norm = vector_norm_inf<i_t, f_t>(dual_res1);
     if (dual_res_norm > settings.dual_tol) {
       settings.log.printf("|| A'*y + z - c || %e steplength %e\n", dual_res_norm, step_length);
     }
 #endif
 
+    timers.start_timer();
     // Update primal variable
-    std::vector<f_t> atilde(m);
-    std::vector<f_t> delta_x_flip(n);
-    phase2::flip_bounds(
-      lp, settings, objective, z, nonbasic_list, entering_index, vstatus, delta_x_flip, atilde);
-
-    // B*delta_xB_0 = atilde
-    std::vector<f_t> delta_xB_0(m);
-    ft.b_solve(atilde, delta_xB_0);
-    for (i_t k = 0; k < m; ++k) {
-      const i_t j = basic_list[k];
-      x[j] += delta_xB_0[k];
-    }
-    for (i_t k = 0; k < n - m; ++k) {
-      const i_t j = nonbasic_list[k];
-      x[j] += delta_x_flip[j];
-    }
-
-    f_t delta_x_leaving;
-    if (direction == 1) {
-      delta_x_leaving = lp.lower[leaving_index] - x[leaving_index];
-    } else {
-      delta_x_leaving = lp.upper[leaving_index] - x[leaving_index];
-    }
-    // B*w = -A(:, entering)
-    std::vector<f_t> scaled_delta_xB(m);
-    std::fill(rhs.begin(), rhs.end(), 0.0);
-    lp.A.load_a_column(entering_index, rhs);
-    std::vector<f_t> utilde(m);
-    ft.b_solve(rhs, scaled_delta_xB, utilde);
-    for (i_t i = 0; i < m; ++i) {
-      scaled_delta_xB[i] *= -1.0;
+    const i_t num_flipped = phase2::flip_bounds(lp,
+                                                settings,
+                                                bounded_variables,
+                                                objective,
+                                                z,
+                                                delta_z_indices,
+                                                nonbasic_list,
+                                                entering_index,
+                                                vstatus,
+                                                delta_x_flip,
+                                                atilde_mark,
+                                                atilde,
+                                                atilde_index);
+
+    timers.flip_time += timers.stop_timer();
+
+    sparse_vector_t<i_t, f_t> delta_xB_0_sparse(m, 0);
+    if (num_flipped > 0) {
+      timers.start_timer();
+      phase2::adjust_for_flips(ft,
+                               basic_list,
+                               delta_z_indices,
+                               atilde_index,
+                               atilde,
+                               atilde_mark,
+                               delta_xB_0_sparse,
+                               delta_x_flip,
+                               x);
+      timers.ftran_time += timers.stop_timer();
     }
 
-#ifdef COMPUTE_BSOLVE_RESIDUAL
-    {
-      std::vector<f_t> residual_B(m);
-      b_multiply(lp, basic_list, scaled_delta_xB, residual_B);
-      f_t err_max = 0;
-      for (Int k = 0; k < m; ++k) {
-        const f_t err = std::abs(rhs[k] - residual_B[k]);
-        if (err >= 1e-5) {
-          settings.log.printf(
-            "Bsolve diff %d %e rhs %e residual %e\n", k, err, rhs[k], residual_B[k]);
-        }
-        err_max = std::max(err_max, err);
-      }
-      assert(err_max < 1e-4);
+    timers.start_timer();
+    sparse_vector_t<i_t, f_t> utilde_sparse(m, 0);
+    sparse_vector_t<i_t, f_t> scaled_delta_xB_sparse(m, 0);
+    sparse_vector_t<i_t, f_t> rhs_sparse(lp.A, entering_index);
+    if (phase2::compute_delta_x(lp,
+                                ft,
+                                entering_index,
+                                leaving_index,
+                                basic_leaving_index,
+                                direction,
+                                basic_list,
+                                delta_x_flip,
+                                rhs_sparse,
+                                x,
+                                utilde_sparse,
+                                scaled_delta_xB_sparse,
+                                delta_x) == -1) {
+      settings.log.printf("Failed to compute delta_x. Iter %d\n", iter);
+      return dual::status_t::NUMERICAL;
     }
-#endif
 
-    f_t primal_step_length = delta_x_leaving / scaled_delta_xB[basic_leaving_index];
-    for (i_t k = 0; k < m; ++k) {
-      const i_t j = basic_list[k];
-      delta_x[j]  = primal_step_length * scaled_delta_xB[k];
-    }
-    delta_x[leaving_index] = delta_x_leaving;
-    for (i_t k = 0; k < n - m; k++) {
-      const i_t j = nonbasic_list[k];
-      delta_x[j]  = 0.0;
-    }
-    delta_x[entering_index] = primal_step_length;
+    timers.ftran_time += timers.stop_timer();
 
-#ifdef COMPUTE_PRIMAL_STEP_RESIDUAL
+#ifdef CHECK_PRIMAL_STEP
+    std::vector<f_t> residual(m);
     matrix_vector_multiply(lp.A, 1.0, delta_x, 1.0, residual);
-    f_t primal_step_err = vector_norm_inf(residual);
+    f_t primal_step_err = vector_norm_inf<i_t, f_t>(residual);
     if (primal_step_err > 1e-4) { settings.log.printf("|| A * dx || %e\n", primal_step_err); }
 #endif
 
+    timers.start_timer();
     const i_t steepest_edge_status = phase2::update_steepest_edge_norms(settings,
                                                                         basic_list,
                                                                         ft,
                                                                         direction,
-                                                                        delta_y,
-                                                                        scaled_delta_xB,
+                                                                        delta_y_sparse,
+                                                                        steepest_edge_norm_check,
+                                                                        scaled_delta_xB_sparse,
                                                                         basic_leaving_index,
                                                                         entering_index,
+                                                                        v,
                                                                         delta_y_steepest_edge);
 #ifdef STEEPEST_EDGE_DEBUG
     if (steepest_edge_status == -1) {
       settings.log.printf("Num updates %d\n", ft.num_updates());
-      settings.log.printf(" Primal step length %e\n", primal_step_length);
-      settings.log.printf("|| delta_xB || %e\n", vector_norm_inf(scaled_delta_xB));
       settings.log.printf("|| rhs || %e\n", vector_norm_inf(rhs));
     }
 #endif
     assert(steepest_edge_status == 0);
+    timers.se_norms_time += timers.stop_timer();
 
+    timers.start_timer();
     // x <- x + delta_x
-    for (i_t j = 0; j < n; ++j) {
-      x[j] += delta_x[j];
-    }
+    phase2::update_primal_variables(scaled_delta_xB_sparse, basic_list, delta_x, entering_index, x);
+    timers.vector_time += timers.stop_timer();
+
 #ifdef COMPUTE_PRIMAL_RESIDUAL
     residual = lp.rhs;
     matrix_vector_multiply(lp.A, 1.0, x, -1.0, residual);
@@ -1534,10 +2762,68 @@ dual::status_t dual_phase2(i_t phase,
     }
 #endif
 
+    timers.start_timer();
+    // TODO(CMM): Do I also need to update the objective due to the bound flips?
+    // TODO(CMM): I'm using the unperturbed objective here, should this be the perturbed objective?
+    phase2::update_objective(
+      basic_list, scaled_delta_xB_sparse.i, lp.objective, delta_x, entering_index, obj);
+    timers.objective_time += timers.stop_timer();
+
+    timers.start_timer();
+    // Update primal infeasibilities due to changes in basic variables
+    // from flipping bounds
+#ifdef CHECK_BASIC_INFEASIBILITIES
+    phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 2);
+#endif
+    phase2::update_primal_infeasibilities(lp,
+                                          settings,
+                                          basic_list,
+                                          x,
+                                          entering_index,
+                                          leaving_index,
+                                          delta_xB_0_sparse.i,
+                                          squared_infeasibilities,
+                                          infeasibility_indices,
+                                          primal_infeasibility);
+    // Update primal infeasibilities due to changes in basic variables
+    // from the leaving and entering variables
+    phase2::update_primal_infeasibilities(lp,
+                                          settings,
+                                          basic_list,
+                                          x,
+                                          entering_index,
+                                          leaving_index,
+                                          scaled_delta_xB_sparse.i,
+                                          squared_infeasibilities,
+                                          infeasibility_indices,
+                                          primal_infeasibility);
+    // Update the entering variable
+    phase2::update_single_primal_infeasibility(lp.lower,
+                                               lp.upper,
+                                               x,
+                                               settings.primal_tol,
+                                               squared_infeasibilities,
+                                               infeasibility_indices,
+                                               entering_index,
+                                               primal_infeasibility);
+
+    phase2::clean_up_infeasibilities(squared_infeasibilities, infeasibility_indices);
+
+#if CHECK_PRIMAL_INFEASIBILITIES
+    phase2::check_primal_infeasibilities(
+      lp, settings, basic_list, x, squared_infeasibilities, infeasibility_indices);
+#endif
+    timers.update_infeasibility_time += timers.stop_timer();
+
+    // Clear delta_x
+    phase2::clear_delta_x(basic_list, entering_index, scaled_delta_xB_sparse, delta_x);
+
+    timers.start_timer();
     f_t sum_perturb = 0.0;
-    phase2::compute_perturbation(lp, settings, z, objective, sum_perturb);
+    phase2::compute_perturbation(lp, settings, delta_z_indices, z, objective, sum_perturb);
+    timers.perturb_time += timers.stop_timer();
 
-    // Update basis
+    // Update basis information
     vstatus[entering_index] = variable_status_t::BASIC;
     if (lp.lower[leaving_index] != lp.upper[leaving_index]) {
       vstatus[leaving_index] = static_cast<variable_status_t>(-direction);
@@ -1546,66 +2832,89 @@ dual::status_t dual_phase2(i_t phase,
     }
     basic_list[basic_leaving_index]        = entering_index;
     nonbasic_list[nonbasic_entering_index] = leaving_index;
+    nonbasic_mark[entering_index]          = -1;
+    nonbasic_mark[leaving_index]           = nonbasic_entering_index;
+    basic_mark[leaving_index]              = -1;
+    basic_mark[entering_index]             = basic_leaving_index;
+
+#ifdef CHECK_BASIC_INFEASIBILITIES
+    phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 5);
+#endif
 
-    // Refactor or Update
+    timers.start_timer();
+    // Refactor or update the basis factorization
     bool should_refactor = ft.num_updates() > settings.refactor_frequency;
     if (!should_refactor) {
-      i_t recommend_refactor = ft.update(utilde, basic_leaving_index);
-#ifdef CHECK_FT
-      {
-        csc_matrix_t Btest(m, m, 1);
-        ft.multiply_lu(Btest);
-        {
-          csc_matrix_t B(m, m, 1);
-          form_b(lp, basic_list, B);
-          csc_matrix_t Diff(m, m, 1);
-          add(Btest, B, 1.0, -1.0, Diff);
-          const f_t err = Diff.norm1();
-          if (err > settings.primal_tol) {
-            settings.log.printf("|| B - L*U || %e\n", Diff.norm1());
-          }
-          assert(err < settings.primal_tol);
-        }
-      }
+      i_t recommend_refactor = ft.update(utilde_sparse, UTsol_sparse, basic_leaving_index);
+#ifdef CHECK_UPDATE
+      phase2::check_update(lp, settings, ft, basic_list, basic_leaving_index);
 #endif
       should_refactor = recommend_refactor == 1;
     }
 
+#ifdef CHECK_BASIC_INFEASIBILITIES
+    phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 6);
+#endif
     if (should_refactor) {
       if (factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed) ==
           -1) {
+        if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; }
         basis_repair(lp.A, settings, deficient, slacks_needed, basic_list, nonbasic_list, vstatus);
-        if (factorize_basis(
-              lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed) == -1) {
-          return dual::status_t::NUMERICAL;
+        i_t count = 0;
+        while (factorize_basis(
+                 lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed) == -1) {
+          settings.log.printf("Failed to repair basis. %d deficient columns.\n",
+                              static_cast<int>(deficient.size()));
+          if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; }
+          settings.threshold_partial_pivoting_tol = 1.0;
+          count++;
+          if (count > 100) { return dual::status_t::NUMERICAL; }
+          basis_repair(
+            lp.A, settings, deficient, slacks_needed, basic_list, nonbasic_list, vstatus);
         }
       }
       reorder_basic_list(q, basic_list);
       ft.reset(L, U, p);
+      phase2::reset_basis_mark(basic_list, nonbasic_list, basic_mark, nonbasic_mark);
+      phase2::compute_initial_primal_infeasibilities(
+        lp, settings, basic_list, x, squared_infeasibilities, infeasibility_indices);
     }
+#ifdef CHECK_BASIC_INFEASIBILITIES
+    phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 7);
+#endif
+    timers.lu_update_time += timers.stop_timer();
 
+    timers.start_timer();
     phase2::compute_steepest_edge_norm_entering(
       settings, m, ft, basic_leaving_index, entering_index, delta_y_steepest_edge);
+    timers.se_entering_time += timers.stop_timer();
 
 #ifdef STEEPEST_EDGE_DEBUG
     if (iter < 100 || iter % 100 == 0))
-        {
-            phase2::check_steepest_edge_norms(settings, basic_list, ft, delta_y_steepest_edge);
-        }
+    {
+      phase2::check_steepest_edge_norms(settings, basic_list, ft, delta_y_steepest_edge);
+    }
+#endif
+
+#ifdef CHECK_BASIS_MARK
+    phase2::check_basis_mark(settings, basic_list, nonbasic_list, basic_mark, nonbasic_mark);
 #endif
 
     iter++;
 
-    const f_t obj = compute_objective(lp, x);
-    f_t now       = toc(start_time);
+    // Clear delta_z
+    phase2::clear_delta_z(entering_index, leaving_index, delta_z_mark, delta_z_indices, delta_z);
+
+    f_t now = toc(start_time);
     if ((iter - start_iter) < settings.first_iteration_log ||
         (iter % settings.iteration_log_frequency) == 0) {
       if (phase == 1 && iter == 1) {
-        settings.log.printf(" Iter     Objective   Primal Infeas  Perturb  Time\n");
+        settings.log.printf(" Iter     Objective           Num Inf.  Sum Inf.     Perturb  Time\n");
       }
-      settings.log.printf("%5d %+.8e %.8e %.2e %.2f\n",
+      settings.log.printf("%5d %+.16e %7d %.8e %.2e %.2f\n",
                           iter,
                           compute_user_objective(lp, obj),
+                          infeasibility_indices.size(),
                           primal_infeasibility,
                           sum_perturb,
                           now);
@@ -1624,6 +2933,20 @@ dual::status_t dual_phase2(i_t phase,
     }
   }
   if (iter >= iter_limit) { status = dual::status_t::ITERATION_LIMIT; }
+
+  if (phase == 2) {
+    timers.print_timers(settings);
+    constexpr bool print_stats = false;
+    if constexpr (print_stats) {
+      settings.log.printf("Sparse delta_z %8d %8.2f%\n",
+                          sparse_delta_z,
+                          100.0 * sparse_delta_z / (sparse_delta_z + dense_delta_z));
+      settings.log.printf("Dense delta_z  %8d %8.2f%\n",
+                          dense_delta_z,
+                          100.0 * dense_delta_z / (sparse_delta_z + dense_delta_z));
+      ft.print_stats();
+    }
+  }
   return status;
 }
 
diff --git a/cpp/src/dual_simplex/presolve.cpp b/cpp/src/dual_simplex/presolve.cpp
index d94fd0f6c..68043e06a 100644
--- a/cpp/src/dual_simplex/presolve.cpp
+++ b/cpp/src/dual_simplex/presolve.cpp
@@ -23,6 +23,128 @@
 
 namespace cuopt::linear_programming::dual_simplex {
 
+template <typename i_t, typename f_t>
+void bound_strengthening(const std::vector<char>& row_sense,
+                         const simplex_solver_settings_t<i_t, f_t>& settings,
+                         lp_problem_t<i_t, f_t>& problem)
+{
+  const i_t m = problem.num_rows;
+  const i_t n = problem.num_cols;
+
+  std::vector<f_t> constraint_lower(m);
+  std::vector<i_t> num_lower_infinity(m);
+  std::vector<i_t> num_upper_infinity(m);
+
+  csc_matrix_t<i_t, f_t> Arow(1, 1, 1);
+  problem.A.transpose(Arow);
+
+  std::vector<i_t> less_rows;
+  less_rows.reserve(m);
+
+  for (i_t i = 0; i < m; ++i) {
+    if (row_sense[i] == 'L') { less_rows.push_back(i); }
+  }
+
+  std::vector<f_t> lower = problem.lower;
+  std::vector<f_t> upper = problem.upper;
+
+  std::vector<i_t> updated_variables_list;
+  updated_variables_list.reserve(n);
+  std::vector<i_t> updated_variables_mark(n, 0);
+
+  i_t iter                         = 0;
+  const i_t iter_limit             = 10;
+  i_t total_strengthened_variables = 0;
+  settings.log.printf("Less equal rows %d\n", less_rows.size());
+  while (iter < iter_limit && less_rows.size() > 0) {
+    // Derive bounds on the constraints
+    settings.log.printf("Running bound strengthening on %d rows\n",
+                        static_cast<i_t>(less_rows.size()));
+    for (i_t i : less_rows) {
+      const i_t row_start   = Arow.col_start[i];
+      const i_t row_end     = Arow.col_start[i + 1];
+      num_lower_infinity[i] = 0;
+      num_upper_infinity[i] = 0;
+
+      f_t lower_limit = 0.0;
+      for (i_t p = row_start; p < row_end; ++p) {
+        const i_t j    = Arow.i[p];
+        const f_t a_ij = Arow.x[p];
+        if (a_ij > 0) {
+          lower_limit += a_ij * lower[j];
+        } else if (a_ij < 0) {
+          lower_limit += a_ij * upper[j];
+        }
+        if (lower[j] == -inf && a_ij > 0) {
+          num_lower_infinity[i]++;
+          lower_limit = -inf;
+        }
+        if (upper[j] == inf && a_ij < 0) {
+          num_lower_infinity[i]++;
+          lower_limit = -inf;
+        }
+      }
+      constraint_lower[i] = lower_limit;
+    }
+
+    // Use the constraint bounds to derive new bounds on the variables
+    for (i_t i : less_rows) {
+      if (std::isfinite(constraint_lower[i]) && num_lower_infinity[i] == 0) {
+        const i_t row_start = Arow.col_start[i];
+        const i_t row_end   = Arow.col_start[i + 1];
+        for (i_t p = row_start; p < row_end; ++p) {
+          const i_t k    = Arow.i[p];
+          const f_t a_ik = Arow.x[p];
+          if (a_ik > 0) {
+            const f_t new_upper = lower[k] + (problem.rhs[i] - constraint_lower[i]) / a_ik;
+            if (new_upper < upper[k]) {
+              upper[k] = new_upper;
+              if (lower[k] > upper[k]) {
+                settings.log.printf(
+                  "\t INFEASIBLE!!!!!!!!!!!!!!!!! constraint_lower %e lower %e rhs %e\n",
+                  constraint_lower[i],
+                  lower[k],
+                  problem.rhs[i]);
+              }
+              if (!updated_variables_mark[k]) { updated_variables_list.push_back(k); }
+            }
+          } else if (a_ik < 0) {
+            const f_t new_lower = upper[k] + (problem.rhs[i] - constraint_lower[i]) / a_ik;
+            if (new_lower > lower[k]) {
+              lower[k] = new_lower;
+              if (lower[k] > upper[k]) {
+                settings.log.printf("\t INFEASIBLE !!!!!!!!!!!!!!!!!!1\n");
+              }
+              if (!updated_variables_mark[k]) { updated_variables_list.push_back(k); }
+            }
+          }
+        }
+      }
+    }
+    less_rows.clear();
+
+    // Update the bounds on the constraints
+    settings.log.printf("Round %d: Strengthend %d variables\n",
+                        iter,
+                        static_cast<i_t>(updated_variables_list.size()));
+    total_strengthened_variables += updated_variables_list.size();
+    for (i_t j : updated_variables_list) {
+      updated_variables_mark[j] = 0;
+      const i_t col_start       = problem.A.col_start[j];
+      const i_t col_end         = problem.A.col_start[j + 1];
+      for (i_t p = col_start; p < col_end; ++p) {
+        const i_t i = problem.A.i[p];
+        less_rows.push_back(i);
+      }
+    }
+    updated_variables_list.clear();
+    iter++;
+  }
+  settings.log.printf("Total strengthened variables %d\n", total_strengthened_variables);
+  problem.lower = lower;
+  problem.upper = upper;
+}
+
 template <typename i_t, typename f_t>
 i_t remove_empty_cols(lp_problem_t<i_t, f_t>& problem,
                       i_t& num_empty_cols,
@@ -500,6 +622,7 @@ i_t add_artifical_variables(lp_problem_t<i_t, f_t>& problem,
 
 template <typename i_t, typename f_t>
 void convert_user_problem(const user_problem_t<i_t, f_t>& user_problem,
+                          const simplex_solver_settings_t<i_t, f_t>& settings,
                           lp_problem_t<i_t, f_t>& problem,
                           std::vector<i_t>& new_slacks)
 {
@@ -559,6 +682,14 @@ void convert_user_problem(const user_problem_t<i_t, f_t>& user_problem,
     convert_greater_to_less(user_problem, row_sense, problem, greater_rows, less_rows);
   }
 
+  // At this point the problem representation is in the form: A*x {<=, =} b
+  // This is the time to run bound strengthening
+  constexpr bool run_bound_strengthening = false;
+  if constexpr (run_bound_strengthening) {
+    settings.log.printf("Running bound strengthening\n");
+    bound_strengthening(row_sense, settings, problem);
+  }
+
   // The original problem may have a variable without a lower bound
   // but a finite upper bound
   // -inf < x_j <= u_j
@@ -669,7 +800,8 @@ void convert_user_lp_with_guess(const user_problem_t<i_t, f_t>& user_problem,
                                 lp_solution_t<i_t, f_t>& converted_solution)
 {
   std::vector<i_t> new_slacks;
-  convert_user_problem(user_problem, problem, new_slacks);
+  simplex_solver_settings_t<i_t, f_t> settings;
+  convert_user_problem(user_problem, settings, problem, new_slacks);
   crush_primal_solution_with_slack(
     user_problem, problem, initial_solution.x, initial_slack, new_slacks, converted_solution.x);
   crush_dual_solution(user_problem,
@@ -900,9 +1032,11 @@ void uncrush_solution(const presolve_info_t<i_t, f_t>& presolve_info,
 
 #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE
 
-template void convert_user_problem<int, double>(const user_problem_t<int, double>& user_problem,
-                                                lp_problem_t<int, double>& problem,
-                                                std::vector<int>& new_slacks);
+template void convert_user_problem<int, double>(
+  const user_problem_t<int, double>& user_problem,
+  const simplex_solver_settings_t<int, double>& settings,
+  lp_problem_t<int, double>& problem,
+  std::vector<int>& new_slacks);
 
 template void convert_user_lp_with_guess<int, double>(
   const user_problem_t<int, double>& user_problem,
diff --git a/cpp/src/dual_simplex/presolve.hpp b/cpp/src/dual_simplex/presolve.hpp
index 947c637cb..7a307e6f7 100644
--- a/cpp/src/dual_simplex/presolve.hpp
+++ b/cpp/src/dual_simplex/presolve.hpp
@@ -63,6 +63,7 @@ struct presolve_info_t {
 
 template <typename i_t, typename f_t>
 void convert_user_problem(const user_problem_t<i_t, f_t>& user_problem,
+                          const simplex_solver_settings_t<i_t, f_t>& settings,
                           lp_problem_t<i_t, f_t>& problem,
                           std::vector<i_t>& new_slacks);
 
diff --git a/cpp/src/dual_simplex/random.hpp b/cpp/src/dual_simplex/random.hpp
index e1ad01fef..dfc60dbd5 100644
--- a/cpp/src/dual_simplex/random.hpp
+++ b/cpp/src/dual_simplex/random.hpp
@@ -21,7 +21,7 @@
 
 namespace cuopt::linear_programming::dual_simplex {
 
-template <typename i_t>
+template <typename i_t, typename f_t>
 class random_t {
  public:
   random_t(i_t seed) : gen(seed) {}
@@ -34,6 +34,12 @@ class random_t {
     return distrib(gen);
   }
 
+  f_t random()
+  {
+    std::uniform_real_distribution<> distrib(0.0, 1.0);
+    return distrib(gen);
+  }
+
  private:
   std::mt19937 gen;
 };
diff --git a/cpp/src/dual_simplex/right_looking_lu.cpp b/cpp/src/dual_simplex/right_looking_lu.cpp
index 57eb9b01d..7fe276ec6 100644
--- a/cpp/src/dual_simplex/right_looking_lu.cpp
+++ b/cpp/src/dual_simplex/right_looking_lu.cpp
@@ -571,12 +571,12 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
     // Find pivot that satisfies
     // abs(pivot) >= abstol,
     // abs(pivot) >= threshold_tol * max abs[pivot column]
-    i_t pivot_i                 = -1;
-    i_t pivot_j                 = -1;
-    i_t pivot_p                 = kNone;
-    constexpr f_t pivot_tol     = 1e-11;
-    constexpr f_t drop_tol      = 1e-13;
-    constexpr f_t threshold_tol = 1.0 / 10.0;
+    i_t pivot_i             = -1;
+    i_t pivot_j             = -1;
+    i_t pivot_p             = kNone;
+    constexpr f_t pivot_tol = 1e-11;
+    const f_t drop_tol      = tol == 1.0 ? 0.0 : 1e-13;
+    const f_t threshold_tol = tol;
     markowitz_search(Cdegree,
                      Rdegree,
                      col_count,
diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp
index df5e4e1d0..a51ed19bc 100644
--- a/cpp/src/dual_simplex/simplex_solver_settings.hpp
+++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp
@@ -46,6 +46,8 @@ struct simplex_solver_settings_t {
       cut_off(std::numeric_limits<f_t>::infinity()),
       steepest_edge_ratio(0.5),
       steepest_edge_primal_tol(1e-9),
+      hypersparse_threshold(0.05),
+      threshold_partial_pivoting_tol(1.0 / 10.0),
       use_steepest_edge_pricing(true),
       use_harris_ratio(false),
       use_bound_flip_ratio(true),
@@ -86,7 +88,9 @@ struct simplex_solver_settings_t {
   f_t cut_off;               // If the dual objective is greater than the cutoff we stop
   f_t
     steepest_edge_ratio;  // the ratio of computed steepest edge mismatch from updated steepest edge
-  f_t steepest_edge_primal_tol;    // Primal tolerance divided by steepest edge norm
+  f_t steepest_edge_primal_tol;  // Primal tolerance divided by steepest edge norm
+  f_t hypersparse_threshold;
+  mutable f_t threshold_partial_pivoting_tol;
   bool use_steepest_edge_pricing;  // true if using steepest edge pricing, false if using max
                                    // infeasibility pricing
   bool use_harris_ratio;           // true if using the harris ratio test
diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp
index 464bd7047..e665bae97 100644
--- a/cpp/src/dual_simplex/solve.cpp
+++ b/cpp/src/dual_simplex/solve.cpp
@@ -244,7 +244,7 @@ lp_status_t solve_linear_program(const user_problem_t<i_t, f_t>& user_problem,
   f_t start_time = tic();
   lp_problem_t<i_t, f_t> original_lp(1, 1, 1);
   std::vector<i_t> new_slacks;
-  convert_user_problem(user_problem, original_lp, new_slacks);
+  convert_user_problem(user_problem, settings, original_lp, new_slacks);
   solution.resize(user_problem.num_rows, user_problem.num_cols);
   lp_solution_t<i_t, f_t> lp_solution(original_lp.num_rows, original_lp.num_cols);
   std::vector<variable_status_t> vstatus;
@@ -283,7 +283,7 @@ i_t solve(const user_problem_t<i_t, f_t>& problem,
     lp_problem_t<i_t, f_t> original_lp(
       problem.num_rows, problem.num_cols, problem.A.col_start[problem.A.n]);
     std::vector<i_t> new_slacks;
-    convert_user_problem(problem, original_lp, new_slacks);
+    convert_user_problem(problem, settings, original_lp, new_slacks);
     lp_solution_t<i_t, f_t> solution(original_lp.num_rows, original_lp.num_cols);
     std::vector<variable_status_t> vstatus;
     std::vector<f_t> edge_norms;
diff --git a/cpp/src/dual_simplex/sparse_matrix.cpp b/cpp/src/dual_simplex/sparse_matrix.cpp
index 830838bf5..dc4df3990 100644
--- a/cpp/src/dual_simplex/sparse_matrix.cpp
+++ b/cpp/src/dual_simplex/sparse_matrix.cpp
@@ -16,6 +16,7 @@
  */
 
 #include <dual_simplex/sparse_matrix.hpp>
+#include <dual_simplex/sparse_vector.hpp>
 
 #include <dual_simplex/types.hpp>
 
@@ -148,6 +149,61 @@ i_t csc_matrix_t<i_t, f_t>::load_a_column(i_t j, std::vector<f_t>& Aj) const
   return (col_end - col_start);
 }
 
+template <typename i_t, typename f_t>
+void csc_matrix_t<i_t, f_t>::append_column(const std::vector<f_t>& x)
+{
+  const i_t m = this->m;
+  assert(x.size() == m);
+  const i_t xsz = x.size();
+  i_t nz        = this->col_start[this->n];
+  for (i_t j = 0; j < xsz; ++j) {
+    if (x[j] != 0.0) {
+      this->i[nz] = j;
+      this->x[nz] = x[j];
+      nz++;
+    }
+  }
+  this->col_start[this->n + 1] = nz;
+  this->n++;
+}
+
+template <typename i_t, typename f_t>
+void csc_matrix_t<i_t, f_t>::append_column(const sparse_vector_t<i_t, f_t>& x)
+{
+  const i_t m = this->m;
+  assert(x.n == m);
+  i_t nz        = this->col_start[this->n];
+  const i_t xnz = x.i.size();
+  for (i_t k = 0; k < xnz; ++k) {
+    const i_t i     = x.i[k];
+    const f_t x_val = x.x[k];
+    if (x_val != 0.0) {
+      this->i[nz] = i;
+      this->x[nz] = x_val;
+      nz++;
+    }
+  }
+  this->col_start[this->n + 1] = nz;
+  this->n++;
+}
+
+template <typename i_t, typename f_t>
+void csc_matrix_t<i_t, f_t>::append_column(i_t x_nz, i_t* i, f_t* x)
+{
+  i_t nz = this->col_start[this->n];
+  for (i_t k = 0; k < x_nz; ++k) {
+    const i_t i_val = i[k];
+    const f_t x_val = x[i_val];
+    if (x_val != 0.0) {
+      this->i[nz] = i_val;
+      this->x[nz] = x_val;
+      nz++;
+    }
+  }
+  this->col_start[this->n + 1] = nz;
+  this->n++;
+}
+
 template <typename i_t, typename f_t>
 i_t csc_matrix_t<i_t, f_t>::transpose(csc_matrix_t<i_t, f_t>& AT) const
 {
@@ -360,6 +416,28 @@ void scatter_dense(const csc_matrix_t<i_t, f_t>& A, i_t j, f_t alpha, std::vecto
   }
 }
 
+// x <- x + alpha * A(:, j)
+template <typename i_t, typename f_t>
+void scatter_dense(const csc_matrix_t<i_t, f_t>& A,
+                   i_t j,
+                   f_t alpha,
+                   std::vector<f_t>& x,
+                   std::vector<i_t>& mark,
+                   std::vector<i_t>& indices)
+{
+  const i_t col_start = A.col_start[j];
+  const i_t col_end   = A.col_start[j + 1];
+  for (i_t p = col_start; p < col_end; ++p) {
+    const i_t i  = A.i[p];
+    const f_t ax = A.x[p];
+    x[i] += alpha * ax;
+    if (!mark[i]) {
+      mark[i] = 1;
+      indices.push_back(i);
+    }
+  }
+}
+
 // Compute C = A*B where C is m x n, A is m x k, and B = k x n
 // Do this by computing C(:, j) = A*B(:, j) = sum (i=1 to k) A(:, k)*B(i, j)
 template <typename i_t, typename f_t>
@@ -695,6 +773,13 @@ template void scatter_dense<int, double>(const csc_matrix_t<int, double>& A,
                                          double alpha,
                                          std::vector<double>& x);
 
+template void scatter_dense<int, double>(const csc_matrix_t<int, double>& A,
+                                         int j,
+                                         double alpha,
+                                         std::vector<double>& x,
+                                         std::vector<int>& mark,
+                                         std::vector<int>& indices);
+
 template int multiply<int, double>(const csc_matrix_t<int, double>& A,
                                    const csc_matrix_t<int, double>& B,
                                    csc_matrix_t<int, double>& C);
diff --git a/cpp/src/dual_simplex/sparse_matrix.hpp b/cpp/src/dual_simplex/sparse_matrix.hpp
index 29e6a0cf4..9cc3d6380 100644
--- a/cpp/src/dual_simplex/sparse_matrix.hpp
+++ b/cpp/src/dual_simplex/sparse_matrix.hpp
@@ -20,6 +20,7 @@
 #include <dual_simplex/types.hpp>
 #include <dual_simplex/vector_math.hpp>
 
+#include <algorithm>
 #include <cassert>
 #include <cstdio>
 #include <vector>
@@ -29,6 +30,9 @@ namespace cuopt::linear_programming::dual_simplex {
 template <typename i_t, typename f_t>
 class csr_matrix_t;  // Forward declaration of CSR matrix needed to define CSC matrix
 
+template <typename i_t, typename f_t>
+class sparse_vector_t;  // Forward declaration of sparse vector needed to define CSC matrix
+
 // A sparse matrix stored in compressed sparse column format
 template <typename i_t, typename f_t>
 class csc_matrix_t {
@@ -59,6 +63,15 @@ class csc_matrix_t {
   // Compute the transpose of A
   i_t transpose(csc_matrix_t<i_t, f_t>& AT) const;
 
+  // Append a dense column to the matrix. Assumes the matrix has already been resized accordingly
+  void append_column(const std::vector<f_t>& x);
+
+  // Append a sparse column to the matrix. Assumes the matrix has already been resized accordingly
+  void append_column(const sparse_vector_t<i_t, f_t>& x);
+
+  // Append a sparse column to the matrix. Assumes the matrix has already been resized accordingly
+  void append_column(i_t nz, i_t* i, f_t* x);
+
   // Remove columns from the matrix
   i_t remove_columns(const std::vector<i_t>& cols_to_remove);
 
@@ -131,6 +144,14 @@ i_t scatter(const csc_matrix_t<i_t, f_t>& A,
 template <typename i_t, typename f_t>
 void scatter_dense(const csc_matrix_t<i_t, f_t>& A, i_t j, f_t alpha, std::vector<f_t>& x);
 
+template <typename i_t, typename f_t>
+void scatter_dense(const csc_matrix_t<i_t, f_t>& A,
+                   i_t j,
+                   f_t alpha,
+                   std::vector<f_t>& x,
+                   std::vector<i_t>& mark,
+                   std::vector<i_t>& indices);
+
 // Compute C = A*B where C is m x n, A is m x k, and B = k x n
 // Do this by computing C(:, j) = A*B(:, j) = sum (i=1 to k) A(:, k)*B(i, j)
 template <typename i_t, typename f_t>
diff --git a/cpp/src/dual_simplex/sparse_vector.cpp b/cpp/src/dual_simplex/sparse_vector.cpp
new file mode 100644
index 000000000..73a0c0a8f
--- /dev/null
+++ b/cpp/src/dual_simplex/sparse_vector.cpp
@@ -0,0 +1,224 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <dual_simplex/sparse_vector.hpp>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+template <typename i_t, typename f_t>
+sparse_vector_t<i_t, f_t>::sparse_vector_t(const csc_matrix_t<i_t, f_t>& A, i_t col)
+{
+  const i_t col_start = A.col_start[col];
+  const i_t col_end   = A.col_start[col + 1];
+  n                   = A.m;
+  const i_t nz        = col_end - col_start;
+  i.reserve(nz);
+  x.reserve(nz);
+  for (i_t k = col_start; k < col_end; ++k) {
+    i.push_back(A.i[k]);
+    x.push_back(A.x[k]);
+  }
+}
+
+template <typename i_t, typename f_t>
+void sparse_vector_t<i_t, f_t>::from_dense(const std::vector<f_t>& in)
+{
+  i.clear();
+  x.clear();
+  n = in.size();
+  i.reserve(n);
+  x.reserve(n);
+  for (i_t k = 0; k < n; ++k) {
+    if (in[k] != 0) {
+      i.push_back(k);
+      x.push_back(in[k]);
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+void sparse_vector_t<i_t, f_t>::to_csc(csc_matrix_t<i_t, f_t>& A) const
+{
+  A.m      = n;
+  A.n      = 1;
+  A.nz_max = i.size();
+  A.col_start.resize(2);
+  A.col_start[0] = 0;
+  A.col_start[1] = i.size();
+  A.i            = i;
+  A.x            = x;
+}
+
+template <typename i_t, typename f_t>
+void sparse_vector_t<i_t, f_t>::to_dense(std::vector<f_t>& x_dense) const
+{
+  x_dense.clear();
+  x_dense.resize(n, 0.0);
+  const i_t nz = i.size();
+  for (i_t k = 0; k < nz; ++k) {
+    x_dense[i[k]] = x[k];
+  }
+}
+
+template <typename i_t, typename f_t>
+void sparse_vector_t<i_t, f_t>::scatter(std::vector<f_t>& x_dense) const
+{
+  // Assumes x_dense is already cleared
+  const i_t nz = i.size();
+  for (i_t k = 0; k < nz; ++k) {
+    x_dense[i[k]] += x[k];
+  }
+}
+
+template <typename i_t, typename f_t>
+void sparse_vector_t<i_t, f_t>::inverse_permute_vector(const std::vector<i_t>& p)
+{
+  assert(p.size() == n);
+  i_t nz = i.size();
+  std::vector<i_t> i_perm(nz);
+  for (i_t k = 0; k < nz; ++k) {
+    i_perm[k] = p[i[k]];
+  }
+  i = i_perm;
+}
+
+template <typename i_t, typename f_t>
+void sparse_vector_t<i_t, f_t>::inverse_permute_vector(const std::vector<i_t>& p,
+                                                       sparse_vector_t<i_t, f_t>& y) const
+{
+  i_t m = p.size();
+  assert(n == m);
+  i_t nz = i.size();
+  y.n    = n;
+  y.x    = x;
+  std::vector<i_t> i_perm(nz);
+  for (i_t k = 0; k < nz; ++k) {
+    i_perm[k] = p[i[k]];
+  }
+  y.i = i_perm;
+}
+
+template <typename i_t, typename f_t>
+f_t sparse_vector_t<i_t, f_t>::sparse_dot(const csc_matrix_t<i_t, f_t>& Y, i_t y_col) const
+{
+  const i_t col_start = Y.col_start[y_col];
+  const i_t col_end   = Y.col_start[y_col + 1];
+  const i_t ny        = col_end - col_start;
+  const i_t nx        = i.size();
+  f_t dot             = 0.0;
+  for (i_t h = 0, k = col_start; h < nx && k < col_end;) {
+    const i_t p = i[h];
+    const i_t q = Y.i[k];
+    if (p == q) {
+      dot += Y.x[k] * x[h];
+      h++;
+      k++;
+    } else if (p < q) {
+      h++;
+    } else if (q < p) {
+      k++;
+    }
+  }
+  return dot;
+}
+
+template <typename i_t, typename f_t>
+void sparse_vector_t<i_t, f_t>::sort()
+{
+  if (i.size() == 1) { return; }
+  // If the number of nonzeros is large, use a O(n) bucket sort
+  if (i.size() > 0.3 * n) {
+    std::vector<f_t> bucket(n, 0.0);
+    const i_t nz = i.size();
+    for (i_t k = 0; k < nz; ++k) {
+      bucket[i[k]] = x[k];
+    }
+    i.clear();
+    i.reserve(nz);
+    x.clear();
+    x.reserve(nz);
+    for (i_t k = 0; k < n; ++k) {
+      if (bucket[k] != 0.0) {
+        i.push_back(k);
+        x.push_back(bucket[k]);
+      }
+    }
+  } else {
+    // Use a n log n sort
+    const i_t nz = i.size();
+    std::vector<i_t> i_sorted(nz);
+    std::vector<f_t> x_sorted(nz);
+    std::vector<i_t> perm(nz);
+    for (i_t k = 0; k < nz; ++k) {
+      perm[k] = k;
+    }
+    std::vector<i_t>& iunsorted = i;
+    std::sort(
+      perm.begin(), perm.end(), [&iunsorted](i_t a, i_t b) { return iunsorted[a] < iunsorted[b]; });
+    for (i_t k = 0; k < nz; ++k) {
+      i_sorted[k] = i[perm[k]];
+      x_sorted[k] = x[perm[k]];
+    }
+    i = i_sorted;
+    x = x_sorted;
+  }
+
+  // Check
+#ifdef CHECK_SORT
+  if (!std::is_sorted(i.begin(), i.end())) { printf("Sort error\n"); }
+#endif
+}
+
+template <typename i_t, typename f_t>
+f_t sparse_vector_t<i_t, f_t>::norm2_squared() const
+{
+  f_t dot      = 0.0;
+  const i_t nz = i.size();
+  for (i_t k = 0; k < nz; ++k) {
+    dot += x[k] * x[k];
+  }
+  return dot;
+}
+
+template <typename i_t, typename f_t>
+void sparse_vector_t<i_t, f_t>::negate()
+{
+  const i_t nz = x.size();
+  for (i_t k = 0; k < nz; ++k) {
+    x[k] *= -1.0;
+  }
+}
+
+template <typename i_t, typename f_t>
+f_t sparse_vector_t<i_t, f_t>::find_coefficient(i_t index) const
+{
+  const i_t nz = i.size();
+  for (i_t k = 0; k < nz; ++k) {
+    if (i[k] == index) { return x[k]; }
+  }
+  return std::numeric_limits<f_t>::quiet_NaN();
+}
+
+#ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE
+template class sparse_vector_t<int, double>;
+#endif
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/sparse_vector.hpp b/cpp/src/dual_simplex/sparse_vector.hpp
new file mode 100644
index 000000000..cf970acda
--- /dev/null
+++ b/cpp/src/dual_simplex/sparse_vector.hpp
@@ -0,0 +1,64 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <dual_simplex/sparse_matrix.hpp>
+#include <dual_simplex/types.hpp>
+
+#include <vector>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+// A sparse vector stored as a list of nonzero coefficients and their indices
+template <typename i_t, typename f_t>
+class sparse_vector_t {
+ public:
+  // Construct a sparse vector of dimension n with nz nonzero coefficients
+  sparse_vector_t(i_t n, i_t nz) : n(n), i(nz), x(nz) {}
+  // Construct a sparse vector from a dense vector.
+  sparse_vector_t(const std::vector<f_t>& in) { from_dense(in); }
+  // Construct a sparse vector from a column of a CSC matrix
+  sparse_vector_t(const csc_matrix_t<i_t, f_t>& A, i_t col);
+  // gather a dense vector into a sparse vector
+  void from_dense(const std::vector<f_t>& in);
+  // convert a sparse vector into a CSC matrix with a single column
+  void to_csc(csc_matrix_t<i_t, f_t>& A) const;
+  // convert a sparse vector into a dense vector. Dense vector is cleared and resized.
+  void to_dense(std::vector<f_t>& x_dense) const;
+  // scatter a sparse vector into a dense vector. Assumes x_dense is already cleared or
+  // preinitialized
+  void scatter(std::vector<f_t>& x_dense) const;
+  // inverse permute the current sparse vector
+  void inverse_permute_vector(const std::vector<i_t>& p);
+  // inverse permute a sparse vector into another sparse vector
+  void inverse_permute_vector(const std::vector<i_t>& p, sparse_vector_t<i_t, f_t>& y) const;
+  // compute the dot product of a sparse vector with a column of a CSC matrix
+  f_t sparse_dot(const csc_matrix_t<i_t, f_t>& Y, i_t y_col) const;
+  // ensure the coefficients in the sparse vectory are sorted in terms of increasing index
+  void sort();
+  // compute the squared 2-norm of the sparse vector
+  f_t norm2_squared() const;
+  void negate();
+  f_t find_coefficient(i_t index) const;
+
+  i_t n;
+  std::vector<i_t> i;
+  std::vector<f_t> x;
+};
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/triangle_solve.cpp b/cpp/src/dual_simplex/triangle_solve.cpp
index eddf04843..13a42d2f9 100644
--- a/cpp/src/dual_simplex/triangle_solve.cpp
+++ b/cpp/src/dual_simplex/triangle_solve.cpp
@@ -89,26 +89,25 @@ i_t upper_triangular_transpose_solve(const csc_matrix_t<i_t, f_t>& U, std::vecto
   return 0;
 }
 
-// \brief Reach computes the reach of b=B(:, col) in the graph of G
-// \param[in] B - Sparse CSC matrix containing rhs
-// \param[in] col - column of B
+// \brief Reach computes the reach of b in the graph of G
+// \param[in] b - Sparse vector containing the rhs
 // \param[in] pinv - inverse permuation vector
 // \param[in, out] G - Sparse CSC matrix G. The column pointers of G are
 // modified (but restored) during this call \param[out] xi  - stack of size 2*n.
 // xi[top] .. xi[n-1] contains the reachable indicies \returns top - the size of
 // the stack
 template <typename i_t, typename f_t>
-i_t reach(const csc_matrix_t<i_t, f_t>& B,
-          i_t col,
+i_t reach(const sparse_vector_t<i_t, f_t>& b,
           const std::optional<std::vector<i_t>>& pinv,
           csc_matrix_t<i_t, f_t>& G,
           std::vector<i_t>& xi)
 {
-  const i_t m = G.m;
-  i_t top     = m;
-  for (i_t p = B.col_start[col]; p < B.col_start[col + 1]; ++p) {
-    if (!MARKED(G.col_start, B.i[p])) {  // start a DFS at unmarked node i
-      top = depth_first_search(B.i[p], pinv, G, top, xi, xi.begin() + m);
+  const i_t m   = G.m;
+  i_t top       = m;
+  const i_t bnz = b.i.size();
+  for (i_t p = 0; p < bnz; ++p) {
+    if (!MARKED(G.col_start, b.i[p])) {  // start a DFS at unmarked node i
+      top = depth_first_search(b.i[p], pinv, G, top, xi, xi.begin() + m);
     }
   }
   for (i_t p = top; p < m; ++p) {  // restore G
@@ -152,7 +151,7 @@ i_t depth_first_search(i_t j,
     }
     done   = 1;  // Node j is done if no unvisited neighbors
     i_t p2 = (jnew < 0) ? 0 : UNFLIP(G.col_start[jnew + 1]);
-    for (i_t p = pstack[head]; p < p2; ++p) {  // Examin all neighbors of j
+    for (i_t p = pstack[head]; p < p2; ++p) {  // Examine all neighbors of j
       i_t i = G.i[p];                          // Consider neighbor i
       if (MARKED(G.col_start, i)) {
         continue;  // skip visited node i
@@ -163,29 +162,31 @@ i_t depth_first_search(i_t j,
       break;             // break to start dfs at node i
     }
     if (done) {
-      head--;         // remove j from the recursion stack
-      xi[--top] = j;  // and place it the output stack
+      pstack[head] = 0;  // restore pstack so it can be used again in other routines
+      xi[head]     = 0;  // restore xi so it can be used again in other routines
+      head--;            // remove j from the recursion stack
+      xi[--top] = j;     // and place it the output stack
     }
   }
   return top;
 }
 
 template <typename i_t, typename f_t, bool lo>
-i_t sparse_triangle_solve(const csc_matrix_t<i_t, f_t>& B,
-                          i_t col,
+i_t sparse_triangle_solve(const sparse_vector_t<i_t, f_t>& b,
                           const std::optional<std::vector<i_t>>& pinv,
                           std::vector<i_t>& xi,
                           csc_matrix_t<i_t, f_t>& G,
                           f_t* x)
 {
   i_t m = G.m;
-  assert(B.m == m);
-  i_t top = reach(B, col, pinv, G, xi);
+  assert(b.n == m);
+  i_t top = reach(b, pinv, G, xi);
   for (i_t p = top; p < m; ++p) {
     x[xi[p]] = 0;  // Clear x vector
   }
-  for (i_t p = B.col_start[col]; p < B.col_start[col + 1]; ++p) {
-    x[B.i[p]] = B.x[p];  // Scatter b
+  const i_t bnz = b.i.size();
+  for (i_t p = 0; p < bnz; ++p) {
+    x[b.i[p]] = b.x[p];  // Scatter b
   }
   for (i_t px = top; px < m; ++px) {
     i_t j = xi[px];                   // x(j) is nonzero
@@ -225,8 +226,7 @@ template int upper_triangular_solve<int, double>(const csc_matrix_t<int, double>
 template int upper_triangular_transpose_solve<int, double>(const csc_matrix_t<int, double>& U,
                                                            std::vector<double>& x);
 
-template int reach<int, double>(const csc_matrix_t<int, double>& B,
-                                int col,
+template int reach<int, double>(const sparse_vector_t<int, double>& b,
                                 const std::optional<std::vector<int>>& pinv,
                                 csc_matrix_t<int, double>& G,
                                 std::vector<int>& xi);
@@ -238,12 +238,17 @@ template int depth_first_search<int, double>(int j,
                                              std::vector<int>& xi,
                                              std::vector<int>::iterator pstack);
 
-template int sparse_triangle_solve<int, double, true>(const csc_matrix_t<int, double>& B,
-                                                      int col,
+template int sparse_triangle_solve<int, double, true>(const sparse_vector_t<int, double>& b,
                                                       const std::optional<std::vector<int>>& pinv,
                                                       std::vector<int>& xi,
                                                       csc_matrix_t<int, double>& G,
                                                       double* x);
+
+template int sparse_triangle_solve<int, double, false>(const sparse_vector_t<int, double>& b,
+                                                       const std::optional<std::vector<int>>& pinv,
+                                                       std::vector<int>& xi,
+                                                       csc_matrix_t<int, double>& G,
+                                                       double* x);
 #endif
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/triangle_solve.hpp b/cpp/src/dual_simplex/triangle_solve.hpp
index fc01613c7..5016332da 100644
--- a/cpp/src/dual_simplex/triangle_solve.hpp
+++ b/cpp/src/dual_simplex/triangle_solve.hpp
@@ -18,6 +18,7 @@
 #pragma once
 
 #include <dual_simplex/sparse_matrix.hpp>
+#include <dual_simplex/sparse_vector.hpp>
 #include <dual_simplex/types.hpp>
 
 #include <optional>
@@ -52,17 +53,15 @@ i_t upper_triangular_solve(const csc_matrix_t<i_t, f_t>& U, std::vector<f_t>& x)
 template <typename i_t, typename f_t>
 i_t upper_triangular_transpose_solve(const csc_matrix_t<i_t, f_t>& U, std::vector<f_t>& x);
 
-// \brief Reach computes the reach of b=B(:, col) in the graph of G
-// \param[in] B - Sparse CSC matrix containing rhs
-// \param[in] col - column of B
+// \brief Reach computes the reach of b in the graph of G
+// \param[in] b - sparse vector containing the rhs
 // \param[in] pinv - inverse permuation vector
 // \param[in, out] G - Sparse CSC matrix G. The column pointers of G are
 // modified (but restored) during this call \param[out] xi  - stack of size 2*n.
 // xi[top] .. xi[n-1] contains the reachable indicies \returns top - the size of
 // the stack
 template <typename i_t, typename f_t>
-i_t reach(const csc_matrix_t<i_t, f_t>& B,
-          i_t col,
+i_t reach(const sparse_vector_t<i_t, f_t>& b,
           const std::optional<std::vector<i_t>>& pinv,
           csc_matrix_t<i_t, f_t>& G,
           std::vector<i_t>& xi);
@@ -95,8 +94,7 @@ i_t depth_first_search(i_t j,
 //        and U is a sparse upper triangular matrix, and b is a sparse
 //        right-hand side. The vector b is obtained from the column of a sparse
 //        matrix.
-// \param[in] B - Sparse CSC matrix contain the rhs
-// \param[in] col - the column of B to use as b. b = B(:, col)
+// \param[in] b - Sparse vector contain the rhs
 // \param[in] pinv - optional inverse permutation vector
 // \param[in, out] xi - An array of size 2*m, on output it contains the non-zero
 // pattern of x in xi[top] through xi[m-1]
@@ -104,8 +102,7 @@ i_t depth_first_search(i_t j,
 //                     G.col_start is marked and restored during the algorithm
 // \param[out] - The solution vector xi_t
 template <typename i_t, typename f_t, bool lo>
-i_t sparse_triangle_solve(const csc_matrix_t<i_t, f_t>& B,
-                          i_t col,
+i_t sparse_triangle_solve(const sparse_vector_t<i_t, f_t>& b,
                           const std::optional<std::vector<i_t>>& pinv,
                           std::vector<i_t>& xi,
                           csc_matrix_t<i_t, f_t>& G,
diff --git a/cpp/src/dual_simplex/vector_math.cpp b/cpp/src/dual_simplex/vector_math.cpp
index aa05d5743..239848ac3 100644
--- a/cpp/src/dual_simplex/vector_math.cpp
+++ b/cpp/src/dual_simplex/vector_math.cpp
@@ -67,6 +67,47 @@ f_t dot(const std::vector<f_t>& x, const std::vector<f_t>& y)
   return dot;
 }
 
+template <typename i_t, typename f_t>
+f_t sparse_dot(
+  i_t const* xind, f_t const* xval, i_t nx, i_t const* yind, i_t ny, f_t const* y_scatter_val)
+{
+  f_t dot = 0.0;
+  for (i_t i = 0, j = 0; i < nx && j < ny;) {
+    const i_t p = xind[i];
+    const i_t q = yind[j];
+    if (p == q) {
+      dot += xval[i] * y_scatter_val[q];
+      i++;
+      j++;
+    } else if (p < q) {
+      i++;
+    } else if (q < p) {
+      j++;
+    }
+  }
+  return dot;
+}
+
+template <typename i_t, typename f_t>
+f_t sparse_dot(i_t* xind, f_t* xval, i_t nx, i_t* yind, f_t* yval, i_t ny)
+{
+  f_t dot = 0.0;
+  for (i_t i = 0, j = 0; i < nx && j < ny;) {
+    const i_t p = xind[i];
+    const i_t q = yind[j];
+    if (p == q) {
+      dot += xval[i] * yval[j];
+      i++;
+      j++;
+    } else if (p < q) {
+      i++;
+    } else if (q < p) {
+      j++;
+    }
+  }
+  return dot;
+}
+
 template <typename i_t, typename f_t>
 f_t sparse_dot(const std::vector<i_t>& xind,
                const std::vector<f_t>& xval,
@@ -146,6 +187,16 @@ template double sparse_dot<int, double>(const std::vector<int>& xind,
                                         const std::vector<int>& yind,
                                         const std::vector<double>& yval);
 
+template double sparse_dot<int, double>(int const* xind,
+                                        double const* xval,
+                                        int nx,
+                                        int const* yind,
+                                        int ny,
+                                        double const* y_scatter_val);
+
+template double sparse_dot<int, double>(
+  int* xind, double* xval, int nx, int* yind, double* yval, int ny);
+
 template int permute_vector<int, double>(const std::vector<int>& p,
                                          const std::vector<double>& b,
                                          std::vector<double>& x);
diff --git a/cpp/src/dual_simplex/vector_math.hpp b/cpp/src/dual_simplex/vector_math.hpp
index 962b21743..c5bd12863 100644
--- a/cpp/src/dual_simplex/vector_math.hpp
+++ b/cpp/src/dual_simplex/vector_math.hpp
@@ -44,6 +44,13 @@ f_t sparse_dot(const std::vector<i_t>& xind,
                const std::vector<i_t>& yind,
                const std::vector<f_t>& yval);
 
+template <typename i_t, typename f_t>
+f_t sparse_dot(
+  i_t const* xind, f_t const* xval, i_t nx, i_t const* yind, i_t ny, f_t const* y_scatter_val);
+
+template <typename i_t, typename f_t>
+f_t sparse_dot(i_t* xind, f_t* xval, i_t nx, i_t* yind, f_t* yval, i_t ny);
+
 // Computes x = P*b or x=b(p) in MATLAB.
 template <typename i_t, typename f_t>
 i_t permute_vector(const std::vector<i_t>& p, const std::vector<f_t>& b, std::vector<f_t>& x);
diff --git a/cpp/tests/linear_programming/c_api_tests/c_api_tests.cpp b/cpp/tests/linear_programming/c_api_tests/c_api_tests.cpp
index e675a3d5d..bfd100946 100644
--- a/cpp/tests/linear_programming/c_api_tests/c_api_tests.cpp
+++ b/cpp/tests/linear_programming/c_api_tests/c_api_tests.cpp
@@ -57,7 +57,7 @@ TEST_P(TimeLimitTestFixture, time_limit)
                            method),
             CUOPT_SUCCESS);
   EXPECT_EQ(termination_status, CUOPT_TERIMINATION_STATUS_TIME_LIMIT);
-  EXPECT_NEAR(solve_time, target_solve_time, 0.1);
+  EXPECT_NEAR(solve_time, target_solve_time, 1.0);
 }
 INSTANTIATE_TEST_SUITE_P(
   c_api,