diff --git a/ci/build_wheel_cuopt.sh b/ci/build_wheel_cuopt.sh
index cd49b6ffb..0a9a8edff 100755
--- a/ci/build_wheel_cuopt.sh
+++ b/ci/build_wheel_cuopt.sh
@@ -20,6 +20,9 @@ set -euo pipefail
 
 source rapids-init-pip
 
+# Install cudss
+bash ci/utils/install_cudss.sh
+
 package_dir="python/cuopt"
 export SKBUILD_CMAKE_ARGS="-DCUOPT_BUILD_WHEELS=ON;-DDISABLE_DEPRECATION_WARNINGS=ON";
 
diff --git a/ci/build_wheel_libcuopt.sh b/ci/build_wheel_libcuopt.sh
index 7651b52f8..88d6ebf01 100755
--- a/ci/build_wheel_libcuopt.sh
+++ b/ci/build_wheel_libcuopt.sh
@@ -34,6 +34,24 @@ else
     echo "Building in release mode"
 fi
 
+# Install cudss
+bash ci/utils/install_cudss.sh
+
+# Clean metadata & install cudss
+if command -v dnf &> /dev/null; then
+    dnf clean all
+    # Adding static library just to please CMAKE requirements
+    dnf -y install libcudss0-static-cuda-12 libcudss0-devel-cuda-12 libcudss0-cuda-12
+elif command -v apt-get &> /dev/null; then
+    apt-get update
+    apt-get install -y libcudss-devel
+else
+    echo "Neither dnf nor apt-get found. Cannot install cudss dependencies."
+    exit 1
+fi
+# dnf -y install libcudss0-devel-cuda-12
+
+
 rapids-logger "Generating build requirements"
 
 CUOPT_MPS_PARSER_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="cuopt_mps_parser" rapids-download-wheels-from-github python)
@@ -62,6 +80,7 @@ EXCLUDE_ARGS=(
   --exclude "libraft.so"
   --exclude "libcublas.so.*"
   --exclude "libcublasLt.so.*"
+  --exclude "libcudss.so.*"
   --exclude "libcurand.so.*"
   --exclude "libcusolver.so.*"
   --exclude "libcusparse.so.*"
diff --git a/ci/utils/install_cudss.sh b/ci/utils/install_cudss.sh
new file mode 100755
index 000000000..d885ee385
--- /dev/null
+++ b/ci/utils/install_cudss.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+# Clean metadata & install cudss
+if command -v dnf &> /dev/null; then
+    dnf clean all
+    # Adding static library just to please CMAKE requirements
+    dnf -y install libcudss0-static-cuda-12 libcudss0-devel-cuda-12 libcudss0-cuda-12
+elif command -v apt-get &> /dev/null; then
+    apt-get update
+    apt-get install -y libcudss-devel
+else
+    echo "Neither dnf nor apt-get found. Cannot install cudss dependencies."
+    exit 1
+fi
diff --git a/cmake/FindCUDSS.cmake b/cmake/FindCUDSS.cmake
new file mode 100644
index 000000000..9e1e110cb
--- /dev/null
+++ b/cmake/FindCUDSS.cmake
@@ -0,0 +1,42 @@
+# =============================================================================
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+if (CUDSS_INCLUDE AND CUDSS_LIBRARIES)
+  set(CUDSS_FIND_QUIETLY TRUE)
+endif (CUDSS_INCLUDE AND CUDSS_LIBRARIES)
+
+message(STATUS "CUDSS DIR = $ENV{CUDSS_DIR}")
+
+
+find_path(CUDSS_INCLUDE
+  NAMES
+  cudss.h
+  PATHS
+  $ENV{CUDSS_DIR}/include
+  ${INCLUDE_INSTALL_DIR}
+  PATH_SUFFIXES
+  cudss
+)
+
+
+find_library(CUDSS_LIBRARIES cudss PATHS $ENV{CUDSS_DIR}/lib ${LIB_INSTALL_DIR})
+
+message(STATUS "CUDSS_INCLUDE = ${CUDSS_INCLUDE}")
+message(STATUS "CUDSS_LIBRARIES = ${CUDSS_LIBRARIES}")
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CHOLMOD DEFAULT_MSG
+                                  CUDSS_INCLUDE CUDSS_LIBRARIES)
+
+mark_as_advanced(CUDSS_INCLUDE CUDSS_LIBRARIES)
diff --git a/conda/environments/all_cuda-128_arch-aarch64.yaml b/conda/environments/all_cuda-128_arch-aarch64.yaml
index ba6b6945e..da85d2c05 100644
--- a/conda/environments/all_cuda-128_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-128_arch-aarch64.yaml
@@ -35,6 +35,7 @@ dependencies:
 - httpx
 - ipython
 - jsonref==1.1.0
+- libcudss-dev
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
diff --git a/conda/environments/all_cuda-128_arch-x86_64.yaml b/conda/environments/all_cuda-128_arch-x86_64.yaml
index f91c22957..ec173cfa6 100644
--- a/conda/environments/all_cuda-128_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-128_arch-x86_64.yaml
@@ -35,6 +35,7 @@ dependencies:
 - httpx
 - ipython
 - jsonref==1.1.0
+- libcudss-dev
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
diff --git a/conda/recipes/libcuopt/recipe.yaml b/conda/recipes/libcuopt/recipe.yaml
index 9924107d2..d8701f645 100644
--- a/conda/recipes/libcuopt/recipe.yaml
+++ b/conda/recipes/libcuopt/recipe.yaml
@@ -66,6 +66,7 @@ cache:
       - librmm =${{ dep_minor_version }}
       - rapids-logger =0.1
       - cuda-nvtx-dev
+      - libcudss-dev
       - libcurand-dev
       - libcusparse-dev
       - cuda-cudart-dev
@@ -133,6 +134,7 @@ outputs:
         - rapids-logger =0.1
         - librmm =${{ dep_minor_version }}
         - cuda-cudart-dev
+        - libcudss-dev
         - libcublas
         - libcusparse-dev
       run:
@@ -183,6 +185,7 @@ outputs:
         - gmock ${{ gtest_version }}
         - gtest ${{ gtest_version }}
         - cuda-cudart-dev
+        - libcudss-dev
         - libcublas
         - libcusparse-dev
       run:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 53071afae..f59825e7c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -30,6 +30,9 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(CUOPT)
 
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../cmake")
+message(STATUS "CMAKE_MODULE_PATH = ${CMAKE_MODULE_PATH}")
+
 project(
   CUOPT
   VERSION 25.10.00
@@ -180,6 +183,8 @@ include(${rapids-cmake-dir}/cpm/rapids_logger.cmake)
 rapids_cpm_rapids_logger(BUILD_EXPORT_SET cuopt-exports INSTALL_EXPORT_SET cuopt-exports)
 create_logger_macros(CUOPT "cuopt::default_logger()" include/cuopt)
 
+find_package(CUDSS REQUIRED)
+
 if(BUILD_TESTS)
   include(cmake/thirdparty/get_gtest.cmake)
 endif()
@@ -222,6 +227,7 @@ target_link_options(cuopt PRIVATE "${CUOPT_BINARY_DIR}/fatbin.ld")
 add_library(cuopt::cuopt ALIAS cuopt)
 # ##################################################################################################
 # - include paths ---------------------------------------------------------------------------------
+message(STATUS "target include directories CUDSS_INCLUDES = ${CUDSS_INCLUDE}")
 
 target_include_directories(cuopt SYSTEM PRIVATE "${papilo_SOURCE_DIR}/src" "${papilo_BINARY_DIR}")
 
@@ -229,12 +235,14 @@ target_include_directories(cuopt
   PRIVATE
   "${CMAKE_CURRENT_SOURCE_DIR}/../thirdparty"
   "${CMAKE_CURRENT_SOURCE_DIR}/src"
+  "${CUDSS_INCLUDE}"
   PUBLIC
   "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
   "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>"
   "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/libmps_parser/include>"
   INTERFACE
   "$<INSTALL_INTERFACE:include>"
+  ${CUDSS_INCLUDE}
 )
 
 # ##################################################################################################
@@ -250,6 +258,23 @@ list(PREPEND CUOPT_PRIVATE_CUDA_LIBS CUDA::cublasLt)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/libmps_parser)
 set(CMAKE_LIBRARY_PATH ${CMAKE_CURRENT_BINARY_DIR}/libmps_parser/)
 
+find_package(cudss REQUIRED CONFIG)
+
+# Print all details of the cudss package
+message(STATUS "cudss package details:")
+message(STATUS "  cudss_FOUND: ${cudss_FOUND}")
+message(STATUS "  cudss_VERSION: ${cudss_VERSION}")
+message(STATUS "  cudss_INCLUDE_DIRS: ${cudss_INCLUDE_DIRS}")
+message(STATUS "  cudss_LIBRARY_DIR: ${cudss_LIBRARY_DIR}")
+message(STATUS "  cudss_LIBRARIES: ${cudss_LIBRARIES}")
+message(STATUS "  cudss_DEFINITIONS: ${cudss_DEFINITIONS}")
+message(STATUS "  cudss_COMPILE_OPTIONS: ${cudss_COMPILE_OPTIONS}")
+message(STATUS "  cudss_LINK_OPTIONS: ${cudss_LINK_OPTIONS}")
+
+# Use the specific cudss library version to avoid symlink issues
+set(CUDSS_LIB_FILE "${cudss_LIBRARY_DIR}/libcudss.so.0")
+message(STATUS "Using cudss library: ${CUDSS_LIB_FILE}")
+
 
 target_link_libraries(cuopt
   PUBLIC
@@ -260,6 +285,7 @@ target_link_libraries(cuopt
   CCCL::CCCL
   raft::raft
   cuopt::mps_parser
+  ${CUDSS_LIB_FILE}
   PRIVATE
   ${CUOPT_PRIVATE_CUDA_LIBS}
   )
diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h
index 72d05a6ad..ab1552387 100644
--- a/cpp/include/cuopt/linear_programming/constants.h
+++ b/cpp/include/cuopt/linear_programming/constants.h
@@ -50,6 +50,7 @@
 #define CUOPT_LOG_FILE                    "log_file"
 #define CUOPT_LOG_TO_CONSOLE              "log_to_console"
 #define CUOPT_CROSSOVER                   "crossover"
+#define CUOPT_USE_CUDSS                   "use_cudss"
 #define CUOPT_PRESOLVE                    "presolve"
 #define CUOPT_MIP_ABSOLUTE_TOLERANCE      "mip_absolute_tolerance"
 #define CUOPT_MIP_RELATIVE_TOLERANCE      "mip_relative_tolerance"
@@ -105,6 +106,7 @@
 #define CUOPT_METHOD_CONCURRENT   0
 #define CUOPT_METHOD_PDLP         1
 #define CUOPT_METHOD_DUAL_SIMPLEX 2
+#define CUOPT_METHOD_BARRIER      3
 
 /* @brief Status codes constants */
 #define CUOPT_SUCCESS          0
diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
index f4e515288..9e9d6a93b 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
@@ -65,7 +65,8 @@ enum pdlp_solver_mode_t : int {
 enum method_t : int {
   Concurrent  = CUOPT_METHOD_CONCURRENT,
   PDLP        = CUOPT_METHOD_PDLP,
-  DualSimplex = CUOPT_METHOD_DUAL_SIMPLEX
+  DualSimplex = CUOPT_METHOD_DUAL_SIMPLEX,
+  Barrier     = CUOPT_METHOD_BARRIER
 };
 
 template <typename i_t, typename f_t>
@@ -207,6 +208,7 @@ class pdlp_solver_settings_t {
   std::string user_problem_file{""};
   bool per_constraint_residual{false};
   bool crossover{false};
+  bool use_cudss{false};
   bool save_best_primal_so_far{false};
   bool first_primal_feasible{false};
   bool presolve{false};
diff --git a/cpp/src/dual_simplex/CMakeLists.txt b/cpp/src/dual_simplex/CMakeLists.txt
index d85471f9b..e3586e5f8 100644
--- a/cpp/src/dual_simplex/CMakeLists.txt
+++ b/cpp/src/dual_simplex/CMakeLists.txt
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 set(DUAL_SIMPLEX_SRC_FILES
+  ${CMAKE_CURRENT_SOURCE_DIR}/barrier.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/basis_solves.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/basis_updates.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/bound_flipping_ratio_test.cpp
@@ -36,5 +37,7 @@ set(DUAL_SIMPLEX_SRC_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/triangle_solve.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/vector_math.cpp)
 
+set_source_files_properties(${DUAL_SIMPLEX_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES COMPILE_OPTIONS "-g1")
+
 set(CUOPT_SRC_FILES ${CUOPT_SRC_FILES}
   ${DUAL_SIMPLEX_SRC_FILES} PARENT_SCOPE)
diff --git a/cpp/src/dual_simplex/barrier.cpp b/cpp/src/dual_simplex/barrier.cpp
new file mode 100644
index 000000000..caa2218ef
--- /dev/null
+++ b/cpp/src/dual_simplex/barrier.cpp
@@ -0,0 +1,1298 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <dual_simplex/barrier.hpp>
+
+#include <dual_simplex/presolve.hpp>
+#include <dual_simplex/sparse_matrix.hpp>
+#include <dual_simplex/tic_toc.hpp>
+#include <dual_simplex/types.hpp>
+
+#include <numeric>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+template <typename i_t, typename f_t>
+class iteration_data_t {
+ public:
+  iteration_data_t(const lp_problem_t<i_t, f_t>& lp,
+                   i_t num_upper_bounds,
+                   const simplex_solver_settings_t<i_t, f_t>& settings)
+    : upper_bounds(num_upper_bounds),
+      c(lp.objective),
+      b(lp.rhs),
+      w(num_upper_bounds),
+      x(lp.num_cols),
+      y(lp.num_rows),
+      v(num_upper_bounds),
+      z(lp.num_cols),
+      w_save(num_upper_bounds),
+      x_save(lp.num_cols),
+      y_save(lp.num_rows),
+      v_save(num_upper_bounds),
+      z_save(lp.num_cols),
+      diag(lp.num_cols),
+      inv_diag(lp.num_cols),
+      inv_sqrt_diag(lp.num_cols),
+      AD(lp.num_cols, lp.num_rows, 0),
+      AT(lp.num_rows, lp.num_cols, 0),
+      ADAT(lp.num_rows, lp.num_rows, 0),
+      primal_residual(lp.num_rows),
+      bound_residual(num_upper_bounds),
+      dual_residual(lp.num_cols),
+      complementarity_xz_residual(lp.num_cols),
+      complementarity_wv_residual(num_upper_bounds),
+      primal_rhs(lp.num_rows),
+      bound_rhs(num_upper_bounds),
+      dual_rhs(lp.num_cols),
+      complementarity_xz_rhs(lp.num_cols),
+      complementarity_wv_rhs(num_upper_bounds),
+      dw_aff(num_upper_bounds),
+      dx_aff(lp.num_cols),
+      dy_aff(lp.num_rows),
+      dv_aff(num_upper_bounds),
+      dz_aff(lp.num_cols),
+      dw(num_upper_bounds),
+      dx(lp.num_cols),
+      dy(lp.num_rows),
+      dv(num_upper_bounds),
+      dz(lp.num_cols),
+      has_factorization(false)
+  {
+    // Create the upper bounds vector
+    n_upper_bounds = 0;
+    for (i_t j = 0; j < lp.num_cols; j++) {
+      if (lp.upper[j] < inf) { upper_bounds[n_upper_bounds++] = j; }
+    }
+    settings.log.printf("n_upper_bounds %d\n", n_upper_bounds);
+    // Form A*Dinv*A'
+    diag.set_scalar(1.0);
+    if (n_upper_bounds > 0) {
+      for (i_t k = 0; k < n_upper_bounds; k++) {
+        i_t j   = upper_bounds[k];
+        diag[j] = 2.0;
+      }
+    }
+    inv_diag.set_scalar(1.0);
+    if (n_upper_bounds > 0) { diag.inverse(inv_diag); }
+    inv_sqrt_diag.set_scalar(1.0);
+    if (n_upper_bounds > 0) { inv_diag.sqrt(inv_sqrt_diag); }
+
+    column_density(lp.A, settings);
+
+    // Copy A into AD
+    AD = lp.A;
+    // Perform column scaling on A to get AD^(-1)
+    AD.scale_columns(inv_diag);
+
+    // Form A*Dinv*A'
+    float64_t start_form_aat = tic();
+    lp.A.transpose(AT);
+    multiply(AD, AT, ADAT);
+
+    float64_t aat_time = toc(start_form_aat);
+    settings.log.printf("AAT time %.2fs\n", aat_time);
+    settings.log.printf("AAT nonzeros %e\n", static_cast<float64_t>(ADAT.col_start[lp.num_rows]));
+
+    chol = std::make_unique<sparse_cholesky_cudss_t<i_t, f_t>>(settings, lp.num_rows);
+    chol->set_positive_definite(false);
+
+    // Perform symbolic analysis
+    chol->analyze(ADAT);
+  }
+
+  void to_solution(const lp_problem_t<i_t, f_t>& lp,
+                   i_t iterations,
+                   f_t objective,
+                   f_t user_objective,
+                   f_t primal_residual,
+                   f_t dual_residual,
+                   lp_solution_t<i_t, f_t>& solution)
+  {
+    solution.x = x;
+    solution.y = y;
+    dense_vector_t<i_t, f_t> z_tilde(z.size());
+    scatter_upper_bounds(v, z_tilde);
+    z_tilde.axpy(1.0, z, -1.0);
+    solution.z = z_tilde;
+
+    dense_vector_t<i_t, f_t> dual_res = z_tilde;
+    dual_res.axpy(-1.0, lp.objective, 1.0);
+    matrix_transpose_vector_multiply(lp.A, 1.0, solution.y, 1.0, dual_res);
+    f_t dual_residual_norm = vector_norm_inf<i_t, f_t>(dual_res);
+    printf("Solution Dual residual: %e\n", dual_residual_norm);
+
+    solution.iterations         = iterations;
+    solution.objective          = objective;
+    solution.user_objective     = user_objective;
+    solution.l2_primal_residual = primal_residual;
+    solution.l2_dual_residual   = dual_residual;
+  }
+
+  void column_density(const csc_matrix_t<i_t, f_t>& A,
+                      const simplex_solver_settings_t<i_t, f_t>& settings)
+  {
+    const i_t m = A.m;
+    const i_t n = A.n;
+    if (n < 2) { return; }
+    std::vector<f_t> density(n);
+    for (i_t j = 0; j < n; j++) {
+      const i_t nnz = A.col_start[j + 1] - A.col_start[j];
+      density[j]    = static_cast<f_t>(nnz);
+    }
+
+    std::vector<i_t> permutation(n);
+    std::iota(permutation.begin(), permutation.end(), 0);
+    std::sort(permutation.begin(), permutation.end(), [&density](i_t i, i_t j) {
+      return density[i] < density[j];
+    });
+
+    std::vector<i_t> columns_to_keep;
+    columns_to_keep.reserve(n);
+
+    f_t nnz = density[permutation[0]];
+    columns_to_keep.push_back(permutation[0]);
+    for (i_t i = 1; i < n; i++) {
+      const i_t j         = permutation[i];
+      const f_t density_j = density[j];
+
+      if (i > n - 5 && density_j > 10.0) {
+        settings.log.printf("Dense column %6d nz %6d density %.2e nnz %.2e\n",
+                            j,
+                            static_cast<i_t>(density_j),
+                            density_j / m,
+                            nnz + density_j * density_j);
+      }
+      nnz += density_j * density_j;
+
+      columns_to_keep.push_back(j);
+    }
+  }
+
+  void scatter_upper_bounds(const dense_vector_t<i_t, f_t>& y, dense_vector_t<i_t, f_t>& z)
+  {
+    if (n_upper_bounds > 0) {
+      for (i_t k = 0; k < n_upper_bounds; k++) {
+        i_t j = upper_bounds[k];
+        z[j]  = y[k];
+      }
+    }
+  }
+
+  void gather_upper_bounds(const dense_vector_t<i_t, f_t>& z, dense_vector_t<i_t, f_t>& y)
+  {
+    if (n_upper_bounds > 0) {
+      for (i_t k = 0; k < n_upper_bounds; k++) {
+        i_t j = upper_bounds[k];
+        y[k]  = z[j];
+      }
+    }
+  }
+
+  void preconditioned_conjugate_gradient(const simplex_solver_settings_t<i_t, f_t>& settings,
+                                         const dense_vector_t<i_t, f_t>& b,
+                                         f_t tolerance,
+                                         dense_vector_t<i_t, f_t>& xinout) const
+  {
+    dense_vector_t<i_t, f_t> residual = b;
+    dense_vector_t<i_t, f_t> y(ADAT.n);
+
+    dense_vector_t<i_t, f_t> x = xinout;
+
+    const csc_matrix_t<i_t, f_t>& A = ADAT;
+
+    // r = A*x - b
+    matrix_vector_multiply(A, 1.0, x, -1.0, residual);
+
+    // Solve M y = r for y
+    chol->solve(residual, y);
+
+    dense_vector_t<i_t, f_t> p = y;
+    p.multiply_scalar(-1.0);
+
+    dense_vector_t<i_t, f_t> Ap(A.n);
+    i_t iter                  = 0;
+    f_t norm_residual         = vector_norm2<i_t, f_t>(residual);
+    f_t initial_norm_residual = norm_residual;
+    settings.log.printf("PCG initial residual 2-norm %e inf-norm %e\n",
+                        norm_residual,
+                        vector_norm_inf<i_t, f_t>(residual));
+
+    f_t rTy = residual.inner_product(y);
+
+    while (norm_residual > tolerance && iter < 100) {
+      // Compute Ap = A * p
+      matrix_vector_multiply(A, 1.0, p, 0.0, Ap);
+
+      // Compute alpha = (r^T * y) / (p^T * Ap)
+      f_t alpha = rTy / p.inner_product(Ap);
+
+      // Update residual = residual + alpha * Ap
+      residual.axpy(alpha, Ap, 1.0);
+
+      f_t new_residual = vector_norm2<i_t, f_t>(residual);
+      if (new_residual > 1.1 * norm_residual || new_residual > 1.1 * initial_norm_residual) {
+        settings.log.printf(
+          "PCG residual increased by more than 10%%. New %e > %e\n", new_residual, norm_residual);
+        break;
+      }
+      norm_residual = new_residual;
+
+      // Update x = x + alpha * p
+      x.axpy(alpha, p, 1.0);
+
+      // residual = A*x - b
+      residual = b;
+      matrix_vector_multiply(A, 1.0, x, -1.0, residual);
+      norm_residual = vector_norm2<i_t, f_t>(residual);
+
+      // Solve M y = r for y
+      chol->solve(residual, y);
+
+      // Compute beta = (r_+^T y_+) / (r^T y)
+      f_t r1Ty1 = residual.inner_product(y);
+      f_t beta  = r1Ty1 / rTy;
+
+      rTy = r1Ty1;
+
+      // Update p = -y + beta * p
+      p.axpy(-1.0, y, beta);
+
+      iter++;
+      settings.log.printf("PCG iter %3d 2-norm_residual %.2e inf-norm_residual %.2e\n",
+                          iter,
+                          norm_residual,
+                          vector_norm_inf<i_t, f_t>(residual));
+    }
+
+    residual = b;
+    matrix_vector_multiply(A, 1.0, x, -1.0, residual);
+    norm_residual = vector_norm2<i_t, f_t>(residual);
+    if (norm_residual < initial_norm_residual) {
+      settings.log.printf("PCG improved residual 2-norm %.2e/%.2e in %d iterations\n",
+                          norm_residual,
+                          initial_norm_residual,
+                          iter);
+      xinout = x;
+    } else {
+      settings.log.printf("Rejecting PCG solution\n");
+    }
+  }
+
+  i_t n_upper_bounds;
+  dense_vector_t<i_t, f_t> upper_bounds;
+  dense_vector_t<i_t, f_t> c;
+  dense_vector_t<i_t, f_t> b;
+
+  dense_vector_t<i_t, f_t> w;
+  dense_vector_t<i_t, f_t> x;
+  dense_vector_t<i_t, f_t> y;
+  dense_vector_t<i_t, f_t> v;
+  dense_vector_t<i_t, f_t> z;
+
+  dense_vector_t<i_t, f_t> w_save;
+  dense_vector_t<i_t, f_t> x_save;
+  dense_vector_t<i_t, f_t> y_save;
+  dense_vector_t<i_t, f_t> v_save;
+  dense_vector_t<i_t, f_t> z_save;
+
+  dense_vector_t<i_t, f_t> diag;
+  dense_vector_t<i_t, f_t> inv_diag;
+  dense_vector_t<i_t, f_t> inv_sqrt_diag;
+
+  csc_matrix_t<i_t, f_t> AD;
+  csc_matrix_t<i_t, f_t> AT;
+  csc_matrix_t<i_t, f_t> ADAT;
+
+  std::unique_ptr<sparse_cholesky_base_t<i_t, f_t>> chol;
+
+  bool has_factorization;
+
+  dense_vector_t<i_t, f_t> primal_residual;
+  dense_vector_t<i_t, f_t> bound_residual;
+  dense_vector_t<i_t, f_t> dual_residual;
+  dense_vector_t<i_t, f_t> complementarity_xz_residual;
+  dense_vector_t<i_t, f_t> complementarity_wv_residual;
+
+  dense_vector_t<i_t, f_t> primal_rhs;
+  dense_vector_t<i_t, f_t> bound_rhs;
+  dense_vector_t<i_t, f_t> dual_rhs;
+  dense_vector_t<i_t, f_t> complementarity_xz_rhs;
+  dense_vector_t<i_t, f_t> complementarity_wv_rhs;
+
+  dense_vector_t<i_t, f_t> dw_aff;
+  dense_vector_t<i_t, f_t> dx_aff;
+  dense_vector_t<i_t, f_t> dy_aff;
+  dense_vector_t<i_t, f_t> dv_aff;
+  dense_vector_t<i_t, f_t> dz_aff;
+
+  dense_vector_t<i_t, f_t> dw;
+  dense_vector_t<i_t, f_t> dx;
+  dense_vector_t<i_t, f_t> dy;
+  dense_vector_t<i_t, f_t> dv;
+  dense_vector_t<i_t, f_t> dz;
+};
+
+template <typename i_t, typename f_t>
+void barrier_solver_t<i_t, f_t>::initial_point(iteration_data_t<i_t, f_t>& data)
+{
+  // Perform a numerical factorization
+  i_t status = data.chol->factorize(data.ADAT);
+  if (status != 0) {
+    settings.log.printf("Initial factorization failed\n");
+    return;
+  }
+
+  // rhs_x <- b
+  dense_vector_t<i_t, f_t> rhs_x(lp.rhs);
+
+  dense_vector_t<i_t, f_t> Fu(lp.num_cols);
+  data.gather_upper_bounds(lp.upper, Fu);
+
+  dense_vector_t<i_t, f_t> DinvFu(lp.num_cols);  // DinvFu = Dinv * Fu
+  data.inv_diag.pairwise_product(Fu, DinvFu);
+
+  // rhs_x <-  A * Dinv * F * u  - b
+  matrix_vector_multiply(lp.A, 1.0, DinvFu, -1.0, rhs_x);
+
+  // Solve A*Dinv*A'*q = A*Dinv*F*u - b
+  dense_vector_t<i_t, f_t> q(lp.num_rows);
+  settings.log.printf("||rhs_x|| = %e\n", vector_norm2<i_t, f_t>(rhs_x));
+  i_t solve_status = data.chol->solve(rhs_x, q);
+  settings.log.printf("Initial solve status %d\n", solve_status);
+  settings.log.printf("||q|| = %e\n", vector_norm2<i_t, f_t>(q));
+
+  // rhs_x <- A*Dinv*A'*q - rhs_x
+  matrix_vector_multiply(data.ADAT, 1.0, q, -1.0, rhs_x);
+  settings.log.printf("|| A*Dinv*A'*q - (A*Dinv*F*u - b) || = %e\n", vector_norm2<i_t, f_t>(rhs_x));
+
+  // x = Dinv*(F*u - A'*q)
+  data.inv_diag.pairwise_product(Fu, data.x);
+  // Fu <- -1.0 * A' * q + 1.0 * Fu
+  matrix_transpose_vector_multiply(lp.A, -1.0, q, 1.0, Fu);
+
+  // x <- Dinv * (F*u - A'*q)
+  data.inv_diag.pairwise_product(Fu, data.x);
+
+  // w <- E'*u - E'*x
+  if (data.n_upper_bounds > 0) {
+    for (i_t k = 0; k < data.n_upper_bounds; k++) {
+      i_t j     = data.upper_bounds[k];
+      data.w[k] = lp.upper[j] - data.x[j];
+    }
+  }
+
+  // Verify A*x = b
+  data.primal_residual = lp.rhs;
+  matrix_vector_multiply(lp.A, 1.0, data.x, -1.0, data.primal_residual);
+  settings.log.printf("||b - A * x||: %e\n", vector_norm2<i_t, f_t>(data.primal_residual));
+
+  if (data.n_upper_bounds > 0) {
+    for (i_t k = 0; k < data.n_upper_bounds; k++) {
+      i_t j                  = data.upper_bounds[k];
+      data.bound_residual[k] = lp.upper[j] - data.w[k] - data.x[j];
+    }
+    settings.log.printf("|| u - w - x||: %e\n", vector_norm2<i_t, f_t>(data.bound_residual));
+  }
+
+  // First compute rhs = A*Dinv*c
+  dense_vector_t<i_t, f_t> rhs(lp.num_rows);
+  dense_vector_t<i_t, f_t> Dinvc(lp.num_cols);
+  data.inv_diag.pairwise_product(lp.objective, Dinvc);
+  // rhs = 1.0 * A * Dinv * c
+  matrix_vector_multiply(lp.A, 1.0, Dinvc, 0.0, rhs);
+
+  // Solve A*Dinv*A'*q = A*Dinv*c
+  data.chol->solve(rhs, data.y);
+
+  // z = Dinv*(c - A'*y)
+  dense_vector_t<i_t, f_t> cmATy = data.c;
+  matrix_transpose_vector_multiply(lp.A, -1.0, data.y, 1.0, cmATy);
+  // z <- Dinv * (c - A'*y)
+  data.inv_diag.pairwise_product(cmATy, data.z);
+
+  // v = -E'*z
+  data.gather_upper_bounds(data.z, data.v);
+  data.v.multiply_scalar(-1.0);
+
+  // Verify A'*y + z - E*v = c
+  data.z.pairwise_subtract(data.c, data.dual_residual);
+  matrix_transpose_vector_multiply(lp.A, 1.0, data.y, 1.0, data.dual_residual);
+  if (data.n_upper_bounds > 0) {
+    for (i_t k = 0; k < data.n_upper_bounds; k++) {
+      i_t j = data.upper_bounds[k];
+      data.dual_residual[j] -= data.v[k];
+    }
+  }
+  settings.log.printf("||A^T y + z - E*v - c ||: %e\n", vector_norm2<i_t, f_t>(data.dual_residual));
+
+  // Make sure (w, x, v, z) > 0
+  float64_t epsilon_adjust = 10.0;
+  data.w.ensure_positive(epsilon_adjust);
+  data.x.ensure_positive(epsilon_adjust);
+  data.v.ensure_positive(epsilon_adjust);
+  data.z.ensure_positive(epsilon_adjust);
+}
+
+template <typename i_t, typename f_t>
+void barrier_solver_t<i_t, f_t>::compute_residuals(const dense_vector_t<i_t, f_t>& w,
+                                                   const dense_vector_t<i_t, f_t>& x,
+                                                   const dense_vector_t<i_t, f_t>& y,
+                                                   const dense_vector_t<i_t, f_t>& v,
+                                                   const dense_vector_t<i_t, f_t>& z,
+                                                   iteration_data_t<i_t, f_t>& data)
+{
+  // Compute primal_residual = b - A*x
+  data.primal_residual = lp.rhs;
+  matrix_vector_multiply(lp.A, -1.0, x, 1.0, data.primal_residual);
+
+  // Compute bound_residual = E'*u - w - E'*x
+  if (data.n_upper_bounds > 0) {
+    for (i_t k = 0; k < data.n_upper_bounds; k++) {
+      i_t j                  = data.upper_bounds[k];
+      data.bound_residual[k] = lp.upper[j] - w[k] - x[j];
+    }
+  }
+
+  // Compute dual_residual = c - A'*y - z + E*v
+  data.c.pairwise_subtract(z, data.dual_residual);
+  matrix_transpose_vector_multiply(lp.A, -1.0, y, 1.0, data.dual_residual);
+  if (data.n_upper_bounds > 0) {
+    for (i_t k = 0; k < data.n_upper_bounds; k++) {
+      i_t j = data.upper_bounds[k];
+      data.dual_residual[j] += v[k];
+    }
+  }
+
+  // Compute complementarity_xz_residual = x.*z
+  x.pairwise_product(z, data.complementarity_xz_residual);
+
+  // Compute complementarity_wv_residual = w.*v
+  w.pairwise_product(v, data.complementarity_wv_residual);
+}
+
+template <typename i_t, typename f_t>
+void barrier_solver_t<i_t, f_t>::compute_residual_norms(const dense_vector_t<i_t, f_t>& w,
+                                                        const dense_vector_t<i_t, f_t>& x,
+                                                        const dense_vector_t<i_t, f_t>& y,
+                                                        const dense_vector_t<i_t, f_t>& v,
+                                                        const dense_vector_t<i_t, f_t>& z,
+                                                        iteration_data_t<i_t, f_t>& data,
+                                                        f_t& primal_residual_norm,
+                                                        f_t& dual_residual_norm,
+                                                        f_t& complementarity_residual_norm)
+{
+  compute_residuals(w, x, y, v, z, data);
+  primal_residual_norm = std::max(vector_norm_inf<i_t, f_t>(data.primal_residual),
+                                  vector_norm_inf<i_t, f_t>(data.bound_residual));
+  dual_residual_norm   = vector_norm_inf<i_t, f_t>(data.dual_residual);
+  complementarity_residual_norm =
+    std::max(vector_norm_inf<i_t, f_t>(data.complementarity_xz_residual),
+             vector_norm_inf<i_t, f_t>(data.complementarity_wv_residual));
+}
+
+template <typename i_t, typename f_t>
+f_t barrier_solver_t<i_t, f_t>::max_step_to_boundary(const dense_vector_t<i_t, f_t>& x,
+                                                     const dense_vector_t<i_t, f_t>& dx,
+                                                     i_t& index) const
+{
+  float64_t max_step = 1.0;
+  index              = -1;
+  for (i_t i = 0; i < x.size(); i++) {
+    // x_i + alpha * dx_i >= 0, x_i >= 0, alpha >= 0
+    // We only need to worry about the case where dx_i < 0
+    // alpha * dx_i >= -x_i => alpha <= -x_i / dx_i
+    if (dx[i] < 0.0) {
+      const f_t ratio = -x[i] / dx[i];
+      if (ratio < max_step) {
+        max_step = ratio;
+        index    = i;
+      }
+    }
+  }
+  return max_step;
+}
+
+template <typename i_t, typename f_t>
+f_t barrier_solver_t<i_t, f_t>::max_step_to_boundary(const dense_vector_t<i_t, f_t>& x,
+                                                     const dense_vector_t<i_t, f_t>& dx) const
+{
+  i_t index;
+  return max_step_to_boundary(x, dx, index);
+}
+
+template <typename i_t, typename f_t>
+i_t barrier_solver_t<i_t, f_t>::compute_search_direction(iteration_data_t<i_t, f_t>& data,
+                                                         dense_vector_t<i_t, f_t>& dw,
+                                                         dense_vector_t<i_t, f_t>& dx,
+                                                         dense_vector_t<i_t, f_t>& dy,
+                                                         dense_vector_t<i_t, f_t>& dv,
+                                                         dense_vector_t<i_t, f_t>& dz,
+                                                         f_t& max_residual)
+{
+  const bool debug = true;
+  // Solves the linear system
+  //
+  //  dw dx dy dv dz
+  // [ 0 A  0   0  0 ] [ dw ] = [ rp  ]
+  // [ I E' 0   0  0 ] [ dx ]   [ rw  ]
+  // [ 0 0  A' -E  I ] [ dy ]   [ rd  ]
+  // [ 0 Z  0   0  X ] [ dv ]   [ rxz ]
+  // [ V 0  0   W  0 ] [ dz ]   [ rwv ]
+
+  max_residual = 0.0;
+  // diag = z ./ x
+  data.z.pairwise_divide(data.x, data.diag);
+  // diag = z ./ x + E * (v ./ w) * E'
+  if (data.n_upper_bounds > 0) {
+    for (i_t k = 0; k < data.n_upper_bounds; k++) {
+      i_t j = data.upper_bounds[k];
+      data.diag[j] += data.v[k] / data.w[k];
+    }
+  }
+
+  // inv_diag = 1.0 ./ diag
+  data.diag.inverse(data.inv_diag);
+
+  // inv_sqrt_diag <- sqrt(inv_diag)
+  data.inv_diag.sqrt(data.inv_sqrt_diag);
+
+  // Form A*D*A'
+  if (!data.has_factorization) {
+    data.AD = lp.A;
+    data.AD.scale_columns(data.inv_diag);
+    // compute ADAT = A Dinv * A^T
+    multiply(data.AD, data.AT, data.ADAT);
+
+    // factorize
+    i_t status = data.chol->factorize(data.ADAT);
+    if (status < 0) {
+      settings.log.printf("Factorization failed.\n");
+      return -1;
+    }
+    data.has_factorization = true;
+  }
+
+  // Compute h = primal_rhs + A*inv_diag*(dual_rhs - complementarity_xz_rhs ./ x +
+  // E*((complementarity_wv_rhs - v .* bound_rhs) ./ w) )
+  dense_vector_t<i_t, f_t> tmp1(data.n_upper_bounds);
+  dense_vector_t<i_t, f_t> tmp2(data.n_upper_bounds);
+  dense_vector_t<i_t, f_t> tmp3(lp.num_cols);
+  dense_vector_t<i_t, f_t> tmp4(lp.num_cols);
+
+  // tmp2 <- v .* bound_rhs
+  data.v.pairwise_product(data.bound_rhs, tmp2);
+  // tmp2 <- complementarity_wv_rhs - v .* bound_rhs
+  tmp2.axpy(1.0, data.complementarity_wv_rhs, -1.0);
+  // tmp1 <- (complementarity_wv_rhs - v .* bound_rhs) ./ w
+  tmp2.pairwise_divide(data.w, tmp1);
+  tmp3.set_scalar(0.0);
+  // tmp3 <- E * tmp1
+  data.scatter_upper_bounds(tmp1, tmp3);
+
+  // tmp4 <- complementarity_xz_rhs ./ x
+  data.complementarity_xz_rhs.pairwise_divide(data.x, tmp4);
+  // tmp3 <- -complementarity_xz_rhs ./ x + E * ((complementarity_wv_rhs - v .* bound_rhs) ./ w)
+  tmp3.axpy(-1.0, tmp4, 1.0);
+  // tmp3 <- dual_rhs -complementarity_xz_rhs ./ x + E * ((complementarity_wv_rhs - v .* bound_rhs)
+  // ./ w)
+  tmp3.axpy(1.0, data.dual_rhs, 1.0);
+  // r1 <- dual_rhs -complementarity_xz_rhs ./ x +  E * ((complementarity_wv_rhs - v .* bound_rhs)
+  // ./ w)
+  dense_vector_t<i_t, f_t> r1       = tmp3;
+  dense_vector_t<i_t, f_t> r1_prime = r1;
+  // tmp4 <- inv_diag * ( dual_rhs -complementarity_xz_rhs ./ x + E *((complementarity_wv_rhs - v .*
+  // bound_rhs) ./ w))
+  data.inv_diag.pairwise_product(tmp3, tmp4);
+
+  dense_vector_t<i_t, f_t> h = data.primal_rhs;
+  // h <- 1.0 * A * tmp4 + primal_rhs
+  matrix_vector_multiply(lp.A, 1.0, tmp4, 1.0, h);
+
+  // Solve A D^{-1} A^T dy = h
+  i_t solve_status = data.chol->solve(h, dy);
+  if (solve_status < 0) {
+    settings.log.printf("Linear solve failed\n");
+    return -1;
+  }
+
+  // y_residual <- ADAT*dy - h
+  dense_vector_t<i_t, f_t> y_residual = h;
+  matrix_vector_multiply(data.ADAT, 1.0, dy, -1.0, y_residual);
+  const f_t y_residual_norm = vector_norm_inf<i_t, f_t>(y_residual);
+  max_residual              = std::max(max_residual, y_residual_norm);
+  if (y_residual_norm > 1e-2) {
+    settings.log.printf("|| h || = %.2e\n", vector_norm_inf<i_t, f_t>(h));
+    settings.log.printf("||ADAT*dy - h|| = %.2e\n", y_residual_norm);
+  }
+  if (y_residual_norm > 10) { return -1; }
+
+  if (0 && y_residual_norm > 1e-10) {
+    settings.log.printf("Running PCG to improve residual 2-norm %e inf-norm %e\n",
+                        vector_norm2<i_t, f_t>(y_residual),
+                        y_residual_norm);
+    data.preconditioned_conjugate_gradient(
+      settings, h, std::min(y_residual_norm / 100.0, 1e-6), dy);
+    y_residual = h;
+    matrix_vector_multiply(data.ADAT, 1.0, dy, -1.0, y_residual);
+    const f_t y_residual_norm_pcg = vector_norm_inf<i_t, f_t>(y_residual);
+    settings.log.printf("PCG improved residual || ADAT * dy - h || = %.2e\n", y_residual_norm_pcg);
+  }
+
+  // dx = dinv .* (A'*dy - dual_rhs + complementarity_xz_rhs ./ x  - E *((complementarity_wv_rhs - v
+  // .* bound_rhs) ./ w))
+  // r1 <- A'*dy - r1
+  matrix_transpose_vector_multiply(lp.A, 1.0, dy, -1.0, r1);
+  // dx <- dinv .* r1
+  data.inv_diag.pairwise_product(r1, dx);
+
+  // dx_residual <- D * dx - A'*dy - r1
+  dense_vector_t<i_t, f_t> dx_residual(lp.num_cols);
+  // dx_residual <- D*dx
+  data.diag.pairwise_product(dx, dx_residual);
+  // dx_residual <- -A'*dy + D*dx
+  matrix_transpose_vector_multiply(lp.A, -1.0, dy, 1.0, dx_residual);
+  // dx_residual <- D*dx - A'*dy + r1
+  dx_residual.axpy(1.0, r1_prime, 1.0);
+  if (debug) {
+    const f_t dx_residual_norm = vector_norm_inf<i_t, f_t>(dx_residual);
+    max_residual               = std::max(max_residual, dx_residual_norm);
+    if (dx_residual_norm > 1e-2) {
+      settings.log.printf("|| D * dx - A'*y + r1 || = %.2e\n", dx_residual_norm);
+    }
+  }
+
+  dense_vector_t<i_t, f_t> dx_residual_2(lp.num_cols);
+  // dx_residual_2 <- D^-1 * (A'*dy - r1)
+  data.inv_diag.pairwise_product(r1, dx_residual_2);
+  // dx_residual_2 <- D^-1 * (A'*dy - r1) - dx
+  dx_residual_2.axpy(-1.0, dx, 1.0);
+  // dx_residual_2 <- D^-1 * (A'*dy - r1) - dx
+  const f_t dx_residual_2_norm = vector_norm_inf<i_t, f_t>(dx_residual_2);
+  max_residual                 = std::max(max_residual, dx_residual_2_norm);
+  if (dx_residual_2_norm > 1e-2) {
+    settings.log.printf("|| D^-1 (A'*dy - r1) - dx || = %.2e\n", dx_residual_2_norm);
+  }
+
+  dense_vector_t<i_t, f_t> dx_residual_5(lp.num_cols);
+  dense_vector_t<i_t, f_t> dx_residual_6(lp.num_rows);
+  // dx_residual_5 <- D^-1 * (A'*dy - r1)
+  data.inv_diag.pairwise_product(r1, dx_residual_5);
+  // dx_residual_6 <- A * D^-1 (A'*dy - r1)
+  matrix_vector_multiply(lp.A, 1.0, dx_residual_5, 0.0, dx_residual_6);
+  // dx_residual_6 <- A * D^-1 (A'*dy - r1) - A * dx
+  matrix_vector_multiply(lp.A, -1.0, dx, 1.0, dx_residual_6);
+  const f_t dx_residual_6_norm = vector_norm_inf<i_t, f_t>(dx_residual_6);
+  max_residual                 = std::max(max_residual, dx_residual_6_norm);
+  if (dx_residual_6_norm > 1e-2) {
+    settings.log.printf("|| A * D^-1 (A'*dy - r1) - A * dx || = %.2e\n", dx_residual_6_norm);
+  }
+
+  dense_vector_t<i_t, f_t> dx_residual_3(lp.num_cols);
+  // dx_residual_3 <- D^-1 * r1
+  data.inv_diag.pairwise_product(r1_prime, dx_residual_3);
+  dense_vector_t<i_t, f_t> dx_residual_4(lp.num_rows);
+  // dx_residual_4 <- A * D^-1 * r1
+  matrix_vector_multiply(lp.A, 1.0, dx_residual_3, 0.0, dx_residual_4);
+  // dx_residual_4 <-  A * D^-1 * r1 + A * dx
+  matrix_vector_multiply(lp.A, 1.0, dx, 1.0, dx_residual_4);
+  // dx_residual_4 <- - A * D^-1 * r1 - A * dx + ADAT * dy
+
+#if CHECK_FORM_ADAT
+  csc_matrix_t<i_t, f_t> ADinv = lp.A;
+  ADinv.scale_columns(data.inv_diag);
+  csc_matrix_t<i_t, f_t> ADinvAT(lp.num_rows, lp.num_rows, 1);
+  csc_matrix_t<i_t, f_t> Atranspose(1, 1, 0);
+  lp.A.transpose(Atranspose);
+  multiply(ADinv, Atranspose, ADinvAT);
+  matrix_vector_multiply(ADinvAT, 1.0, dy, -1.0, dx_residual_4);
+  const f_t dx_residual_4_norm = vector_norm_inf<i_t, f_t>(dx_residual_4);
+  max_residual                 = std::max(max_residual, dx_residual_4_norm);
+  if (dx_residual_4_norm > 1e-2) {
+    settings.log.printf("|| ADAT * dy - A * D^-1 * r1 - A * dx || = %.2e\n", dx_residual_4_norm);
+  }
+
+  csc_matrix_t<i_t, f_t> C(lp.num_rows, lp.num_rows, 1);
+  add(ADinvAT, data.ADAT, 1.0, -1.0, C);
+  const f_t matrix_residual = C.norm1();
+  max_residual              = std::max(max_residual, matrix_residual);
+  if (matrix_residual > 1e-2) {
+    settings.log.printf("|| AD^{-1/2} D^{-1} A^T + E - A D^{-1} A^T|| = %.2e\n", matrix_residual);
+  }
+#endif
+
+  dense_vector_t<i_t, f_t> dx_residual_7 = h;
+  matrix_vector_multiply(data.ADAT, 1.0, dy, -1.0, dx_residual_7);
+  const f_t dx_residual_7_norm = vector_norm_inf<i_t, f_t>(dx_residual_7);
+  max_residual                 = std::max(max_residual, dx_residual_7_norm);
+  if (dx_residual_7_norm > 1e-2) {
+    settings.log.printf("|| A D^{-1} A^T * dy - h || = %.2e\n", dx_residual_7_norm);
+  }
+
+  // x_residual <- A * dx - primal_rhs
+  dense_vector_t<i_t, f_t> x_residual = data.primal_rhs;
+  matrix_vector_multiply(lp.A, 1.0, dx, -1.0, x_residual);
+  if (debug) {
+    const f_t x_residual_norm = vector_norm_inf<i_t, f_t>(x_residual);
+    max_residual              = std::max(max_residual, x_residual_norm);
+    if (x_residual_norm > 1e-2) {
+      settings.log.printf("|| A * dx - rp || = %.2e\n", x_residual_norm);
+    }
+  }
+
+  // dz = (complementarity_xz_rhs - z.* dx) ./ x;
+  // tmp3 <- z .* dx
+  data.z.pairwise_product(dx, tmp3);
+  // tmp3 <- 1.0 * complementarity_xz_rhs - tmp3
+  tmp3.axpy(1.0, data.complementarity_xz_rhs, -1.0);
+  // dz <- tmp3 ./ x
+  tmp3.pairwise_divide(data.x, dz);
+
+  // xz_residual <- z .* dx + x .* dz - complementarity_xz_rhs
+  dense_vector_t<i_t, f_t> xz_residual = data.complementarity_xz_rhs;
+  dense_vector_t<i_t, f_t> zdx(lp.num_cols);
+  dense_vector_t<i_t, f_t> xdz(lp.num_cols);
+  data.z.pairwise_product(dx, zdx);
+  data.x.pairwise_product(dz, xdz);
+  xz_residual.axpy(1.0, zdx, -1.0);
+  xz_residual.axpy(1.0, xdz, 1.0);
+  if (debug) {
+    const f_t xz_residual_norm = vector_norm_inf<i_t, f_t>(xz_residual);
+    max_residual               = std::max(max_residual, xz_residual_norm);
+    if (xz_residual_norm > 1e-2) {
+      settings.log.printf("|| Z dx + X dz - rxz || = %.2e\n", xz_residual_norm);
+    }
+  }
+
+  // dv = (v .* E' dx + complementarity_wv_rhs - v .* bound_rhs) ./ w
+  // tmp1 <- E' * dx
+  data.gather_upper_bounds(dx, tmp1);
+  // tmp2 <- v .* E' * dx
+  data.v.pairwise_product(tmp1, tmp2);
+  // tmp1 <- v .* bound_rhs
+  data.v.pairwise_product(data.bound_rhs, tmp1);
+  // tmp1 <- v .* E' * dx - v . * bound_rhs
+  tmp1.axpy(1.0, tmp2, -1.0);
+  // tmp1 <- v .* E' * dx + complementarity_wv_rhs - v.* bound_rhs
+  tmp1.axpy(1.0, data.complementarity_wv_rhs, 1.0);
+  // dv <- tmp1 ./ w
+  tmp1.pairwise_divide(data.w, dv);
+
+  dense_vector_t<i_t, f_t> dv_residual(data.n_upper_bounds);
+  dense_vector_t<i_t, f_t> dv_residual_2(data.n_upper_bounds);
+  // dv_residual <- E'*dx
+  data.gather_upper_bounds(dx, dv_residual);
+  // dv_residual_2 <- v .* E' * dx
+  data.v.pairwise_product(dv_residual, dv_residual_2);
+  // dv_residual_2 <- -v .* E' * dx
+  dv_residual_2.multiply_scalar(-1.0);
+  // dv_residual_ <- W .* dv
+  data.w.pairwise_product(dv, dv_residual);
+  // dv_residual <- -v .* E' * dx + w .* dv
+  dv_residual.axpy(1.0, dv_residual_2, 1.0);
+  // dv_residual <- -v .* E' * dx + w .* dv - complementarity_wv_rhs
+  dv_residual.axpy(-1.0, data.complementarity_wv_rhs, 1.0);
+  // dv_residual_2 <- V * bound_rhs
+  data.v.pairwise_product(data.bound_rhs, dv_residual_2);
+  // dv_residual <- -v .* E' * dx + w .* dv - complementarity_wv_rhs + v .* bound_rhs
+  dv_residual.axpy(1.0, dv_residual_2, 1.0);
+  if (debug) {
+    const f_t dv_residual_norm = vector_norm_inf<i_t, f_t>(dv_residual);
+    max_residual               = std::max(max_residual, dv_residual_norm);
+    if (dv_residual_norm > 1e-2) {
+      settings.log.printf(
+        "|| -v .* E' * dx + w .* dv - complementarity_wv_rhs - v .* bound_rhs || = %.2e\n",
+        dv_residual_norm);
+    }
+  }
+
+  // dual_residual <- A' * dy - E * dv  + dz -  dual_rhs
+  dense_vector_t<i_t, f_t> dual_residual(lp.num_cols);
+  // dual_residual <- E * dv
+  data.scatter_upper_bounds(dv, dual_residual);
+  // dual_residual <- A' * dy - E * dv
+  matrix_transpose_vector_multiply(lp.A, 1.0, dy, -1.0, dual_residual);
+  // dual_residual <- A' * dy - E * dv + dz
+  dual_residual.axpy(1.0, dz, 1.0);
+  // dual_residual <- A' * dy - E * dv + dz - dual_rhs
+  dual_residual.axpy(-1.0, data.dual_rhs, 1.0);
+  if (debug) {
+    const f_t dual_residual_norm = vector_norm_inf<i_t, f_t>(dual_residual);
+    max_residual                 = std::max(max_residual, dual_residual_norm);
+    if (dual_residual_norm > 1e-2) {
+      settings.log.printf("|| A' * dy - E * dv  + dz -  dual_rhs || = %.2e\n", dual_residual_norm);
+    }
+  }
+
+  // dw = bound_rhs - E'*dx
+  data.gather_upper_bounds(dx, tmp1);
+  dw = data.bound_rhs;
+  dw.axpy(-1.0, tmp1, 1.0);
+  // dw_residual <- dw + E'*dx - bound_rhs
+  dense_vector_t<i_t, f_t> dw_residual(data.n_upper_bounds);
+  data.gather_upper_bounds(dx, dw_residual);
+  dw_residual.axpy(1.0, dw, 1.0);
+  dw_residual.axpy(-1.0, data.bound_rhs, 1.0);
+  if (debug) {
+    const f_t dw_residual_norm = vector_norm_inf<i_t, f_t>(dw_residual);
+    max_residual               = std::max(max_residual, dw_residual_norm);
+    if (dw_residual_norm > 1e-2) {
+      settings.log.printf("|| dw + E'*dx - bound_rhs || = %.2e\n", dw_residual_norm);
+    }
+  }
+
+  // wv_residual <- v .* dw + w .* dv - complementarity_wv_rhs
+  dense_vector_t<i_t, f_t> wv_residual = data.complementarity_wv_rhs;
+  dense_vector_t<i_t, f_t> vdw(data.n_upper_bounds);
+  dense_vector_t<i_t, f_t> wdv(data.n_upper_bounds);
+  data.v.pairwise_product(dw, vdw);
+  data.w.pairwise_product(dv, wdv);
+  wv_residual.axpy(1.0, vdw, -1.0);
+  wv_residual.axpy(1.0, wdv, 1.0);
+  if (debug) {
+    const f_t wv_residual_norm = vector_norm_inf<i_t, f_t>(wv_residual);
+    max_residual               = std::max(max_residual, wv_residual_norm);
+    if (wv_residual_norm > 1e-2) {
+      settings.log.printf("|| V dw + W dv - rwv || = %.2e\n", wv_residual_norm);
+    }
+  }
+
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+lp_status_t barrier_solver_t<i_t, f_t>::check_for_suboptimal_solution(
+  const barrier_solver_settings_t<i_t, f_t>& options,
+  iteration_data_t<i_t, f_t>& data,
+  i_t iter,
+  f_t norm_b,
+  f_t norm_c,
+  f_t& primal_objective,
+  f_t& relative_primal_residual,
+  f_t& relative_dual_residual,
+  f_t& relative_complementarity_residual,
+  lp_solution_t<i_t, f_t>& solution)
+{
+  if (relative_primal_residual < 100 * options.feasibility_tol &&
+      relative_dual_residual < 100 * options.optimality_tol &&
+      relative_complementarity_residual < 100 * options.complementarity_tol) {
+    settings.log.printf("Suboptimal solution found\n");
+    data.to_solution(lp,
+                     iter,
+                     primal_objective,
+                     primal_objective + lp.obj_constant,
+                     vector_norm2<i_t, f_t>(data.primal_residual),
+                     vector_norm2<i_t, f_t>(data.dual_residual),
+                     solution);
+    return lp_status_t::OPTIMAL;  // TODO: Barrier should probably have a separate suboptimal status
+  }
+  f_t primal_residual_norm;
+  f_t dual_residual_norm;
+  f_t complementarity_residual_norm;
+  compute_residual_norms(data.w_save,
+                         data.x_save,
+                         data.y_save,
+                         data.v_save,
+                         data.z_save,
+                         data,
+                         primal_residual_norm,
+                         dual_residual_norm,
+                         complementarity_residual_norm);
+  primal_objective         = data.c.inner_product(data.x_save);
+  relative_primal_residual = primal_residual_norm / (1.0 + norm_b);
+  relative_dual_residual   = dual_residual_norm / (1.0 + norm_c);
+  relative_complementarity_residual =
+    complementarity_residual_norm / (1.0 + std::abs(primal_objective));
+
+  if (relative_primal_residual < 100 * options.feasibility_tol &&
+      relative_dual_residual < 100 * options.optimality_tol &&
+      relative_complementarity_residual < 100 * options.complementarity_tol) {
+    settings.log.printf("Restoring previous solution: primal %.2e dual %.2e complementarity %.2e\n",
+                        primal_residual_norm,
+                        dual_residual_norm,
+                        complementarity_residual_norm);
+    data.to_solution(lp,
+                     iter,
+                     primal_objective,
+                     primal_objective + lp.obj_constant,
+                     vector_norm2<i_t, f_t>(data.primal_residual),
+                     vector_norm2<i_t, f_t>(data.dual_residual),
+                     solution);
+    settings.log.printf("Suboptimal solution found\n");
+    return lp_status_t::OPTIMAL;  // TODO: Barrier should probably have a separate suboptimal status
+  } else {
+    settings.log.printf("Primal residual %.2e dual residual %.2e complementarity residual %.2e\n",
+                        relative_primal_residual,
+                        relative_dual_residual,
+                        relative_complementarity_residual);
+  }
+  settings.log.printf("Search direction computation failed\n");
+  return lp_status_t::NUMERICAL_ISSUES;
+}
+
+template <typename i_t, typename f_t>
+lp_status_t barrier_solver_t<i_t, f_t>::solve(const barrier_solver_settings_t<i_t, f_t>& options,
+                                              lp_solution_t<i_t, f_t>& solution)
+{
+  float64_t start_time = tic();
+  lp_status_t status   = lp_status_t::UNSET;
+
+  i_t n = lp.num_cols;
+  i_t m = lp.num_rows;
+
+  solution.resize(m, n);
+  settings.log.printf(
+    "Barrier solver: %d constraints, %d variables, %ld nonzeros\n", m, n, lp.A.col_start[n]);
+
+  // Compute the number of free variables
+  i_t num_free_variables = presolve_info.free_variable_pairs.size() / 2;
+  settings.log.printf("%d free variables\n", num_free_variables);
+
+  // Compute the number of upper bounds
+  i_t num_upper_bounds = 0;
+  for (i_t j = 0; j < n; j++) {
+    if (lp.upper[j] < inf) { num_upper_bounds++; }
+  }
+  iteration_data_t<i_t, f_t> data(lp, num_upper_bounds, settings);
+  if (toc(start_time) > settings.time_limit) {
+    settings.log.printf("Barrier time limit exceeded\n");
+    return lp_status_t::TIME_LIMIT;
+  }
+  settings.log.printf("%d finite upper bounds\n", num_upper_bounds);
+
+  initial_point(data);
+  if (toc(start_time) > settings.time_limit) {
+    settings.log.printf("Barrier time limit exceeded\n");
+    return lp_status_t::TIME_LIMIT;
+  }
+  if (settings.concurrent_halt != nullptr &&
+      settings.concurrent_halt->load(std::memory_order_acquire) == 1) {
+    settings.log.printf("Barrier solver halted\n");
+    return lp_status_t::CONCURRENT_LIMIT;
+  }
+  compute_residuals(data.w, data.x, data.y, data.v, data.z, data);
+
+  f_t primal_residual_norm = std::max(vector_norm_inf<i_t, f_t>(data.primal_residual),
+                                      vector_norm_inf<i_t, f_t>(data.bound_residual));
+  f_t dual_residual_norm   = vector_norm_inf<i_t, f_t>(data.dual_residual);
+  f_t complementarity_residual_norm =
+    std::max(vector_norm_inf<i_t, f_t>(data.complementarity_xz_residual),
+             vector_norm_inf<i_t, f_t>(data.complementarity_wv_residual));
+  f_t mu = (data.complementarity_xz_residual.sum() + data.complementarity_wv_residual.sum()) /
+           (static_cast<f_t>(n) + static_cast<f_t>(num_upper_bounds));
+
+  f_t norm_b = vector_norm_inf<i_t, f_t>(data.b);
+  f_t norm_c = vector_norm_inf<i_t, f_t>(data.c);
+
+  f_t primal_objective = data.c.inner_product(data.x);
+
+  f_t relative_primal_residual = primal_residual_norm / (1.0 + norm_b);
+  f_t relative_dual_residual   = dual_residual_norm / (1.0 + norm_c);
+  f_t relative_complementarity_residual =
+    complementarity_residual_norm / (1.0 + std::abs(primal_objective));
+
+  dense_vector_t<i_t, f_t> restrict_u(num_upper_bounds);
+  dense_vector_t<i_t, f_t> upper(lp.upper);
+  data.gather_upper_bounds(upper, restrict_u);
+  f_t dual_objective = data.b.inner_product(data.y) - restrict_u.inner_product(data.v);
+
+  i_t iter = 0;
+  settings.log.printf(
+    "         Objective                                  Residual             Step-Length     "
+    "Time\n");
+  settings.log.printf(
+    "Iter    Primal               Dual            Primal   Dual    Compl.     Primal Dual     "
+    "Elapsed\n");
+  float64_t elapsed_time = toc(start_time);
+  settings.log.printf("%2d   %+.12e %+.12e %.2e %.2e %.2e                   %.1f\n",
+                      iter,
+                      primal_objective,
+                      dual_objective,
+                      primal_residual_norm,
+                      dual_residual_norm,
+                      complementarity_residual_norm,
+                      elapsed_time);
+
+  bool converged = primal_residual_norm < options.feasibility_tol &&
+                   dual_residual_norm < options.optimality_tol &&
+                   complementarity_residual_norm < options.complementarity_tol;
+
+  data.w_save = data.w;
+  data.x_save = data.x;
+  data.y_save = data.y;
+  data.v_save = data.v;
+  data.z_save = data.z;
+
+  const i_t iteration_limit = std::min(options.iteration_limit, settings.iteration_limit);
+
+  while (iter < iteration_limit) {
+    if (toc(start_time) > settings.time_limit) {
+      settings.log.printf("Barrier time limit exceeded\n");
+      return lp_status_t::TIME_LIMIT;
+    }
+    if (settings.concurrent_halt != nullptr &&
+        settings.concurrent_halt->load(std::memory_order_acquire) == 1) {
+      settings.log.printf("Barrier solver halted\n");
+      return lp_status_t::CONCURRENT_LIMIT;
+    }
+    // Compute the affine step
+    data.primal_rhs             = data.primal_residual;
+    data.bound_rhs              = data.bound_residual;
+    data.dual_rhs               = data.dual_residual;
+    data.complementarity_xz_rhs = data.complementarity_xz_residual;
+    data.complementarity_wv_rhs = data.complementarity_wv_residual;
+    // x.*z ->  -x .* z
+    data.complementarity_xz_rhs.multiply_scalar(-1.0);
+    // w.*v -> -w .* v
+    data.complementarity_wv_rhs.multiply_scalar(-1.0);
+
+    f_t max_affine_residual = 0.0;
+    i_t status              = compute_search_direction(
+      data, data.dw_aff, data.dx_aff, data.dy_aff, data.dv_aff, data.dz_aff, max_affine_residual);
+    if (status < 0) {
+      return check_for_suboptimal_solution(options,
+                                           data,
+                                           iter,
+                                           norm_b,
+                                           norm_c,
+                                           primal_objective,
+                                           relative_primal_residual,
+                                           relative_dual_residual,
+                                           relative_complementarity_residual,
+                                           solution);
+    }
+    if (toc(start_time) > settings.time_limit) {
+      settings.log.printf("Barrier time limit exceeded\n");
+      return lp_status_t::TIME_LIMIT;
+    }
+    if (settings.concurrent_halt != nullptr &&
+        settings.concurrent_halt->load(std::memory_order_acquire) == 1) {
+      settings.log.printf("Barrier solver halted\n");
+      return lp_status_t::CONCURRENT_LIMIT;
+    }
+
+    f_t step_primal_aff = std::min(max_step_to_boundary(data.w, data.dw_aff),
+                                   max_step_to_boundary(data.x, data.dx_aff));
+    f_t step_dual_aff   = std::min(max_step_to_boundary(data.v, data.dv_aff),
+                                 max_step_to_boundary(data.z, data.dz_aff));
+
+    // w_aff = w + step_primal_aff * dw_aff
+    // x_aff = x + step_primal_aff * dx_aff
+    // v_aff = v + step_dual_aff * dv_aff
+    // z_aff = z + step_dual_aff * dz_aff
+    dense_vector_t<i_t, f_t> w_aff = data.w;
+    dense_vector_t<i_t, f_t> x_aff = data.x;
+    dense_vector_t<i_t, f_t> v_aff = data.v;
+    dense_vector_t<i_t, f_t> z_aff = data.z;
+    w_aff.axpy(step_primal_aff, data.dw_aff, 1.0);
+    x_aff.axpy(step_primal_aff, data.dx_aff, 1.0);
+    v_aff.axpy(step_dual_aff, data.dv_aff, 1.0);
+    z_aff.axpy(step_dual_aff, data.dz_aff, 1.0);
+
+    dense_vector_t<i_t, f_t> complementarity_xz_aff(lp.num_cols);
+    dense_vector_t<i_t, f_t> complementarity_wv_aff(num_upper_bounds);
+    x_aff.pairwise_product(z_aff, complementarity_xz_aff);
+    w_aff.pairwise_product(v_aff, complementarity_wv_aff);
+
+    f_t complementarity_aff_sum = complementarity_xz_aff.sum() + complementarity_wv_aff.sum();
+    f_t mu_aff =
+      (complementarity_aff_sum) / (static_cast<f_t>(n) + static_cast<f_t>(num_upper_bounds));
+    f_t sigma  = std::max(0.0, std::min(1.0, std::pow(mu_aff / mu, 3.0)));
+    f_t new_mu = sigma * mu_aff;
+
+    // Compute the combined centering corrector step
+    data.primal_rhs.set_scalar(0.0);
+    data.bound_rhs.set_scalar(0.0);
+    data.dual_rhs.set_scalar(0.0);
+    // complementarity_xz_rhs = -dx_aff .* dz_aff + sigma * mu
+    data.dx_aff.pairwise_product(data.dz_aff, data.complementarity_xz_rhs);
+    data.complementarity_xz_rhs.multiply_scalar(-1.0);
+    data.complementarity_xz_rhs.add_scalar(new_mu);
+
+    // complementarity_wv_rhs = -dw_aff .* dv_aff + sigma * mu
+    data.dw_aff.pairwise_product(data.dv_aff, data.complementarity_wv_rhs);
+    data.complementarity_wv_rhs.multiply_scalar(-1.0);
+    data.complementarity_wv_rhs.add_scalar(new_mu);
+
+    f_t max_corrector_residual = 0.0;
+    status                     = compute_search_direction(
+      data, data.dw, data.dx, data.dy, data.dv, data.dz, max_corrector_residual);
+    if (status < 0) {
+      return check_for_suboptimal_solution(options,
+                                           data,
+                                           iter,
+                                           norm_b,
+                                           norm_c,
+                                           primal_objective,
+                                           relative_primal_residual,
+                                           relative_dual_residual,
+                                           relative_complementarity_residual,
+                                           solution);
+    }
+    data.has_factorization = false;
+    if (toc(start_time) > settings.time_limit) {
+      settings.log.printf("Barrier time limit exceeded\n");
+      return lp_status_t::TIME_LIMIT;
+    }
+    if (settings.concurrent_halt != nullptr &&
+        settings.concurrent_halt->load(std::memory_order_acquire) == 1) {
+      settings.log.printf("Barrier solver halted\n");
+      return lp_status_t::CONCURRENT_LIMIT;
+    }
+
+    // dw = dw_aff + dw_cc
+    // dx = dx_aff + dx_cc
+    // dy = dy_aff + dy_cc
+    // dv = dv_aff + dv_cc
+    // dz = dz_aff + dz_cc
+    // Note: dw_cc - dz_cc are stored in dw - dz
+    data.dw.axpy(1.0, data.dw_aff, 1.0);
+    data.dx.axpy(1.0, data.dx_aff, 1.0);
+    data.dy.axpy(1.0, data.dy_aff, 1.0);
+    data.dv.axpy(1.0, data.dv_aff, 1.0);
+    data.dz.axpy(1.0, data.dz_aff, 1.0);
+
+    f_t max_step_primal =
+      std::min(max_step_to_boundary(data.w, data.dw), max_step_to_boundary(data.x, data.dx));
+    f_t max_step_dual =
+      std::min(max_step_to_boundary(data.v, data.dv), max_step_to_boundary(data.z, data.dz));
+
+    f_t step_primal = options.step_scale * max_step_primal;
+    f_t step_dual   = options.step_scale * max_step_dual;
+
+    data.w.axpy(step_primal, data.dw, 1.0);
+    data.x.axpy(step_primal, data.dx, 1.0);
+    data.y.axpy(step_dual, data.dy, 1.0);
+    data.v.axpy(step_dual, data.dv, 1.0);
+    data.z.axpy(step_dual, data.dz, 1.0);
+
+    // Handle free variables
+    if (num_free_variables > 0) {
+      for (i_t k = 0; k < 2 * num_free_variables; k += 2) {
+        i_t u       = presolve_info.free_variable_pairs[k];
+        i_t v       = presolve_info.free_variable_pairs[k + 1];
+        f_t u_val   = data.x[u];
+        f_t v_val   = data.x[v];
+        f_t min_val = std::min(u_val, v_val);
+        f_t eta     = options.step_scale * min_val;
+        data.x[u] -= eta;
+        data.x[v] -= eta;
+      }
+    }
+
+    compute_residual_norms(data.w,
+                           data.x,
+                           data.y,
+                           data.v,
+                           data.z,
+                           data,
+                           primal_residual_norm,
+                           dual_residual_norm,
+                           complementarity_residual_norm);
+
+    mu = (data.complementarity_xz_residual.sum() + data.complementarity_wv_residual.sum()) /
+         (static_cast<f_t>(n) + static_cast<f_t>(num_upper_bounds));
+
+    primal_objective = data.c.inner_product(data.x);
+    dual_objective   = data.b.inner_product(data.y) - restrict_u.inner_product(data.v);
+
+    relative_primal_residual = primal_residual_norm / (1.0 + norm_b);
+    relative_dual_residual   = dual_residual_norm / (1.0 + norm_c);
+    relative_complementarity_residual =
+      complementarity_residual_norm / (1.0 + std::abs(primal_objective));
+
+    if (relative_primal_residual < 100 * options.feasibility_tol &&
+        relative_dual_residual < 100 * options.optimality_tol &&
+        relative_complementarity_residual < 100 * options.complementarity_tol) {
+      data.w_save = data.w;
+      data.x_save = data.x;
+      data.y_save = data.y;
+      data.v_save = data.v;
+      data.z_save = data.z;
+    }
+
+    iter++;
+    elapsed_time = toc(start_time);
+
+    if (primal_objective != primal_objective || dual_objective != dual_objective) {
+      settings.log.printf("Numerical error in objective\n");
+      return lp_status_t::NUMERICAL_ISSUES;
+    }
+
+    settings.log.printf("%2d   %+.12e %+.12e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.1f\n",
+                        iter,
+                        primal_objective + lp.obj_constant,
+                        dual_objective + lp.obj_constant,
+                        relative_primal_residual,
+                        relative_dual_residual,
+                        relative_complementarity_residual,
+                        step_primal,
+                        step_dual,
+                        std::min(data.complementarity_xz_residual.minimum(),
+                                 data.complementarity_wv_residual.minimum()),
+                        mu,
+                        std::max(max_affine_residual, max_corrector_residual),
+                        elapsed_time);
+
+    bool primal_feasible = relative_primal_residual < options.feasibility_tol;
+    bool dual_feasible   = relative_dual_residual < options.optimality_tol;
+    bool small_gap       = relative_complementarity_residual < options.complementarity_tol;
+
+    converged = primal_feasible && dual_feasible && small_gap;
+
+    if (converged) {
+      settings.log.printf(
+        "Optimal solution found in %d iterations and %.2fs\n", iter, toc(start_time));
+      settings.log.printf("Objective %+.8e\n", primal_objective + lp.obj_constant);
+      settings.log.printf("Primal infeasibility (abs/rel): %8.2e/%8.2e\n",
+                          primal_residual_norm,
+                          relative_primal_residual);
+      settings.log.printf("Dual infeasibility   (abs/rel): %8.2e/%8.2e\n",
+                          dual_residual_norm,
+                          relative_dual_residual);
+      settings.log.printf("Complementarity gap  (abs/rel): %8.2e/%8.2e\n",
+                          complementarity_residual_norm,
+                          relative_complementarity_residual);
+      data.to_solution(lp,
+                       iter,
+                       primal_objective,
+                       primal_objective + lp.obj_constant,
+                       primal_residual_norm,
+                       dual_residual_norm,
+                       solution);
+      return lp_status_t::OPTIMAL;
+    }
+  }
+  data.to_solution(lp,
+                   iter,
+                   primal_objective,
+                   primal_objective + lp.obj_constant,
+                   vector_norm2<i_t, f_t>(data.primal_residual),
+                   vector_norm2<i_t, f_t>(data.dual_residual),
+                   solution);
+  return lp_status_t::ITERATION_LIMIT;
+}
+
+#ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE
+template class barrier_solver_t<int, double>;
+template class sparse_cholesky_base_t<int, double>;
+template class sparse_cholesky_cudss_t<int, double>;
+template class iteration_data_t<int, double>;
+template class barrier_solver_settings_t<int, double>;
+#endif
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/barrier.hpp b/cpp/src/dual_simplex/barrier.hpp
new file mode 100644
index 000000000..b8ec5fed6
--- /dev/null
+++ b/cpp/src/dual_simplex/barrier.hpp
@@ -0,0 +1,100 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "dual_simplex/dense_vector.hpp"
+#include "dual_simplex/presolve.hpp"
+#include "dual_simplex/simplex_solver_settings.hpp"
+#include "dual_simplex/solution.hpp"
+#include "dual_simplex/solve.hpp"
+#include "dual_simplex/sparse_cholesky.hpp"
+#include "dual_simplex/sparse_matrix.hpp"
+#include "dual_simplex/tic_toc.hpp"
+
+namespace cuopt::linear_programming::dual_simplex {
+
+template <typename i_t, typename f_t>
+struct barrier_solver_settings_t {
+  f_t feasibility_tol     = 1e-8;
+  f_t optimality_tol      = 1e-8;
+  f_t complementarity_tol = 1e-8;
+  i_t iteration_limit     = 1000;
+  f_t step_scale          = 0.9;
+};
+
+template <typename i_t, typename f_t>
+class iteration_data_t;  // Forward declare
+
+template <typename i_t, typename f_t>
+class barrier_solver_t {
+ public:
+  barrier_solver_t(const lp_problem_t<i_t, f_t>& lp,
+                   const presolve_info_t<i_t, f_t>& presolve,
+                   const simplex_solver_settings_t<i_t, f_t>& settings)
+    : lp(lp), settings(settings), presolve_info(presolve)
+  {
+  }
+  lp_status_t solve(const barrier_solver_settings_t<i_t, f_t>& options,
+                    lp_solution_t<i_t, f_t>& solution);
+
+ private:
+  void initial_point(iteration_data_t<i_t, f_t>& data);
+  void compute_residual_norms(const dense_vector_t<i_t, f_t>& w,
+                              const dense_vector_t<i_t, f_t>& x,
+                              const dense_vector_t<i_t, f_t>& y,
+                              const dense_vector_t<i_t, f_t>& v,
+                              const dense_vector_t<i_t, f_t>& z,
+                              iteration_data_t<i_t, f_t>& data,
+                              f_t& primal_residual_norm,
+                              f_t& dual_residual_norm,
+                              f_t& complementarity_residual_norm);
+  void compute_residuals(const dense_vector_t<i_t, f_t>& w,
+                         const dense_vector_t<i_t, f_t>& x,
+                         const dense_vector_t<i_t, f_t>& y,
+                         const dense_vector_t<i_t, f_t>& v,
+                         const dense_vector_t<i_t, f_t>& z,
+                         iteration_data_t<i_t, f_t>& data);
+  f_t max_step_to_boundary(const dense_vector_t<i_t, f_t>& x,
+                           const dense_vector_t<i_t, f_t>& dx,
+                           i_t& index) const;
+  f_t max_step_to_boundary(const dense_vector_t<i_t, f_t>& x,
+                           const dense_vector_t<i_t, f_t>& dx) const;
+  i_t compute_search_direction(iteration_data_t<i_t, f_t>& data,
+                               dense_vector_t<i_t, f_t>& dw,
+                               dense_vector_t<i_t, f_t>& dx,
+                               dense_vector_t<i_t, f_t>& dy,
+                               dense_vector_t<i_t, f_t>& dv,
+                               dense_vector_t<i_t, f_t>& dz,
+                               f_t& max_residual);
+
+  lp_status_t check_for_suboptimal_solution(const barrier_solver_settings_t<i_t, f_t>& options,
+                                            iteration_data_t<i_t, f_t>& data,
+                                            i_t iter,
+                                            f_t norm_b,
+                                            f_t norm_c,
+                                            f_t& primal_objective,
+                                            f_t& relative_primal_residual,
+                                            f_t& relative_dual_residual,
+                                            f_t& relative_complementarity_residual,
+                                            lp_solution_t<i_t, f_t>& solution);
+
+  const lp_problem_t<i_t, f_t>& lp;
+  const simplex_solver_settings_t<i_t, f_t>& settings;
+  const presolve_info_t<i_t, f_t>& presolve_info;
+};
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/dense_vector.hpp b/cpp/src/dual_simplex/dense_vector.hpp
new file mode 100644
index 000000000..b8e5cf114
--- /dev/null
+++ b/cpp/src/dual_simplex/dense_vector.hpp
@@ -0,0 +1,181 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "dual_simplex/types.hpp"
+
+#include <cmath>
+#include <cstdio>
+#include <vector>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+template <typename i_t, typename f_t>
+class dense_vector_t : public std::vector<f_t> {
+ public:
+  dense_vector_t(i_t n) { this->resize(n, 0.0); }
+  dense_vector_t(const std::vector<f_t>& in)
+  {
+    this->resize(in.size());
+    for (i_t i = 0; i < in.size(); i++) {
+      (*this)[i] = in[i];
+    }
+  }
+
+  f_t minimum() const
+  {
+    const i_t n = this->size();
+    f_t min_x   = inf;
+    for (i_t i = 0; i < n; i++) {
+      min_x = std::min(min_x, (*this)[i]);
+    }
+    return min_x;
+  }
+
+  f_t maximum() const
+  {
+    const i_t n = this->size();
+    f_t max_x   = -inf;
+    for (i_t i = 0; i < n; i++) {
+      max_x = std::max(max_x, (*this)[i]);
+    }
+    return max_x;
+  }
+
+  // b <- sqrt(a)
+  void sqrt(dense_vector_t<i_t, f_t>& b) const
+  {
+    const i_t n = this->size();
+    for (i_t i = 0; i < n; i++) {
+      b[i] = std::sqrt((*this)[i]);
+    }
+  }
+  // a <- a + alpha
+  void add_scalar(f_t alpha)
+  {
+    const i_t n = this->size();
+    for (i_t i = 0; i < n; i++) {
+      (*this)[i] += alpha;
+    }
+  }
+  // a <- alpha * e, e = (1, 1, ..., 1)
+  void set_scalar(f_t alpha)
+  {
+    const i_t n = this->size();
+    for (i_t i = 0; i < n; i++) {
+      (*this)[i] = alpha;
+    }
+  }
+  // a <- alpha * a
+  void multiply_scalar(f_t alpha)
+  {
+    const i_t n = this->size();
+    for (i_t i = 0; i < n; i++) {
+      (*this)[i] *= alpha;
+    }
+  }
+  f_t sum() const
+  {
+    f_t sum     = 0.0;
+    const i_t n = this->size();
+    for (i_t i = 0; i < n; ++i) {
+      sum += (*this)[i];
+    }
+    return sum;
+  }
+
+  f_t inner_product(dense_vector_t<i_t, f_t>& b) const
+  {
+    f_t dot     = 0.0;
+    const i_t n = this->size();
+    for (i_t i = 0; i < n; i++) {
+      dot += (*this)[i] * b[i];
+    }
+    return dot;
+  }
+
+  // c <- a .* b
+  void pairwise_product(const dense_vector_t<i_t, f_t>& b, dense_vector_t<i_t, f_t>& c) const
+  {
+    const i_t n = this->size();
+    if (b.size() != n) {
+      printf("Error: b.size() %d != n %d\n", b.size(), n);
+      exit(1);
+    }
+    if (c.size() != n) {
+      printf("Error: c.size() %d != n %d\n", c.size(), n);
+      exit(1);
+    }
+    for (i_t i = 0; i < n; i++) {
+      c[i] = (*this)[i] * b[i];
+    }
+  }
+  // c <- a ./ b
+  void pairwise_divide(const dense_vector_t<i_t, f_t>& b, dense_vector_t<i_t, f_t>& c) const
+  {
+    const i_t n = this->size();
+    for (i_t i = 0; i < n; i++) {
+      c[i] = (*this)[i] / b[i];
+    }
+  }
+
+  // c <- a - b
+  void pairwise_subtract(const dense_vector_t<i_t, f_t>& b, dense_vector_t<i_t, f_t>& c) const
+  {
+    const i_t n = this->size();
+    for (i_t i = 0; i < n; i++) {
+      c[i] = (*this)[i] - b[i];
+    }
+  }
+
+  // y <- alpha * x + beta * y
+  void axpy(f_t alpha, const dense_vector_t<i_t, f_t>& x, f_t beta)
+  {
+    const i_t n = this->size();
+    for (i_t i = 0; i < n; i++) {
+      (*this)[i] = alpha * x[i] + beta * (*this)[i];
+    }
+  }
+
+  void ensure_positive(f_t epsilon_adjust)
+  {
+    const f_t mix_x = minimum();
+    if (mix_x <= 0.0) {
+      const f_t delta_x = -mix_x + epsilon_adjust;
+      add_scalar(delta_x);
+    }
+  }
+
+  void bound_away_from_zero(f_t epsilon_adjust)
+  {
+    const i_t n = this->size();
+    for (i_t i = 0; i < n; i++) {
+      if ((*this)[i] < epsilon_adjust) { (*this)[i] = epsilon_adjust; }
+    }
+  }
+
+  // b <- 1.0 /a
+  void inverse(dense_vector_t<i_t, f_t>& b) const
+  {
+    const i_t n = this->size();
+    for (i_t i = 0; i < n; i++) {
+      b[i] = 1.0 / (*this)[i];
+    }
+  }
+};
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/presolve.cpp b/cpp/src/dual_simplex/presolve.cpp
index 457da9113..bb113e4ba 100644
--- a/cpp/src/dual_simplex/presolve.cpp
+++ b/cpp/src/dual_simplex/presolve.cpp
@@ -18,6 +18,7 @@
 #include <dual_simplex/presolve.hpp>
 
 #include <dual_simplex/right_looking_lu.hpp>
+#include <dual_simplex/tic_toc.hpp>
 
 #include <cmath>
 
@@ -219,9 +220,10 @@ template <typename i_t, typename f_t>
 i_t remove_rows(lp_problem_t<i_t, f_t>& problem,
                 const std::vector<char>& row_sense,
                 csr_matrix_t<i_t, f_t>& Arow,
-                std::vector<i_t>& row_marker)
+                std::vector<i_t>& row_marker,
+                bool error_on_nonzero_rhs)
 {
-  constexpr bool verbose = false;
+  constexpr bool verbose = true;
   if (verbose) { printf("Removing rows %d %ld\n", Arow.m, row_marker.size()); }
   csr_matrix_t<i_t, f_t> Aout;
   Arow.remove_rows(row_marker, Aout);
@@ -236,7 +238,7 @@ i_t remove_rows(lp_problem_t<i_t, f_t>& problem,
       new_rhs[row_count]       = problem.rhs[i];
       row_count++;
     } else {
-      if (problem.rhs[i] != 0.0) {
+      if (error_on_nonzero_rhs && problem.rhs[i] != 0.0) {
         if (verbose) {
           printf(
             "Error nonzero rhs %e for zero row %d sense %c\n", problem.rhs[i], i, row_sense[i]);
@@ -273,7 +275,7 @@ i_t remove_empty_rows(lp_problem_t<i_t, f_t>& problem,
       row_marker[i] = 0;
     }
   }
-  const i_t retval = remove_rows(problem, row_sense, Arow, row_marker);
+  const i_t retval = remove_rows(problem, row_sense, Arow, row_marker, true);
   return retval;
 }
 
@@ -522,14 +524,11 @@ i_t find_dependent_rows(lp_problem_t<i_t, f_t>& problem,
   csc_matrix_t<i_t, f_t> U(m, m, nz);
   std::vector<i_t> pinv(n);
   std::vector<i_t> q(m);
-  for (i_t i = 0; i < m; ++i) {
-    q[i] = i;
-  }
-  std::optional<std::vector<i_t>> optional_q = q;
-  // TODO: Replace with right looking LU in crossover PR
-  // i_t pivots = left_looking_lu(C, settings, 1e-13, optional_q, L, U, pinv);
-  i_t pivots = 0;
+
+  i_t pivots = right_looking_lu_row_permutation_only(C, settings, 1e-13, tic(), q, pinv);
+
   if (pivots < m) {
+    settings.log.printf("Found %d dependent rows\n", m - pivots);
     const i_t num_dependent = m - pivots;
     std::vector<f_t> independent_rhs(pivots);
     std::vector<f_t> dependent_rhs(num_dependent);
@@ -537,7 +536,7 @@ i_t find_dependent_rows(lp_problem_t<i_t, f_t>& problem,
     i_t ind_count = 0;
     i_t dep_count = 0;
     for (i_t i = 0; i < m; ++i) {
-      i_t row = (*optional_q)[i];
+      i_t row = q[i];
       if (i < pivots) {
         dependent_rows[row]          = 0;
         independent_rhs[ind_count++] = problem.rhs[row];
@@ -548,6 +547,7 @@ i_t find_dependent_rows(lp_problem_t<i_t, f_t>& problem,
       }
     }
 
+#if 0
     std::vector<f_t> z = independent_rhs;
     // Solve U1^T z = independent_rhs
     for (i_t k = 0; k < pivots; ++k) {
@@ -579,6 +579,9 @@ i_t find_dependent_rows(lp_problem_t<i_t, f_t>& problem,
         problem.rhs[dependent_row_list[k]] = 0.0;
       }
     }
+#endif
+  } else {
+    settings.log.printf("No dependent rows found\n");
   }
   return pivots;
 }
@@ -694,27 +697,12 @@ void convert_user_problem(const user_problem_t<i_t, f_t>& user_problem,
     bound_strengthening(row_sense, settings, problem);
   }
 
-  // The original problem may have a variable without a lower bound
-  // but a finite upper bound
-  // -inf < x_j <= u_j
-  i_t no_lower_bound = 0;
-  for (i_t j = 0; j < problem.num_cols; j++) {
-    if (problem.lower[j] == -INFINITY && problem.upper[j] < INFINITY) { no_lower_bound++; }
-  }
-
-  // The original problem may have nonzero lower bounds
-  // 0 != l_j <= x_j <= u_j
-  i_t nonzero_lower_bounds = 0;
-  for (i_t j = 0; j < problem.num_cols; j++) {
-    if (problem.lower[j] != 0.0 && problem.lower[j] > -INFINITY) { nonzero_lower_bounds++; }
-  }
-
   if (less_rows > 0) {
     convert_less_than_to_equal(user_problem, row_sense, problem, less_rows, new_slacks);
   }
 
   // Add artifical variables
-  add_artifical_variables(problem, equality_rows, new_slacks);
+  if (!settings.barrier_presolve) { add_artifical_variables(problem, equality_rows, new_slacks); }
 }
 
 template <typename i_t, typename f_t>
@@ -726,11 +714,115 @@ i_t presolve(const lp_problem_t<i_t, f_t>& original,
   problem = original;
   std::vector<char> row_sense(problem.num_rows, '=');
 
+  // The original problem may have a variable without a lower bound
+  // but a finite upper bound
+  // -inf < x_j <= u_j
+  i_t no_lower_bound = 0;
+  for (i_t j = 0; j < problem.num_cols; j++) {
+    if (problem.lower[j] == -inf && problem.upper[j] < inf) { no_lower_bound++; }
+  }
+  settings.log.printf("%d variables with no lower bound\n", no_lower_bound);
+
+  // The original problem may have nonzero lower bounds
+  // 0 != l_j <= x_j <= u_j
+  i_t nonzero_lower_bounds = 0;
+  for (i_t j = 0; j < problem.num_cols; j++) {
+    if (problem.lower[j] != 0.0 && problem.lower[j] > -inf) { nonzero_lower_bounds++; }
+  }
+  if (settings.barrier_presolve && nonzero_lower_bounds > 0) {
+    settings.log.printf("Transforming %ld nonzero lower bound\n", nonzero_lower_bounds);
+    presolve_info.removed_lower_bounds.resize(problem.num_cols);
+    // We can construct a new variable: x'_j = x_j - l_j or x_j = x'_j + l_j
+    // than we have 0 <= x'_j <= u_j - l_j
+    // Constraints in the form:
+    //  sum_{k != j} a_ik x_k + a_ij * x_j {=, <=} beta_i
+    //  become
+    //  sum_{k != j} a_ik x_k + a_ij * (x'_j + l_j) {=, <=} beta_i
+    //  or
+    //  sum_{k != j} a_ik x_k + a_ij * x'_j {=, <=} beta_i - a_{ij} l_j
+    //
+    // the cost function
+    // sum_{k != j} c_k x_k + c_j * x_j
+    // becomes
+    // sum_{k != j} c_k x_k + c_j (x'_j + l_j)
+    //
+    // so we get the constant term c_j * l_j
+    for (i_t j = 0; j < problem.num_cols; j++) {
+      if (problem.lower[j] != 0.0 && problem.lower[j] > -inf) {
+        for (i_t p = problem.A.col_start[j]; p < problem.A.col_start[j + 1]; p++) {
+          i_t i   = problem.A.i[p];
+          f_t aij = problem.A.x[p];
+          problem.rhs[i] -= aij * problem.lower[j];
+        }
+        problem.obj_constant += problem.objective[j] * problem.lower[j];
+        problem.upper[j] -= problem.lower[j];
+        presolve_info.removed_lower_bounds[j] = problem.lower[j];
+        problem.lower[j]                      = 0.0;
+      }
+    }
+  }
+
+  // Check for free variables
   i_t free_variables = 0;
   for (i_t j = 0; j < problem.num_cols; j++) {
-    if (problem.lower[j] == -INFINITY && problem.upper[j] == INFINITY) { free_variables++; }
+    if (problem.lower[j] == -inf && problem.upper[j] == inf) { free_variables++; }
+  }
+  if (free_variables > 0) {
+    settings.log.printf("%d free variables\n", free_variables);
+
+    // We have a variable x_j: with -inf < x_j < inf
+    // we create new variables v and w with 0 <= v, w and x_j = v - w
+    // Constraints
+    // sum_{k != j} a_ik x_k + a_ij x_j {=, <=} beta
+    // become
+    // sum_{k != j} a_ik x_k + aij v - a_ij w {=, <=} beta
+    //
+    // The cost function
+    // sum_{k != j} c_k x_k + c_j x_j
+    // becomes
+    // sum_{k != j} c_k x_k + c_j v - c_j w
+
+    i_t num_cols = problem.num_cols + free_variables;
+    i_t nnz      = problem.A.col_start[problem.num_cols];
+    for (i_t j = 0; j < problem.num_cols; j++) {
+      if (problem.lower[j] == -inf && problem.upper[j] == inf) {
+        nnz += (problem.A.col_start[j + 1] - problem.A.col_start[j]);
+      }
+    }
+
+    problem.A.col_start.resize(num_cols + 1);
+    problem.A.i.resize(nnz);
+    problem.A.x.resize(nnz);
+    problem.lower.resize(num_cols);
+    problem.upper.resize(num_cols);
+    problem.objective.resize(num_cols);
+
+    presolve_info.free_variable_pairs.resize(free_variables * 2);
+    i_t pair_count = 0;
+    i_t q          = problem.A.col_start[problem.num_cols];
+    i_t col        = problem.num_cols;
+    for (i_t j = 0; j < problem.num_cols; j++) {
+      if (problem.lower[j] == -inf && problem.upper[j] == inf) {
+        for (i_t p = problem.A.col_start[j]; p < problem.A.col_start[j + 1]; p++) {
+          i_t i          = problem.A.i[p];
+          f_t aij        = problem.A.x[p];
+          problem.A.i[q] = i;
+          problem.A.x[q] = -aij;
+          q++;
+        }
+        problem.lower[col]                              = 0.0;
+        problem.upper[col]                              = inf;
+        problem.objective[col]                          = -problem.objective[j];
+        presolve_info.free_variable_pairs[pair_count++] = j;
+        presolve_info.free_variable_pairs[pair_count++] = col;
+        problem.A.col_start[++col]                      = q;
+        problem.lower[j]                                = 0.0;
+      }
+    }
+    assert(problem.A.col_start[num_cols] == nnz);
+    problem.A.n      = num_cols;
+    problem.num_cols = num_cols;
   }
-  if (free_variables > 0) { settings.log.printf("%d free variables\n", free_variables); }
 
   // Check for empty rows
   i_t num_empty_rows = 0;
@@ -760,11 +852,12 @@ i_t presolve(const lp_problem_t<i_t, f_t>& original,
   }
 
   // Check for dependent rows
-  constexpr bool check_dependent_rows = false;
+  bool check_dependent_rows = false;  // settings.barrier;
   if (check_dependent_rows) {
     std::vector<i_t> dependent_rows;
     constexpr i_t kOk = -1;
     i_t infeasible;
+    f_t dependent_row_start    = tic();
     const i_t independent_rows = find_dependent_rows(problem, settings, dependent_rows, infeasible);
     if (infeasible != kOk) {
       settings.log.printf("Found problem infeasible in presolve\n");
@@ -775,8 +868,9 @@ i_t presolve(const lp_problem_t<i_t, f_t>& original,
       settings.log.printf("%d dependent rows\n", num_dependent_rows);
       csr_matrix_t<i_t, f_t> Arow;
       problem.A.to_compressed_row(Arow);
-      remove_rows(problem, row_sense, Arow, dependent_rows);
+      remove_rows(problem, row_sense, Arow, dependent_rows, false);
     }
+    settings.log.printf("Dependent row check in %.2fs\n", toc(dependent_row_start));
   }
   assert(problem.num_rows == problem.A.m);
   assert(problem.num_cols == problem.A.n);
@@ -1012,25 +1106,50 @@ void uncrush_solution(const presolve_info_t<i_t, f_t>& presolve_info,
   if (presolve_info.removed_variables.size() == 0) {
     uncrushed_x = crushed_x;
     uncrushed_z = crushed_z;
-    return;
-  }
+  } else {
+    printf("Presolve info removed variables %d\n", presolve_info.removed_variables.size());
+    // We removed some variables, so we need to map the crushed solution back to the original
+    // variables
+    const i_t n = presolve_info.removed_variables.size() + presolve_info.remaining_variables.size();
+    uncrushed_x.resize(n);
+    uncrushed_z.resize(n);
+
+    i_t k = 0;
+    for (const i_t j : presolve_info.remaining_variables) {
+      uncrushed_x[j] = crushed_x[k];
+      uncrushed_z[j] = crushed_z[k];
+      k++;
+    }
 
-  const i_t n = presolve_info.removed_variables.size() + presolve_info.remaining_variables.size();
-  uncrushed_x.resize(n);
-  uncrushed_z.resize(n);
+    k = 0;
+    for (const i_t j : presolve_info.removed_variables) {
+      uncrushed_x[j] = presolve_info.removed_values[k];
+      uncrushed_z[j] = presolve_info.removed_reduced_costs[k];
+      k++;
+    }
+  }
 
-  i_t k = 0;
-  for (const i_t j : presolve_info.remaining_variables) {
-    uncrushed_x[j] = crushed_x[k];
-    uncrushed_z[j] = crushed_z[k];
-    k++;
+  const i_t num_free_variables = presolve_info.free_variable_pairs.size() / 2;
+  if (num_free_variables > 0) {
+    printf("Presolve info free variables %d\n", num_free_variables);
+    // We added free variables so we need to map the crushed solution back to the original variables
+    for (i_t k = 0; k < 2 * num_free_variables; k += 2) {
+      const i_t u = presolve_info.free_variable_pairs[k];
+      const i_t v = presolve_info.free_variable_pairs[k + 1];
+      uncrushed_x[u] -= uncrushed_x[v];
+    }
+    const i_t n = uncrushed_x.size();
+    uncrushed_x.resize(n - num_free_variables);
+    uncrushed_z.resize(n - num_free_variables);
   }
 
-  k = 0;
-  for (const i_t j : presolve_info.removed_variables) {
-    uncrushed_x[j] = presolve_info.removed_values[k];
-    uncrushed_z[j] = presolve_info.removed_reduced_costs[k];
-    k++;
+  if (presolve_info.removed_lower_bounds.size() > 0) {
+    printf("Presolve info removed lower bounds %d\n", presolve_info.removed_lower_bounds.size());
+    // We removed some lower bounds so we need to map the crushed solution back to the original
+    // variables
+    for (i_t j = 0; j < uncrushed_x.size(); j++) {
+      uncrushed_x[j] += presolve_info.removed_lower_bounds[j];
+    }
   }
 }
 
diff --git a/cpp/src/dual_simplex/presolve.hpp b/cpp/src/dual_simplex/presolve.hpp
index 7a307e6f7..b528578de 100644
--- a/cpp/src/dual_simplex/presolve.hpp
+++ b/cpp/src/dual_simplex/presolve.hpp
@@ -59,6 +59,10 @@ struct presolve_info_t {
   std::vector<f_t> removed_values;
   // values of the removed reduced costs
   std::vector<f_t> removed_reduced_costs;
+  // Free variable pairs
+  std::vector<i_t> free_variable_pairs;
+  // Removed lower bounds
+  std::vector<f_t> removed_lower_bounds;
 };
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp
index a51ed19bc..c14d09ab1 100644
--- a/cpp/src/dual_simplex/simplex_solver_settings.hpp
+++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp
@@ -56,6 +56,10 @@ struct simplex_solver_settings_t {
       use_left_looking_lu(false),
       eliminate_singletons(true),
       print_presolve_stats(true),
+      barrier_presolve(false),
+      use_cudss(false),
+      barrier(false),
+      crossover(false),
       refactor_frequency(100),
       iteration_log_frequency(1000),
       first_iteration_log(2),
@@ -99,8 +103,12 @@ struct simplex_solver_settings_t {
   bool relaxation;                 // true to only solve the LP relaxation of a MIP
   bool
     use_left_looking_lu;  // true to use left looking LU factorization, false to use right looking
-  bool eliminate_singletons;    // true to eliminate singletons from the basis
-  bool print_presolve_stats;    // true to print presolve stats
+  bool eliminate_singletons;  // true to eliminate singletons from the basis
+  bool print_presolve_stats;  // true to print presolve stats
+  bool barrier_presolve;      // true to use barrier presolve
+  bool use_cudss;  // true to use cuDSS for sparse Cholesky factorization, false to use cholmod
+  bool barrier;    // true to use barrier method, false to use dual simplex method
+  bool crossover;  // true to do crossover, false to not
   i_t refactor_frequency;       // number of basis updates before refactorization
   i_t iteration_log_frequency;  // number of iterations between log updates
   i_t first_iteration_log;      // number of iterations to log at beginning of solve
diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp
index e665bae97..e3a43307c 100644
--- a/cpp/src/dual_simplex/solve.cpp
+++ b/cpp/src/dual_simplex/solve.cpp
@@ -17,7 +17,9 @@
 
 #include <dual_simplex/solve.hpp>
 
+#include <dual_simplex/barrier.hpp>
 #include <dual_simplex/branch_and_bound.hpp>
+#include <dual_simplex/crossover.hpp>
 #include <dual_simplex/initial_basis.hpp>
 #include <dual_simplex/phase1.hpp>
 #include <dual_simplex/phase2.hpp>
@@ -236,6 +238,177 @@ lp_status_t solve_linear_program_advanced(const lp_problem_t<i_t, f_t>& original
   return lp_status;
 }
 
+template <typename i_t, typename f_t>
+lp_status_t solve_linear_program_with_barrier(const user_problem_t<i_t, f_t>& user_problem,
+                                              const simplex_solver_settings_t<i_t, f_t>& settings,
+                                              lp_solution_t<i_t, f_t>& solution)
+{
+  f_t start_time = tic();
+  lp_problem_t<i_t, f_t> original_lp(1, 1, 1);
+
+  // Convert the user problem to a linear program with only equality constraints
+  std::vector<i_t> new_slacks;
+  simplex_solver_settings_t<i_t, f_t> barrier_settings = settings;
+  barrier_settings.barrier_presolve                    = true;
+  convert_user_problem(user_problem, barrier_settings, original_lp, new_slacks);
+  lp_solution_t<i_t, f_t> lp_solution(original_lp.num_rows, original_lp.num_cols);
+
+  // Presolve the linear program
+  presolve_info_t<i_t, f_t> presolve_info;
+  lp_problem_t<i_t, f_t> presolved_lp(1, 1, 1);
+  const i_t ok = presolve(original_lp, barrier_settings, presolved_lp, presolve_info);
+  if (ok == -1) { return lp_status_t::INFEASIBLE; }
+
+  // Apply columns scaling to the presolve LP
+  lp_problem_t<i_t, f_t> barrier_lp(
+    presolved_lp.num_rows, presolved_lp.num_cols, presolved_lp.A.col_start[presolved_lp.num_cols]);
+  std::vector<f_t> column_scales;
+  column_scaling(presolved_lp, barrier_settings, barrier_lp, column_scales);
+
+  // Solve using barrier
+  lp_solution_t<i_t, f_t> barrier_solution(barrier_lp.num_rows, barrier_lp.num_cols);
+  barrier_solver_t<i_t, f_t> barrier_solver(barrier_lp, presolve_info, barrier_settings);
+  barrier_solver_settings_t<i_t, f_t> barrier_solver_settings;
+  lp_status_t barrier_status = barrier_solver.solve(barrier_solver_settings, barrier_solution);
+  if (barrier_status == lp_status_t::OPTIMAL) {
+    std::vector<f_t> scaled_residual = barrier_lp.rhs;
+    matrix_vector_multiply(barrier_lp.A, 1.0, barrier_solution.x, -1.0, scaled_residual);
+    f_t scaled_primal_residual = vector_norm_inf<i_t, f_t>(scaled_residual);
+    settings.log.printf("Scaled Primal residual: %e\n", scaled_primal_residual);
+
+    std::vector<f_t> scaled_dual_residual = barrier_solution.z;
+    for (i_t j = 0; j < scaled_dual_residual.size(); ++j) {
+      scaled_dual_residual[j] -= barrier_lp.objective[j];
+    }
+    matrix_transpose_vector_multiply(
+      barrier_lp.A, 1.0, barrier_solution.y, 1.0, scaled_dual_residual);
+    f_t scaled_dual_residual_norm = vector_norm_inf<i_t, f_t>(scaled_dual_residual);
+    settings.log.printf("Scaled Dual residual: %e\n", scaled_dual_residual_norm);
+
+    // Unscale the solution
+    std::vector<f_t> unscaled_x(barrier_lp.num_cols);
+    std::vector<f_t> unscaled_z(barrier_lp.num_cols);
+    unscale_solution<i_t, f_t>(
+      column_scales, barrier_solution.x, barrier_solution.z, unscaled_x, unscaled_z);
+
+    std::vector<f_t> residual = presolved_lp.rhs;
+    matrix_vector_multiply(presolved_lp.A, 1.0, unscaled_x, -1.0, residual);
+    f_t primal_residual = vector_norm_inf<i_t, f_t>(residual);
+    settings.log.printf("Unscaled Primal residual: %e\n", primal_residual);
+
+    std::vector<f_t> unscaled_dual_residual = unscaled_z;
+    for (i_t j = 0; j < unscaled_dual_residual.size(); ++j) {
+      unscaled_dual_residual[j] -= presolved_lp.objective[j];
+    }
+    matrix_transpose_vector_multiply(
+      presolved_lp.A, 1.0, barrier_solution.y, 1.0, unscaled_dual_residual);
+    f_t unscaled_dual_residual_norm = vector_norm_inf<i_t, f_t>(unscaled_dual_residual);
+    settings.log.printf("Unscaled Dual residual: %e\n", unscaled_dual_residual_norm);
+
+    // Undo presolve
+    lp_solution.y = barrier_solution.y;
+    uncrush_solution(presolve_info, unscaled_x, unscaled_z, lp_solution.x, lp_solution.z);
+
+    std::vector<f_t> post_solve_residual = original_lp.rhs;
+    matrix_vector_multiply(original_lp.A, 1.0, lp_solution.x, -1.0, post_solve_residual);
+    f_t post_solve_primal_residual = vector_norm_inf<i_t, f_t>(post_solve_residual);
+    settings.log.printf("Post-solve Primal residual: %e\n", post_solve_primal_residual);
+
+    std::vector<f_t> post_solve_dual_residual = lp_solution.z;
+    for (i_t j = 0; j < post_solve_dual_residual.size(); ++j) {
+      post_solve_dual_residual[j] -= original_lp.objective[j];
+    }
+    matrix_transpose_vector_multiply(
+      original_lp.A, 1.0, lp_solution.y, 1.0, post_solve_dual_residual);
+    f_t post_solve_dual_residual_norm = vector_norm_inf<i_t, f_t>(post_solve_dual_residual);
+    settings.log.printf("Post-solve Dual residual: %e\n", post_solve_dual_residual_norm);
+
+    uncrush_primal_solution(user_problem, original_lp, lp_solution.x, solution.x);
+    uncrush_dual_solution(
+      user_problem, original_lp, lp_solution.y, lp_solution.z, solution.y, solution.z);
+    solution.objective          = barrier_solution.objective;
+    solution.user_objective     = barrier_solution.user_objective;
+    solution.l2_primal_residual = barrier_solution.l2_primal_residual;
+    solution.l2_dual_residual   = barrier_solution.l2_dual_residual;
+    solution.iterations         = barrier_solution.iterations;
+  }
+
+  // If we aren't doing crossover, we're done
+  if (!settings.crossover) { return barrier_status; }
+
+  if (settings.crossover && barrier_status == lp_status_t::OPTIMAL) {
+    // Check to see if we need to add artifical variables
+    csc_matrix_t<i_t, f_t> Atranspose(1, 1, 1);
+    original_lp.A.transpose(Atranspose);
+    std::vector<i_t> artificial_variables;
+    artificial_variables.reserve(original_lp.num_rows);
+    for (i_t i = 0; i < original_lp.num_rows; ++i) {
+      const i_t row_start = Atranspose.col_start[i];
+      const i_t row_end   = Atranspose.col_start[i + 1];
+      bool found_slack    = false;
+      for (i_t p = row_start; p < row_end; ++p) {
+        const i_t j      = Atranspose.i[p];
+        const i_t nz_col = original_lp.A.col_start[j + 1] - original_lp.A.col_start[j];
+        if (nz_col == 1) {
+          found_slack = true;
+          break;
+        }
+      }
+      if (!found_slack) {
+        // settings.log.printf("No slack found for row %d\n", i);
+        artificial_variables.push_back(i);
+      }
+    }
+    if (artificial_variables.size() > 0) {
+      settings.log.printf("Adding %ld artificial variables\n", artificial_variables.size());
+      const i_t additional_vars = artificial_variables.size();
+      const i_t new_cols        = original_lp.num_cols + additional_vars;
+      i_t col                   = original_lp.num_cols;
+      i_t nz                    = original_lp.A.col_start[col];
+      const i_t new_nz          = nz + additional_vars;
+      original_lp.A.col_start.resize(new_cols + 1);
+      original_lp.A.x.resize(new_nz);
+      original_lp.A.i.resize(new_nz);
+      original_lp.objective.resize(new_cols);
+      original_lp.lower.resize(new_cols);
+      original_lp.upper.resize(new_cols);
+      lp_solution.x.resize(new_cols);
+      lp_solution.z.resize(new_cols);
+      for (i_t i : artificial_variables) {
+        original_lp.A.x[nz]        = 1.0;
+        original_lp.A.i[nz]        = i;
+        original_lp.objective[col] = 0.0;
+        original_lp.lower[col]     = 0.0;
+        original_lp.upper[col]     = 0.0;
+        lp_solution.x[col]         = 0.0;
+        lp_solution.z[col]         = 0.0;
+        nz++;
+        col++;
+        original_lp.A.col_start[col] = nz;
+      }
+      original_lp.A.n      = new_cols;
+      original_lp.num_cols = new_cols;
+      printf("nz %d =? new_nz %d =? Acol %d, num_cols %d =? new_cols %d x size %ld z size %ld\n",
+             nz,
+             new_nz,
+             original_lp.A.col_start[original_lp.num_cols],
+             original_lp.num_cols,
+             new_cols,
+             lp_solution.x.size(),
+             lp_solution.z.size());
+    }
+
+    // Run crossover
+    lp_solution_t<i_t, f_t> crossover_solution(original_lp.num_rows, original_lp.num_cols);
+    std::vector<variable_status_t> vstatus(original_lp.num_cols);
+    crossover_status_t crossover_status = crossover(
+      original_lp, barrier_settings, lp_solution, start_time, crossover_solution, vstatus);
+    settings.log.printf("Crossover status: %d\n", crossover_status);
+    if (crossover_status == crossover_status_t::OPTIMAL) { barrier_status = lp_status_t::OPTIMAL; }
+  }
+  return barrier_status;
+}
+
 template <typename i_t, typename f_t>
 lp_status_t solve_linear_program(const user_problem_t<i_t, f_t>& user_problem,
                                  const simplex_solver_settings_t<i_t, f_t>& settings,
@@ -342,6 +515,11 @@ template lp_status_t solve_linear_program_advanced(
   std::vector<variable_status_t>& vstatus,
   std::vector<double>& edge_norms);
 
+template lp_status_t solve_linear_program_with_barrier(
+  const user_problem_t<int, double>& user_problem,
+  const simplex_solver_settings_t<int, double>& settings,
+  lp_solution_t<int, double>& solution);
+
 template lp_status_t solve_linear_program(const user_problem_t<int, double>& user_problem,
                                           const simplex_solver_settings_t<int, double>& settings,
                                           lp_solution_t<int, double>& solution);
diff --git a/cpp/src/dual_simplex/solve.hpp b/cpp/src/dual_simplex/solve.hpp
index f20844d6e..a54acf697 100644
--- a/cpp/src/dual_simplex/solve.hpp
+++ b/cpp/src/dual_simplex/solve.hpp
@@ -56,6 +56,11 @@ lp_status_t solve_linear_program_advanced(const lp_problem_t<i_t, f_t>& original
                                           std::vector<variable_status_t>& vstatus,
                                           std::vector<f_t>& edge_norms);
 
+template <typename i_t, typename f_t>
+lp_status_t solve_linear_program_with_barrier(const user_problem_t<i_t, f_t>& user_problem,
+                                              const simplex_solver_settings_t<i_t, f_t>& settings,
+                                              lp_solution_t<i_t, f_t>& solution);
+
 template <typename i_t, typename f_t>
 lp_status_t solve_linear_program(const user_problem_t<i_t, f_t>& user_problem,
                                  const simplex_solver_settings_t<i_t, f_t>& settings,
diff --git a/cpp/src/dual_simplex/sparse_cholesky.hpp b/cpp/src/dual_simplex/sparse_cholesky.hpp
new file mode 100644
index 000000000..4b777584f
--- /dev/null
+++ b/cpp/src/dual_simplex/sparse_cholesky.hpp
@@ -0,0 +1,372 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "dual_simplex/dense_vector.hpp"
+#include "dual_simplex/simplex_solver_settings.hpp"
+#include "dual_simplex/sparse_matrix.hpp"
+#include "dual_simplex/tic_toc.hpp"
+
+#include <cuda_runtime.h>
+
+#include "cudss.h"
+
+namespace cuopt::linear_programming::dual_simplex {
+
+template <typename i_t, typename f_t>
+class sparse_cholesky_base_t {
+ public:
+  virtual ~sparse_cholesky_base_t()                                                 = default;
+  virtual i_t analyze(const csc_matrix_t<i_t, f_t>& A_in)                           = 0;
+  virtual i_t factorize(const csc_matrix_t<i_t, f_t>& A_in)                         = 0;
+  virtual i_t solve(const dense_vector_t<i_t, f_t>& b, dense_vector_t<i_t, f_t>& x) = 0;
+  virtual void set_positive_definite(bool positive_definite)                        = 0;
+};
+
+#define CUDSS_EXAMPLE_FREE \
+  do {                     \
+  } while (0)
+
+#define CUDA_CALL_AND_CHECK(call, msg)                                                 \
+  do {                                                                                 \
+    cuda_error = call;                                                                 \
+    if (cuda_error != cudaSuccess) {                                                   \
+      printf("FAILED: CUDA API returned error = %d, details: " #msg "\n", cuda_error); \
+      CUDSS_EXAMPLE_FREE;                                                              \
+      return -1;                                                                       \
+    }                                                                                  \
+  } while (0);
+
+#define CUDA_CALL_AND_CHECK_EXIT(call, msg)                                            \
+  do {                                                                                 \
+    cuda_error = call;                                                                 \
+    if (cuda_error != cudaSuccess) {                                                   \
+      printf("FAILED: CUDA API returned error = %d, details: " #msg "\n", cuda_error); \
+      CUDSS_EXAMPLE_FREE;                                                              \
+      exit(-1);                                                                        \
+    }                                                                                  \
+  } while (0);
+
+#define CUDSS_CALL_AND_CHECK(call, status, msg)                                               \
+  do {                                                                                        \
+    status = call;                                                                            \
+    if (status != CUDSS_STATUS_SUCCESS) {                                                     \
+      printf("FAILED: CUDSS call ended unsuccessfully with status = %d, details: " #msg "\n", \
+             status);                                                                         \
+      CUDSS_EXAMPLE_FREE;                                                                     \
+      return -2;                                                                              \
+    }                                                                                         \
+  } while (0);
+
+#define CUDSS_CALL_AND_CHECK_EXIT(call, status, msg)                                          \
+  do {                                                                                        \
+    status = call;                                                                            \
+    if (status != CUDSS_STATUS_SUCCESS) {                                                     \
+      printf("FAILED: CUDSS call ended unsuccessfully with status = %d, details: " #msg "\n", \
+             status);                                                                         \
+      CUDSS_EXAMPLE_FREE;                                                                     \
+      exit(-2);                                                                               \
+    }                                                                                         \
+  } while (0);
+
+template <typename i_t, typename f_t>
+class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
+ public:
+  sparse_cholesky_cudss_t(const simplex_solver_settings_t<i_t, f_t>& settings, i_t size)
+    : n(size), nnz(-1), first_factor(true), positive_definite(true), settings_(settings)
+  {
+    int major, minor, patch;
+    cudssGetProperty(MAJOR_VERSION, &major);
+    cudssGetProperty(MINOR_VERSION, &minor);
+    cudssGetProperty(PATCH_LEVEL, &patch);
+    settings.log.printf("CUDSS Version %d.%d.%d\n", major, minor, patch);
+
+    cuda_error = cudaSuccess;
+    status     = CUDSS_STATUS_SUCCESS;
+    CUDA_CALL_AND_CHECK_EXIT(cudaStreamCreate(&stream), "cudaStreamCreate");
+    CUDSS_CALL_AND_CHECK_EXIT(cudssCreate(&handle), status, "cudssCreate");
+    CUDSS_CALL_AND_CHECK_EXIT(cudssConfigCreate(&solverConfig), status, "cudssConfigCreate");
+    CUDSS_CALL_AND_CHECK_EXIT(cudssDataCreate(handle, &solverData), status, "cudssDataCreate");
+
+#ifdef USE_AMD
+    // Tell cuDSS to use AMD
+    cudssAlgType_t reorder_alg = CUDSS_ALG_3;
+    CUDSS_CALL_AND_CHECK_EXIT(
+      cudssConfigSet(
+        solverConfig, CUDSS_CONFIG_REORDERING_ALG, &reorder_alg, sizeof(cudssAlgType_t)),
+      status,
+      "cudssConfigSet for reordering alg");
+#endif
+
+#if 0
+        int32_t ir_n_steps = 2;
+        CUDSS_CALL_AND_CHECK_EXIT(cudssConfigSet(solverConfig, CUDSS_CONFIG_IR_N_STEPS,
+                                          &ir_n_steps, sizeof(int32_t)), status, "cudssConfigSet for ir n steps");
+#endif
+
+    // Device pointers
+    csr_offset_d  = nullptr;
+    csr_columns_d = nullptr;
+    csr_values_d  = nullptr;
+    x_values_d    = nullptr;
+    b_values_d    = nullptr;
+    CUDA_CALL_AND_CHECK_EXIT(cudaMalloc(&x_values_d, n * sizeof(f_t)), "cudaMalloc for x_values");
+    CUDA_CALL_AND_CHECK_EXIT(cudaMalloc(&b_values_d, n * sizeof(f_t)), "cudaMalloc for b_values");
+
+    i_t ldb = n;
+    i_t ldx = n;
+    CUDSS_CALL_AND_CHECK_EXIT(
+      cudssMatrixCreateDn(&cudss_b, n, 1, ldb, b_values_d, CUDA_R_64F, CUDSS_LAYOUT_COL_MAJOR),
+      status,
+      "cudssMatrixCreateDn for b");
+    CUDSS_CALL_AND_CHECK_EXIT(
+      cudssMatrixCreateDn(&cudss_x, n, 1, ldx, x_values_d, CUDA_R_64F, CUDSS_LAYOUT_COL_MAJOR),
+      status,
+      "cudssMatrixCreateDn for x");
+  }
+  ~sparse_cholesky_cudss_t() override
+  {
+    cudaFree(csr_values_d);
+    cudaFree(csr_columns_d);
+    cudaFree(csr_offset_d);
+
+    cudaFree(x_values_d);
+    cudaFree(b_values_d);
+
+    CUDSS_CALL_AND_CHECK_EXIT(cudssMatrixDestroy(A), status, "cudssMatrixDestroy for A");
+
+    CUDSS_CALL_AND_CHECK_EXIT(
+      cudssMatrixDestroy(cudss_x), status, "cudssMatrixDestroy for cudss_x");
+    CUDSS_CALL_AND_CHECK_EXIT(
+      cudssMatrixDestroy(cudss_b), status, "cudssMatrixDestroy for cudss_b");
+    CUDSS_CALL_AND_CHECK_EXIT(cudssDataDestroy(handle, solverData), status, "cudssDataDestroy");
+    CUDSS_CALL_AND_CHECK_EXIT(cudssConfigDestroy(solverConfig), status, "cudssConfigDestroy");
+    CUDSS_CALL_AND_CHECK_EXIT(cudssDestroy(handle), status, "cudssDestroy");
+    CUDA_CALL_AND_CHECK_EXIT(cudaStreamSynchronize(stream), "cudaStreamSynchronize");
+    CUDA_CALL_AND_CHECK_EXIT(cudaStreamDestroy(stream), "cudaStreamDestroy");
+  }
+  i_t analyze(const csc_matrix_t<i_t, f_t>& A_in) override
+  {
+    csr_matrix_t<i_t, f_t> Arow;
+#ifdef WRITE_MATRIX_MARKET
+    FILE* fid = fopen("A.mtx", "w");
+    A_in.write_matrix_market(fid);
+    fclose(fid);
+#endif
+    A_in.to_compressed_row(Arow);
+
+    nnz = A_in.col_start[A_in.n];
+
+    CUDA_CALL_AND_CHECK(cudaMalloc(&csr_offset_d, (n + 1) * sizeof(i_t)),
+                        "cudaMalloc for csr_offset");
+    CUDA_CALL_AND_CHECK(cudaMalloc(&csr_columns_d, nnz * sizeof(i_t)),
+                        "cudaMalloc for csr_columns");
+    CUDA_CALL_AND_CHECK(cudaMalloc(&csr_values_d, nnz * sizeof(f_t)), "cudaMalloc for csr_values");
+
+    CUDA_CALL_AND_CHECK(
+      cudaMemcpy(
+        csr_offset_d, Arow.row_start.data(), (n + 1) * sizeof(i_t), cudaMemcpyHostToDevice),
+      "cudaMemcpy for csr_offset");
+    CUDA_CALL_AND_CHECK(
+      cudaMemcpy(csr_columns_d, Arow.j.data(), nnz * sizeof(i_t), cudaMemcpyHostToDevice),
+      "cudaMemcpy for csr_columns");
+    CUDA_CALL_AND_CHECK(
+      cudaMemcpy(csr_values_d, Arow.x.data(), nnz * sizeof(f_t), cudaMemcpyHostToDevice),
+      "cudaMemcpy for csr_values");
+
+    if (!first_factor) {
+      CUDSS_CALL_AND_CHECK(cudssMatrixDestroy(A), status, "cudssMatrixDestroy for A");
+    }
+
+    CUDSS_CALL_AND_CHECK(
+      cudssMatrixCreateCsr(&A,
+                           n,
+                           n,
+                           nnz,
+                           csr_offset_d,
+                           nullptr,
+                           csr_columns_d,
+                           csr_values_d,
+                           CUDA_R_32I,
+                           CUDA_R_64F,
+                           positive_definite ? CUDSS_MTYPE_SPD : CUDSS_MTYPE_SYMMETRIC,
+                           CUDSS_MVIEW_FULL,
+                           CUDSS_BASE_ZERO),
+      status,
+      "cudssMatrixCreateCsr");
+
+    // Perform symbolic analysis
+    f_t start_analysis = tic();
+
+    CUDSS_CALL_AND_CHECK(
+      cudssExecute(handle, CUDSS_PHASE_REORDERING, solverConfig, solverData, A, cudss_x, cudss_b),
+      status,
+      "cudssExecute for reordering");
+
+    f_t reorder_time = toc(start_analysis);
+    settings_.log.printf("Reordering time %.2fs\n", reorder_time);
+
+    f_t start_symbolic = tic();
+
+    CUDSS_CALL_AND_CHECK(
+      cudssExecute(
+        handle, CUDSS_PHASE_SYMBOLIC_FACTORIZATION, solverConfig, solverData, A, cudss_x, cudss_b),
+      status,
+      "cudssExecute for symbolic factorization");
+
+    f_t symbolic_time = toc(start_symbolic);
+    f_t analysis_time = toc(start_analysis);
+    settings_.log.printf("Symbolic factorization time %.2fs\n", symbolic_time);
+    settings_.log.printf("Symbolic time %.2fs\n", analysis_time);
+    int64_t lu_nz       = 0;
+    size_t size_written = 0;
+    CUDSS_CALL_AND_CHECK(
+      cudssDataGet(handle, solverData, CUDSS_DATA_LU_NNZ, &lu_nz, sizeof(int64_t), &size_written),
+      status,
+      "cudssDataGet for LU_NNZ");
+    settings_.log.printf("Symbolic nonzeros in factor %e\n", static_cast<f_t>(lu_nz) / 2.0);
+    // TODO: Is there any way to get nonzeros in the factors?
+    // TODO: Is there any way to get flops for the factorization?
+
+    return 0;
+  }
+  i_t factorize(const csc_matrix_t<i_t, f_t>& A_in) override
+  {
+    csr_matrix_t<i_t, f_t> Arow;
+    A_in.to_compressed_row(Arow);
+
+    if (nnz != A_in.col_start[A_in.n]) {
+      settings_.log.printf(
+        "Error: nnz %d != A_in.col_start[A_in.n] %d\n", nnz, A_in.col_start[A_in.n]);
+      exit(1);
+    }
+
+    CUDA_CALL_AND_CHECK(
+      cudaMemcpy(csr_values_d, Arow.x.data(), nnz * sizeof(f_t), cudaMemcpyHostToDevice),
+      "cudaMemcpy for csr_values");
+
+    CUDSS_CALL_AND_CHECK(
+      cudssMatrixSetValues(A, csr_values_d), status, "cudssMatrixSetValues for A");
+
+    f_t start_numeric = tic();
+    CUDSS_CALL_AND_CHECK(
+      cudssExecute(
+        handle, CUDSS_PHASE_FACTORIZATION, solverConfig, solverData, A, cudss_x, cudss_b),
+      status,
+      "cudssExecute for factorization");
+
+    f_t numeric_time = toc(start_numeric);
+
+    int info;
+    size_t sizeWritten = 0;
+    CUDSS_CALL_AND_CHECK(
+      cudssDataGet(handle, solverData, CUDSS_DATA_INFO, &info, sizeof(info), &sizeWritten),
+      status,
+      "cudssDataGet for info");
+    if (info != 0) {
+      settings_.log.printf("Factorization failed info %d\n", info);
+      return -1;
+    }
+
+    if (first_factor) {
+      settings_.log.printf("Factor time %.2fs\n", numeric_time);
+      first_factor = false;
+    }
+    if (status != CUDSS_STATUS_SUCCESS) {
+      settings_.log.printf("cuDSS Factorization failed\n");
+      return -1;
+    }
+    return 0;
+  }
+
+  i_t solve(const dense_vector_t<i_t, f_t>& b, dense_vector_t<i_t, f_t>& x) override
+  {
+    if (b.size() != n) {
+      settings_.log.printf("Error: b.size() %d != n %d\n", b.size(), n);
+      exit(1);
+    }
+    if (x.size() != n) {
+      settings_.log.printf("Error: x.size() %d != n %d\n", x.size(), n);
+      exit(1);
+    }
+    CUDA_CALL_AND_CHECK(cudaMemcpy(b_values_d, b.data(), n * sizeof(f_t), cudaMemcpyHostToDevice),
+                        "cudaMemcpy for b_values");
+    CUDA_CALL_AND_CHECK(cudaMemcpy(x_values_d, x.data(), n * sizeof(f_t), cudaMemcpyHostToDevice),
+                        "cudaMemcpy for x_values");
+
+    CUDSS_CALL_AND_CHECK(
+      cudssMatrixSetValues(cudss_b, b_values_d), status, "cudssMatrixSetValues for b");
+    CUDSS_CALL_AND_CHECK(
+      cudssMatrixSetValues(cudss_x, x_values_d), status, "cudssMatrixSetValues for x");
+
+    i_t ldb = n;
+    i_t ldx = n;
+    CUDSS_CALL_AND_CHECK_EXIT(
+      cudssMatrixCreateDn(&cudss_b, n, 1, ldb, b_values_d, CUDA_R_64F, CUDSS_LAYOUT_COL_MAJOR),
+      status,
+      "cudssMatrixCreateDn for b");
+    CUDSS_CALL_AND_CHECK_EXIT(
+      cudssMatrixCreateDn(&cudss_x, n, 1, ldx, x_values_d, CUDA_R_64F, CUDSS_LAYOUT_COL_MAJOR),
+      status,
+      "cudssMatrixCreateDn for x");
+
+    CUDSS_CALL_AND_CHECK(
+      cudssExecute(handle, CUDSS_PHASE_SOLVE, solverConfig, solverData, A, cudss_x, cudss_b),
+      status,
+      "cudssExecute for solve");
+
+    CUDA_CALL_AND_CHECK(cudaStreamSynchronize(stream), "cudaStreamSynchronize");
+
+    CUDA_CALL_AND_CHECK(cudaMemcpy(x.data(), x_values_d, n * sizeof(f_t), cudaMemcpyDeviceToHost),
+                        "cudaMemcpy for x");
+
+    for (i_t i = 0; i < n; i++) {
+      if (x[i] != x[i]) { return -1; }
+    }
+
+    return 0;
+  }
+
+  void set_positive_definite(bool positive_definite) override
+  {
+    this->positive_definite = positive_definite;
+  }
+
+ private:
+  i_t n;
+  i_t nnz;
+  bool first_factor;
+  bool positive_definite;
+  cudaError_t cuda_error;
+  cudssStatus_t status;
+  cudaStream_t stream;
+  cudssHandle_t handle;
+  cudssConfig_t solverConfig;
+  cudssData_t solverData;
+  cudssMatrix_t A;
+  cudssMatrix_t cudss_x;
+  cudssMatrix_t cudss_b;
+  i_t* csr_offset_d;
+  i_t* csr_columns_d;
+  f_t* csr_values_d;
+  f_t* x_values_d;
+  f_t* b_values_d;
+
+  const simplex_solver_settings_t<i_t, f_t>& settings_;
+};
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/sparse_matrix.cpp b/cpp/src/dual_simplex/sparse_matrix.cpp
index dc4df3990..22d286cda 100644
--- a/cpp/src/dual_simplex/sparse_matrix.cpp
+++ b/cpp/src/dual_simplex/sparse_matrix.cpp
@@ -372,12 +372,40 @@ void csc_matrix_t<i_t, f_t>::print_matrix(FILE* fid) const
   fprintf(fid, "A = sparse(ijx(:, 1), ijx(:, 2), ijx(:, 3), %d, %d);\n", this->m, this->n);
 }
 
+template <typename i_t, typename f_t>
+void csc_matrix_t<i_t, f_t>::write_matrix_market(FILE* fid) const
+{
+  fprintf(fid, "%%%%MatrixMarket matrix coordinate real general\n");
+  fprintf(fid, "%d %d %d\n", this->m, this->n, this->col_start[this->n]);
+  for (i_t j = 0; j < this->n; ++j) {
+    const i_t col_beg = this->col_start[j];
+    const i_t col_end = this->col_start[j + 1];
+    for (i_t p = col_beg; p < col_end; ++p) {
+      fprintf(fid, "%d %d %.16e\n", this->i[p] + 1, j + 1, this->x[p]);
+    }
+  }
+}
+
 template <typename i_t, typename f_t>
 void csc_matrix_t<i_t, f_t>::print_matrix() const
 {
   this->print_matrix(stdout);
 }
 
+template <typename i_t, typename f_t>
+void csc_matrix_t<i_t, f_t>::scale_columns(const std::vector<f_t>& scale)
+{
+  const i_t n = this->n;
+  assert(scale.size() == n);
+  for (i_t j = 0; j < n; ++j) {
+    const i_t col_start = this->col_start[j];
+    const i_t col_end   = this->col_start[j + 1];
+    for (i_t p = col_start; p < col_end; ++p) {
+      this->x[p] *= scale[j];
+    }
+  }
+}
+
 template <typename i_t, typename f_t>
 i_t scatter(const csc_matrix_t<i_t, f_t>& A,
             i_t j,
diff --git a/cpp/src/dual_simplex/sparse_matrix.hpp b/cpp/src/dual_simplex/sparse_matrix.hpp
index 1fa898b61..d09a88381 100644
--- a/cpp/src/dual_simplex/sparse_matrix.hpp
+++ b/cpp/src/dual_simplex/sparse_matrix.hpp
@@ -87,9 +87,15 @@ class csc_matrix_t {
   // Prints the matrix to a file
   void print_matrix(FILE* fid) const;
 
+  // Writes the matrix to a file in Matrix Market format
+  void write_matrix_market(FILE* fid) const;
+
   // Compute || A ||_1 = max_j (sum {i = 1 to m} | A(i, j) | )
   f_t norm1() const;
 
+  // Perform column scaling of the matrix
+  void scale_columns(const std::vector<f_t>& scale);
+
   i_t m;                       // number of rows
   i_t n;                       // number of columns
   i_t nz_max;                  // maximum number of entries
diff --git a/cpp/src/linear_programming/solve.cu b/cpp/src/linear_programming/solve.cu
index 504d87bd3..ebe20f4b2 100644
--- a/cpp/src/linear_programming/solve.cu
+++ b/cpp/src/linear_programming/solve.cu
@@ -287,13 +287,84 @@ optimization_problem_solution_t<i_t, f_t> convert_dual_simplex_sol(
   if (termination_status != pdlp_termination_status_t::Optimal &&
       termination_status != pdlp_termination_status_t::TimeLimit &&
       termination_status != pdlp_termination_status_t::ConcurrentLimit) {
-    CUOPT_LOG_INFO("Dual simplex status %s", sol.get_termination_status_string().c_str());
+    CUOPT_LOG_INFO("Solve status %s", sol.get_termination_status_string().c_str());
   }
 
   problem.handle_ptr->sync_stream();
   return sol;
 }
 
+template <typename i_t, typename f_t>
+std::tuple<dual_simplex::lp_solution_t<i_t, f_t>, dual_simplex::lp_status_t, f_t, f_t, f_t>
+run_barrier(dual_simplex::user_problem_t<i_t, f_t>& user_problem,
+            pdlp_solver_settings_t<i_t, f_t> const& settings)
+{
+  auto start_solver = std::chrono::high_resolution_clock::now();
+
+  f_t norm_user_objective = dual_simplex::vector_norm2<i_t, f_t>(user_problem.objective);
+  f_t norm_rhs            = dual_simplex::vector_norm2<i_t, f_t>(user_problem.rhs);
+
+  dual_simplex::simplex_solver_settings_t<i_t, f_t> dual_simplex_settings;
+  dual_simplex_settings.time_limit      = settings.time_limit;
+  dual_simplex_settings.iteration_limit = settings.iteration_limit;
+  dual_simplex_settings.concurrent_halt = settings.concurrent_halt;
+  dual_simplex_settings.use_cudss       = settings.use_cudss;
+  dual_simplex_settings.barrier         = true;
+  dual_simplex_settings.crossover       = settings.crossover;
+  if (dual_simplex_settings.concurrent_halt != nullptr) {
+    // Don't show the dual simplex log in concurrent mode. Show the PDLP log instead
+    dual_simplex_settings.log.log = false;
+  }
+
+  dual_simplex::lp_solution_t<i_t, f_t> solution(user_problem.num_rows, user_problem.num_cols);
+  auto status = dual_simplex::solve_linear_program_with_barrier<i_t, f_t>(
+    user_problem, dual_simplex_settings, solution);
+
+  auto end      = std::chrono::high_resolution_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start_solver);
+
+  CUOPT_LOG_INFO("Barrier finished in %.2f seconds", duration.count() / 1000.0);
+
+  if (settings.concurrent_halt != nullptr && (status == dual_simplex::lp_status_t::OPTIMAL ||
+                                              status == dual_simplex::lp_status_t::UNBOUNDED ||
+                                              status == dual_simplex::lp_status_t::INFEASIBLE)) {
+    // We finished. Tell PDLP to stop if it is still running.
+    settings.concurrent_halt->store(1, std::memory_order_release);
+  }
+
+  return {std::move(solution), status, duration.count() / 1000.0, norm_user_objective, norm_rhs};
+}
+
+template <typename i_t, typename f_t>
+optimization_problem_solution_t<i_t, f_t> run_barrier(
+  detail::problem_t<i_t, f_t>& problem, pdlp_solver_settings_t<i_t, f_t> const& settings)
+{
+  // Convert data structures to dual simplex format and back
+  dual_simplex::user_problem_t<i_t, f_t> dual_simplex_problem =
+    cuopt_problem_to_simplex_problem<i_t, f_t>(problem);
+  auto sol_dual_simplex = run_barrier(dual_simplex_problem, settings);
+  return convert_dual_simplex_sol(problem,
+                                  std::get<0>(sol_dual_simplex),
+                                  std::get<1>(sol_dual_simplex),
+                                  std::get<2>(sol_dual_simplex),
+                                  std::get<3>(sol_dual_simplex),
+                                  std::get<4>(sol_dual_simplex));
+}
+
+template <typename i_t, typename f_t>
+void run_barrier_thread(
+  dual_simplex::user_problem_t<i_t, f_t>& problem,
+  pdlp_solver_settings_t<i_t, f_t> const& settings,
+  std::unique_ptr<
+    std::tuple<dual_simplex::lp_solution_t<i_t, f_t>, dual_simplex::lp_status_t, f_t, f_t, f_t>>&
+    sol_ptr)
+{
+  // We will return the solution from the thread as a unique_ptr
+  sol_ptr = std::make_unique<
+    std::tuple<dual_simplex::lp_solution_t<i_t, f_t>, dual_simplex::lp_status_t, f_t, f_t, f_t>>(
+    run_barrier(problem, settings));
+}
+
 template <typename i_t, typename f_t>
 std::tuple<dual_simplex::lp_solution_t<i_t, f_t>, dual_simplex::lp_status_t, f_t, f_t, f_t>
 run_dual_simplex(dual_simplex::user_problem_t<i_t, f_t>& user_problem,
@@ -499,12 +570,25 @@ optimization_problem_solution_t<i_t, f_t> run_concurrent(
                                   std::ref(settings_pdlp),
                                   std::ref(sol_dual_simplex_ptr));
 
+  dual_simplex::user_problem_t<i_t, f_t> barrier_problem = dual_simplex_problem;
+  // Create a thread for barrier
+  std::unique_ptr<
+    std::tuple<dual_simplex::lp_solution_t<i_t, f_t>, dual_simplex::lp_status_t, f_t, f_t, f_t>>
+    sol_barrier_ptr;
+  std::thread barrier_thread(run_barrier_thread<i_t, f_t>,
+                             std::ref(barrier_problem),
+                             std::ref(settings_pdlp),
+                             std::ref(sol_barrier_ptr));
+
   // Run pdlp in the main thread
   auto sol_pdlp = run_pdlp(problem, settings_pdlp, is_batch_mode);
 
   // Wait for dual simplex thread to finish
   dual_simplex_thread.join();
 
+  // Wait for barrier thread to finish
+  barrier_thread.join();
+
   // copy the dual simplex solution to the device
   auto sol_dual_simplex = convert_dual_simplex_sol(problem,
                                                    std::get<0>(*sol_dual_simplex_ptr),
@@ -513,6 +597,14 @@ optimization_problem_solution_t<i_t, f_t> run_concurrent(
                                                    std::get<3>(*sol_dual_simplex_ptr),
                                                    std::get<4>(*sol_dual_simplex_ptr));
 
+  // copy the barrier solution to the device
+  auto sol_barrier = convert_dual_simplex_sol(problem,
+                                              std::get<0>(*sol_barrier_ptr),
+                                              std::get<1>(*sol_barrier_ptr),
+                                              std::get<2>(*sol_barrier_ptr),
+                                              std::get<3>(*sol_barrier_ptr),
+                                              std::get<4>(*sol_barrier_ptr));
+
   f_t end_time = dual_simplex::toc(start_time);
   CUOPT_LOG_INFO("Concurrent time:  %.3fs", end_time);
   // Check status to see if we should return the pdlp solution or the dual simplex solution
@@ -528,6 +620,16 @@ optimization_problem_solution_t<i_t, f_t> run_concurrent(
                    sol_pdlp.get_additional_termination_information().number_of_steps_taken,
                    end_time);
     return sol_pdlp;
+  } else if (sol_barrier.get_termination_status() == pdlp_termination_status_t::Optimal) {
+    CUOPT_LOG_INFO("Solved with barrier");
+    sol_pdlp.copy_from(op_problem.get_handle_ptr(), sol_barrier);
+    sol_pdlp.set_solve_time(end_time);
+    CUOPT_LOG_INFO("Status: %s   Objective: %.8e  Iterations: %d  Time: %.3fs",
+                   sol_pdlp.get_termination_status_string().c_str(),
+                   sol_pdlp.get_objective_value(),
+                   sol_pdlp.get_additional_termination_information().number_of_steps_taken,
+                   end_time);
+    return sol_pdlp;
   } else if (sol_pdlp.get_termination_status() == pdlp_termination_status_t::Optimal) {
     CUOPT_LOG_INFO("Solved with PDLP");
     return sol_pdlp;
@@ -549,6 +651,8 @@ optimization_problem_solution_t<i_t, f_t> solve_lp_with_method(
 {
   if (settings.method == method_t::DualSimplex) {
     return run_dual_simplex(problem, settings);
+  } else if (settings.method == method_t::Barrier) {
+    return run_barrier(problem, settings);
   } else if (settings.method == method_t::Concurrent) {
     return run_concurrent(op_problem, problem, settings, is_batch_mode);
   } else {
diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu
index a049d0d09..c097958dd 100644
--- a/cpp/src/math_optimization/solver_settings.cu
+++ b/cpp/src/math_optimization/solver_settings.cu
@@ -89,7 +89,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
   int_parameters = {
     {CUOPT_ITERATION_LIMIT, &pdlp_settings.iteration_limit, 0, std::numeric_limits<i_t>::max(), std::numeric_limits<i_t>::max()},
     {CUOPT_PDLP_SOLVER_MODE, reinterpret_cast<int*>(&pdlp_settings.pdlp_solver_mode), CUOPT_PDLP_SOLVER_MODE_STABLE1, CUOPT_PDLP_SOLVER_MODE_FAST1, CUOPT_PDLP_SOLVER_MODE_STABLE2},
-    {CUOPT_METHOD, reinterpret_cast<int*>(&pdlp_settings.method), CUOPT_METHOD_CONCURRENT, CUOPT_METHOD_DUAL_SIMPLEX, CUOPT_METHOD_CONCURRENT},
+    {CUOPT_METHOD, reinterpret_cast<int*>(&pdlp_settings.method), CUOPT_METHOD_CONCURRENT, CUOPT_METHOD_BARRIER, CUOPT_METHOD_CONCURRENT},
     {CUOPT_NUM_CPU_THREADS, &mip_settings.num_cpu_threads, -1, std::numeric_limits<i_t>::max(), -1}
   };
 
@@ -105,6 +105,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_LOG_TO_CONSOLE, &pdlp_settings.log_to_console, true},
     {CUOPT_LOG_TO_CONSOLE, &mip_settings.log_to_console, true},
     {CUOPT_CROSSOVER, &pdlp_settings.crossover, false},
+    {CUOPT_USE_CUDSS, &pdlp_settings.use_cudss, false},
     {CUOPT_PRESOLVE, &pdlp_settings.presolve, false},
     {CUOPT_PRESOLVE, &mip_settings.presolve, true}
   };
diff --git a/cpp/src/mip/solver.cu b/cpp/src/mip/solver.cu
index 0f2117991..44f6e8083 100644
--- a/cpp/src/mip/solver.cu
+++ b/cpp/src/mip/solver.cu
@@ -33,15 +33,25 @@
 #include <raft/sparse/detail/cusparse_macros.h>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 
+#include <cudss.h>
+
 #include <future>
 #include <memory>
 #include <thread>
 
 namespace cuopt::linear_programming::detail {
 
+void force_link_cudss()
+{
+  cudssHandle_t handle;
+  cudssCreate(&handle);
+  cudssDestroy(handle);
+}
+
 // This serves as both a warm up but also a mandatory initial call to setup cuSparse and cuBLAS
 static void init_handler(const raft::handle_t* handle_ptr)
 {
+  force_link_cudss();
   // Init cuBlas / cuSparse context here to avoid having it during solving time
   RAFT_CUBLAS_TRY(raft::linalg::detail::cublassetpointermode(
     handle_ptr->get_cublas_handle(), CUBLAS_POINTER_MODE_DEVICE, handle_ptr->get_stream()));
diff --git a/dependencies.yaml b/dependencies.yaml
index 07de52533..1da064fed 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -691,6 +691,7 @@ dependencies:
               - libcurand-dev
               - libcusolver-dev
               - libcusparse-dev
+              - libcudss-dev
               - cuda-nvtx-dev
           - matrix:
               cuda: "12.*"
@@ -698,6 +699,7 @@ dependencies:
               - libcurand-dev
               - libcusolver-dev
               - libcusparse-dev
+              - libcudss-dev
               - cuda-nvtx-dev
               - cuda-nvvm
               - cuda-crt
@@ -714,6 +716,7 @@ dependencies:
               - nvidia-curand-cu12
               - nvidia-cusparse-cu12
               - nvidia-cusolver-cu12
+              - nvidia-cudss-cu12
               - nvidia-nvtx-cu12
           # if use_cuda_wheels=false is provided, do not add dependencies on any CUDA wheels
           # (e.g. for DLFW and pip devcontainers)
@@ -728,6 +731,7 @@ dependencies:
               - nvidia-curand
               - nvidia-cusparse
               - nvidia-cusolver
+              - nvidia-cudss
               - nvidia-nvtx
   develop:
     common:
diff --git a/python/libcuopt/CMakeLists.txt b/python/libcuopt/CMakeLists.txt
index b99770887..5e76ce87d 100644
--- a/python/libcuopt/CMakeLists.txt
+++ b/python/libcuopt/CMakeLists.txt
@@ -69,6 +69,7 @@ set(rpaths
   "$ORIGIN/../../rapids_logger/lib64"
   "$ORIGIN/../../librmm/lib64"
   "$ORIGIN/../../nvidia/cublas/lib"
+  "$ORIGIN/../../nvidia/cu12/lib"
   "$ORIGIN/../../nvidia/curand/lib"
   "$ORIGIN/../../nvidia/cusolver/lib"
   "$ORIGIN/../../nvidia/cusparse/lib"
diff --git a/python/libcuopt/pyproject.toml b/python/libcuopt/pyproject.toml
index 908d9e28c..7e768fdd4 100644
--- a/python/libcuopt/pyproject.toml
+++ b/python/libcuopt/pyproject.toml
@@ -46,6 +46,7 @@ dependencies = [
     "cuopt-mps-parser==25.10.*,>=0.0.0a0",
     "librmm==25.10.*,>=0.0.0a0",
     "nvidia-cublas",
+    "nvidia-cudss",
     "nvidia-curand",
     "nvidia-cusolver",
     "nvidia-cusparse",