diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index c0ac2cc45..55e164f6a 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -45,7 +45,6 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.08
     with:
-      matrix_filter: map(select((.CUDA_VER | startswith("12"))))
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
@@ -56,7 +55,6 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.08
     with:
-      matrix_filter: map(select((.CUDA_VER | startswith("12")) and .PY_VER != "3.13"))
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
@@ -80,7 +78,6 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       script: ci/build_wheel_cuopt_mps_parser.sh
-      matrix_filter: map(select((.CUDA_VER | startswith("12"))))
       package-name: cuopt_mps_parser
       package-type: python
       append-cuda-suffix: false
@@ -107,7 +104,7 @@ jobs:
       script: ci/build_wheel_libcuopt.sh
       package-name: libcuopt
       package-type: cpp
-      matrix_filter: map(select((.CUDA_VER | startswith("12")) and .PY_VER == "3.12"))
+      matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
   wheel-publish-libcuopt:
     needs: wheel-build-libcuopt
     secrets: inherit
@@ -124,7 +121,6 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08
     with:
-      matrix_filter: map(select((.CUDA_VER | startswith("12"))))
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
@@ -148,7 +144,6 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08
     with:
-      matrix_filter: map(select((.CUDA_VER | startswith("12"))))
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
@@ -192,7 +187,6 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       script: ci/build_wheel_cuopt_sh_client.sh
-      matrix_filter: map(select((.CUDA_VER | startswith("12"))))
       package-name: cuopt_sh_client
       package-type: python
       append-cuda-suffix: false
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 7ee94cfb2..92c2f0434 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -122,7 +122,6 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.08
     with:
       build_type: pull-request
-      matrix_filter: map(select((.CUDA_VER | startswith("12"))))
       script: ci/build_cpp.sh
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
@@ -131,7 +130,6 @@ jobs:
     #if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
-      matrix_filter: map(select((.CUDA_VER | startswith("12"))))
       script: ci/test_cpp.sh
   conda-python-build:
     needs: conda-cpp-build
@@ -139,7 +137,6 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.08
     with:
       build_type: pull-request
-      matrix_filter: map(select((.CUDA_VER | startswith("12"))))
       script: ci/build_python.sh
   conda-python-tests:
     needs: [conda-python-build, changed-files]
@@ -149,7 +146,6 @@ jobs:
     with:
       run_codecov: false
       build_type: pull-request
-      matrix_filter: map(select((.CUDA_VER | startswith("12"))))
       script: ci/test_python.sh
   docs-build:
     needs: conda-python-build
@@ -163,17 +159,6 @@ jobs:
       artifact-name: "cuopt_docs"
       container_image: "rapidsai/ci-conda:25.08-latest"
       script: "ci/build_docs.sh"
-  #conda-notebook-tests:
-  #  needs: [conda-python-build, changed-files]
-  #  secrets: inherit
-  #  uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08
-  #  #if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_notebooks
-  #  with:
-  #    build_type: pull-request
-  #    node_type: "gpu-l4-latest-1"
-  #    arch: "amd64"
-  #    container_image: "rapidsai/ci-conda:cuda11.8.0-ubuntu22.04-py3.10"
-  #    run_script: "ci/test_notebooks.sh"
   wheel-build-cuopt-mps-parser:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08
@@ -183,14 +168,13 @@ jobs:
       package-name: cuopt_mps_parser
       package-type: python
       append-cuda-suffix: false
-      matrix_filter: map(select((.CUDA_VER | startswith("12"))))
   wheel-build-libcuopt:
     needs: wheel-build-cuopt-mps-parser
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
-      matrix_filter: map(select((.CUDA_VER | startswith("12")) and .PY_VER == "3.12"))
+      matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       package-type: cpp
       package-name: libcuopt
       build_type: pull-request
@@ -204,16 +188,14 @@ jobs:
       script: ci/build_wheel_cuopt.sh
       package-name: cuopt
       package-type: python
-      matrix_filter: map(select((.CUDA_VER | startswith("12"))))
   wheel-tests-cuopt:
-    needs: [wheel-build-cuopt, wheel-build-cuopt-mps-parser, changed-files]
+    needs: [wheel-build-cuopt, wheel-build-cuopt-mps-parser, wheel-build-cuopt-sh-client, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08
     #if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_cuopt
     with:
       build_type: pull-request
       script: ci/test_wheel_cuopt.sh
-      matrix_filter: map(select((.CUDA_VER | startswith("12"))))
   wheel-build-cuopt-server:
     needs: wheel-build-cuopt
     secrets: inherit
@@ -223,14 +205,12 @@ jobs:
       script: ci/build_wheel_cuopt_server.sh
       package-name: cuopt_server
       package-type: python
-      matrix_filter: map(select((.CUDA_VER | startswith("12"))))
   wheel-build-cuopt-sh-client:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08
     with:
       build_type: pull-request
       script: ci/build_wheel_cuopt_sh_client.sh
-      matrix_filter: map(select((.CUDA_VER | startswith("12"))))
       package-name: cuopt_sh_client
       package-type: python
       append-cuda-suffix: false
@@ -242,7 +222,6 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel_cuopt_server.sh
-      matrix_filter: map(select((.CUDA_VER | startswith("12"))))
   test-self-hosted-server:
     needs: [wheel-build-cuopt-server, changed-files]
     secrets: inherit
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 6cd251332..203eee65a 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -40,7 +40,6 @@ jobs:
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
-      matrix_filter: map(select((.CUDA_VER | startswith("12"))))
       script: ci/test_cpp.sh
   conda-python-tests:
     secrets: inherit
@@ -51,7 +50,6 @@ jobs:
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
-      matrix_filter: map(select((.CUDA_VER | startswith("12"))))
       script: ci/test_python.sh
   wheel-tests-cuopt:
     secrets: inherit
@@ -61,7 +59,6 @@ jobs:
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
-      matrix_filter: map(select((.CUDA_VER | startswith("12"))))
       script: ci/test_wheel_cuopt.sh
   wheel-tests-cuopt-server:
     secrets: inherit
@@ -71,5 +68,4 @@ jobs:
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
-      matrix_filter: map(select((.CUDA_VER | startswith("12"))))
       script: ci/test_wheel_cuopt_server.sh
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3129e7190..c141d3162 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -143,6 +143,15 @@ cd $CUOPT_HOME
 ./build.sh --help
 ```
 
+#### Deb package
+
+`libcuopt.so` can be packaged as a deb package with option deb. This is a beta-feature and dependecies of libcuopt needs to be installed manually while installing it using deb package.
+This is only available to be built through source code and libcuopt is not being released as deb package in any official space.
+
+```bash
+./build.sh libmps_parser libcuopt deb
+```
+
 #### Building for development
 
 To build all libraries and tests, simply run
diff --git a/build.sh b/build.sh
index f12986676..746ab0835 100755
--- a/build.sh
+++ b/build.sh
@@ -27,7 +27,7 @@ REPODIR=$(cd "$(dirname "$0")"; pwd)
 LIBCUOPT_BUILD_DIR=${LIBCUOPT_BUILD_DIR:=${REPODIR}/cpp/build}
 LIBMPS_PARSER_BUILD_DIR=${LIBMPS_PARSER_BUILD_DIR:=${REPODIR}/cpp/libmps_parser/build}
 
-VALIDARGS="clean libcuopt libmps_parser cuopt_mps_parser cuopt cuopt_server cuopt_sh_client docs -a -b -g -v -l= --verbose-pdlp  [--cmake-args=\\\"<args>\\\"] [--cache-tool=<tool>] -n --allgpuarch --ci-only-arch --show_depr_warn -h --help"
+VALIDARGS="clean libcuopt libmps_parser cuopt_mps_parser cuopt cuopt_server cuopt_sh_client docs deb -a -b -g -v -l= --verbose-pdlp  [--cmake-args=\\\"<args>\\\"] [--cache-tool=<tool>] -n --allgpuarch --ci-only-arch --show_depr_warn -h --help"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
@@ -38,6 +38,7 @@ HELP="$0 [<target> ...] [<flag> ...]
    cuopt_server     - build the cuopt_server Python package
    cuopt_sh_client  - build cuopt self host client
    docs             - build the docs
+   deb              - build deb package (requires libcuopt to be built first)
  and <flag> is:
    -v               - verbose build mode
    -g               - build for debug
@@ -268,13 +269,8 @@ if  [ ${BUILD_ALL_GPU_ARCH} -eq 1 ]; then
     echo "Building for *ALL* supported GPU architectures..."
 else
     if [ ${BUILD_CI_ONLY} -eq 1 ]; then
-        if [[ ${CUDA_VERSION} == 11* ]]; then
-            CUOPT_CMAKE_CUDA_ARCHITECTURES="70-real;80"
-            echo "Building for Volta and Ampere architectures..."
-        else
-            CUOPT_CMAKE_CUDA_ARCHITECTURES="RAPIDS"
-            echo "Building for Volta, Ampere and Hopper architectures..."
-        fi
+        CUOPT_CMAKE_CUDA_ARCHITECTURES="RAPIDS"
+        echo "Building for RAPIDS supported architectures..."
     else
         CUOPT_CMAKE_CUDA_ARCHITECTURES="NATIVE"
         echo "Building for the architecture of the GPU in the system..."
@@ -321,6 +317,21 @@ if buildAll || hasArg libcuopt; then
     fi
 fi
 
+################################################################################
+# Build deb package
+if hasArg deb; then
+    # Check if libcuopt has been built
+    if [ ! -d "${LIBCUOPT_BUILD_DIR}" ]; then
+        echo "Error: libcuopt must be built before creating deb package. Run with 'libcuopt' target first."
+        exit 1
+    fi
+    
+    echo "Building deb package..."
+    cd "${LIBCUOPT_BUILD_DIR}"
+    cpack -G DEB
+    echo "Deb package created in ${LIBCUOPT_BUILD_DIR}"
+fi
+
 
 # Build and install the cuopt Python package
 if buildAll || hasArg cuopt; then
diff --git a/ci/test_self_hosted_service.sh b/ci/test_self_hosted_service.sh
index 81126c836..a5285ec1c 100755
--- a/ci/test_self_hosted_service.sh
+++ b/ci/test_self_hosted_service.sh
@@ -217,6 +217,10 @@ if [ "$doservertest" -eq 1 ]; then
     # Test for message on absolute path, bad directory
     run_cli_test "Absolute path '/nohay' does not exist" cuopt_sh -s -c "$CLIENT_CERT" -p $CUOPT_SERVER_PORT -f /nohay/nada
 
+    # Set all current and deprecated solver_config values and make sure the service does not reject the dataset
+    # This is a smoketest against parameter name misalignment
+    run_cli_test "'status': 'Optimal'" cuopt_sh -s -c $CLIENT_CERT -p $CUOPT_SERVER_PORT ../../datasets/cuopt_service_data/lpmip_configs.json
+
     rapids-logger "Running cuopt_self_hosted Python tests"
     pytest tests
 
diff --git a/ci/test_wheel_cuopt.sh b/ci/test_wheel_cuopt.sh
index 3c78c29f0..7a16db43f 100755
--- a/ci/test_wheel_cuopt.sh
+++ b/ci/test_wheel_cuopt.sh
@@ -17,19 +17,33 @@
 
 set -euo pipefail
 
+# sets up a constraints file for 'pip' and puts its location in an exported variable PIP_EXPORT,
+# so those constraints will affect all future 'pip install' calls
 source rapids-init-pip
 
 # Download the packages built in the previous step
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 CUOPT_MPS_PARSER_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="cuopt_mps_parser" rapids-download-wheels-from-github python)
+CUOPT_SH_CLIENT_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="cuopt_sh_client" rapids-download-wheels-from-github python)
 CUOPT_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="cuopt_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-github python)
 LIBCUOPT_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="libcuopt_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-github cpp)
 
+# update pip constraints.txt to ensure all future 'pip install' (including those in ci/thirdparty-testing)
+# use these wheels for cuopt packages
+cat > "${PIP_CONSTRAINT}" <<EOF
+cuopt-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo ${CUOPT_WHEELHOUSE}/cuopt_${RAPIDS_PY_CUDA_SUFFIX}-*.whl)
+cuopt-mps-parser @ file://$(echo ${CUOPT_MPS_PARSER_WHEELHOUSE}/cuopt_mps_parser-*.whl)
+cuopt-sh-client @ file://$(echo ${CUOPT_SH_CLIENT_WHEELHOUSE}/cuopt_sh_client-*.whl)
+libcuopt-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo ${LIBCUOPT_WHEELHOUSE}/libcuopt_${RAPIDS_PY_CUDA_SUFFIX}-*.whl)
+EOF
+
 # echo to expand wildcard before adding `[extra]` requires for pip
 rapids-pip-retry install \
     --extra-index-url=https://pypi.nvidia.com \
+    --constraint "${PIP_CONSTRAINT}" \
     "${CUOPT_MPS_PARSER_WHEELHOUSE}"/cuopt_mps_parser*.whl \
     "$(echo "${CUOPT_WHEELHOUSE}"/cuopt*.whl)[test]" \
+    "${CUOPT_SH_CLIENT_WHEELHOUSE}"/cuopt_sh_client*.whl \
     "${LIBCUOPT_WHEELHOUSE}"/libcuopt*.whl
 
 python -c "import cuopt"
@@ -59,3 +73,5 @@ timeout 10m bash ./python/libcuopt/libcuopt/tests/test_cli.sh
 # Run Python tests
 RAPIDS_DATASET_ROOT_DIR=./datasets timeout 30m python -m pytest --verbose --capture=no ./python/cuopt/cuopt/tests/
 
+# run cvxpy integration tests
+./ci/thirdparty-testing/run_cvxpy_tests.sh
diff --git a/ci/thirdparty-testing/run_cvxpy_tests.sh b/ci/thirdparty-testing/run_cvxpy_tests.sh
new file mode 100755
index 000000000..523b2fea3
--- /dev/null
+++ b/ci/thirdparty-testing/run_cvxpy_tests.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e -u -o pipefail
+
+echo "building 'cvxpy' from source"
+git clone https://github.com/cvxpy/cvxpy.git
+pushd ./cvxpy || exit 1
+pip wheel \
+    -w dist \
+    .
+
+# NOTE: installing cvxpy[CUOPT] alongside CI artifacts is helpful to catch dependency conflicts
+echo "installing 'cvxpy' with cuopt"
+python -m pip install \
+    --constraint "${PIP_CONSTRAINT}" \
+    --extra-index-url=https://pypi.nvidia.com \
+    --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple \
+    'pytest-error-for-skips>=2.0.2' \
+    "$(echo ./dist/cvxpy*.whl)[CUOPT,testing]"
+
+# ensure that environment is still consistent (i.e. cvxpy requirements do not conflict with cuopt's)
+pip check
+
+echo "running 'cvxpy' tests"
+timeout 3m python -m pytest \
+    --verbose \
+    --capture=no \
+    --error-for-skips \
+    -k "TestCUOPT" \
+    ./cvxpy/tests/test_conic_solvers.py
diff --git a/conda/recipes/libcuopt/recipe.yaml b/conda/recipes/libcuopt/recipe.yaml
index fbdcfd5e8..b507a35fb 100644
--- a/conda/recipes/libcuopt/recipe.yaml
+++ b/conda/recipes/libcuopt/recipe.yaml
@@ -29,7 +29,7 @@ cache:
         export CXXFLAGS=$(echo $CXXFLAGS | sed -E 's@\-fdebug\-prefix\-map[^ ]*@@g')
         set +x
 
-        ./build.sh -n -v -a libmps_parser libcuopt --ci-only-arch --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib\"
+        ./build.sh -n -v -a libmps_parser libcuopt deb --allgpuarch --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib\"
       secrets:
         - AWS_ACCESS_KEY_ID
         - AWS_SECRET_ACCESS_KEY
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index b7da821a8..0266c4b33 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -33,11 +33,6 @@ project(
   LANGUAGES CXX CUDA C
 )
 
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 12.0.0)
-  list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "90")
-  list(APPEND CMAKE_CUDA_ARCHITECTURES "80-virtual")
-endif()
-
 set(DEPENDENT_LIB_MAJOR_VERSION "25")
 set(DEPENDENT_LIB_MINOR_VERSION "08")
 
@@ -139,6 +134,12 @@ elseif(CMAKE_CUDA_LINEINFO)
   set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -lineinfo")
 endif(CMAKE_BUILD_TYPE MATCHES Debug)
 
+# Undefine NDEBUG if assert mode is on
+if(DEFINE_ASSERT)
+  message(STATUS "Undefining NDEBUG with assert mode enabled")
+  add_definitions(-UNDEBUG)
+endif()
+
 
 # ##################################################################################################
 # - find CPM based dependencies  ------------------------------------------------------------------
@@ -222,9 +223,7 @@ set(CUOPT_PRIVATE_CUDA_LIBS
   CUDA::cusolver
   OpenMP::OpenMP_CXX)
 
-if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.4)
-  list(PREPEND CUOPT_PRIVATE_CUDA_LIBS CUDA::cublasLt)
-endif()
+list(PREPEND CUOPT_PRIVATE_CUDA_LIBS CUDA::cublasLt)
 
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/libmps_parser)
 set(CMAKE_LIBRARY_PATH ${CMAKE_CURRENT_BINARY_DIR}/libmps_parser/)
@@ -252,16 +251,49 @@ endif(BUILD_TESTS)
 
 # ##################################################################################################
 # - install targets -------------------------------------------------------------------------------
+
+# allows for CPack component builds and install location
+set(CPACK_DEB_COMPONENT_INSTALL ON)
+set(CPACK_COMPONENTS_ALL runtime dev)
+set(CPACK_PACKAGING_INSTALL_PREFIX "/usr/local")
+
+#If using cpack to create a deb package
+if(CPACK_GENERATOR STREQUAL "DEB")
+  set(_BIN_DEST "bin")
+  set(_LIB_DEST "lib")
+  set(_INCLUDE_DEST "lib/cuopt")
+
+#If building locally use the Default install paths(e.g. for local development or other package types)
+else()
+  set(_BIN_DEST "${CMAKE_INSTALL_BINDIR}")
+  set(_LIB_DEST "${lib_dir}")
+  set(_INCLUDE_DEST  include/cuopt/)
+endif()
+
+# adds the .so files to the runtime deb package
 install(TARGETS cuopt mps_parser
-  DESTINATION ${lib_dir}
-  EXPORT cuopt-exports)
+  DESTINATION ${_LIB_DEST}
+  COMPONENT runtime
+  EXPORT cuopt-exports
+)
 
+# adds the .so files to the development deb package
+install(TARGETS cuopt mps_parser
+  DESTINATION ${_LIB_DEST}
+  COMPONENT dev
+)
+
+# adds the header files to the development deb package
 install(DIRECTORY include/cuopt/
-  DESTINATION include/cuopt)
+  DESTINATION ${_INCLUDE_DEST}
+  COMPONENT dev
+)
 
+# adds the version header file to the development deb package
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/include/cuopt/version_config.hpp
-  DESTINATION include/cuopt)
-
+  DESTINATION ${_INCLUDE_DEST}
+  COMPONENT dev
+)
 # ###############################################################################################
 # - install export -------------------------------------------------------------------------------
 set(doc_string
@@ -306,8 +338,6 @@ if(Doxygen_FOUND)
 endif()
 
 
-
-list(APPEND CUOPT_CXX_FLAGS -g -O0)
 add_executable(cuopt_cli cuopt_cli.cpp)
 target_compile_options(cuopt_cli
   PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUOPT_CXX_FLAGS}>"
@@ -330,10 +360,11 @@ target_link_libraries(cuopt_cli
 )
 set_property(TARGET cuopt_cli PROPERTY INSTALL_RPATH "$ORIGIN/../${lib_dir}")
 
-# FIXME:: Is this the right way?
+# adds the cuopt_cli executable to the runtime deb package
 install(TARGETS cuopt_cli
-  DESTINATION ${CMAKE_INSTALL_BINDIR})
-
+  COMPONENT runtime
+  RUNTIME DESTINATION ${_BIN_DEST}
+)
 
 option(BUILD_BENCHMARKS "Build benchmarks" ON)
 if(BUILD_BENCHMARKS)
@@ -348,3 +379,38 @@ if(BUILD_BENCHMARKS)
     OpenMP::OpenMP_CXX
   )
 endif()
+
+
+# ##################################################################################################
+# - CPack has to be the last item in the cmake file-------------------------------------------------
+# Used to create an installable deb package for cuOpt
+
+set(CPACK_GENERATOR "DEB")
+
+# Runtime package metadata
+execute_process(COMMAND dpkg --print-architecture OUTPUT_VARIABLE DEB_ARCH OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+# general package metadata
+set(CPACK_DEBIAN_PACKAGE_NAME "cuOpt")
+set(CPACK_PACKAGE_VERSION "${PROJECT_VERSION}")
+set(CPACK_DEBIAN_PACKAGE_MAINTAINER "Nvidia")
+set(CPACK_PACKAGE_FILE_NAME "cuOpt_${CPACK_PACKAGE_VERSION}_${DEB_ARCH}")
+
+# runtime package metadata
+set(CPACK_COMPONENT_RUNTIME_DESCRIPTION "cuOpt runtime components (binaries and shared libraries)")
+set(CPACK_COMPONENT_RUNTIME_DISPLAY_NAME "cuOpt Runtime")
+set(CPACK_COMPONENT_RUNTIME_GROUP "Runtime")
+set(CPACK_DEBIAN_RUNTIME_PACKAGE_MAINTAINER "NVIDIA")
+set(CPACK_DEBIAN_RUNTIME_PACKAGE_NAME "cuopt")
+set(CPACK_DEBIAN_RUNTIME_PACKAGE_FILE_NAME "cuopt_${PROJECT_VERSION}_${DEB_ARCH}")
+
+# Dev package metadata
+set(CPACK_COMPONENT_DEV_DESCRIPTION "cuOpt development files (headers, symlinks, etc.)")
+set(CPACK_COMPONENT_DEV_DISPLAY_NAME "cuOpt Development")
+set(CPACK_COMPONENT_DEV_GROUP "Development")
+set(CPACK_DEBIAN_DEV_PACKAGE_MAINTAINER "NVIDIA")
+set(CPACK_DEBIAN_DEV_PACKAGE_NAME "cuopt-dev")
+set(CPACK_DEBIAN_DEV_PACKAGE_FILE_NAME "cuopt-dev_${PROJECT_VERSION}_${DEB_ARCH}")
+
+# MUST BE THE LAST ITEM IN THE CMAKE FILE!!!
+include(CPack)
diff --git a/cpp/src/dual_simplex/CMakeLists.txt b/cpp/src/dual_simplex/CMakeLists.txt
index 16ee502f8..d85471f9b 100644
--- a/cpp/src/dual_simplex/CMakeLists.txt
+++ b/cpp/src/dual_simplex/CMakeLists.txt
@@ -16,6 +16,7 @@
 set(DUAL_SIMPLEX_SRC_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/basis_solves.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/basis_updates.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/bound_flipping_ratio_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/branch_and_bound.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/crossover.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/initial_basis.cpp
@@ -30,6 +31,7 @@ set(DUAL_SIMPLEX_SRC_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/singletons.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/solve.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/sparse_matrix.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/sparse_vector.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/tic_toc.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/triangle_solve.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/vector_math.cpp)
diff --git a/cpp/src/dual_simplex/basis_solves.cpp b/cpp/src/dual_simplex/basis_solves.cpp
index d363c3ee8..a2243f540 100644
--- a/cpp/src/dual_simplex/basis_solves.cpp
+++ b/cpp/src/dual_simplex/basis_solves.cpp
@@ -364,9 +364,11 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
         for (i_t h = 0; h < Sdim; ++h) {
           identity[h] = h;
         }
-        Srank = right_looking_lu(S, medium_tol, identity, S_col_perm, SL, SU, S_perm_inv);
+        Srank = right_looking_lu(
+          S, settings.threshold_partial_pivoting_tol, identity, S_col_perm, SL, SU, S_perm_inv);
         if (Srank != Sdim) {
           // Get the rank deficient columns
+          deficient.clear();
           deficient.resize(Sdim - Srank);
           for (i_t h = Srank; h < Sdim; ++h) {
             deficient[h - Srank] = col_perm[num_singletons + S_col_perm[h]];
diff --git a/cpp/src/dual_simplex/basis_updates.cpp b/cpp/src/dual_simplex/basis_updates.cpp
index 78737c167..3c9ef3274 100644
--- a/cpp/src/dual_simplex/basis_updates.cpp
+++ b/cpp/src/dual_simplex/basis_updates.cpp
@@ -30,6 +30,14 @@ i_t basis_update_t<i_t, f_t>::b_solve(const std::vector<f_t>& rhs, std::vector<f
   return b_solve(rhs, solution, Lsol);
 }
 
+template <typename i_t, typename f_t>
+i_t basis_update_t<i_t, f_t>::b_solve(const sparse_vector_t<i_t, f_t>& rhs,
+                                      sparse_vector_t<i_t, f_t>& solution) const
+{
+  sparse_vector_t<i_t, f_t> Lsol(rhs.n, 0);
+  return b_solve(rhs, solution, Lsol);
+}
+
 template <typename i_t, typename f_t>
 i_t basis_update_t<i_t, f_t>::b_solve(const std::vector<f_t>& rhs,
                                       std::vector<f_t>& solution,
@@ -55,6 +63,33 @@ i_t basis_update_t<i_t, f_t>::b_solve(const std::vector<f_t>& rhs,
   return 0;
 }
 
+template <typename i_t, typename f_t>
+i_t basis_update_t<i_t, f_t>::b_solve(const sparse_vector_t<i_t, f_t>& rhs,
+                                      sparse_vector_t<i_t, f_t>& solution,
+                                      sparse_vector_t<i_t, f_t>& Lsol) const
+{
+  const i_t m = L0_.m;
+  assert(row_permutation_.size() == m);
+  assert(rhs.n == m);
+  assert(solution.n == m);
+  assert(Lsol.n == m);
+
+  // P*B = L*U
+  // B*x = b
+  // P*B*x = P*b = b'
+  solution = rhs;
+  solution.inverse_permute_vector(inverse_row_permutation_);
+
+  // L*U*x = b'
+  // Solve for v such that L*v = b'
+  l_solve(solution);
+  Lsol = solution;
+
+  // Solve for x such that U*x = v
+  u_solve(solution);
+  return 0;
+}
+
 template <typename i_t, typename f_t>
 i_t basis_update_t<i_t, f_t>::b_transpose_solve(const std::vector<f_t>& rhs,
                                                 std::vector<f_t>& solution) const
@@ -87,6 +122,112 @@ i_t basis_update_t<i_t, f_t>::b_transpose_solve(const std::vector<f_t>& rhs,
   return 0;
 }
 
+template <typename i_t, typename f_t>
+i_t basis_update_t<i_t, f_t>::b_transpose_solve(const sparse_vector_t<i_t, f_t>& rhs,
+                                                sparse_vector_t<i_t, f_t>& solution) const
+{
+  // Observe that
+  // P*B = L*U
+  // B'*P' = U'*L'
+  // We want to solve
+  // B'*y = c
+  // Let y = P'*w
+  // B'*y = B'*P'*w = U'*L'*w = c
+  // 1. Solve U'*r = c for r
+  // 2. Solve L'*w = r for w
+  // 3. Compute y = P'*w
+
+  const i_t m = L0_.m;
+  assert(rhs.n == m);
+  assert(solution.n == m);
+
+  // Solve for r such that U'*r = c
+  // Actually Q*U'*Q'*r = c
+  sparse_vector_t<i_t, f_t> r = rhs;
+  u_transpose_solve(r);
+
+#ifdef CHECK_U_TRANSPOSE_SOLVE
+  std::vector<f_t> residual;
+  rhs.to_dense(residual);
+  std::vector<f_t> r_dense;
+  r.to_dense(r_dense);
+  std::vector<f_t> product(m);
+  // Q * U' * Q' * r_dense - c
+
+  std::vector<f_t> r_dense_permuted(m);
+  inverse_permute_vector(col_permutation_, r_dense, r_dense_permuted);
+
+  // product = U' * Q' * r_dense
+  matrix_transpose_vector_multiply(U_, 1.0, r_dense_permuted, 0.0, product);
+  std::vector<f_t> product_permuted(m);
+  permute_vector(col_permutation_, product, product_permuted);
+  // residual = product_permuted - c
+  for (i_t k = 0; k < m; ++k) {
+    residual[k] -= product_permuted[k];
+  }
+
+  const f_t Ut_error = vector_norm_inf<i_t, f_t>(residual);
+  if (Ut_error > 1e-6) {
+    printf("|| U' * r - c || %e\n", Ut_error);
+    for (i_t k = 0; k < m; ++k) {
+      if (std::abs(residual[k]) > 1e-6) { printf("%d residual %e\n", k, residual[k]); }
+    }
+    printf("rhs nz %d\n", rhs.i.size());
+  }
+#endif
+
+  // Solve for w such that L'*w = r
+  l_transpose_solve(r);
+
+  // y = P'*w
+  r.inverse_permute_vector(row_permutation_, solution);
+
+#ifdef CHECK_PERMUTATION
+  std::vector<f_t> r_dense2;
+  r.to_dense(r_dense2);
+  std::vector<f_t> solution_dense_permuted(m);
+  permute_vector(inverse_row_permutation_, r_dense2, solution_dense_permuted);
+  std::vector<f_t> solution_dense;
+  solution.to_dense(solution_dense);
+  bool found_error = false;
+  for (i_t k = 0; k < m; ++k) {
+    if (std::abs(solution_dense[k] - solution_dense_permuted[k]) > 1e-6) {
+      printf("B transpose inverse permutation error %d %e %e\n",
+             k,
+             solution_dense[k],
+             solution_dense_permuted[k]);
+      found_error = true;
+    }
+  }
+  if (found_error) {
+    for (i_t k = 0; k < m; ++k) {
+      printf("%d (sparse -> permuted -> dense) %e (sparse -> dense -> permuted)%e\n",
+             k,
+             solution_dense[k],
+             solution_dense_permuted[k]);
+    }
+    for (i_t k = 0; k < solution.i.size(); ++k) {
+      printf("%d solution sparse %d %e\n", k, solution.i[k], solution.x[k]);
+    }
+    for (i_t k = 0; k < m; ++k) {
+      if (solution_dense[k] != 0.0) { printf("%d solution dense %e\n", k, solution_dense[k]); }
+    }
+    for (i_t k = 0; k < m; ++k) {
+      printf("inv permutation %d %d\n", k, inverse_row_permutation_[k]);
+    }
+    for (i_t k = 0; k < m; ++k) {
+      if (r_dense2[k] != 0.0) { printf("%d r dense %e\n", k, r_dense2[k]); }
+    }
+    for (i_t k = 0; k < m; ++k) {
+      if (solution_dense_permuted[k] != 0.0) {
+        printf("%d solution dense permuted %e\n", k, solution_dense_permuted[k]);
+      }
+    }
+  }
+#endif
+  return 0;
+}
+
 // Solve for x such that L*x = b
 template <typename i_t, typename f_t>
 i_t basis_update_t<i_t, f_t>::l_solve(std::vector<f_t>& rhs) const
@@ -101,7 +242,6 @@ i_t basis_update_t<i_t, f_t>::l_solve(std::vector<f_t>& rhs) const
 #endif
   // First solve
   // L0*x0 = b
-  // TODO: Handle a sparse rhs
   dual_simplex::lower_triangular_solve(L0_, rhs);
 #ifdef CHECK_LOWER_SOLVE
   {
@@ -129,6 +269,92 @@ i_t basis_update_t<i_t, f_t>::l_solve(std::vector<f_t>& rhs) const
   return 0;
 }
 
+template <typename i_t, typename f_t>
+i_t basis_update_t<i_t, f_t>::l_solve(sparse_vector_t<i_t, f_t>& rhs) const
+{
+  // L = L0 * R1^{-1} * R2^{-1} * ... * Rk^{-1}
+  //
+  // where Ri = I + e_r d^T
+
+  // First solve
+  // L0*x0 = b
+  const i_t m = L0_.m;
+
+  i_t top = sparse_triangle_solve<i_t, f_t, true>(
+    rhs, std::nullopt, xi_workspace_, L0_, x_workspace_.data());
+  solve_to_sparse_vector(top, rhs);  // Uses xi_workspace_ and x_workspace_ to fill rhs
+
+#ifdef CHECK_L_SOLVE
+  std::vector<f_t> residual(m, 0.0);
+  const i_t col_start = B.col_start[0];
+  const i_t col_end   = B.col_start[1];
+  for (i_t p = col_start; p < col_end; ++p) {
+    residual[B.i[p]] = B.x[p];
+  }
+
+  std::vector<f_t> x0;
+  rhs.to_dense(x0);
+  matrix_vector_multiply(L0_, 1.0, x0, -1.0, residual);
+  const f_t L0_solve_error = vector_norm_inf<i_t, f_t>(residual);
+  if (L0_solve_error > 1e-10) { printf("L0 solve error %e\n", L0_solve_error); }
+#endif
+
+  // then solve R1^{-1}*x1 = x0     ->  x1 = R1*x0
+  // then solve R2^{-1}*x2 = x1     ->  x2 = R2*x1
+  // until we get to
+  // Rk^{-1}*x = xk-1               -> x = Rk*xk-1
+  // Rk = (I + e_rk dk^T)
+  // x = Rk*xk-1 = xk-1 + erk (dk^T xk-1)
+
+#ifdef CHECK_MULTIPLY
+  std::vector<f_t> multiply;
+  rhs.to_dense(multiply);
+#endif
+
+  i_t nz = scatter_into_workspace(rhs);
+
+  for (i_t k = 0; k < num_updates_; ++k) {
+    const i_t r         = pivot_indices_[k];
+    f_t dot             = 0.0;
+    const i_t col_start = S_.col_start[k];
+    const i_t col_end   = S_.col_start[k + 1];
+    for (i_t p = col_start; p < col_end; ++p) {
+      if (xi_workspace_[S_.i[p]]) { dot += S_.x[p] * x_workspace_[S_.i[p]]; }
+    }
+    if (!xi_workspace_[r]) {
+      xi_workspace_[r]      = 1;
+      xi_workspace_[m + nz] = r;
+      nz++;
+    }
+    x_workspace_[r] += dot;
+
+#ifdef CHECK_MULTIPLY
+    f_t dot2 = 0.0;
+    for (i_t p = col_start; p < col_end; ++p) {
+      dot2 += S_.x[p] * multiply[S_.i[p]];
+    }
+    multiply[r] += dot2;
+#endif
+  }
+
+  // Gather the solution into rhs
+  gather_into_sparse_vector(nz, rhs);
+
+  rhs.sort();
+
+#ifdef CHECK_MULTIPLY
+  std::vector<f_t> rhs_dense;
+  rhs.to_dense(rhs_dense);
+  for (i_t k = 0; k < m; ++k) {
+    if (std::abs(rhs_dense[k] - multiply[k]) > 1e-10) {
+      printf("l_solve rhs dense/multiply error %d %e %e\n", k, rhs_dense[k], multiply[k]);
+    }
+  }
+#endif
+
+  return 0;
+}
+
 // Solve for y such that L'*y = c
 template <typename i_t, typename f_t>
 i_t basis_update_t<i_t, f_t>::l_transpose_solve(std::vector<f_t>& rhs) const
@@ -153,6 +379,149 @@ i_t basis_update_t<i_t, f_t>::l_transpose_solve(std::vector<f_t>& rhs) const
   return 0;
 }
 
+template <typename i_t, typename f_t>
+i_t basis_update_t<i_t, f_t>::scatter_into_workspace(const sparse_vector_t<i_t, f_t>& in) const
+{
+  const i_t m = L0_.m;
+  // scatter pattern into xi_workspace_
+  i_t nz = in.i.size();
+  for (i_t k = 0; k < nz; ++k) {
+    const i_t i          = in.i[k];
+    xi_workspace_[i]     = 1;
+    xi_workspace_[m + k] = i;
+  }
+  // scatter values into x_workspace_
+  for (i_t k = 0; k < nz; ++k) {
+    x_workspace_[in.i[k]] = in.x[k];
+  }
+  return nz;
+}
+
+template <typename i_t, typename f_t>
+void basis_update_t<i_t, f_t>::gather_into_sparse_vector(i_t nz,
+                                                         sparse_vector_t<i_t, f_t>& out) const
+{
+  const i_t m = L0_.m;
+  out.i.clear();
+  out.x.clear();
+  out.i.resize(nz);
+  out.x.resize(nz);
+  for (i_t k = 0; k < nz; ++k) {
+    const i_t i          = xi_workspace_[m + k];
+    out.i[k]             = i;
+    out.x[k]             = x_workspace_[i];
+    xi_workspace_[m + k] = 0;
+    xi_workspace_[i]     = 0;
+    x_workspace_[i]      = 0.0;
+  }
+}
+
+template <typename i_t, typename f_t>
+void basis_update_t<i_t, f_t>::solve_to_sparse_vector(i_t top, sparse_vector_t<i_t, f_t>& out) const
+{
+  const i_t m = L0_.m;
+  out.n       = m;
+  out.i.clear();
+  out.x.clear();
+  const i_t nz = m - top;
+  out.x.resize(nz);
+  out.i.resize(nz);
+  i_t k = 0;
+  for (i_t p = top; p < m; ++p) {
+    const i_t i      = xi_workspace_[p];
+    out.i[k]         = i;
+    out.x[k]         = x_workspace_[i];
+    x_workspace_[i]  = 0.0;
+    xi_workspace_[p] = 0;
+    k++;
+  }
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_t<i_t, f_t>::l_transpose_solve(sparse_vector_t<i_t, f_t>& rhs) const
+{
+  // L = L0*R1^{-1}* ... * Rk^{-1}
+  // L' = Rk^{-T} * Rk-1^{-T} * ... * R2^{-T} * R1^{-T} * L0^T
+  // L'*y = c
+  // Rk^{-T} * Rk-1^{-T} * ... * R2^{-T} * R1^{-T} * L0^T * y = c
+  // L0^T * y = cprime = R1^1 * ... * Rk^T * c
+  const i_t m = L0_.m;
+
+  i_t nz = 0;
+
+#ifdef CHECK_UPDATES
+  std::vector<f_t> multiply;
+  rhs.to_dense(multiply);
+  for (i_t k = 0; k < 2 * m; ++k) {
+    if (xi_workspace_[k]) { printf("xi workspace %d %d\n", k, xi_workspace_[k]); }
+  }
+#endif
+
+  if (num_updates_ > 0) { nz = scatter_into_workspace(rhs); }
+
+  for (i_t k = num_updates_ - 1; k >= 0; --k) {
+    const i_t r = pivot_indices_[k];
+    assert(r < m);
+    const i_t col_start = S_.col_start[k];
+    const i_t col_end   = S_.col_start[k + 1];
+    if (xi_workspace_[r]) {
+      for (i_t p = col_start; p < col_end; ++p) {
+        // rhs.x[S_.i[p]] += rhs.x[r] * S_.x[p];
+        if (!xi_workspace_[S_.i[p]]) {
+          xi_workspace_[S_.i[p]] = 1;
+          xi_workspace_[m + nz]  = S_.i[p];
+          nz++;
+        }
+        x_workspace_[S_.i[p]] += x_workspace_[r] * S_.x[p];
+      }
+    }
+#ifdef CHECK_UPDATES
+    for (i_t p = col_start; p < col_end; ++p) {
+      multiply[S_.i[p]] += multiply[r] * S_.x[p];
+    }
+#endif
+  }
+
+  // Gather into rhs
+  if (num_updates_ > 0) {
+    gather_into_sparse_vector(nz, rhs);
+
+    rhs.sort();
+
+#ifdef CHECK_UPDATES
+    std::vector<f_t> rhs_dense;
+    rhs.to_dense(rhs_dense);
+    for (i_t k = 0; k < m; ++k) {
+      if (std::abs(rhs_dense[k] - multiply[k]) > 1e-6) {
+        printf("rhs dense/multiply error %d %e %e\n", k, rhs_dense[k], multiply[k]);
+      }
+    }
+#endif
+  }
+
+  // L0^T * y = cprime
+#ifdef CHECK_LOWER_TRANSPOSE_SOLVE
+  std::vector<f_t> cprime_dense;
+  rhs.to_dense(cprime_dense);
+#endif
+
+  i_t top = sparse_triangle_solve<i_t, f_t, false>(
+    rhs, std::nullopt, xi_workspace_, L0_transpose_, x_workspace_.data());
+  solve_to_sparse_vector(top, rhs);  // Uses xi_workspace_ and x_workspace_ to fill rhs
+
+#ifdef CHECK_LOWER_TRANSPOSE_SOLVE
+  std::vector<f_t> y_dense;
+  rhs.to_dense(y_dense);
+
+  std::vector<f_t> residual = cprime_dense;
+  matrix_transpose_vector_multiply(L0_, 1.0, y_dense, -1.0, residual);
+  const f_t L0_solve_error = vector_norm_inf<i_t, f_t>(residual);
+  if (L0_solve_error > 1e-6) { printf("L0 solve error %e\n", L0_solve_error); }
+
+#endif
+  return 0;
+}
+
 template <typename i_t, typename f_t>
 f_t basis_update_t<i_t, f_t>::update_lower(const std::vector<i_t>& sind,
                                            const std::vector<f_t>& sval,
@@ -205,6 +574,28 @@ i_t basis_update_t<i_t, f_t>::u_solve(std::vector<f_t>& x) const
   return 0;
 }
 
+template <typename i_t, typename f_t>
+i_t basis_update_t<i_t, f_t>::u_solve(sparse_vector_t<i_t, f_t>& rhs) const
+{
+  // Solve Q*U*Q'*x = b
+  // Multiplying by Q' we have U*Q'*x = Q'*b = bprime
+  // Let y = Q'*x so U*y = bprime
+  // 1. Compute bprime = Q'*b
+  // 2. Solve for y such that U*y = bprime
+  // 3. Compute Q*y = x
+  const i_t m = U_.m;
+  sparse_vector_t<i_t, f_t> bprime(m, 0);
+  rhs.inverse_permute_vector(col_permutation_, bprime);
+
+  i_t top = sparse_triangle_solve<i_t, f_t, false>(
+    bprime, std::nullopt, xi_workspace_, U_, x_workspace_.data());
+  solve_to_sparse_vector(top, rhs);  // Uses xi_workspace_ and x_workspace_ to fill rhs
+
+  rhs.inverse_permute_vector(inverse_col_permutation_);
+
+  return 0;
+}
+
 // x = U'(q,q)\b
 template <typename i_t, typename f_t>
 i_t basis_update_t<i_t, f_t>::u_transpose_solve(std::vector<f_t>& x) const
@@ -223,6 +614,113 @@ i_t basis_update_t<i_t, f_t>::u_transpose_solve(std::vector<f_t>& x) const
   return 0;
 }
 
+template <typename i_t, typename f_t>
+i_t basis_update_t<i_t, f_t>::u_transpose_solve(sparse_vector_t<i_t, f_t>& rhs) const
+{
+  // Solve Q*U'*Q'*x = b
+  // Multiplying by Q' we have U'*Q'*x = Q'*b = bprime
+  // Let y = Q'*x so U'*y = bprime
+  // 1. Compute bprime = Q'*b
+  // 2. Solve for y such that U'*y = bprime
+  // 3. Compute Q*y = x
+  const i_t m = U_.m;
+  sparse_vector_t<i_t, f_t> bprime(1, 0);
+#ifdef CHECK_PERMUTATION
+  std::vector<f_t> rhs_dense(m);
+  rhs.to_dense(rhs_dense);
+#endif
+  rhs.inverse_permute_vector(col_permutation_, bprime);
+#ifdef CHECK_PERMUTATION
+  std::vector<f_t> bprime_dense;
+  bprime.to_dense(bprime_dense);
+  std::vector<f_t> rhs_dense_permuted(m);
+  inverse_permute_vector(col_permutation_, rhs_dense, rhs_dense_permuted);
+  for (i_t k = 0; k < m; ++k) {
+    if (std::abs(bprime_dense[k] - rhs_dense_permuted[k]) > 1e-6) {
+      printf("u_transpose inverse permutation error %d %e %e\n",
+             k,
+             bprime_dense[k],
+             rhs_dense_permuted[k]);
+    }
+  }
+#endif
+
+#ifdef CHECK_WORKSPACE
+  for (i_t k = 0; k < 2 * m; ++k) {
+    if (xi_workspace_[k]) {
+      printf("before Utranspose m %d solve xi workspace %d %d\n", m, k, xi_workspace_[k]);
+    }
+  }
+#endif
+
+  // U'*y = bprime
+  i_t top = sparse_triangle_solve<i_t, f_t, true>(
+    bprime, std::nullopt, xi_workspace_, U_transpose_, x_workspace_.data());
+  solve_to_sparse_vector(top, rhs);  // Uses xi_workspace_ and x_workspace_ to fill rhs
+
+#ifdef CHECK_WORKSPACE
+  for (i_t k = 0; k < 2 * m; ++k) {
+    if (xi_workspace_[k]) {
+      printf(
+        "after Utranspose m %d top %d solve xi workspace %d %d\n", m, top, k, xi_workspace_[k]);
+    }
+  }
+#endif
+
+#ifdef CHECK_PERMUTATION
+  std::vector<f_t> rhs_dense2;
+  rhs.to_dense(rhs_dense2);
+#endif
+
+  // Q*y = x
+  rhs.inverse_permute_vector(inverse_col_permutation_);
+#ifdef CHECK_PERMUTATION
+  rhs.to_dense(rhs_dense_permuted);
+  std::vector<f_t> rhs_dense_permuted2(m);
+  permute_vector(col_permutation_, rhs_dense2, rhs_dense_permuted2);
+  bool found_error = false;
+  for (i_t k = 0; k < m; ++k) {
+    if (std::abs(rhs_dense_permuted[k] - rhs_dense_permuted2[k]) > 1e-6) {
+      printf("u_transpose2 permutation error %d %e %e\n",
+             k,
+             rhs_dense_permuted[k],
+             rhs_dense_permuted2[k]);
+      found_error = true;
+    }
+  }
+  if (found_error) {
+    for (i_t k = 0; k < m; ++k) {
+      printf("%d (sparse -> permuted -> dense) %e (sparse -> dense -> permuted)%e\n",
+             k,
+             rhs_dense_permuted[k],
+             rhs_dense_permuted2[k]);
+    }
+    for (i_t k = 0; k < rhs.i.size(); ++k) {
+      printf("%d rhs sparse %d %e\n", k, rhs.i[k], rhs.x[k]);
+    }
+    for (i_t k = 0; k < m; ++k) {
+      if (rhs_dense_permuted[k] != 0.0) {
+        printf("%d rhs dense permuted %e\n", k, rhs_dense_permuted[k]);
+      }
+    }
+    for (i_t k = 0; k < m; ++k) {
+      if (rhs_dense2[k] != 0.0) { printf("%d rhs dense2 %e\n", k, rhs_dense2[k]); }
+    }
+    printf("col permutation %d rhs dense 2 %d rhs dense permuted %d\n",
+           col_permutation_.size(),
+           rhs_dense2.size(),
+           rhs_dense_permuted.size());
+    for (i_t k = 0; k < col_permutation_.size(); ++k) {
+      printf("%d col permutation %d\n", k, col_permutation_[k]);
+    }
+    for (i_t k = 0; k < m; ++k) {
+      printf("%d col permutation inverse %d\n", k, inverse_col_permutation_[k]);
+    }
+  }
+#endif
+  return 0;
+}
+
 template <typename i_t, typename f_t>
 i_t basis_update_t<i_t, f_t>::index_map(i_t r) const
 {
@@ -334,6 +832,7 @@ i_t basis_update_t<i_t, f_t>::update_upper(const std::vector<i_t>& ind,
   U_.col_start[n] = new_nz;
 
   // Check to ensure that U remains upper triangular
+#ifdef CHECK_UPPER_TRIANGULAR
   for (i_t k = 0; k < n; ++k) {
     const i_t col_start = U_.col_start[k];
     const i_t col_end   = U_.col_start[k + 1];
@@ -341,6 +840,10 @@ i_t basis_update_t<i_t, f_t>::update_upper(const std::vector<i_t>& ind,
       assert(U_.i[p] <= k);
     }
   }
+#endif
+
+  // Update U transpose
+  U_.transpose(U_transpose_);
 
   return 0;
 }
@@ -436,7 +939,7 @@ i_t basis_update_t<i_t, f_t>::update(std::vector<f_t>& utilde, i_t leaving_index
     norm_s = update_lower(sind, sval, leaving_index);
   }
 
-#ifdef PARANOID
+#ifdef CHECK_ABAR
   {
     sparse_matrix_t abar_test(m, 1, 1);
     const Int nz           = lower_triangular_multiply(U_, m - 1, abar_test, 1);
@@ -473,21 +976,19 @@ i_t basis_update_t<i_t, f_t>::multiply_lu(csc_matrix_t<i_t, f_t>& out)
   out.col_start.resize(m + 1);
   assert(out.m == m);
   const i_t nz_estimate = L0_.col_start[m] + U_.col_start[m];
-#if 0
-    printf("Nz estimate %d m %d num updates %d\n", nz_estimate, m, num_updates_);
-    printf("q = [");
-    for (Int k = 0; k < m; ++k)
-    {
-        printf("%d ", col_permutation_[k]);
-    }
-    printf("];\n");
-    //PrintMatrix(L0_);
-    printf("p = [");
-    for (Int k = 0; k < m; ++k)
-    {
-        printf("%d ", row_permutation_[k]);
-    }
-    printf("];\n");
+#ifdef PRINT_PERMUTATIONS
+  printf("Nz estimate %d m %d num updates %d\n", nz_estimate, m, num_updates_);
+  printf("q = [");
+  for (i_t k = 0; k < m; ++k) {
+    printf("%d ", col_permutation_[k]);
+  }
+  printf("];\n");
+  // PrintMatrix(L0_);
+  printf("p = [");
+  for (i_t k = 0; k < m; ++k) {
+    printf("%d ", row_permutation_[k]);
+  }
+  printf("];\n");
 #endif
   out.reallocate(nz_estimate);
 
@@ -547,16 +1048,14 @@ i_t basis_update_t<i_t, f_t>::lower_triangular_multiply(const csc_matrix_t<i_t,
     const i_t r = pivot_indices_[k];
     f_t dot     = sparse_dot(sind, sval, S_, k);
     if (dot == 0.0) { continue; }
-#if 0
-        for (Int p = 0; p < sind.size(); ++p)
-        {
-            printf("s %d %e\n", sind[p], sval[p]);
-        }
-        printf("S col start %d %d\n", S_.col_start[k], S_.col_start[k+1]);
-        for (Int p = S_.col_start[k]; p < S_.col_start[k+1]; ++p)
-        {
-            printf("S %d %d %e\n", k, S_.i[p], S_.x[p]);
-        }
+#ifdef DEBUG_MULTIPLY
+    for (i_t p = 0; p < sind.size(); ++p) {
+      printf("s %d %e\n", sind[p], sval[p]);
+    }
+    printf("S col start %d %d\n", S_.col_start[k], S_.col_start[k + 1]);
+    for (i_t p = S_.col_start[k]; p < S_.col_start[k + 1]; ++p) {
+      printf("S %d %d %e\n", k, S_.i[p], S_.x[p]);
+    }
 #endif
     bool fill = true;
     for (i_t p = 0; p < sind.size(); ++p) {
@@ -617,8 +1116,938 @@ i_t basis_update_t<i_t, f_t>::lower_triangular_multiply(const csc_matrix_t<i_t,
   return new_nz;
 }
 
+template <typename i_t, typename f_t>
+void basis_update_mpf_t<i_t, f_t>::gather_into_sparse_vector(i_t nz,
+                                                             sparse_vector_t<i_t, f_t>& out) const
+{
+  const i_t m = L0_.m;
+  out.i.clear();
+  out.x.clear();
+  out.i.reserve(nz);
+  out.x.reserve(nz);
+  const f_t zero_tol = 1e-13;
+  for (i_t k = 0; k < nz; ++k) {
+    const i_t i = xi_workspace_[m + k];
+    if (std::abs(x_workspace_[i]) > zero_tol) {
+      out.i.push_back(i);
+      out.x.push_back(x_workspace_[i]);
+    }
+    xi_workspace_[m + k] = 0;
+    xi_workspace_[i]     = 0;
+    x_workspace_[i]      = 0.0;
+  }
+}
+
+template <typename i_t, typename f_t>
+void basis_update_mpf_t<i_t, f_t>::solve_to_workspace(i_t top) const
+{
+  const i_t m = L0_.m;
+  i_t nz      = 0;
+  for (i_t p = top; p < m; ++p) {
+    const i_t i           = xi_workspace_[p];
+    xi_workspace_[m + nz] = i;
+    xi_workspace_[p]      = 0;
+    nz++;
+  }
+  for (i_t k = 0; k < nz; ++k) {
+    const i_t i      = xi_workspace_[m + k];
+    xi_workspace_[i] = 1;
+  }
+}
+
+template <typename i_t, typename f_t>
+void basis_update_mpf_t<i_t, f_t>::solve_to_sparse_vector(i_t top,
+                                                          sparse_vector_t<i_t, f_t>& out) const
+{
+  const i_t m  = L0_.m;
+  out.n        = m;
+  const i_t nz = m - top;
+  out.x.clear();
+  out.i.clear();
+  out.x.reserve(nz);
+  out.i.reserve(nz);
+  i_t k              = 0;
+  const f_t zero_tol = 1e-13;
+  for (i_t p = top; p < m; ++p) {
+    const i_t i = xi_workspace_[p];
+    if (std::abs(x_workspace_[i]) > zero_tol) {
+      out.i.push_back(i);
+      out.x.push_back(x_workspace_[i]);
+    }
+    x_workspace_[i]  = 0.0;
+    xi_workspace_[p] = 0;
+    k++;
+  }
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::scatter_into_workspace(const sparse_vector_t<i_t, f_t>& in) const
+{
+  const i_t m = L0_.m;
+  // scatter pattern into xi_workspace_
+  i_t nz = in.i.size();
+  for (i_t k = 0; k < nz; ++k) {
+    const i_t i          = in.i[k];
+    xi_workspace_[i]     = 1;
+    xi_workspace_[m + k] = i;
+  }
+  // scatter values into x_workspace_
+  for (i_t k = 0; k < nz; ++k) {
+    x_workspace_[in.i[k]] = in.x[k];
+  }
+  return nz;
+}
+
+template <typename i_t, typename f_t>
+void basis_update_mpf_t<i_t, f_t>::grow_storage(i_t nz, i_t& S_start, i_t& S_nz)
+{
+  const i_t last_S_col     = num_updates_ * 2;
+  const i_t new_last_S_col = last_S_col + 2;
+  if (new_last_S_col >= S_.col_start.size()) {
+    S_.col_start.resize(new_last_S_col + refactor_frequency_);
+  }
+  S_nz = S_.col_start[last_S_col];
+  if (S_nz + nz > S_.i.size()) {
+    S_.i.resize(std::max(2 * S_nz, S_nz + nz));
+    S_.x.resize(std::max(2 * S_nz, S_nz + nz));
+  }
+  S_start = last_S_col;
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::nonzeros(const std::vector<f_t>& x) const
+{
+  i_t nz        = 0;
+  const i_t xsz = x.size();
+  for (i_t i = 0; i < xsz; ++i) {
+    if (x[i] != 0.0) { nz++; }
+  }
+  return nz;
+}
+
+// dot = S(:, col)' * x
+template <typename i_t, typename f_t>
+f_t basis_update_mpf_t<i_t, f_t>::dot_product(i_t col, const std::vector<f_t>& x) const
+{
+  f_t dot             = 0.0;
+  const i_t col_start = S_.col_start[col];
+  const i_t col_end   = S_.col_start[col + 1];
+  for (i_t p = col_start; p < col_end; ++p) {
+    const i_t i = S_.i[p];
+    dot += S_.x[p] * x[i];
+  }
+  return dot;
+}
+
+// dot = S(:, col)' * x
+template <typename i_t, typename f_t>
+f_t basis_update_mpf_t<i_t, f_t>::dot_product(i_t col,
+                                              const std::vector<i_t>& mark,
+                                              const std::vector<f_t>& x) const
+{
+  f_t dot             = 0.0;
+  const i_t col_start = S_.col_start[col];
+  const i_t col_end   = S_.col_start[col + 1];
+  for (i_t p = col_start; p < col_end; ++p) {
+    const i_t i = S_.i[p];
+    if (mark[i]) { dot += S_.x[p] * x[i]; }
+  }
+  return dot;
+}
+
+// x <- x + theta * S(:, col)
+template <typename i_t, typename f_t>
+void basis_update_mpf_t<i_t, f_t>::add_sparse_column(const csc_matrix_t<i_t, f_t>& S,
+                                                     i_t col,
+                                                     f_t theta,
+                                                     std::vector<f_t>& x) const
+{
+  const i_t col_start = S.col_start[col];
+  const i_t col_end   = S.col_start[col + 1];
+  for (i_t p = col_start; p < col_end; ++p) {
+    const i_t i = S.i[p];
+    x[i] += theta * S.x[p];
+  }
+}
+
+template <typename i_t, typename f_t>
+void basis_update_mpf_t<i_t, f_t>::add_sparse_column(const csc_matrix_t<i_t, f_t>& S,
+                                                     i_t col,
+                                                     f_t theta,
+                                                     std::vector<i_t>& mark,
+                                                     i_t& nz,
+                                                     std::vector<f_t>& x) const
+{
+  const i_t m         = L0_.m;
+  const i_t col_start = S.col_start[col];
+  const i_t col_end   = S.col_start[col + 1];
+  for (i_t p = col_start; p < col_end; ++p) {
+    const i_t i = S.i[p];
+    if (!mark[i]) {
+      // Fill occured
+      mark[i]      = 1;
+      mark[m + nz] = i;
+      nz++;
+    }
+    x[i] += theta * S.x[p];
+  }
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::b_transpose_solve(const std::vector<f_t>& rhs,
+                                                    std::vector<f_t>& solution) const
+{
+  std::vector<f_t> UTsol;
+  return b_transpose_solve(rhs, solution, UTsol);
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::b_transpose_solve(const std::vector<f_t>& rhs,
+                                                    std::vector<f_t>& solution,
+                                                    std::vector<f_t>& UTsol) const
+{
+  const i_t m = L0_.m;
+  // P*B = L*U
+  // B'*P' = U'*L'
+  // We want to solve
+  // B'*y = c
+  // Let y = P'*w
+  // B'*y = B'*P'*w = U'*L'*w = c
+  // 1. Solve U'*r = c for r
+  // 2. Solve L'*w = r for w
+  // 3. Compute y = P'*w
+
+  // Solve for r such that U'*r = c
+  std::vector<f_t> r = rhs;
+  u_transpose_solve(r);
+  UTsol = r;
+
+  // Solve for w such that L'*w = r
+  l_transpose_solve(r);
+
+  // Compute y = P'*w
+  inverse_permute_vector(row_permutation_, r, solution);
+
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::b_transpose_solve(const sparse_vector_t<i_t, f_t>& rhs,
+                                                    sparse_vector_t<i_t, f_t>& solution) const
+{
+  sparse_vector_t<i_t, f_t> UTsol(1, 0);
+  return b_transpose_solve(rhs, solution, UTsol);
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::b_transpose_solve(const sparse_vector_t<i_t, f_t>& rhs,
+                                                    sparse_vector_t<i_t, f_t>& solution,
+                                                    sparse_vector_t<i_t, f_t>& UTsol) const
+{
+  // Solve for r such that U'*r = c
+
+  bool use_hypersparse = false;
+  const f_t input_size = static_cast<f_t>(rhs.i.size());
+  estimate_solution_density(input_size, sum_U_transpose_, num_calls_U_transpose_, use_hypersparse);
+  if (use_hypersparse) {
+    solution = rhs;
+    u_transpose_solve(solution);
+  } else {
+    std::vector<f_t> solution_dense;
+    rhs.to_dense(solution_dense);
+    u_transpose_solve(solution_dense);
+    solution.from_dense(solution_dense);
+  }
+  UTsol = solution;
+  sum_U_transpose_ += static_cast<f_t>(solution.i.size()) / input_size;
+
+#ifdef CHECK_U_TRANSPOSE_SOLVE
+  std::vector<f_t> UTsol_dense;
+  UTsol.to_dense(UTsol_dense);
+  std::vector<f_t> rhs_dense;
+  rhs.to_dense(rhs_dense);
+
+  matrix_transpose_vector_multiply(U0_, 1.0, UTsol_dense, -1.0, rhs_dense);
+  if (vector_norm_inf<i_t, f_t>(rhs_dense) > 1e-10) {
+    printf("B transpose solve U transpose residual %e\n", vector_norm_inf<i_t, f_t>(rhs_dense));
+  }
+#endif
+
+  // Solve for w such that L'*w = r
+#ifdef CHECK_L_TRANSPOSE_SOLVE
+  std::vector<f_t> r_dense;
+  solution.to_dense(r_dense);
+#endif
+  const f_t rhs_size = static_cast<f_t>(solution.i.size());
+  estimate_solution_density(rhs_size, sum_L_transpose_, num_calls_L_transpose_, use_hypersparse);
+  if (use_hypersparse) {
+    l_transpose_solve(solution);
+  } else {
+    std::vector<f_t> solution_dense;
+    solution.to_dense(solution_dense);
+    l_transpose_solve(solution_dense);
+    solution.from_dense(solution_dense);
+  }
+  sum_L_transpose_ += static_cast<f_t>(solution.i.size()) / rhs_size;
+
+#ifdef CHECK_L_TRANSPOSE_SOLVE
+  std::vector<f_t> solution_dense;
+  solution.to_dense(solution_dense);
+  l_transpose_multiply(solution_dense);
+  f_t max_error = 0.0;
+  for (i_t k = 0; k < L0_.m; ++k) {
+    if (std::abs(solution_dense[k] - r_dense[k]) > 1e-4) {
+      printf(
+        "B transpose solve L transpose solve error %e: index %d multiply %e rhs %e. update %d. use "
+        "hypersparse %d\n",
+        std::abs(solution_dense[k] - r_dense[k]),
+        k,
+        solution_dense[k],
+        r_dense[k],
+        num_updates_,
+        use_hypersparse);
+    }
+
+    max_error = std::max(max_error, std::abs(solution_dense[k] - r_dense[k]));
+  }
+  if (max_error > 1e-4) { printf("B transpose solve L transpose solve residual %e\n", max_error); }
+#endif
+  // Compute y = P'*w
+  solution.inverse_permute_vector(row_permutation_);
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::u_transpose_solve(std::vector<f_t>& rhs) const
+{
+  total_dense_U_transpose_++;
+  dual_simplex::upper_triangular_transpose_solve(U0_, rhs);
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::u_transpose_solve(sparse_vector_t<i_t, f_t>& rhs) const
+{
+  total_sparse_U_transpose_++;
+  // U0'*x = y
+  // Solve U0'*x0 = y
+  i_t top = dual_simplex::sparse_triangle_solve<i_t, f_t, true>(
+    rhs, std::nullopt, xi_workspace_, U0_transpose_, x_workspace_.data());
+  solve_to_sparse_vector(top, rhs);
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::l_transpose_solve(std::vector<f_t>& rhs) const
+{
+  total_dense_L_transpose_++;
+  // L = L0 * T0 * T1 * ... * T_{num_updates_ - 1}
+  // L' = T_{num_updates_ - 1}^T * T_{num_updates_ - 2}^T * ... * T0^T * L0^T
+  // L'*x = b
+  // L0^T *x = T_0^-T * T_1^-T * ... * T_{num_updates_ - 1}^-T * b = b'
+
+  const f_t zero_tol = 1e-13;
+  // Compute b'
+  for (i_t k = num_updates_ - 1; k >= 0; --k) {
+    // T_k^{-T} = ( I - v u^T/(1 + u^T v))
+    // T_k^{-T} * b = b - v * (u^T * b) / (1 + u^T * v) = b - theta * v, theta = u^T b / mu
+
+    const i_t u_col = 2 * k;
+    const i_t v_col = 2 * k + 1;
+    const f_t mu    = mu_values_[k];
+
+    // dot = u^T * b
+    f_t dot         = dot_product(u_col, rhs);
+    const f_t theta = dot / mu;
+
+    if (std::abs(theta) > zero_tol) { add_sparse_column(S_, v_col, -theta, rhs); }
+  }
+
+  // Solve for x such that L0^T * x = b'
+  dual_simplex::lower_triangular_transpose_solve(L0_, rhs);
+
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::l_transpose_solve(sparse_vector_t<i_t, f_t>& rhs) const
+{
+  total_sparse_L_transpose_++;
+  const i_t m = L0_.m;
+  // L'*x = b
+  // L0^T * x = T_0^-T * T_1^-T * ... * T_{num_updates_ - 1}^-T * b = b'
+
+  scatter_into_workspace(rhs);
+  i_t nz = rhs.i.size();
+
+#ifdef CHECK_MULTIPLY
+  std::vector<f_t> rhs_dense_0;
+  rhs.to_dense(rhs_dense_0);
+#endif
+  const f_t zero_tol = 1e-13;
+  // Compute b'
+  for (i_t k = num_updates_ - 1; k >= 0; --k) {
+    // T_k^{-T} = ( I - v u^T/(1 + u^T v))
+    // T_k^{-T} * b = b - v * (u^T * b) / (1 + u^T * v) = b - theta * v, theta = u^T b / mu
+
+    const i_t u_col = 2 * k;
+    const i_t v_col = 2 * k + 1;
+    const f_t mu    = mu_values_[k];
+
+    // dot = u^T * b
+    f_t dot = dot_product(u_col, xi_workspace_, x_workspace_);
+
+#ifdef CHECK_MULTIPLY
+    f_t dot_check = 0.0;
+    for (i_t p = S_.col_start[u_col]; p < S_.col_start[u_col + 1]; ++p) {
+      const i_t i = S_.i[p];
+      dot_check += S_.x[p] * rhs_dense_0[i];
+    }
+    if (std::abs(dot - dot_check) > 1e-10) {
+      printf("L transpose solve dot erorr: index %d dot %e dot check %e\n", k, dot, dot_check);
+    }
+#endif
+
+    const f_t theta = dot / mu;
+    if (std::abs(theta) > zero_tol) {
+      add_sparse_column(S_, v_col, -theta, xi_workspace_, nz, x_workspace_);
+    }
+
+#ifdef CHECK_MULTIPLY
+    for (i_t p = S_.col_start[v_col]; p < S_.col_start[v_col + 1]; ++p) {
+      const i_t i = S_.i[p];
+      rhs_dense_0[i] -= theta * S_.x[p];
+    }
+#endif
+  }
+
+#ifdef CHECK_MULTIPLY
+  for (i_t i = 0; i < m; ++i) {
+    if (std::abs(rhs_dense_0[i] - x_workspace_[i]) > 1e-9) {
+      printf("L transpose solve multiply error %e index %d sparse %e dense %e\n",
+             std::abs(rhs_dense_0[i] - x_workspace_[i]),
+             i,
+             x_workspace_[i],
+             rhs_dense_0[i]);
+    }
+  }
+#endif
+
+  sparse_vector_t<i_t, f_t> b(m, nz);
+  gather_into_sparse_vector(nz, b);
+  i_t top = dual_simplex::sparse_triangle_solve<i_t, f_t, false>(
+    b, std::nullopt, xi_workspace_, L0_transpose_, x_workspace_.data());
+  solve_to_sparse_vector(top, rhs);
+
+#ifdef CHECK_SPARSE_SOLVE
+  std::vector<f_t> rhs_dense;
+  rhs.to_dense(rhs_dense);
+
+  std::vector<f_t> b_dense(m, 0.0);
+  for (i_t p = 0; p < nz; ++p) {
+    const i_t i = b.i[p];
+    b_dense[i]  = b.x[p];
+  }
+  matrix_vector_multiply(L0_transpose_, 1.0, rhs_dense, -1.0, b_dense);
+  if (vector_norm_inf<i_t, f_t>(b_dense) > 1e-9) {
+    printf("L0 transpose solve residual %e\n", vector_norm_inf<i_t, f_t>(b_dense));
+  }
+#endif
+
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::b_solve(const std::vector<f_t>& rhs,
+                                          std::vector<f_t>& solution) const
+{
+  const i_t m = L0_.m;
+  std::vector<f_t> Lsol(m);
+  return b_solve(rhs, solution, Lsol);
+}
+
+// Solve for x such that B*x = y
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::b_solve(const std::vector<f_t>& rhs,
+                                          std::vector<f_t>& solution,
+                                          std::vector<f_t>& Lsol,
+                                          bool need_Lsol) const
+{
+  const i_t m = L0_.m;
+  // P*B = L*U
+  // B*x = b
+  // P*B*x = P*b
+
+  permute_vector(row_permutation_, rhs, solution);
+
+  // L*U*x = b'
+  // Solve for v such that L*v = b'
+#ifdef CHECK_L_SOLVE
+  std::vector<f_t> rhs_permuted = solution;
+#endif
+  l_solve(solution);
+  if (need_Lsol) { Lsol = solution; }
+
+#ifdef CHECK_L_SOLVE
+  std::vector<f_t> Lsol_check = Lsol;
+  l_multiply(Lsol_check);
+  f_t max_lsol_err = 0.0;
+  for (i_t k = 0; k < m; ++k) {
+    const f_t err = std::abs(Lsol_check[k] - rhs_permuted[k]);
+    max_lsol_err  = std::max(max_lsol_err, err);
+  }
+  printf("B solve L multiply error %e\n", max_lsol_err);
+#endif
+
+  // Solve for x such that U*x = v
+  u_solve(solution);
+
+#ifdef CHECK_U_SOLVE
+  std::vector<f_t> residual = Lsol;
+  matrix_vector_multiply(U0_, 1.0, solution, -1.0, residual);
+  f_t max_err = vector_norm_inf<i_t, f_t>(residual);
+  printf("B solve U solve residual %e\n", max_err);
+#endif
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::b_solve(const sparse_vector_t<i_t, f_t>& rhs,
+                                          sparse_vector_t<i_t, f_t>& solution) const
+{
+  sparse_vector_t<i_t, f_t> Lsol(1, 0);
+  return b_solve(rhs, solution, Lsol, false);
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::b_solve(const sparse_vector_t<i_t, f_t>& rhs,
+                                          sparse_vector_t<i_t, f_t>& solution,
+                                          sparse_vector_t<i_t, f_t>& Lsol,
+                                          bool need_Lsol) const
+{
+  const i_t m = L0_.m;
+  solution    = rhs;
+  solution.inverse_permute_vector(inverse_row_permutation_);
+
+#ifdef CHECK_PERMUTATION
+  std::vector<f_t> permuation_rhs;
+  rhs.to_dense(permuation_rhs);
+  std::vector<f_t> finish_perm(m);
+  permute_vector(row_permutation_, permuation_rhs, finish_perm);
+
+  std::vector<f_t> solution_dense2;
+  solution.to_dense(solution_dense2);
+  for (i_t k = 0; k < m; ++k) {
+    if (finish_perm[k] != solution_dense2[k]) {
+      printf("B solve permutation error %e %e %d\n", finish_perm[k], solution_dense2[k], k);
+    }
+  }
+#endif
+
+#ifdef CHECK_L_SOLVE
+  std::vector<f_t> l_solve_rhs;
+  solution.to_dense(l_solve_rhs);
+#endif
+
+  bool use_hypersparse;
+  const f_t input_size = static_cast<f_t>(rhs.i.size());
+  estimate_solution_density(input_size, sum_L_, num_calls_L_, use_hypersparse);
+  if (use_hypersparse) {
+    l_solve(solution);
+  } else {
+    std::vector<f_t> solution_dense;
+    solution.to_dense(solution_dense);
+    l_solve(solution_dense);
+    solution.from_dense(solution_dense);
+  }
+  if (need_Lsol) { Lsol = solution; }
+  sum_L_ += static_cast<f_t>(solution.i.size()) / input_size;
+
+#ifdef CHECK_L_SOLVE
+  std::vector<f_t> l_solve_dense;
+  Lsol.to_dense(l_solve_dense);
+
+  l_multiply(l_solve_dense);
+  f_t max_err_l_solve = 0.0;
+  for (i_t k = 0; k < m; ++k) {
+    const f_t err   = std::abs(l_solve_dense[k] - l_solve_rhs[k]);
+    max_err_l_solve = std::max(max_err_l_solve, err);
+  }
+  if (max_err_l_solve > 1e-9) { printf("B solve L solve residual %e\n", max_err_l_solve); }
+#endif
+
+#ifdef CHECK_U_SOLVE
+  std::vector<f_t> rhs_dense;
+  solution.to_dense(rhs_dense);
+#endif
+
+  const f_t rhs_size = static_cast<f_t>(solution.i.size());
+  estimate_solution_density(rhs_size, sum_U_, num_calls_U_, use_hypersparse);
+  if (use_hypersparse) {
+    u_solve(solution);
+  } else {
+    std::vector<f_t> solution_dense;
+    solution.to_dense(solution_dense);
+    u_solve(solution_dense);
+    solution.from_dense(solution_dense);
+  }
+  sum_U_ += static_cast<f_t>(solution.i.size()) / rhs_size;
+
+#ifdef CHECK_U_SOLVE
+  std::vector<f_t> solution_dense;
+  solution.to_dense(solution_dense);
+
+  matrix_vector_multiply(U0_, 1.0, solution_dense, -1.0, rhs_dense);
+
+  const f_t max_err = vector_norm_inf<i_t, f_t>(rhs_dense);
+  if (max_err > 1e-9) { printf("B solve U0 solve residual %e\n", max_err); }
+#endif
+  return 0;
+}
+
+// Solve for x such that U*x = y
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::u_solve(std::vector<f_t>& rhs) const
+{
+  total_dense_U_++;
+  const i_t m = L0_.m;
+  // U*x = y
+  dual_simplex::upper_triangular_solve(U0_, rhs);
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::u_solve(sparse_vector_t<i_t, f_t>& rhs) const
+{
+  total_sparse_U_++;
+  const i_t m = L0_.m;
+  // U*x = y
+
+  // Solve U0*x = y
+  i_t top = dual_simplex::sparse_triangle_solve<i_t, f_t, false>(
+    rhs, std::nullopt, xi_workspace_, U0_, x_workspace_.data());
+  solve_to_sparse_vector(top, rhs);
+
+  return 0;
+}
+// Solve for x such that L*x = y
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::l_solve(std::vector<f_t>& rhs) const
+{
+  total_dense_L_++;
+  const i_t m = L0_.m;
+  // L*x = y
+  // L0 * T0 * T1 * ... * T_{num_updates_ - 1} * x = y
+
+  // First solve L0*x0 = y
+#ifdef CHECK_L0_SOLVE
+  std::vector<f_t> residual = rhs;
+#endif
+#ifdef CHECK_L_SOLVE
+  std::vector<f_t> rhs_check = rhs;
+#endif
+  dual_simplex::lower_triangular_solve(L0_, rhs);
+
+#ifdef CHECK_L0_SOLVE
+  matrix_vector_multiply(L0_, 1.0, rhs, -1.0, residual);
+  f_t max_err = vector_norm_inf<i_t, f_t>(residual);
+  printf("L solve: L0 solve residual %e\n", max_err);
+#endif
+
+  // Then T0 * T1 * ... * T_{num_updates_ - 1} * x = x0
+  // Or x = T_{num_updates}^{-1} * T_1^{-1} * T_0^{-1}  x0
+  const f_t zero_tol = 1e-16;  // Any higher and pilot_ja fails
+  for (i_t k = 0; k < num_updates_; ++k) {
+    // T = I + u*v^T
+    // T^{-1} = I - u*v^T / (1 + v^T*u)
+    // T^{-1} * x = x - u*v^T * x / (1 + v^T*u) = x - theta * u, theta = v^T * x / (1 + v^T*u) = v^T
+    // x / mu
+    const f_t mu    = mu_values_[k];
+    const i_t u_col = 2 * k;
+    const i_t v_col = 2 * k + 1;
+    f_t dot         = dot_product(v_col, rhs);
+    const f_t theta = dot / mu;
+
+    if (std::abs(theta) > zero_tol) { add_sparse_column(S_, u_col, -theta, rhs); }
+  }
+
+#ifdef CHECK_L_SOLVE
+  std::vector<f_t> inout = rhs;
+  l_multiply(inout);
+  f_t err_max = 0.0;
+  for (i_t k = 0; k < m; ++k) {
+    const f_t err = std::abs(inout[k] - rhs_check[k]);
+    err_max       = std::max(err_max, err);
+  }
+  printf("L solve residual %e\n", err_max);
+#endif
+
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::l_solve(sparse_vector_t<i_t, f_t>& rhs) const
+{
+  total_sparse_L_++;
+  const i_t m = L0_.m;
+  // L*x = y
+  // L0 * T0 * T1 * ... * T_{num_updates_ - 1} * x = y
+
+  // First solve L0*x0 = y
+  i_t top = dual_simplex::sparse_triangle_solve<i_t, f_t, true>(
+    rhs, std::nullopt, xi_workspace_, L0_, x_workspace_.data());
+  solve_to_workspace(top);  // Uses xi_workspace_ and x_workspace_ to fill rhs
+  i_t nz = m - top;
+  // Then T0 * T1 * ... * T_{num_updates_ - 1} * x = x0
+  // Or x = T_{num_updates}^{-1} * T_1^{-1} * T_0^{-1}  x0
+  const f_t zero_tol = 1e-13;
+  for (i_t k = 0; k < num_updates_; ++k) {
+    // T = I + u*v^T
+    // T^{-1} = I - u*v^T / (1 + v^T*u)
+    // T^{-1} * x = x - u*v^T * x / (1 + v^T*u) = x - theta * u, theta = v^T * x / (1 + v^T*u) = v^T
+    // x / mu
+    const f_t mu    = mu_values_[k];
+    const i_t u_col = 2 * k;
+    const i_t v_col = 2 * k + 1;
+
+    // dot = v^T * x
+    f_t dot = dot_product(v_col, xi_workspace_, x_workspace_);
+
+    const f_t theta = dot / mu;
+    if (std::abs(theta) > zero_tol) {
+      add_sparse_column(S_, u_col, -theta, xi_workspace_, nz, x_workspace_);
+    }
+  }
+
+  gather_into_sparse_vector(nz, rhs);
+
+  return 0;
+}
+
+// Takes in utilde such that L*utilde = abar, where abar is the column to add to the basis
+// and etilde such that U'*etilde = e_leaving
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::update(const std::vector<f_t>& utilde,
+                                         const std::vector<f_t>& etilde,
+                                         i_t leaving_index)
+{
+  const i_t m = L0_.m;
+#ifdef PRINT_NUM_UPDATES
+  printf("Update: num_updates_ %d\n", num_updates_);
+#endif
+
+  // We are going to create a new matrix T = I + u*v^T
+  const i_t col_start = U0_.col_start[leaving_index];
+  const i_t col_end   = U0_.col_start[leaving_index + 1];
+  std::vector<f_t> u  = utilde;
+  // u = utilde - U0(:, leaving_index)
+  add_sparse_column(U0_, leaving_index, -1.0, u);
+
+  i_t u_nz = nonzeros(u);
+
+  // v = etilde
+  i_t v_nz = nonzeros(etilde);
+
+  i_t nz = u_nz + v_nz;
+  i_t S_start;
+  i_t S_nz;
+  grow_storage(nz, S_start, S_nz);
+#ifdef PRINT_NZ_INFO
+  printf("Update: S_start %d S_nz %d num updates %d S.n %d\n", S_start, S_nz, num_updates_, S_.n);
+#endif
+
+  i_t S_nz_start = S_nz;
+
+  // Scatter u into S
+  S_.append_column(u);
+
+  // Scatter v into S
+  S_.append_column(etilde);
+
+  // Compute mu = 1 + v^T * u
+  const f_t mu = 1.0 + sparse_dot(S_.i.data() + S_.col_start[S_start],
+                                  S_.x.data() + S_.col_start[S_start],
+                                  S_.col_start[S_start + 1] - S_.col_start[S_start],
+                                  S_.i.data() + S_.col_start[S_start + 1],
+                                  S_.x.data() + S_.col_start[S_start + 1],
+                                  v_nz);
+
+  if (std::abs(mu) < 1e-13) {
+    // Force a refactor. Otherwise we will get numerical issues when dividing by mu.
+    return 1;
+  }
+
+#ifdef CHECK_MU
+  const f_t mu_check = 1.0 + dot<i_t, f_t>(etilde, u);
+  printf("Update: mu %e mu_check %e diff %e\n", mu, mu_check, std::abs(mu - mu_check));
+#endif
+  mu_values_.push_back(mu);
+
+#ifdef PRINT_MU_INFO
+  printf("Update mu %e u nz %d v nz %d\n",
+         mu_values_.back(),
+         S_.col_start[S_start + 1] - S_.col_start[S_start],
+         S_.col_start[S_start + 2] - S_.col_start[S_start + 1]);
+#endif
+  num_updates_++;
+
+  return 0;
+}
+
+// Takes in utilde such that L*utilde = abar, where abar is the column to add to the basis
+template <typename i_t, typename f_t>
+i_t basis_update_mpf_t<i_t, f_t>::update(const sparse_vector_t<i_t, f_t>& utilde,
+                                         sparse_vector_t<i_t, f_t>& etilde,
+                                         i_t leaving_index)
+{
+  const i_t m = L0_.m;
+#ifdef PRINT_NUM_UPDATES
+  printf("Update: num_updates_ %d\n", num_updates_);
+#endif
+
+  // We are going to create a new matrix T = I + u*v^T
+  // where u = utilde - U0(:, p) and v = etilde
+
+  // Scatter utilde into the workspace
+  i_t nz = scatter_into_workspace(utilde);
+
+  // Subtract the column of U0 corresponding to the leaving index
+  add_sparse_column(U0_, leaving_index, -1.0, xi_workspace_, nz, x_workspace_);
+
+  // Ensure the workspace is sorted. Otherwise, the sparse dot will be incorrect.
+  std::sort(xi_workspace_.begin() + m, xi_workspace_.begin() + m + nz, std::less<i_t>());
+
+  // Gather the workspace into a column of S
+  i_t S_start;
+  i_t S_nz;
+  grow_storage(nz + etilde.i.size(), S_start, S_nz);
+
+  S_.append_column(nz, xi_workspace_.data() + m, x_workspace_.data());
+
+  // Gather etilde into a column of S
+  etilde.sort();  // Needs to be sorted for the sparse dot. TODO(CMM): Is etilde sorted on input?
+  S_.append_column(etilde);
+
+  // Compute mu = 1 + v^T * u
+  const f_t mu = 1.0 + sparse_dot(S_.i.data() + S_.col_start[S_start],
+                                  S_.x.data() + S_.col_start[S_start],
+                                  S_.col_start[S_start + 1] - S_.col_start[S_start],
+                                  S_.i.data() + S_.col_start[S_start + 1],
+                                  S_.x.data() + S_.col_start[S_start + 1],
+                                  S_.col_start[S_start + 2] - S_.col_start[S_start + 1]);
+  if (std::abs(mu) < 1e-13) {
+    // Force a refactor. Otherwise we will get numerical issues when dividing by mu.
+    return 1;
+  }
+  mu_values_.push_back(mu);
+  // Clear the workspace
+  for (i_t k = 0; k < nz; ++k) {
+    const i_t i          = xi_workspace_[m + k];
+    xi_workspace_[i]     = 0;
+    x_workspace_[i]      = 0.0;
+    xi_workspace_[m + k] = 0;
+  }
+
+#ifdef PRINT_MU_INFO
+  printf("Update mu %e u nz %d v nz %d\n",
+         mu_values_.back(),
+         S_.col_start[S_start + 1] - S_.col_start[S_start],
+         S_.col_start[S_start + 2] - S_.col_start[S_start + 1]);
+#endif
+
+  num_updates_++;
+
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+void basis_update_mpf_t<i_t, f_t>::l_multiply(std::vector<f_t>& inout) const
+{
+  const i_t m = L0_.m;
+  // L*x = y
+  // L0 * T0 * T1 * ... * T_{num_updates_ - 1} * x = y
+
+  for (i_t k = num_updates_ - 1; k >= 0; --k) {
+    // T_k = ( I + u v^T)
+    // T_k * b = b + u * (v^T * b) = b + theta * u, theta = v^T b
+    const i_t u_col = 2 * k;
+    const i_t v_col = 2 * k + 1;
+    const f_t mu    = mu_values_[k];
+
+    // dot = v^T b
+    f_t dot         = dot_product(v_col, inout);
+    const f_t theta = dot;
+    add_sparse_column(S_, u_col, theta, inout);
+  }
+
+  std::vector<f_t> out(m, 0.0);
+  matrix_vector_multiply(L0_, 1.0, inout, 0.0, out);
+  inout = out;
+}
+
+template <typename i_t, typename f_t>
+void basis_update_mpf_t<i_t, f_t>::l_transpose_multiply(std::vector<f_t>& inout) const
+{
+  const i_t m = L0_.m;
+  std::vector<f_t> out(m, 0.0);
+  matrix_vector_multiply(L0_transpose_, 1.0, inout, 0.0, out);
+
+  inout = out;
+
+  const f_t zero_tol = 1e-13;
+  for (i_t k = 0; k < num_updates_; ++k) {
+    const i_t u_col = 2 * k;
+    const i_t v_col = 2 * k + 1;
+    const f_t mu    = mu_values_[k];
+
+    // T_k = ( I + u v^T)
+    // T_k^T = ( I + v u^T)
+    // T_k^T * b = b + v * (u^T * b) = b + theta * v, theta = u^T * b
+    f_t dot         = dot_product(u_col, inout);
+    const f_t theta = dot;
+    if (std::abs(theta) > zero_tol) { add_sparse_column(S_, v_col, theta, inout); }
+  }
+}
+
+template <typename i_t, typename f_t>
+void basis_update_mpf_t<i_t, f_t>::multiply_lu(csc_matrix_t<i_t, f_t>& out) const
+{
+  // P*B = L*U
+  // B = P'*L*U
+  const i_t m = L0_.m;
+
+  out.col_start.resize(m + 1);
+  out.col_start[0] = 0;
+  out.i.clear();
+  out.x.clear();
+
+  i_t B_nz = 0;
+
+  for (i_t j = 0; j < m; ++j) {
+    // B(:, j) = L*U(:, j)
+    out.col_start[j] = B_nz;
+
+    std::vector<f_t> Uj(m, 0.0);
+    U0_.load_a_column(j, Uj);
+    l_multiply(Uj);
+    for (i_t i = 0; i < m; ++i) {
+      if (Uj[i] != 0.0) {
+        out.i.push_back(row_permutation_[i]);
+        out.x.push_back(Uj[i]);
+        B_nz++;
+      }
+    }
+  }
+  out.col_start[m] = B_nz;
+
+  out.m      = m;
+  out.n      = m;
+  out.nz_max = B_nz;
+}
+
 #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE
 template class basis_update_t<int, double>;
+template class basis_update_mpf_t<int, double>;
 #endif
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/basis_updates.hpp b/cpp/src/dual_simplex/basis_updates.hpp
index 73bec6a5d..73592e180 100644
--- a/cpp/src/dual_simplex/basis_updates.hpp
+++ b/cpp/src/dual_simplex/basis_updates.hpp
@@ -18,8 +18,11 @@
 #pragma once
 
 #include <dual_simplex/sparse_matrix.hpp>
+#include <dual_simplex/sparse_vector.hpp>
 #include <dual_simplex/types.hpp>
 
+#include <numeric>
+
 namespace cuopt::linear_programming::dual_simplex {
 
 // Forrest-Tomlin update to the LU factorization of a basis matrix B
@@ -32,11 +35,18 @@ class basis_update_t {
     : L0_(Linit),
       U_(Uinit),
       row_permutation_(p),
+      inverse_row_permutation_(p.size()),
       S_(Linit.m, 1, 0),
       col_permutation_(Linit.m),
-      inverse_col_permutation_(Linit.m)
+      inverse_col_permutation_(Linit.m),
+      xi_workspace_(2 * Linit.m, 0),
+      x_workspace_(Linit.m, 0.0),
+      U_transpose_(1, 1, 1),
+      L0_transpose_(1, 1, 1)
   {
+    inverse_permutation(row_permutation_, inverse_row_permutation_);
     clear();
+    compute_transposes();
   }
 
   i_t reset(const csc_matrix_t<i_t, f_t>& Linit,
@@ -47,34 +57,60 @@ class basis_update_t {
     U_  = Uinit;
     assert(p.size() == Linit.m);
     row_permutation_ = p;
+    inverse_permutation(row_permutation_, inverse_row_permutation_);
     clear();
+    compute_transposes();
     return 0;
   }
 
   // Solves for x such that B*x = b, where B is the basis matrix
   i_t b_solve(const std::vector<f_t>& rhs, std::vector<f_t>& solution) const;
 
+  // Solves for x such that B*x = b, where B is the basis matrix
+  i_t b_solve(const sparse_vector_t<i_t, f_t>& rhs, sparse_vector_t<i_t, f_t>& solution) const;
+
   // Solves for x such that B*x = b, where B is the basis matrix, also returns L*v = P*b
   // This is useful for avoiding an extra solve with the update
   i_t b_solve(const std::vector<f_t>& rhs,
               std::vector<f_t>& solution,
               std::vector<f_t>& Lsol) const;
 
+  // Solves for x such that B*x = b, where B is the basis matrix, also returns L*v = P*b
+  // This is useful for avoiding an extra solve with the update
+  i_t b_solve(const sparse_vector_t<i_t, f_t>& rhs,
+              sparse_vector_t<i_t, f_t>& solution,
+              sparse_vector_t<i_t, f_t>& Lsol) const;
+
   // Solves for y such that B'*y = c, where B is the basis matrix
   i_t b_transpose_solve(const std::vector<f_t>& rhs, std::vector<f_t>& solution) const;
 
+  i_t b_transpose_solve(const sparse_vector_t<i_t, f_t>& rhs,
+                        sparse_vector_t<i_t, f_t>& solution) const;
+
   // Solve for x such that L*x = y
   i_t l_solve(std::vector<f_t>& rhs) const;
 
+  // Solve for x such that L*x = y
+  i_t l_solve(sparse_vector_t<i_t, f_t>& rhs) const;
+
   // Solve for x such that L'*x = y
   i_t l_transpose_solve(std::vector<f_t>& rhs) const;
 
+  // Solve for x such that L'*x = y
+  i_t l_transpose_solve(sparse_vector_t<i_t, f_t>& rhs) const;
+
   // Solve for x such that U*x = y
   i_t u_solve(std::vector<f_t>& rhs) const;
 
+  // Solve for x such that U*x = y
+  i_t u_solve(sparse_vector_t<i_t, f_t>& rhs) const;
+
   // Solve for x such that U'*x = y
   i_t u_transpose_solve(std::vector<f_t>& rhs) const;
 
+  // Solve for x such that U'*x = y
+  i_t u_transpose_solve(sparse_vector_t<i_t, f_t>& rhs) const;
+
   // Replace the column B(:, leaving_index) with the vector abar. Pass in utilde such that L*utilde
   // = abar
   i_t update(std::vector<f_t>& utilde, i_t leaving_index);
@@ -85,6 +121,12 @@ class basis_update_t {
 
   const std::vector<i_t>& row_permutation() const { return row_permutation_; }
 
+  void compute_transposes()
+  {
+    L0_.transpose(L0_transpose_);
+    U_.transpose(U_transpose_);
+  }
+
  private:
   void clear()
   {
@@ -110,14 +152,271 @@ class basis_update_t {
                                 csc_matrix_t<i_t, f_t>& out,
                                 i_t out_col) const;
 
-  i_t num_updates_;                   // Number of rank-1 updates to L0
-  csc_matrix_t<i_t, f_t> L0_;         // Sparse lower triangular matrix from initial factorization
-  csc_matrix_t<i_t, f_t> U_;          // Sparse upper triangular matrix. Is modified by updates
-  std::vector<i_t> row_permutation_;  // Row permutation from initial factorization L*U = P*B
-  std::vector<i_t> pivot_indices_;    // indicies for rank-1 updates to L
-  csc_matrix_t<i_t, f_t> S_;          // stores the pivot elements for rank-1 updates to L
-  std::vector<i_t> col_permutation_;  // symmetric permuation q used in U(q, q) represents Q
+  void solve_to_sparse_vector(i_t top, sparse_vector_t<i_t, f_t>& out) const;
+  i_t scatter_into_workspace(const sparse_vector_t<i_t, f_t>& in) const;
+  void gather_into_sparse_vector(i_t nz, sparse_vector_t<i_t, f_t>& out) const;
+
+  i_t num_updates_;                    // Number of rank-1 updates to L0
+  mutable csc_matrix_t<i_t, f_t> L0_;  // Sparse lower triangular matrix from initial factorization
+  mutable csc_matrix_t<i_t, f_t> U_;   // Sparse upper triangular matrix. Is modified by updates
+  std::vector<i_t> row_permutation_;   // Row permutation from initial factorization L*U = P*B
+  std::vector<i_t>
+    inverse_row_permutation_;       // Inverse row permutation from initial factorization L*U = P*B
+  std::vector<i_t> pivot_indices_;  // indicies for rank-1 updates to L
+  csc_matrix_t<i_t, f_t> S_;        // stores the pivot elements for rank-1 updates to L
+  std::vector<i_t> col_permutation_;          // symmetric permuation q used in U(q, q) represents Q
   std::vector<i_t> inverse_col_permutation_;  // inverse permutation represents Q'
+  mutable std::vector<i_t> xi_workspace_;
+  mutable std::vector<f_t> x_workspace_;
+  mutable csc_matrix_t<i_t, f_t> U_transpose_;   // Needed for sparse solves
+  mutable csc_matrix_t<i_t, f_t> L0_transpose_;  // Needed for sparse solves
+};
+
+// Middle product form update to the LU factorization of a basis matrix B
+template <typename i_t, typename f_t>
+class basis_update_mpf_t {
+ public:
+  basis_update_mpf_t(const csc_matrix_t<i_t, f_t>& Linit,
+                     const csc_matrix_t<i_t, f_t>& Uinit,
+                     const std::vector<i_t>& p,
+                     const i_t refactor_frequency)
+    : L0_(Linit),
+      U0_(Uinit),
+      row_permutation_(p),
+      inverse_row_permutation_(p.size()),
+      S_(Linit.m, 0, 0),
+      col_permutation_(Linit.m),
+      inverse_col_permutation_(Linit.m),
+      xi_workspace_(2 * Linit.m, 0),
+      x_workspace_(Linit.m, 0.0),
+      U0_transpose_(1, 1, 1),
+      L0_transpose_(1, 1, 1),
+      refactor_frequency_(refactor_frequency),
+      total_sparse_L_transpose_(0),
+      total_dense_L_transpose_(0),
+      total_sparse_L_(0),
+      total_dense_L_(0),
+      total_sparse_U_transpose_(0),
+      total_dense_U_transpose_(0),
+      total_sparse_U_(0),
+      total_dense_U_(0),
+      hypersparse_threshold_(0.05)
+  {
+    inverse_permutation(row_permutation_, inverse_row_permutation_);
+    clear();
+    compute_transposes();
+    reset_stas();
+  }
+
+  void print_stats() const
+  {
+    i_t total_L_transpose_calls = total_sparse_L_transpose_ + total_dense_L_transpose_;
+    i_t total_U_transpose_calls = total_sparse_U_transpose_ + total_dense_U_transpose_;
+    i_t total_L_calls           = total_sparse_L_ + total_dense_L_;
+    i_t total_U_calls           = total_sparse_U_ + total_dense_U_;
+    // clang-format off
+    printf("sparse L transpose  %8d %8.2f%\n", total_sparse_L_transpose_, 100.0 * total_sparse_L_transpose_ / total_L_transpose_calls);
+    printf("dense  L transpose  %8d %8.2f%\n", total_dense_L_transpose_, 100.0 * total_dense_L_transpose_ / total_L_transpose_calls);
+    printf("sparse U transpose  %8d %8.2f%\n", total_sparse_U_transpose_, 100.0 * total_sparse_U_transpose_ / total_U_transpose_calls);
+    printf("dense  U transpose  %8d %8.2f%\n", total_dense_U_transpose_, 100.0 * total_dense_U_transpose_ / total_U_transpose_calls);
+    printf("sparse L            %8d %8.2f%\n", total_sparse_L_, 100.0 * total_sparse_L_ / total_L_calls);
+    printf("dense  L            %8d %8.2f%\n", total_dense_L_, 100.0 * total_dense_L_ / total_L_calls);
+    printf("sparse U            %8d %8.2f%\n", total_sparse_U_, 100.0 * total_sparse_U_ / total_U_calls);
+    printf("dense  U            %8d %8.2f%\n", total_dense_U_, 100.0 * total_dense_U_ / total_U_calls);
+    // clang-format on
+  }
+
+  void reset_stas()
+  {
+    num_calls_L_           = 0;
+    num_calls_U_           = 0;
+    num_calls_L_transpose_ = 0;
+    num_calls_U_transpose_ = 0;
+    sum_L_                 = 0.0;
+    sum_U_                 = 0.0;
+    sum_L_transpose_       = 0.0;
+    sum_U_transpose_       = 0.0;
+  }
+
+  i_t reset(const csc_matrix_t<i_t, f_t>& Linit,
+            const csc_matrix_t<i_t, f_t>& Uinit,
+            const std::vector<i_t>& p)
+  {
+    L0_ = Linit;
+    U0_ = Uinit;
+    assert(p.size() == Linit.m);
+    row_permutation_ = p;
+    inverse_permutation(row_permutation_, inverse_row_permutation_);
+    clear();
+    compute_transposes();
+    reset_stas();
+    return 0;
+  }
+
+  f_t estimate_solution_density(f_t rhs_nz, f_t sum, i_t& num_calls, bool& use_hypersparse) const
+  {
+    num_calls++;
+    const f_t average_growth    = std::max(1.0, sum / static_cast<f_t>(num_calls));
+    const f_t predicted_nz      = rhs_nz * average_growth;
+    const f_t predicted_density = predicted_nz / static_cast<f_t>(L0_.m);
+    use_hypersparse             = predicted_density < hypersparse_threshold_;
+    return predicted_nz;
+  }
+
+  // Solves for x such that B*x = b, where B is the basis matrix
+  i_t b_solve(const std::vector<f_t>& rhs, std::vector<f_t>& solution) const;
+  i_t b_solve(const sparse_vector_t<i_t, f_t>& rhs, sparse_vector_t<i_t, f_t>& solution) const;
+  i_t b_solve(const std::vector<f_t>& rhs,
+              std::vector<f_t>& solution,
+              std::vector<f_t>& Lsol,
+              bool need_Lsol = true) const;
+  i_t b_solve(const sparse_vector_t<i_t, f_t>& rhs,
+              sparse_vector_t<i_t, f_t>& solution,
+              sparse_vector_t<i_t, f_t>& Lsol,
+              bool need_Lsol = true) const;
+
+  // Solves for y such that B'*y = c, where B is the basis matrix
+  i_t b_transpose_solve(const std::vector<f_t>& rhs, std::vector<f_t>& solution) const;
+  i_t b_transpose_solve(const sparse_vector_t<i_t, f_t>& rhs,
+                        sparse_vector_t<i_t, f_t>& solution) const;
+  i_t b_transpose_solve(const std::vector<f_t>& rhs,
+                        std::vector<f_t>& solution,
+                        std::vector<f_t>& UTsol) const;
+  i_t b_transpose_solve(const sparse_vector_t<i_t, f_t>& rhs,
+                        sparse_vector_t<i_t, f_t>& solution,
+                        sparse_vector_t<i_t, f_t>& UTsol) const;
+  // Solve for x such that L*x = y
+  i_t l_solve(std::vector<f_t>& rhs) const;
+
+  // Solve for x such that L*x = y
+  i_t l_solve(sparse_vector_t<i_t, f_t>& rhs) const;
+
+  // Solve for x such that L'*x = y
+  i_t l_transpose_solve(std::vector<f_t>& rhs) const;
+
+  // Solve for x such that L'*x = y
+  i_t l_transpose_solve(sparse_vector_t<i_t, f_t>& rhs) const;
+
+  // Solve for x such that U*x = y
+  i_t u_solve(std::vector<f_t>& rhs) const;
+
+  // Solve for x such that U*x = y
+  i_t u_solve(sparse_vector_t<i_t, f_t>& rhs) const;
+
+  // Solve for x such that U'*x = y
+  i_t u_transpose_solve(std::vector<f_t>& rhs) const;
+
+  // Solve for x such that U'*x = y
+  i_t u_transpose_solve(sparse_vector_t<i_t, f_t>& rhs) const;
+
+  // Replace the column B(:, leaving_index) with the vector abar. Pass in utilde such that L*utilde
+  // = abar
+  i_t update(const std::vector<f_t>& utilde, const std::vector<f_t>& etilde, i_t leaving_index);
+
+  // Replace the column B(:, leaving_index) with the vector abar. Pass in utilde such that L*utilde
+  // = abar
+  i_t update(const sparse_vector_t<i_t, f_t>& utilde,
+             sparse_vector_t<i_t, f_t>& etilde,
+             i_t leaving_index);
+
+  i_t num_updates() const { return num_updates_; }
+
+  const std::vector<i_t>& row_permutation() const { return row_permutation_; }
+
+  void compute_transposes()
+  {
+    L0_.transpose(L0_transpose_);
+    U0_.transpose(U0_transpose_);
+  }
+
+  void multiply_lu(csc_matrix_t<i_t, f_t>& out) const;
+
+ private:
+  void clear()
+  {
+    pivot_indices_.clear();
+    pivot_indices_.reserve(L0_.m);
+    std::iota(col_permutation_.begin(), col_permutation_.end(), 0);
+    std::iota(inverse_col_permutation_.begin(), inverse_col_permutation_.end(), 0);
+    S_.col_start.resize(refactor_frequency_ + 1);
+    S_.col_start[0] = 0;
+    S_.col_start[1] = 0;
+    S_.i.clear();
+    S_.x.clear();
+    S_.n = 0;
+    mu_values_.clear();
+    mu_values_.reserve(refactor_frequency_);
+    num_updates_ = 0;
+  }
+  void grow_storage(i_t nz, i_t& S_start, i_t& S_nz);
+  i_t index_map(i_t leaving) const;
+  f_t u_diagonal(i_t j) const;
+  i_t place_diagonals();
+  f_t update_lower(const std::vector<i_t>& sind, const std::vector<f_t>& sval, i_t leaving);
+  i_t update_upper(const std::vector<i_t>& ind, const std::vector<f_t>& baru, i_t t);
+  i_t lower_triangular_multiply(const csc_matrix_t<i_t, f_t>& in,
+                                i_t in_col,
+                                csc_matrix_t<i_t, f_t>& out,
+                                i_t out_col) const;
+
+  void solve_to_workspace(i_t top) const;
+  void solve_to_sparse_vector(i_t top, sparse_vector_t<i_t, f_t>& out) const;
+  i_t scatter_into_workspace(const sparse_vector_t<i_t, f_t>& in) const;
+  void gather_into_sparse_vector(i_t nz, sparse_vector_t<i_t, f_t>& out) const;
+  i_t nonzeros(const std::vector<f_t>& x) const;
+  f_t dot_product(i_t col, const std::vector<f_t>& x) const;
+  f_t dot_product(i_t col, const std::vector<i_t>& mark, const std::vector<f_t>& x) const;
+  void add_sparse_column(const csc_matrix_t<i_t, f_t>& S,
+                         i_t col,
+                         f_t theta,
+                         std::vector<f_t>& x) const;
+  void add_sparse_column(const csc_matrix_t<i_t, f_t>& S,
+                         i_t col,
+                         f_t theta,
+                         std::vector<i_t>& mark,
+                         i_t& nz,
+                         std::vector<f_t>& x) const;
+
+  void l_multiply(std::vector<f_t>& inout) const;
+  void l_transpose_multiply(std::vector<f_t>& inout) const;
+
+  i_t num_updates_;                    // Number of rank-1 updates to L0
+  i_t refactor_frequency_;             // Average updates before refactoring
+  mutable csc_matrix_t<i_t, f_t> L0_;  // Sparse lower triangular matrix from initial factorization
+  mutable csc_matrix_t<i_t, f_t> U0_;  // Sparse upper triangular matrix from initial factorization
+  std::vector<i_t> row_permutation_;   // Row permutation from initial factorization L*U = P*B
+  std::vector<i_t>
+    inverse_row_permutation_;       // Inverse row permutation from initial factorization L*U = P*B
+  std::vector<i_t> pivot_indices_;  // indicies for rank-1 updates to L
+  csc_matrix_t<i_t, f_t> S_;        // stores information about the rank-1 updates to L
+  std::vector<f_t> mu_values_;      // stores information about the rank-1 updates to L
+  std::vector<i_t> col_permutation_;          // symmetric permuation q used in U(q, q) represents Q
+  std::vector<i_t> inverse_col_permutation_;  // inverse permutation represents Q'
+  mutable std::vector<i_t> xi_workspace_;
+  mutable std::vector<f_t> x_workspace_;
+  mutable csc_matrix_t<i_t, f_t> U0_transpose_;  // Needed for sparse solves
+  mutable csc_matrix_t<i_t, f_t> L0_transpose_;  // Needed for sparse solves
+
+  mutable i_t total_sparse_L_transpose_;
+  mutable i_t total_dense_L_transpose_;
+  mutable i_t total_sparse_L_;
+  mutable i_t total_dense_L_;
+  mutable i_t total_sparse_U_transpose_;
+  mutable i_t total_dense_U_transpose_;
+  mutable i_t total_sparse_U_;
+  mutable i_t total_dense_U_;
+
+  mutable i_t num_calls_L_;
+  mutable i_t num_calls_U_;
+  mutable i_t num_calls_L_transpose_;
+  mutable i_t num_calls_U_transpose_;
+
+  mutable f_t sum_L_;
+  mutable f_t sum_U_;
+  mutable f_t sum_L_transpose_;
+  mutable f_t sum_U_transpose_;
+
+  f_t hypersparse_threshold_;
 };
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp b/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp
new file mode 100644
index 000000000..11753cbcb
--- /dev/null
+++ b/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp
@@ -0,0 +1,346 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <dual_simplex/bound_flipping_ratio_test.hpp>
+
+#include <dual_simplex/tic_toc.hpp>
+
+#include <algorithm>
+#include <cmath>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+template <typename i_t, typename f_t>
+i_t bound_flipping_ratio_test_t<i_t, f_t>::compute_breakpoints(std::vector<i_t>& indicies,
+                                                               std::vector<f_t>& ratios)
+{
+  i_t n                  = n_;
+  i_t m                  = m_;
+  constexpr bool verbose = false;
+  f_t pivot_tol          = settings_.pivot_tol;
+  const f_t dual_tol     = settings_.dual_tol / 10;
+
+  i_t idx = 0;
+  while (idx == 0 && pivot_tol >= 1e-12) {
+    // for (i_t k = 0; k < n - m; ++k) {
+    //   const i_t j = nonbasic_list_[k];
+    for (i_t h = 0; h < delta_z_indices_.size(); ++h) {
+      const i_t j = delta_z_indices_[h];
+      const i_t k = nonbasic_mark_[j];
+      if (vstatus_[j] == variable_status_t::NONBASIC_FIXED) { continue; }
+      if (vstatus_[j] == variable_status_t::NONBASIC_LOWER && delta_z_[j] < -pivot_tol) {
+        indicies[idx] = k;
+        ratios[idx]   = std::max((-dual_tol - z_[j]) / delta_z_[j], 0.0);
+        if constexpr (verbose) { settings_.log.printf("ratios[%d] = %e\n", idx, ratios[idx]); }
+        idx++;
+      }
+      if (vstatus_[j] == variable_status_t::NONBASIC_UPPER && delta_z_[j] > pivot_tol) {
+        indicies[idx] = k;
+        ratios[idx]   = std::max((dual_tol - z_[j]) / delta_z_[j], 0.0);
+        if constexpr (verbose) { settings_.log.printf("ratios[%d] = %e\n", idx, ratios[idx]); }
+        idx++;
+      }
+    }
+    pivot_tol /= 10;
+  }
+  return idx;
+}
+
+template <typename i_t, typename f_t>
+i_t bound_flipping_ratio_test_t<i_t, f_t>::single_pass(i_t start,
+                                                       i_t end,
+                                                       const std::vector<i_t>& indicies,
+                                                       const std::vector<f_t>& ratios,
+                                                       f_t& slope,
+                                                       f_t& step_length,
+                                                       i_t& nonbasic_entering,
+                                                       i_t& entering_index)
+{
+  // Find the minimum ratio
+  f_t min_val    = inf;
+  entering_index = -1;
+  i_t candidate  = -1;
+  f_t zero_tol   = settings_.zero_tol;
+  i_t k_idx      = -1;
+  for (i_t k = start; k < end; ++k) {
+    if (ratios[k] < min_val) {
+      min_val   = ratios[k];
+      candidate = indicies[k];
+      k_idx     = k;
+    } else if (ratios[k] < min_val + zero_tol) {
+      // Use Harris to select variables with larger pivots
+      const i_t j = nonbasic_list_[indicies[k]];
+      if (std::abs(delta_z_[j]) > std::abs(delta_z_[candidate])) {
+        min_val   = ratios[k];
+        candidate = indicies[k];
+        k_idx     = k;
+      }
+    }
+  }
+  step_length       = min_val;
+  nonbasic_entering = candidate;
+  const i_t j = entering_index = nonbasic_list_[nonbasic_entering];
+
+  constexpr bool verbose = false;
+  if (bounded_variables_[j]) {
+    const f_t interval    = upper_[j] - lower_[j];
+    const f_t delta_slope = std::abs(delta_z_[j]) * interval;
+    if constexpr (verbose) {
+      settings_.log.printf("single pass delta slope %e slope %e after slope %e step length %e\n",
+                           delta_slope,
+                           slope,
+                           slope - delta_slope,
+                           step_length);
+    }
+    slope -= delta_slope;
+    return k_idx;  // we should see if we can continue to increase the step-length
+  }
+  return -1;  // we are done. do not increase the step-length further
+}
+
+template <typename i_t, typename f_t>
+i_t bound_flipping_ratio_test_t<i_t, f_t>::compute_step_length(f_t& step_length,
+                                                               i_t& nonbasic_entering)
+{
+  const i_t m            = m_;
+  const i_t n            = n_;
+  const i_t nz           = delta_z_indices_.size();
+  constexpr bool verbose = false;
+
+  // Compute the initial set of breakpoints
+  std::vector<i_t> indicies(nz);
+  std::vector<f_t> ratios(nz);
+  i_t num_breakpoints = compute_breakpoints(indicies, ratios);
+  if constexpr (verbose) { settings_.log.printf("Initial breakpoints %d\n", num_breakpoints); }
+  if (num_breakpoints == 0) {
+    nonbasic_entering = -1;
+    return -1;
+  }
+
+  f_t slope          = slope_;
+  nonbasic_entering  = -1;
+  i_t entering_index = -1;
+
+  i_t k_idx = single_pass(
+    0, num_breakpoints, indicies, ratios, slope, step_length, nonbasic_entering, entering_index);
+  bool continue_search = k_idx >= 0 && num_breakpoints > 1 && slope > 0.0;
+  if (!continue_search) {
+    if constexpr (0) {
+      settings_.log.printf(
+        "BFRT stopping. No bound flips. Step length %e Nonbasic entering %d Entering %d pivot %e\n",
+        step_length,
+        nonbasic_entering,
+        entering_index,
+        std::abs(delta_z_[entering_index]));
+    }
+    return entering_index;
+  }
+
+  if constexpr (verbose) {
+    settings_.log.printf(
+      "Continuing past initial step length %e entering index %d nonbasic entering %d slope %e\n",
+      step_length,
+      entering_index,
+      nonbasic_entering,
+      slope);
+  }
+
+  // Continue the search using a heap to order the breakpoints
+  ratios[k_idx]   = ratios[num_breakpoints - 1];
+  indicies[k_idx] = indicies[num_breakpoints - 1];
+
+  constexpr bool use_bucket_pass = false;
+
+  if (use_bucket_pass) {
+    f_t max_ratio = 0.0;
+    for (i_t k = 0; k < num_breakpoints - 1; ++k) {
+      if (ratios[k] > max_ratio) { max_ratio = ratios[k]; }
+    }
+    settings_.log.printf(
+      "Starting heap passes. %d breakpoints max ratio %e\n", num_breakpoints - 1, max_ratio);
+    bucket_pass(
+      indicies, ratios, num_breakpoints - 1, slope, step_length, nonbasic_entering, entering_index);
+  }
+
+  heap_passes(
+    indicies, ratios, num_breakpoints - 1, slope, step_length, nonbasic_entering, entering_index);
+
+  if constexpr (verbose) {
+    settings_.log.printf("BFRT step length %e entering index %d non basic entering %d pivot %e\n",
+                         step_length,
+                         entering_index,
+                         nonbasic_entering,
+                         std::abs(delta_z_[entering_index]));
+  }
+  return entering_index;
+}
+
+template <typename i_t, typename f_t>
+void bound_flipping_ratio_test_t<i_t, f_t>::heap_passes(const std::vector<i_t>& current_indicies,
+                                                        const std::vector<f_t>& current_ratios,
+                                                        i_t num_breakpoints,
+                                                        f_t& slope,
+                                                        f_t& step_length,
+                                                        i_t& nonbasic_entering,
+                                                        i_t& entering_index)
+{
+  std::vector<i_t> bare_idx(num_breakpoints);
+  constexpr bool verbose                = false;
+  const f_t dual_tol                    = settings_.dual_tol;
+  const f_t zero_tol                    = settings_.zero_tol;
+  const std::vector<f_t>& delta_z       = delta_z_;
+  const std::vector<i_t>& nonbasic_list = nonbasic_list_;
+  const i_t N                           = num_breakpoints;
+  for (i_t k = 0; k < N; ++k) {
+    bare_idx[k] = k;
+    if constexpr (verbose) {
+      settings_.log.printf("Adding index %d ratio %e pivot %e to heap\n",
+                           current_indicies[k],
+                           current_ratios[k],
+                           std::abs(delta_z[nonbasic_list[current_indicies[k]]]));
+    }
+  }
+
+  auto compare = [zero_tol, &current_ratios, &current_indicies, &delta_z, &nonbasic_list](
+                   const i_t& a, const i_t& b) {
+    return (current_ratios[a] > current_ratios[b]) ||
+           (current_ratios[b] - current_ratios[a] < zero_tol &&
+            std::abs(delta_z[nonbasic_list[current_indicies[a]]]) >
+              std::abs(delta_z[nonbasic_list[current_indicies[b]]]));
+  };
+
+  std::make_heap(bare_idx.begin(), bare_idx.end(), compare);
+
+  while (bare_idx.size() > 0 && slope > 0) {
+    // Remove minimum ratio from the heap and rebalance
+    i_t heap_index = bare_idx.front();
+    std::pop_heap(bare_idx.begin(), bare_idx.end(), compare);
+    bare_idx.pop_back();
+
+    nonbasic_entering = current_indicies[heap_index];
+    const i_t j = entering_index = nonbasic_list_[nonbasic_entering];
+    step_length                  = current_ratios[heap_index];
+
+    if (bounded_variables_[j]) {
+      // We have a bounded variable
+      const f_t interval    = upper_[j] - lower_[j];
+      const f_t delta_slope = std::abs(delta_z_[j]) * interval;
+      const f_t pivot       = std::abs(delta_z[j]);
+      if constexpr (verbose) {
+        settings_.log.printf(
+          "heap %d step-length %.12e pivot %e nonbasic entering %d slope %e delta_slope %e new "
+          "slope %e\n",
+          bare_idx.size(),
+          current_ratios[heap_index],
+          pivot,
+          nonbasic_entering,
+          slope,
+          delta_slope,
+          slope - delta_slope);
+      }
+      slope -= delta_slope;
+    } else {
+      // The variable is not bounded. Stop the search.
+      break;
+    }
+
+    if (toc(start_time_) > settings_.time_limit) {
+      entering_index = -2;
+      return;
+    }
+    if (settings_.concurrent_halt != nullptr &&
+        settings_.concurrent_halt->load(std::memory_order_acquire) == 1) {
+      entering_index = -3;
+      return;
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+void bound_flipping_ratio_test_t<i_t, f_t>::bucket_pass(const std::vector<i_t>& current_indicies,
+                                                        const std::vector<f_t>& current_ratios,
+                                                        i_t num_breakpoints,
+                                                        f_t& slope,
+                                                        f_t& step_length,
+                                                        i_t& nonbasic_entering,
+                                                        i_t& entering_index)
+{
+  const f_t dual_tol                    = settings_.dual_tol;
+  const f_t zero_tol                    = settings_.zero_tol;
+  const std::vector<f_t>& delta_z       = delta_z_;
+  const std::vector<i_t>& nonbasic_list = nonbasic_list_;
+  const i_t N                           = num_breakpoints;
+
+  const i_t K = 400;  // 0, -16, -15, ...., 0, 1, ...., 400 - 18 = 382
+  std::vector<f_t> buckets(K, 0.0);
+  std::vector<i_t> bucket_count(K, 0);
+  for (i_t k = 0; k < N; ++k) {
+    const i_t idx          = current_indicies[k];
+    const f_t ratio        = current_ratios[k];
+    const f_t min_exponent = -16.0;
+    const f_t max_exponent = 382.0;
+    const f_t exponent     = std::max(min_exponent, std::min(max_exponent, std::log10(ratio)));
+    const i_t bucket_idx   = ratio == 0.0 ? 0 : static_cast<i_t>(exponent - min_exponent + 1);
+    // settings_.log.printf("Ratio %e exponent %e bucket_idx %d\n", ratio, exponent, bucket_idx);
+    const i_t j           = nonbasic_list[idx];
+    const f_t interval    = upper_[j] - lower_[j];
+    const f_t delta_slope = std::abs(delta_z_[j]) * interval;
+    buckets[bucket_idx] += delta_slope;
+    bucket_count[bucket_idx]++;
+  }
+
+  std::vector<f_t> cumulative_sum(K, 0.0);
+  cumulative_sum[0] = buckets[0];
+  if (cumulative_sum[0] > slope) {
+    settings_.log.printf(
+      "Bucket 0. Count in bucket %d. Slope %e. Cumulative sum %e. Bucket value %e\n",
+      bucket_count[0],
+      slope,
+      cumulative_sum[0],
+      buckets[0]);
+    return;
+  }
+  i_t k;
+  bool exceeded = false;
+  for (k = 1; k < K; ++k) {
+    cumulative_sum[k] = cumulative_sum[k - 1] + buckets[k];
+    if (cumulative_sum[k] > slope) {
+      exceeded = true;
+      break;
+    }
+  }
+
+  if (exceeded) {
+    settings_.log.printf(
+      "Value in bucket %d. Count in buckets %d. Slope %e. Cumulative sum %e. Next sum %e Bucket "
+      "value %e\n",
+      k,
+      bucket_count[k],
+      slope,
+      cumulative_sum[k - 1],
+      cumulative_sum[k],
+      buckets[k - 1]);
+  }
+}
+
+#ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE
+
+template class bound_flipping_ratio_test_t<int, double>;
+
+#endif
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp b/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp
new file mode 100644
index 000000000..1d741ba28
--- /dev/null
+++ b/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp
@@ -0,0 +1,107 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <dual_simplex/initial_basis.hpp>
+#include <dual_simplex/simplex_solver_settings.hpp>
+
+#include <vector>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+template <typename i_t, typename f_t>
+class bound_flipping_ratio_test_t {
+ public:
+  bound_flipping_ratio_test_t(const simplex_solver_settings_t<i_t, f_t>& settings,
+                              f_t start_time,
+                              i_t m,
+                              i_t n,
+                              f_t initial_slope,
+                              const std::vector<f_t>& lower,
+                              const std::vector<f_t>& upper,
+                              const std::vector<uint8_t>& bounded_variables,
+                              const std::vector<variable_status_t>& vstatus,
+                              const std::vector<i_t>& nonbasic_list,
+                              const std::vector<f_t>& z,
+                              const std::vector<f_t>& delta_z,
+                              const std::vector<i_t>& delta_z_indices,
+                              const std::vector<i_t>& nonbasic_mark)
+    : settings_(settings),
+      start_time_(start_time),
+      m_(m),
+      n_(n),
+      slope_(initial_slope),
+      lower_(lower),
+      upper_(upper),
+      bounded_variables_(bounded_variables),
+      vstatus_(vstatus),
+      nonbasic_list_(nonbasic_list),
+      z_(z),
+      delta_z_(delta_z),
+      delta_z_indices_(delta_z_indices),
+      nonbasic_mark_(nonbasic_mark)
+  {
+  }
+
+  i_t compute_step_length(f_t& step_length, i_t& nonbasic_entering);
+
+ private:
+  i_t compute_breakpoints(std::vector<i_t>& indices, std::vector<f_t>& ratios);
+  i_t single_pass(i_t start,
+                  i_t end,
+                  const std::vector<i_t>& indices,
+                  const std::vector<f_t>& ratios,
+                  f_t& slope,
+                  f_t& step_length,
+                  i_t& nonbasic_entering,
+                  i_t& enetering_index);
+  void heap_passes(const std::vector<i_t>& current_indicies,
+                   const std::vector<f_t>& current_ratios,
+                   i_t num_breakpoints,
+                   f_t& slope,
+                   f_t& step_lenght,
+                   i_t& nonbasic_entering,
+                   i_t& entering_index);
+
+  void bucket_pass(const std::vector<i_t>& current_indicies,
+                   const std::vector<f_t>& current_ratios,
+                   i_t num_breakpoints,
+                   f_t& slope,
+                   f_t& step_length,
+                   i_t& nonbasic_entering,
+                   i_t& entering_index);
+
+  const std::vector<f_t>& lower_;
+  const std::vector<f_t>& upper_;
+  const std::vector<uint8_t>& bounded_variables_;
+  const std::vector<i_t>& nonbasic_list_;
+  const std::vector<variable_status_t>& vstatus_;
+  const std::vector<f_t>& z_;
+  const std::vector<f_t>& delta_z_;
+  const std::vector<i_t>& delta_z_indices_;
+  const std::vector<i_t>& nonbasic_mark_;
+
+  const simplex_solver_settings_t<i_t, f_t>& settings_;
+
+  f_t start_time_;
+  f_t slope_;
+
+  i_t n_;
+  i_t m_;
+};
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/branch_and_bound.cpp b/cpp/src/dual_simplex/branch_and_bound.cpp
index e141a71bf..906704012 100644
--- a/cpp/src/dual_simplex/branch_and_bound.cpp
+++ b/cpp/src/dual_simplex/branch_and_bound.cpp
@@ -205,6 +205,29 @@ void graphviz_edge(const simplex_solver_settings_t<i_t, f_t>& settings,
   }
 }
 
+dual::status_t convert_lp_status_to_dual_status(lp_status_t status)
+{
+  if (status == lp_status_t::OPTIMAL) {
+    return dual::status_t::OPTIMAL;
+  } else if (status == lp_status_t::INFEASIBLE) {
+    return dual::status_t::DUAL_UNBOUNDED;
+  } else if (status == lp_status_t::ITERATION_LIMIT) {
+    return dual::status_t::ITERATION_LIMIT;
+  } else if (status == lp_status_t::TIME_LIMIT) {
+    return dual::status_t::TIME_LIMIT;
+  } else if (status == lp_status_t::NUMERICAL_ISSUES) {
+    return dual::status_t::NUMERICAL;
+  } else if (status == lp_status_t::CUTOFF) {
+    return dual::status_t::CUTOFF;
+  } else if (status == lp_status_t::CONCURRENT_LIMIT) {
+    return dual::status_t::CONCURRENT_LIMIT;
+  } else if (status == lp_status_t::UNSET) {
+    return dual::status_t::UNSET;
+  } else {
+    return dual::status_t::NUMERICAL;
+  }
+}
+
 }  // namespace
 
 template <typename f_t>
@@ -380,7 +403,7 @@ branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
   : original_problem(user_problem), settings(solver_settings), original_lp(1, 1, 1)
 {
   start_time = tic();
-  convert_user_problem(original_problem, original_lp, new_slacks);
+  convert_user_problem(original_problem, settings, original_lp, new_slacks);
   full_variable_types(original_problem, original_lp, var_types);
 
   global_variables::mutex_upper.lock();
@@ -674,6 +697,12 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
                                            leaf_solution,
                                            node_iter,
                                            leaf_edge_norms);
+    if (lp_status == dual::status_t::NUMERICAL) {
+      settings.log.printf("Numerical issue node %d. Resolving from scratch.\n", nodes_explored);
+      lp_status_t second_status = solve_linear_program_advanced(
+        leaf_problem, lp_start_time, lp_settings, leaf_solution, leaf_vstatus, leaf_edge_norms);
+      lp_status = convert_lp_status_to_dual_status(second_status);
+    }
     total_lp_solve_time += toc(lp_start_time);
     total_lp_iters += node_iter;
 
diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp
index cbfc66a22..76f4768ab 100644
--- a/cpp/src/dual_simplex/phase2.cpp
+++ b/cpp/src/dual_simplex/phase2.cpp
@@ -17,14 +17,17 @@
 
 #include <dual_simplex/basis_solves.hpp>
 #include <dual_simplex/basis_updates.hpp>
+#include <dual_simplex/bound_flipping_ratio_test.hpp>
 #include <dual_simplex/initial_basis.hpp>
 #include <dual_simplex/phase1.hpp>
 #include <dual_simplex/phase2.hpp>
+#include <dual_simplex/random.hpp>
 #include <dual_simplex/solve.hpp>
 #include <dual_simplex/sparse_matrix.hpp>
 #include <dual_simplex/tic_toc.hpp>
 
 #include <cassert>
+#include <cmath>
 #include <cstdio>
 #include <iterator>
 #include <limits>
@@ -34,17 +37,467 @@ namespace cuopt::linear_programming::dual_simplex {
 
 namespace phase2 {
 
+// Computes vectors farkas_y, farkas_zl, farkas_zu that satisfy
+//
+// A'*farkas_y + farkas_zl - farkas_zu ~= 0
+// farkas_zl, farkas_zu >= 0,
+// b'*farkas_y + l'*farkas_zl - u'*farkas_zu = farkas_constant > 0
+//
+// This is a Farkas certificate for the infeasibility of the primal problem
+//
+// A*x = b, l <= x <= u
+template <typename i_t, typename f_t>
+void compute_farkas_certificate(const lp_problem_t<i_t, f_t>& lp,
+                                const simplex_solver_settings_t<i_t, f_t>& settings,
+                                const std::vector<variable_status_t>& vstatus,
+                                const std::vector<f_t>& x,
+                                const std::vector<f_t>& y,
+                                const std::vector<f_t>& z,
+                                const std::vector<f_t>& delta_y,
+                                const std::vector<f_t>& delta_z,
+                                i_t direction,
+                                i_t leaving_index,
+                                f_t obj_val,
+                                std::vector<f_t>& farkas_y,
+                                std::vector<f_t>& farkas_zl,
+                                std::vector<f_t>& farkas_zu,
+                                f_t& farkas_constant)
+{
+  const i_t m = lp.num_rows;
+  const i_t n = lp.num_cols;
+
+  std::vector<f_t> original_residual = z;
+  matrix_transpose_vector_multiply(lp.A, 1.0, y, 1.0, original_residual);
+  for (i_t j = 0; j < n; ++j) {
+    original_residual[j] -= lp.objective[j];
+  }
+  const f_t original_residual_norm = vector_norm2<i_t, f_t>(original_residual);
+  settings.log.printf("|| A'*y + z - c || = %e\n", original_residual_norm);
+
+  std::vector<f_t> zl(n);
+  std::vector<f_t> zu(n);
+  for (i_t j = 0; j < n; ++j) {
+    zl[j] = std::max(0.0, z[j]);
+    zu[j] = -std::min(0.0, z[j]);
+  }
+
+  original_residual = zl;
+  matrix_transpose_vector_multiply(lp.A, 1.0, y, 1.0, original_residual);
+  for (i_t j = 0; j < n; ++j) {
+    original_residual[j] -= (zu[j] + lp.objective[j]);
+  }
+  const f_t original_residual_2 = vector_norm2<i_t, f_t>(original_residual);
+  settings.log.printf("|| A'*y + zl - zu - c || = %e\n", original_residual_2);
+
+  std::vector<f_t> search_dir_residual = delta_z;
+  matrix_transpose_vector_multiply(lp.A, 1.0, delta_y, 1.0, search_dir_residual);
+  settings.log.printf("|| A'*delta_y + delta_z || = %e\n",
+                      vector_norm2<i_t, f_t>(search_dir_residual));
+
+  std::vector<f_t> y_bar(m);
+  for (i_t i = 0; i < m; ++i) {
+    y_bar[i] = y[i] + delta_y[i];
+  }
+  original_residual = z;
+  matrix_transpose_vector_multiply(lp.A, 1.0, y_bar, 1.0, original_residual);
+  for (i_t j = 0; j < n; ++j) {
+    original_residual[j] += (delta_z[j] - lp.objective[j]);
+  }
+  const f_t original_residual_3 = vector_norm2<i_t, f_t>(original_residual);
+  settings.log.printf("|| A'*(y + delta_y) + (z + delta_z) - c || = %e\n", original_residual_3);
+
+  farkas_y.resize(m);
+  farkas_zl.resize(n);
+  farkas_zu.resize(n);
+
+  f_t gamma = 0.0;
+  for (i_t j = 0; j < n; ++j) {
+    const f_t cj    = lp.objective[j];
+    const f_t lower = lp.lower[j];
+    const f_t upper = lp.upper[j];
+    if (lower > -inf) { gamma -= lower * std::min(0.0, cj); }
+    if (upper < inf) { gamma -= upper * std::max(0.0, cj); }
+  }
+  printf("gamma = %e\n", gamma);
+
+  const f_t threshold          = 1.0;
+  const f_t positive_threshold = std::max(-gamma, 0.0) + threshold;
+  printf("positive_threshold = %e\n", positive_threshold);
+
+  // We need to increase the dual objective to positive threshold
+  f_t alpha        = threshold;
+  const f_t infeas = (direction == 1) ? (lp.lower[leaving_index] - x[leaving_index])
+                                      : (x[leaving_index] - lp.upper[leaving_index]);
+  // We need the new objective to be at least positive_threshold
+  // positive_threshold = obj_val+ alpha * infeas
+  // infeas > 0, alpha > 0, positive_threshold > 0
+  printf("direction = %d\n", direction);
+  printf(
+    "lower %e x %e upper %d\n", lp.lower[leaving_index], x[leaving_index], lp.upper[leaving_index]);
+  printf("infeas = %e\n", infeas);
+  printf("obj_val = %e\n", obj_val);
+  alpha = std::max(threshold, (positive_threshold - obj_val) / infeas);
+  printf("alpha = %e\n", alpha);
+
+  std::vector<f_t> y_prime(m);
+  std::vector<f_t> zl_prime(n);
+  std::vector<f_t> zu_prime(n);
+
+  // farkas_y = y + alpha * delta_y
+  for (i_t i = 0; i < m; ++i) {
+    farkas_y[i] = y[i] + alpha * delta_y[i];
+    y_prime[i]  = y[i] + alpha * delta_y[i];
+  }
+  // farkas_zl = z + alpha * delta_z  - c-
+  for (i_t j = 0; j < n; ++j) {
+    const f_t cj        = lp.objective[j];
+    const f_t z_j       = z[j];
+    const f_t delta_z_j = delta_z[j];
+    farkas_zl[j] = std::max(0.0, z_j) + alpha * std::max(0.0, delta_z_j) + -std::min(0.0, cj);
+    zl_prime[j]  = zl[j] + alpha * std::max(0.0, delta_z_j);
+  }
+
+  // farkas_zu = z + alpha * delta_z + c+
+  for (i_t j = 0; j < n; ++j) {
+    const f_t cj        = lp.objective[j];
+    const f_t z_j       = z[j];
+    const f_t delta_z_j = delta_z[j];
+    farkas_zu[j] = -std::min(0.0, z_j) - alpha * std::min(0.0, delta_z_j) + std::max(0.0, cj);
+    zu_prime[j]  = zu[j] + alpha * (-std::min(0.0, delta_z_j));
+  }
+
+  // farkas_constant = b'*farkas_y + l'*farkas_zl - u'*farkas_zu
+  farkas_constant   = 0.0;
+  f_t test_constant = 0.0;
+  f_t test_3        = 0.0;
+  for (i_t i = 0; i < m; ++i) {
+    farkas_constant += lp.rhs[i] * farkas_y[i];
+    test_constant += lp.rhs[i] * y_prime[i];
+    test_3 += lp.rhs[i] * delta_y[i];
+  }
+  printf("b'*delta_y = %e\n", test_3);
+  printf("|| b || %e\n", vector_norm_inf<i_t, f_t>(lp.rhs));
+  printf("|| delta y || %e\n", vector_norm_inf<i_t, f_t>(delta_y));
+  for (i_t j = 0; j < n; ++j) {
+    const f_t lower = lp.lower[j];
+    const f_t upper = lp.upper[j];
+    if (lower > -inf) {
+      farkas_constant += lower * farkas_zl[j];
+      test_constant += lower * zl_prime[j];
+      const f_t delta_z_l_j = std::max(delta_z[j], 0.0);
+      test_3 += lower * delta_z_l_j;
+    }
+    if (upper < inf) {
+      farkas_constant -= upper * farkas_zu[j];
+      test_constant -= upper * zu_prime[j];
+      const f_t delta_z_u_j = -std::min(delta_z[j], 0.0);
+      test_3 -= upper * delta_z_u_j;
+    }
+  }
+
+  // Verify that the Farkas certificate is valid
+  std::vector<f_t> residual = farkas_zl;
+  matrix_transpose_vector_multiply(lp.A, 1.0, farkas_y, 1.0, residual);
+  for (i_t j = 0; j < n; ++j) {
+    residual[j] -= farkas_zu[j];
+  }
+  const f_t residual_norm = vector_norm2<i_t, f_t>(residual);
+
+  f_t zl_min = 0.0;
+  for (i_t j = 0; j < n; ++j) {
+    zl_min = std::min(zl_min, farkas_zl[j]);
+  }
+  settings.log.printf("farkas_zl_min = %e\n", zl_min);
+  f_t zu_min = 0.0;
+  for (i_t j = 0; j < n; ++j) {
+    zu_min = std::min(zu_min, farkas_zu[j]);
+  }
+  settings.log.printf("farkas_zu_min = %e\n", zu_min);
+
+  settings.log.printf("|| A'*farkas_y + farkas_zl - farkas_zu || = %e\n", residual_norm);
+  settings.log.printf("b'*farkas_y + l'*farkas_zl - u'*farkas_zu = %e\n", farkas_constant);
+
+  if (residual_norm < 1e-6 && farkas_constant > 0.0 && zl_min >= 0.0 && zu_min >= 0.0) {
+    settings.log.printf("Farkas certificate of infeasibility constructed\n");
+  }
+}
+
+template <typename i_t, typename f_t>
+void initial_perturbation(const lp_problem_t<i_t, f_t>& lp,
+                          const simplex_solver_settings_t<i_t, f_t>& settings,
+                          const std::vector<variable_status_t>& vstatus,
+                          std::vector<f_t>& objective)
+{
+  const i_t m           = lp.num_rows;
+  const i_t n           = lp.num_cols;
+  f_t max_abs_obj_coeff = 0.0;
+  for (i_t j = 0; j < n; ++j) {
+    max_abs_obj_coeff = std::max(max_abs_obj_coeff, std::abs(lp.objective[j]));
+  }
+
+  const f_t dual_tol = settings.dual_tol;
+
+  std::srand(static_cast<unsigned int>(std::time(nullptr)));
+
+  objective.resize(n);
+  f_t sum_perturb = 0.0;
+  i_t num_perturb = 0;
+
+  random_t<i_t, f_t> random(settings.seed);
+  for (i_t j = 0; j < n; ++j) {
+    f_t obj = objective[j] = lp.objective[j];
+
+    const f_t lower = lp.lower[j];
+    const f_t upper = lp.upper[j];
+    if (vstatus[j] == variable_status_t::NONBASIC_FIXED ||
+        vstatus[j] == variable_status_t::NONBASIC_FREE || lower == upper ||
+        lower == -inf && upper == inf) {
+      continue;
+    }
+
+    const f_t rand_val = random.random();
+    const f_t perturb =
+      (1e-5 * std::abs(obj) + 1e-7 * max_abs_obj_coeff + 10 * dual_tol) * (1.0 + rand_val);
+
+    if (vstatus[j] == variable_status_t::NONBASIC_LOWER || lower > -inf && upper < inf && obj > 0) {
+      objective[j] = obj + perturb;
+      sum_perturb += perturb;
+      num_perturb++;
+    } else if (vstatus[j] == variable_status_t::NONBASIC_UPPER ||
+               lower > -inf && upper < inf && obj < 0) {
+      objective[j] = obj - perturb;
+      sum_perturb += perturb;
+      num_perturb++;
+    }
+  }
+
+  settings.log.printf("Applied initial perturbation of %e to %d/%d objective coefficients\n",
+                      sum_perturb,
+                      num_perturb,
+                      n);
+}
+
+template <typename i_t, typename f_t>
+void compute_reduced_cost_update(const lp_problem_t<i_t, f_t>& lp,
+                                 const std::vector<i_t>& basic_list,
+                                 const std::vector<i_t>& nonbasic_list,
+                                 const std::vector<f_t>& delta_y,
+                                 i_t leaving_index,
+                                 i_t direction,
+                                 std::vector<i_t>& delta_z_mark,
+                                 std::vector<i_t>& delta_z_indices,
+                                 std::vector<f_t>& delta_z)
+{
+  const i_t m = lp.num_rows;
+  const i_t n = lp.num_cols;
+
+  // delta_zB = sigma*ei
+  for (i_t k = 0; k < m; k++) {
+    const i_t j = basic_list[k];
+    delta_z[j]  = 0;
+  }
+  delta_z[leaving_index] = direction;
+  // delta_zN = -N'*delta_y
+  for (i_t k = 0; k < n - m; k++) {
+    const i_t j = nonbasic_list[k];
+    // z_j <- -A(:, j)'*delta_y
+    const i_t col_start = lp.A.col_start[j];
+    const i_t col_end   = lp.A.col_start[j + 1];
+    f_t dot             = 0.0;
+    for (i_t p = col_start; p < col_end; ++p) {
+      dot += lp.A.x[p] * delta_y[lp.A.i[p]];
+    }
+    delta_z[j] = -dot;
+    if (dot != 0.0) {
+      delta_z_indices.push_back(j);  // Note delta_z_indices has n elements reserved
+      delta_z_mark[j] = 1;
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+void compute_delta_z(const csc_matrix_t<i_t, f_t>& A_transpose,
+                     const sparse_vector_t<i_t, f_t>& delta_y,
+                     i_t leaving_index,
+                     i_t direction,
+                     std::vector<i_t>& nonbasic_mark,
+                     std::vector<i_t>& delta_z_mark,
+                     std::vector<i_t>& delta_z_indices,
+                     std::vector<f_t>& delta_z)
+{
+  // delta_zN = - N'*delta_y
+  const i_t nz_delta_y = delta_y.i.size();
+  for (i_t k = 0; k < nz_delta_y; k++) {
+    const i_t i         = delta_y.i[k];
+    const f_t delta_y_i = delta_y.x[k];
+    if (std::abs(delta_y_i) < 1e-12) { continue; }
+    const i_t row_start = A_transpose.col_start[i];
+    const i_t row_end   = A_transpose.col_start[i + 1];
+    for (i_t p = row_start; p < row_end; ++p) {
+      const i_t j = A_transpose.i[p];
+      if (nonbasic_mark[j] >= 0) {
+        delta_z[j] -= delta_y_i * A_transpose.x[p];
+        if (!delta_z_mark[j]) {
+          delta_z_mark[j] = 1;
+          delta_z_indices.push_back(j);  // Note delta_z_indices has n elements reserved
+        }
+      }
+    }
+  }
+
+  // delta_zB = sigma*ei
+  delta_z[leaving_index] = direction;
+
+#ifdef CHECK_CHANGE_IN_REDUCED_COST
+  delta_y_sparse.to_dense(delta_y);
+  std::vector<f_t> delta_z_check(n);
+  std::vector<i_t> delta_z_mark_check(n, 0);
+  std::vector<i_t> delta_z_indices_check;
+  phase2::compute_reduced_cost_update(lp,
+                                      basic_list,
+                                      nonbasic_list,
+                                      delta_y,
+                                      leaving_index,
+                                      direction,
+                                      delta_z_mark_check,
+                                      delta_z_indices_check,
+                                      delta_z_check);
+  f_t error_check = 0.0;
+  for (i_t k = 0; k < n; ++k) {
+    const f_t diff = std::abs(delta_z[k] - delta_z_check[k]);
+    if (diff > 1e-6) {
+      printf("delta_z error %d transpose %e no transpose %e diff %e\n",
+             k,
+             delta_z[k],
+             delta_z_check[k],
+             diff);
+    }
+    error_check = std::max(error_check, diff);
+  }
+  if (error_check > 1e-6) { printf("delta_z error %e\n", error_check); }
+#endif
+}
+
+template <typename i_t, typename f_t>
+void compute_reduced_costs(const std::vector<f_t>& objective,
+                           const csc_matrix_t<i_t, f_t>& A,
+                           const std::vector<f_t>& y,
+                           const std::vector<i_t>& basic_list,
+                           const std::vector<i_t>& nonbasic_list,
+                           std::vector<f_t>& z)
+{
+  const i_t m = A.m;
+  const i_t n = A.n;
+  // zN = cN - N'*y
+  for (i_t k = 0; k < n - m; k++) {
+    const i_t j = nonbasic_list[k];
+    // z_j <- c_j
+    z[j] = objective[j];
+
+    // z_j <- z_j - A(:, j)'*y
+    const i_t col_start = A.col_start[j];
+    const i_t col_end   = A.col_start[j + 1];
+    f_t dot             = 0.0;
+    for (i_t p = col_start; p < col_end; ++p) {
+      dot += A.x[p] * y[A.i[p]];
+    }
+    z[j] -= dot;
+  }
+  // zB = 0
+  for (i_t k = 0; k < m; ++k) {
+    z[basic_list[k]] = 0.0;
+  }
+}
+
+template <typename i_t, typename f_t>
+void compute_primal_variables(const basis_update_mpf_t<i_t, f_t>& ft,
+                              const std::vector<f_t>& lp_rhs,
+                              const csc_matrix_t<i_t, f_t>& A,
+                              const std::vector<i_t>& basic_list,
+                              const std::vector<i_t>& nonbasic_list,
+                              f_t tight_tol,
+                              std::vector<f_t>& x)
+{
+  const i_t m          = A.m;
+  const i_t n          = A.n;
+  std::vector<f_t> rhs = lp_rhs;
+  // rhs = b - sum_{j : x_j = l_j} A(:, j) * l(j)
+  //         - sum_{j : x_j = u_j} A(:, j) * u(j)
+  for (i_t k = 0; k < n - m; ++k) {
+    const i_t j         = nonbasic_list[k];
+    const i_t col_start = A.col_start[j];
+    const i_t col_end   = A.col_start[j + 1];
+    const f_t xj        = x[j];
+    if (std::abs(xj) < tight_tol * 10) continue;
+    for (i_t p = col_start; p < col_end; ++p) {
+      rhs[A.i[p]] -= xj * A.x[p];
+    }
+  }
+
+  std::vector<f_t> xB(m);
+  ft.b_solve(rhs, xB);
+
+  for (i_t k = 0; k < m; ++k) {
+    const i_t j = basic_list[k];
+    x[j]        = xB[k];
+  }
+}
+
+template <typename i_t, typename f_t>
+void clear_delta_z(i_t entering_index,
+                   i_t leaving_index,
+                   std::vector<i_t>& delta_z_mark,
+                   std::vector<i_t>& delta_z_indices,
+                   std::vector<f_t>& delta_z)
+{
+  for (i_t k = 0; k < delta_z_indices.size(); k++) {
+    const i_t j     = delta_z_indices[k];
+    delta_z[j]      = 0.0;
+    delta_z_mark[j] = 0;
+  }
+  if (entering_index != -1) { delta_z[entering_index] = 0.0; }
+  delta_z[leaving_index] = 0.0;
+  delta_z_indices.clear();
+}
+
+template <typename i_t, typename f_t>
+void clear_delta_x(const std::vector<i_t>& basic_list,
+                   i_t entering_index,
+                   sparse_vector_t<i_t, f_t>& scaled_delta_xB_sparse,
+                   std::vector<f_t>& delta_x)
+{
+  const i_t scaled_delta_xB_nz = scaled_delta_xB_sparse.i.size();
+  for (i_t k = 0; k < scaled_delta_xB_nz; ++k) {
+    const i_t j = basic_list[scaled_delta_xB_sparse.i[k]];
+    delta_x[j]  = 0.0;
+  }
+  // Leaving index already included above
+  delta_x[entering_index] = 0.0;
+  scaled_delta_xB_sparse.i.clear();
+  scaled_delta_xB_sparse.x.clear();
+}
+
+template <typename i_t, typename f_t>
+void compute_dual_residual(const csc_matrix_t<i_t, f_t>& A,
+                           const std::vector<f_t>& objective,
+                           const std::vector<f_t>& y,
+                           const std::vector<f_t>& z,
+                           std::vector<f_t>& dual_residual)
+{
+  dual_residual = z;
+  const i_t n   = A.n;
+  // r = A'*y + z  - c
+  for (i_t j = 0; j < n; ++j) {
+    dual_residual[j] -= objective[j];
+  }
+  matrix_transpose_vector_multiply(A, 1.0, y, 1.0, dual_residual);
+}
+
 template <typename i_t, typename f_t>
 f_t l2_dual_residual(const lp_problem_t<i_t, f_t>& lp, const lp_solution_t<i_t, f_t>& solution)
 {
-  std::vector<f_t> dual_residual = solution.z;
-  const i_t n                    = lp.num_cols;
-  // dual_residual <- z - c
-  for (i_t j = 0; j < n; j++) {
-    dual_residual[j] -= lp.objective[j];
-  }
-  // dual_residual <- 1.0*A'*y + 1.0*(z - c)
-  matrix_transpose_vector_multiply(lp.A, 1.0, solution.y, 1.0, dual_residual);
+  std::vector<f_t> dual_residual;
+  compute_dual_residual(lp.A, lp.objective, solution.y, solution.z, dual_residual);
   return vector_norm2<i_t, f_t>(dual_residual);
 }
 
@@ -56,9 +509,38 @@ f_t l2_primal_residual(const lp_problem_t<i_t, f_t>& lp, const lp_solution_t<i_t
   return vector_norm2<i_t, f_t>(primal_residual);
 }
 
+template <typename i_t, typename f_t>
+void vstatus_changes(const std::vector<variable_status_t>& vstatus,
+                     const std::vector<variable_status_t>& vstatus_old,
+                     const std::vector<f_t>& z,
+                     const std::vector<f_t>& z_old,
+                     i_t& num_vstatus_changes,
+                     i_t& num_z_changes)
+{
+  num_vstatus_changes = 0;
+  num_z_changes       = 0;
+  const i_t n         = vstatus.size();
+  for (i_t j = 0; j < n; ++j) {
+    if (vstatus[j] != vstatus_old[j]) { num_vstatus_changes++; }
+    if (std::abs(z[j] - z_old[j]) > 1e-6) { num_z_changes++; }
+  }
+}
+
+template <typename f_t>
+void compute_bounded_info(const std::vector<f_t>& lower,
+                          const std::vector<f_t>& upper,
+                          std::vector<uint8_t>& bounded_variables)
+{
+  const size_t n = lower.size();
+  for (size_t j = 0; j < n; j++) {
+    const bool bounded   = (lower[j] > -inf) && (upper[j] < inf) && (lower[j] != upper[j]);
+    bounded_variables[j] = static_cast<uint8_t>(bounded);
+  }
+}
+
 template <typename i_t, typename f_t>
 void compute_dual_solution_from_basis(const lp_problem_t<i_t, f_t>& lp,
-                                      basis_update_t<i_t, f_t>& ft,
+                                      basis_update_mpf_t<i_t, f_t>& ft,
                                       const std::vector<i_t>& basic_list,
                                       const std::vector<i_t>& nonbasic_list,
                                       std::vector<f_t>& y,
@@ -101,6 +583,217 @@ void compute_dual_solution_from_basis(const lp_problem_t<i_t, f_t>& lp,
   }
 }
 
+template <typename i_t, typename f_t>
+i_t compute_primal_solution_from_basis(const lp_problem_t<i_t, f_t>& lp,
+                                       basis_update_mpf_t<i_t, f_t>& ft,
+                                       const std::vector<i_t>& basic_list,
+                                       const std::vector<i_t>& nonbasic_list,
+                                       const std::vector<variable_status_t>& vstatus,
+                                       std::vector<f_t>& x)
+{
+  const i_t m          = lp.num_rows;
+  const i_t n          = lp.num_cols;
+  std::vector<f_t> rhs = lp.rhs;
+
+  for (i_t k = 0; k < n - m; ++k) {
+    const i_t j = nonbasic_list[k];
+    if (vstatus[j] == variable_status_t::NONBASIC_LOWER ||
+        vstatus[j] == variable_status_t::NONBASIC_FIXED) {
+      x[j] = lp.lower[j];
+    } else if (vstatus[j] == variable_status_t::NONBASIC_UPPER) {
+      x[j] = lp.upper[j];
+    } else if (vstatus[j] == variable_status_t::NONBASIC_FREE) {
+      x[j] = 0.0;
+    }
+  }
+
+  // rhs = b - sum_{j : x_j = l_j} A(:, j) l(j) - sum_{j : x_j = u_j} A(:, j) *
+  // u(j)
+  for (i_t k = 0; k < n - m; ++k) {
+    const i_t j         = nonbasic_list[k];
+    const i_t col_start = lp.A.col_start[j];
+    const i_t col_end   = lp.A.col_start[j + 1];
+    const f_t xj        = x[j];
+    for (i_t p = col_start; p < col_end; ++p) {
+      rhs[lp.A.i[p]] -= xj * lp.A.x[p];
+    }
+  }
+
+  std::vector<f_t> xB(m);
+  ft.b_solve(rhs, xB);
+
+  for (i_t k = 0; k < m; ++k) {
+    const i_t j = basic_list[k];
+    x[j]        = xB[k];
+  }
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+f_t compute_initial_primal_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
+                                           const simplex_solver_settings_t<i_t, f_t>& settings,
+                                           const std::vector<i_t>& basic_list,
+                                           const std::vector<f_t>& x,
+                                           std::vector<f_t>& squared_infeasibilities,
+                                           std::vector<i_t>& infeasibility_indices)
+{
+  const i_t m = lp.num_rows;
+  const i_t n = lp.num_cols;
+  squared_infeasibilities.resize(n, 0.0);
+  infeasibility_indices.reserve(n);
+  infeasibility_indices.clear();
+  f_t primal_inf = 0.0;
+  for (i_t k = 0; k < m; ++k) {
+    const i_t j            = basic_list[k];
+    const f_t lower_infeas = lp.lower[j] - x[j];
+    const f_t upper_infeas = x[j] - lp.upper[j];
+    const f_t infeas       = std::max(lower_infeas, upper_infeas);
+    if (infeas > settings.primal_tol) {
+      const f_t square_infeas    = infeas * infeas;
+      squared_infeasibilities[j] = square_infeas;
+      infeasibility_indices.push_back(j);
+      primal_inf += square_infeas;
+    }
+  }
+  return primal_inf;
+}
+
+template <typename i_t, typename f_t>
+void update_single_primal_infeasibility(const std::vector<f_t>& lower,
+                                        const std::vector<f_t>& upper,
+                                        const std::vector<f_t>& x,
+                                        f_t primal_tol,
+                                        std::vector<f_t>& squared_infeasibilities,
+                                        std::vector<i_t>& infeasibility_indices,
+                                        i_t j,
+                                        f_t& primal_inf)
+{
+  const f_t old_val = squared_infeasibilities[j];
+  // x_j < l_j - epsilon => -x_j + l_j > epsilon
+  const f_t lower_infeas = lower[j] - x[j];
+  // x_j > u_j + epsilon => x_j - u_j > epsilon
+  const f_t upper_infeas = x[j] - upper[j];
+  const f_t infeas       = std::max(lower_infeas, upper_infeas);
+  const f_t new_val      = infeas * infeas;
+  if (infeas > primal_tol) {
+    primal_inf = std::max(0.0, primal_inf + (new_val - old_val));
+    // We are infeasible w.r.t the tolerance
+    if (old_val == 0.0) {
+      // This is a new infeasibility
+      // We need to add it to the list
+      infeasibility_indices.push_back(j);
+    } else {
+      // Already infeasible
+    }
+    squared_infeasibilities[j] = new_val;
+  } else {
+    // We are feasible w.r.t the tolerance
+    if (old_val != 0.0) {
+      // We were previously infeasible,
+      primal_inf                 = std::max(0.0, primal_inf - old_val);
+      squared_infeasibilities[j] = 0.0;
+    } else {
+      // Still feasible
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+void update_primal_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
+                                   const simplex_solver_settings_t<i_t, f_t>& settings,
+                                   const std::vector<i_t>& basic_list,
+                                   const std::vector<f_t>& x,
+                                   i_t entering_index,
+                                   i_t leaving_index,
+                                   std::vector<i_t>& basic_change_list,
+                                   std::vector<f_t>& squared_infeasibilities,
+                                   std::vector<i_t>& infeasibility_indices,
+                                   f_t& primal_inf)
+{
+  const f_t primal_tol = settings.primal_tol;
+  const i_t nz         = basic_change_list.size();
+  for (i_t k = 0; k < nz; ++k) {
+    const i_t j = basic_list[basic_change_list[k]];
+    // The change list will contain the leaving variable,
+    // But not the entering variable.
+
+    if (j == leaving_index) {
+      // Force the leaving variable to be feasible
+      const f_t old_val          = squared_infeasibilities[j];
+      squared_infeasibilities[j] = 0.0;
+      primal_inf                 = std::max(0.0, primal_inf - old_val);
+      continue;
+    }
+    update_single_primal_infeasibility(lp.lower,
+                                       lp.upper,
+                                       x,
+                                       primal_tol,
+                                       squared_infeasibilities,
+                                       infeasibility_indices,
+                                       j,
+                                       primal_inf);
+  }
+}
+
+template <typename i_t, typename f_t>
+void clean_up_infeasibilities(std::vector<f_t>& squared_infeasibilities,
+                              std::vector<i_t>& infeasibility_indices)
+{
+  bool needs_clean_up = false;
+  for (i_t k = 0; k < infeasibility_indices.size(); ++k) {
+    const i_t j              = infeasibility_indices[k];
+    const f_t squared_infeas = squared_infeasibilities[j];
+    if (squared_infeas == 0.0) { needs_clean_up = true; }
+  }
+
+  if (needs_clean_up) {
+    for (i_t k = 0; k < infeasibility_indices.size(); ++k) {
+      const i_t j              = infeasibility_indices[k];
+      const f_t squared_infeas = squared_infeasibilities[j];
+      if (squared_infeas == 0.0) {
+        // Set to the last element
+        const i_t sz             = infeasibility_indices.size();
+        infeasibility_indices[k] = infeasibility_indices[sz - 1];
+        infeasibility_indices.pop_back();
+        i_t new_j = infeasibility_indices[k];
+        if (squared_infeasibilities[new_j] == 0.0) { k--; }
+      }
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+i_t steepest_edge_pricing_with_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
+                                               const simplex_solver_settings_t<i_t, f_t>& settings,
+                                               const std::vector<f_t>& x,
+                                               const std::vector<f_t>& dy_steepest_edge,
+                                               const std::vector<i_t>& basic_mark,
+                                               std::vector<f_t>& squared_infeasibilities,
+                                               std::vector<i_t>& infeasibility_indices,
+                                               i_t& direction,
+                                               i_t& basic_leaving,
+                                               f_t& max_val)
+{
+  max_val           = 0.0;
+  i_t leaving_index = -1;
+  const i_t nz      = infeasibility_indices.size();
+  for (i_t k = 0; k < nz; ++k) {
+    const i_t j              = infeasibility_indices[k];
+    const f_t squared_infeas = squared_infeasibilities[j];
+    const f_t val            = squared_infeas / dy_steepest_edge[j];
+    if (val > max_val || val == max_val && j > leaving_index) {
+      max_val                = val;
+      leaving_index          = j;
+      const f_t lower_infeas = lp.lower[j] - x[j];
+      const f_t upper_infeas = x[j] - lp.upper[j];
+      direction              = lower_infeas >= upper_infeas ? 1 : -1;
+    }
+  }
+
+  basic_leaving = leaving_index >= 0 ? basic_mark[leaving_index] : -1;
+  return leaving_index;
+}
+
 template <typename i_t, typename f_t>
 i_t steepest_edge_pricing(const lp_problem_t<i_t, f_t>& lp,
                           const simplex_solver_settings_t<i_t, f_t>& settings,
@@ -344,185 +1037,200 @@ i_t phase2_ratio_test(const lp_problem_t<i_t, f_t>& lp,
   return entering_index;
 }
 
-template <typename i_t, typename f_t>
-i_t bound_flipping_ratio_test(const lp_problem_t<i_t, f_t>& lp,
-                              const simplex_solver_settings_t<i_t, f_t>& settings,
-                              f_t start_time,
-                              const std::vector<variable_status_t>& vstatus,
-                              const std::vector<i_t>& nonbasic_list,
-                              const std::vector<f_t>& x,
-                              std::vector<f_t>& z,
-                              std::vector<f_t>& delta_z,
-                              i_t direction,
-                              i_t leaving_index,
-                              f_t& step_length,
-                              i_t& nonbasic_entering)
-{
-  const i_t n = lp.num_cols;
-  const i_t m = lp.num_rows;
-
-  f_t slope = direction == 1 ? (lp.lower[leaving_index] - x[leaving_index])
-                             : (x[leaving_index] - lp.upper[leaving_index]);
-  assert(slope > 0);
-
-  const f_t pivot_tol         = settings.pivot_tol;
-  const f_t relaxed_pivot_tol = settings.pivot_tol;
-  const f_t zero_tol          = settings.zero_tol;
-  std::list<i_t> q_pos;
-  assert(nonbasic_list.size() == n - m);
-  for (i_t k = 0; k < n - m; ++k) {
-    const i_t j = nonbasic_list[k];
-    if (vstatus[j] == variable_status_t::NONBASIC_FIXED) { continue; }
-    if (vstatus[j] == variable_status_t::NONBASIC_LOWER && delta_z[j] < -pivot_tol) {
-      q_pos.push_back(k);
-    } else if (vstatus[j] == variable_status_t::NONBASIC_UPPER && delta_z[j] > pivot_tol) {
-      q_pos.push_back(k);
-    }
-  }
-  i_t entering_index = -1;
-  step_length        = inf;
-  const f_t dual_tol = settings.dual_tol / 10;
-  while (q_pos.size() > 0 && slope > 0) {
-    // Find the minimum ratio for nonbasic variables in q_pos
-    f_t min_val = inf;
-    typename std::list<i_t>::iterator q_index;
-    i_t candidate = -1;
-    for (typename std::list<i_t>::iterator it = q_pos.begin(); it != q_pos.end(); ++it) {
-      const i_t k = *it;
-      const i_t j = nonbasic_list[k];
-      f_t ratio   = inf;
-      if (vstatus[j] == variable_status_t::NONBASIC_LOWER && delta_z[j] < -pivot_tol) {
-        ratio = (-dual_tol - z[j]) / delta_z[j];
-      } else if (vstatus[j] == variable_status_t::NONBASIC_UPPER && delta_z[j] > pivot_tol) {
-        ratio = (dual_tol - z[j]) / delta_z[j];
-      } else if (min_val != inf) {
-        // We've already found something just continue;
-      } else if (vstatus[j] == variable_status_t::NONBASIC_LOWER) {
-        ratio = (-dual_tol - z[j]) / delta_z[j];
-      } else if (vstatus[j] == variable_status_t::NONBASIC_UPPER) {
-        ratio = (dual_tol - z[j]) / delta_z[j];
-      } else {
-        assert(1 == 0);
-      }
-
-      ratio = std::max(ratio, 0.0);
-
-      if (ratio < min_val) {
-        min_val = ratio;
-        q_index = it;  // Save the iterator so we can remove the element it
-                       // points to from the q_pos list later (if it corresponds
-                       // to a bounded variable)
-        candidate = j;
-      } else if (ratio < min_val + zero_tol &&
-                 std::abs(delta_z[j]) > std::abs(delta_z[candidate])) {
-        min_val   = ratio;
-        q_index   = it;
-        candidate = j;
-      }
-    }
-    step_length       = min_val;  // Save the step length
-    nonbasic_entering = *q_index;
-    const i_t j = entering_index = nonbasic_list[nonbasic_entering];
-    if (lp.lower[j] > -inf && lp.upper[j] < inf && lp.lower[j] != lp.upper[j]) {
-      const f_t interval    = lp.upper[j] - lp.lower[j];
-      const f_t delta_slope = std::abs(delta_z[j]) * interval;
-#ifdef BOUND_FLIP_DEBUG
-      if (slope - delta_slope > 0) {
-        log.printf(
-          "Bound flip %d slope change %e prev slope %e slope %e. curr step "
-          "length %e\n",
-          j,
-          delta_slope,
-          slope,
-          slope - delta_slope,
-          step_length);
-      }
-#endif
-      slope -= delta_slope;
-      q_pos.erase(q_index);
-    } else {
-      // we hit a variable that is not bounded. Exit
-      break;
-    }
-
-    if (toc(start_time) > settings.time_limit) { return -2; }
-    if (settings.concurrent_halt != nullptr &&
-        settings.concurrent_halt->load(std::memory_order_acquire) == 1) {
-      return -3;
-    }
-  }
-  // step_length, nonbasic_entering, and entering_index are defined after the
-  // while loop
-  assert(step_length >= 0);
-
-  return entering_index;
-}
-
 template <typename i_t, typename f_t>
 i_t flip_bounds(const lp_problem_t<i_t, f_t>& lp,
                 const simplex_solver_settings_t<i_t, f_t>& settings,
+                const std::vector<uint8_t>& bounded_variables,
                 const std::vector<f_t>& objective,
                 const std::vector<f_t>& z,
+                const std::vector<i_t>& delta_z_indices,
                 const std::vector<i_t>& nonbasic_list,
                 i_t entering_index,
                 std::vector<variable_status_t>& vstatus,
                 std::vector<f_t>& delta_x,
-                std::vector<f_t>& atilde)
+                std::vector<i_t>& mark,
+                std::vector<f_t>& atilde,
+                std::vector<i_t>& atilde_index)
 {
-  f_t delta_obj = 0;
-  for (i_t j : nonbasic_list) {
+  i_t num_flipped = 0;
+  for (i_t j : delta_z_indices) {
     if (j == entering_index) { continue; }
-    const bool bounded =
-      (lp.lower[j] > -inf) && (lp.upper[j] < inf) && (lp.lower[j] != lp.upper[j]);
-    if (!bounded) { continue; }
+    if (!bounded_variables[j]) { continue; }
     // x_j is now a nonbasic bounded variable that will not enter the basis this
     // iteration
     const f_t dual_tol =
       settings.dual_tol;  // lower to 1e-7 or less will cause 25fv47 and d2q06c to cycle
     if (vstatus[j] == variable_status_t::NONBASIC_LOWER && z[j] < -dual_tol) {
       const f_t delta = lp.upper[j] - lp.lower[j];
-      scatter_dense(lp.A, j, -delta, atilde);
-      delta_obj += delta * objective[j];
+      scatter_dense(lp.A, j, -delta, atilde, mark, atilde_index);
       delta_x[j] += delta;
       vstatus[j] = variable_status_t::NONBASIC_UPPER;
 #ifdef BOUND_FLIP_DEBUG
       settings.log.printf(
         "Flipping nonbasic %d from lo %e to up %e. z %e\n", j, lp.lower[j], lp.upper[j], z[j]);
 #endif
+      num_flipped++;
     } else if (vstatus[j] == variable_status_t::NONBASIC_UPPER && z[j] > dual_tol) {
       const f_t delta = lp.lower[j] - lp.upper[j];
-      scatter_dense(lp.A, j, -delta, atilde);
-      delta_obj += delta * objective[j];
+      scatter_dense(lp.A, j, -delta, atilde, mark, atilde_index);
       delta_x[j] += delta;
       vstatus[j] = variable_status_t::NONBASIC_LOWER;
 #ifdef BOUND_FLIP_DEBUG
       settings.log.printf(
         "Flipping nonbasic %d from up %e to lo %e. z %e\n", j, lp.upper[j], lp.lower[j], z[j]);
 #endif
+      num_flipped++;
     }
   }
-  return 0;
+  return num_flipped;
 }
 
 template <typename i_t, typename f_t>
-i_t initialize_steepest_edge_norms(const simplex_solver_settings_t<i_t, f_t>& settings,
+void initialize_steepest_edge_norms_from_slack_basis(const std::vector<i_t>& basic_list,
+                                                     const std::vector<i_t>& nonbasic_list,
+                                                     std::vector<f_t>& delta_y_steepest_edge)
+{
+  const i_t m = basic_list.size();
+  const i_t n = delta_y_steepest_edge.size();
+  for (i_t k = 0; k < m; ++k) {
+    const i_t j              = basic_list[k];
+    delta_y_steepest_edge[j] = 1.0;
+  }
+  const i_t n_minus_m = n - m;
+  for (i_t k = 0; k < n_minus_m; ++k) {
+    const i_t j              = nonbasic_list[k];
+    delta_y_steepest_edge[j] = 1e-4;
+  }
+}
+
+template <typename i_t, typename f_t>
+i_t initialize_steepest_edge_norms(const lp_problem_t<i_t, f_t>& lp,
+                                   const simplex_solver_settings_t<i_t, f_t>& settings,
                                    const f_t start_time,
                                    const std::vector<i_t>& basic_list,
-                                   const basis_update_t<i_t, f_t>& ft,
+                                   basis_update_mpf_t<i_t, f_t>& ft,
                                    std::vector<f_t>& delta_y_steepest_edge)
 {
-  // TODO: Skip this initialization when starting from a slack basis
-  //       Or skip individual columns corresponding to slack variables
-  const i_t m  = basic_list.size();
+  const i_t m = basic_list.size();
+
+  // We want to compute B^T delta_y_i = -e_i
+  // If there is a column u of B^T such that B^T(:, u) = alpha * e_i than the
+  // solve delta_y_i = -1/alpha * e_u
+  // So we need to find columns of B^T (or rows of B) with only a single non-zero entry
+  f_t start_singleton_rows = tic();
+  std::vector<i_t> row_degree(m, 0);
+  std::vector<i_t> mapping(m, -1);
+  std::vector<f_t> coeff(m, 0.0);
+
+  for (i_t k = 0; k < m; ++k) {
+    const i_t j         = basic_list[k];
+    const i_t col_start = lp.A.col_start[j];
+    const i_t col_end   = lp.A.col_start[j + 1];
+    for (i_t p = col_start; p < col_end; ++p) {
+      const i_t i = lp.A.i[p];
+      row_degree[i]++;
+      // column j of A is column k of B
+      mapping[k] = i;
+      coeff[k]   = lp.A.x[p];
+    }
+  }
+
+#ifdef CHECK_SINGLETON_ROWS
+  csc_matrix_t<i_t, f_t> B(m, m, 0);
+  form_b(lp.A, basic_list, B);
+  csc_matrix_t<i_t, f_t> B_transpose(m, m, 0);
+  B.transpose(B_transpose);
+#endif
+
+  i_t num_singleton_rows = 0;
+  for (i_t i = 0; i < m; ++i) {
+    if (row_degree[i] == 1) {
+      num_singleton_rows++;
+#ifdef CHECK_SINGLETON_ROWS
+      const i_t col_start = B_transpose.col_start[i];
+      const i_t col_end   = B_transpose.col_start[i + 1];
+      if (col_end - col_start != 1) {
+        settings.log.printf("Singleton row %d has %d non-zero entries\n", i, col_end - col_start);
+      }
+#endif
+    }
+  }
+
+  if (num_singleton_rows > 0) {
+    settings.log.printf("Found %d singleton rows for steepest edge norms in %.2fs\n",
+                        num_singleton_rows,
+                        toc(start_singleton_rows));
+  }
+
   f_t last_log = tic();
   for (i_t k = 0; k < m; ++k) {
-    std::vector<f_t> ei(m);
-    std::vector<f_t> dy(m);
-    const i_t j = basic_list[k];
-    ei[k]       = -1.0;
-    ft.b_transpose_solve(ei, dy);
-    ei[k]          = 0.0;
-    const f_t init = vector_norm2_squared<i_t, f_t>(dy);
+    sparse_vector_t<i_t, f_t> sparse_ei(m, 1);
+    sparse_ei.x[0] = -1.0;
+    sparse_ei.i[0] = k;
+    const i_t j    = basic_list[k];
+    f_t init       = -1.0;
+    if (row_degree[mapping[k]] == 1) {
+      const i_t u     = mapping[k];
+      const f_t alpha = coeff[k];
+      // dy[u] = -1.0 / alpha;
+      f_t my_init = 1.0 / (alpha * alpha);
+      init        = my_init;
+#ifdef CHECK_HYPERSPARSE
+      std::vector<f_t> residual(m);
+      b_transpose_multiply(lp, basic_list, dy, residual);
+      float error = 0;
+      for (i_t h = 0; h < m; ++h) {
+        const f_t error_component = std::abs(residual[h] - ei[h]);
+        error += error_component;
+        if (error_component > 1e-12) {
+          settings.log.printf("Singleton row %d component %d error %e residual %e ei %e\n",
+                              k,
+                              h,
+                              error_component,
+                              residual[h],
+                              ei[h]);
+        }
+      }
+      if (error > 1e-12) { settings.log.printf("Singleton row %d error %e\n", k, error); }
+#endif
+
+#ifdef CHECK_HYPERSPARSE
+      dy[u] = 0.0;
+      ft.b_transpose_solve(ei, dy);
+      init = vector_norm2_squared<i_t, f_t>(dy);
+      if (init != my_init) {
+        settings.log.printf("Singleton row %d error %.16e init %.16e my_init %.16e\n",
+                            k,
+                            std::abs(init - my_init),
+                            init,
+                            my_init);
+      }
+#endif
+    } else {
+#if COMPARE_WITH_DENSE
+      ft.b_transpose_solve(ei, dy);
+      init = vector_norm2_squared<i_t, f_t>(dy);
+#else
+      sparse_vector_t<i_t, f_t> sparse_dy(m, 0);
+      ft.b_transpose_solve(sparse_ei, sparse_dy);
+      f_t my_init = 0.0;
+      for (i_t p = 0; p < sparse_dy.x.size(); ++p) {
+        my_init += sparse_dy.x[p] * sparse_dy.x[p];
+      }
+#endif
+#if COMPARE_WITH_DENSE
+      if (std::abs(init - my_init) > 1e-12) {
+        settings.log.printf("Singleton row %d error %.16e init %.16e my_init %.16e\n",
+                            k,
+                            std::abs(init - my_init),
+                            init,
+                            my_init);
+      }
+#endif
+      init = my_init;
+    }
+    // ei[k]          = 0.0;
+    // init = vector_norm2_squared<i_t, f_t>(dy);
     assert(init > 0);
     delta_y_steepest_edge[j] = init;
 
@@ -544,26 +1252,25 @@ i_t initialize_steepest_edge_norms(const simplex_solver_settings_t<i_t, f_t>& se
 template <typename i_t, typename f_t>
 i_t update_steepest_edge_norms(const simplex_solver_settings_t<i_t, f_t>& settings,
                                const std::vector<i_t>& basic_list,
-                               const basis_update_t<i_t, f_t>& ft,
+                               const basis_update_mpf_t<i_t, f_t>& ft,
                                i_t direction,
-                               const std::vector<f_t>& delta_y,
-                               const std::vector<f_t>& scaled_delta_xB,
+                               const sparse_vector_t<i_t, f_t>& delta_y_sparse,
+                               f_t dy_norm_squared,
+                               const sparse_vector_t<i_t, f_t>& scaled_delta_xB,
                                i_t basic_leaving_index,
                                i_t entering_index,
+                               std::vector<f_t>& v,
                                std::vector<f_t>& delta_y_steepest_edge)
 {
-  i_t m = delta_y.size();
-  std::vector<f_t> v(m);
+  i_t m                = basic_list.size();
+  const i_t delta_y_nz = delta_y_sparse.i.size();
+  sparse_vector_t<i_t, f_t> v_sparse(m, 0);
   // B^T delta_y = - direction * e_basic_leaving_index
-  // We want B v =  - B^{-T} e_basic_leaving_index
-  ft.b_solve(delta_y, v);
-  // if direction = -1 we need to scale v
-  if (direction == -1) {
-    for (i_t k = 0; k < m; ++k) {
-      v[k] *= -1;
-    }
-  }
-  const f_t dy_norm_squared      = vector_norm2_squared<i_t, f_t>(delta_y);
+  // We want B v =  - B^{-T} e_basic_leaving_index
+  ft.b_solve(delta_y_sparse, v_sparse);
+  if (direction == -1) { v_sparse.negate(); }
+  v_sparse.scatter(v);
+
   const i_t leaving_index        = basic_list[basic_leaving_index];
   const f_t prev_dy_norm_squared = delta_y_steepest_edge[leaving_index];
 #ifdef STEEPEST_EDGE_DEBUG
@@ -580,17 +1287,18 @@ i_t update_steepest_edge_norms(const simplex_solver_settings_t<i_t, f_t>& settin
 
   // B*w = A(:, leaving_index)
   // B*scaled_delta_xB = -A(:, leaving_index) so w = -scaled_delta_xB
-  const f_t wr = -scaled_delta_xB[basic_leaving_index];
+  const f_t wr = -scaled_delta_xB.find_coefficient(basic_leaving_index);
   if (wr == 0) { return -1; }
-  const f_t omegar = dy_norm_squared / (wr * wr);
-
-  for (i_t k = 0; k < m; ++k) {
+  const f_t omegar             = dy_norm_squared / (wr * wr);
+  const i_t scaled_delta_xB_nz = scaled_delta_xB.i.size();
+  for (i_t h = 0; h < scaled_delta_xB_nz; ++h) {
+    const i_t k = scaled_delta_xB.i[h];
     const i_t j = basic_list[k];
     if (k == basic_leaving_index) {
-      const f_t w_squared      = scaled_delta_xB[k] * scaled_delta_xB[k];
+      const f_t w_squared      = scaled_delta_xB.x[h] * scaled_delta_xB.x[h];
       delta_y_steepest_edge[j] = (1.0 / w_squared) * dy_norm_squared;
     } else {
-      const f_t wk = -scaled_delta_xB[k];
+      const f_t wk = -scaled_delta_xB.x[h];
       f_t new_val  = delta_y_steepest_edge[j] + wk * (2.0 * v[k] / wr + wk * omegar);
       new_val      = std::max(new_val, 1e-4);
 #ifdef STEEPEST_EDGE_DEBUG
@@ -611,23 +1319,30 @@ i_t update_steepest_edge_norms(const simplex_solver_settings_t<i_t, f_t>& settin
     }
   }
 
+  const i_t v_nz = v_sparse.i.size();
+  for (i_t k = 0; k < v_nz; ++k) {
+    v[v_sparse.i[k]] = 0.0;
+  }
+
   return 0;
 }
 
 // Compute steepest edge info for entering variable
 template <typename i_t, typename f_t>
-i_t compute_steepest_edge_norm_entering(const simplex_solver_settings_t<i_t, f_t>& setttings,
+i_t compute_steepest_edge_norm_entering(const simplex_solver_settings_t<i_t, f_t>& settings,
                                         i_t m,
-                                        const basis_update_t<i_t, f_t>& ft,
+                                        const basis_update_mpf_t<i_t, f_t>& ft,
                                         i_t basic_leaving_index,
                                         i_t entering_index,
                                         std::vector<f_t>& steepest_edge_norms)
 {
-  std::vector<f_t> es(m);
-  es[basic_leaving_index] = -1.0;
-  std::vector<f_t> delta_ys(m);
-  ft.b_transpose_solve(es, delta_ys);
-  steepest_edge_norms[entering_index] = vector_norm2_squared<i_t, f_t>(delta_ys);
+  sparse_vector_t<i_t, f_t> es_sparse(m, 1);
+  es_sparse.i[0] = basic_leaving_index;
+  es_sparse.x[0] = -1.0;
+  sparse_vector_t<i_t, f_t> delta_ys_sparse(m, 0);
+  ft.b_transpose_solve(es_sparse, delta_ys_sparse);
+  steepest_edge_norms[entering_index] = delta_ys_sparse.norm2_squared();
+
 #ifdef STEEPEST_EDGE_DEBUG
   settings.log.printf("Steepest edge norm %e for entering j %d at i %d\n",
                       steepest_edge_norms[entering_index],
@@ -640,7 +1355,7 @@ i_t compute_steepest_edge_norm_entering(const simplex_solver_settings_t<i_t, f_t
 template <typename i_t, typename f_t>
 i_t check_steepest_edge_norms(const simplex_solver_settings_t<i_t, f_t>& settings,
                               const std::vector<i_t>& basic_list,
-                              const basis_update_t<i_t, f_t>& ft,
+                              const basis_update_mpf_t<i_t, f_t>& ft,
                               const std::vector<f_t>& delta_y_steepest_edge)
 {
   const i_t m = basic_list.size();
@@ -664,6 +1379,7 @@ i_t check_steepest_edge_norms(const simplex_solver_settings_t<i_t, f_t>& setting
 template <typename i_t, typename f_t>
 i_t compute_perturbation(const lp_problem_t<i_t, f_t>& lp,
                          const simplex_solver_settings_t<i_t, f_t>& settings,
+                         const std::vector<i_t>& delta_z_indices,
                          std::vector<f_t>& z,
                          std::vector<f_t>& objective,
                          f_t& sum_perturb)
@@ -673,7 +1389,8 @@ i_t compute_perturbation(const lp_problem_t<i_t, f_t>& lp,
   const f_t tight_tol = settings.tight_tol;
   i_t num_perturb     = 0;
   sum_perturb         = 0.0;
-  for (i_t j = 0; j < n; ++j) {
+  for (i_t k = 0; k < delta_z_indices.size(); ++k) {
+    const i_t j = delta_z_indices[k];
     if (lp.upper[j] == inf && lp.lower[j] > -inf && z[j] < -tight_tol) {
       const f_t violation = -z[j];
       z[j] += violation;  // z[j] <- 0
@@ -708,6 +1425,245 @@ i_t compute_perturbation(const lp_problem_t<i_t, f_t>& lp,
   return 0;
 }
 
+template <typename i_t>
+void reset_basis_mark(const std::vector<i_t>& basic_list,
+                      const std::vector<i_t>& nonbasic_list,
+                      std::vector<i_t>& basic_mark,
+                      std::vector<i_t>& nonbasic_mark)
+{
+  const i_t m         = basic_list.size();
+  const i_t n         = nonbasic_mark.size();
+  const i_t n_minus_m = n - m;
+
+  for (i_t k = 0; k < n; k++) {
+    basic_mark[k] = -1;
+  }
+
+  for (i_t k = 0; k < n; k++) {
+    nonbasic_mark[k] = -1;
+  }
+
+  for (i_t k = 0; k < n_minus_m; k++) {
+    nonbasic_mark[nonbasic_list[k]] = k;
+  }
+
+  for (i_t k = 0; k < m; k++) {
+    basic_mark[basic_list[k]] = k;
+  }
+}
+
+template <typename i_t, typename f_t>
+void compute_delta_y(const basis_update_mpf_t<i_t, f_t>& ft,
+                     i_t basic_leaving_index,
+                     i_t direction,
+                     sparse_vector_t<i_t, f_t>& delta_y_sparse,
+                     sparse_vector_t<i_t, f_t>& UTsol_sparse)
+{
+  const i_t m = delta_y_sparse.n;
+  // BT*delta_y = -delta_zB = -sigma*ei
+  sparse_vector_t<i_t, f_t> ei_sparse(m, 1);
+  ei_sparse.i[0] = basic_leaving_index;
+  ei_sparse.x[0] = -direction;
+  ft.b_transpose_solve(ei_sparse, delta_y_sparse, UTsol_sparse);
+
+  if (direction != -1) {
+    // We solved BT*delta_y = -sigma*ei, but for the update we need
+    // UT*etilde = ei. So we need to flip the sign of the solution
+    // in the case that sigma == 1.
+    UTsol_sparse.negate();
+  }
+
+#ifdef CHECK_B_TRANSPOSE_SOLVE
+  std::vector<f_t> delta_y_sparse_vector_check(m);
+  delta_y_sparse.to_dense(delta_y_sparse_vector_check);
+  f_t error_check = 0.0;
+  for (i_t k = 0; k < m; ++k) {
+    if (std::abs(delta_y[k] - delta_y_sparse_vector_check[k]) > 1e-6) {
+      settings.log.printf(
+        "\tBTranspose error %d %e %e\n", k, delta_y[k], delta_y_sparse_vector_check[k]);
+    }
+    error_check += std::abs(delta_y[k] - delta_y_sparse_vector_check[k]);
+  }
+  if (error_check > 1e-6) { settings.log.printf("BTranspose error %e\n", error_check); }
+  std::vector<f_t> residual(m);
+  b_transpose_multiply(lp, basic_list, delta_y_sparse_vector_check, residual);
+  for (i_t k = 0; k < m; ++k) {
+    if (std::abs(residual[k] - ei[k]) > 1e-6) {
+      settings.log.printf("\tBTranspose multiply error %d %e %e\n", k, residual[k], ei[k]);
+    }
+  }
+#endif
+}
+
+template <typename i_t, typename f_t>
+void update_dual_variables(const sparse_vector_t<i_t, f_t>& delta_y_sparse,
+                           const std::vector<i_t>& delta_z_indices,
+                           const std::vector<f_t>& delta_z,
+                           f_t step_length,
+                           i_t leaving_index,
+                           std::vector<f_t>& y,
+                           std::vector<f_t>& z)
+{
+  // Update dual variables
+  // y <- y + steplength * delta_y
+  const i_t delta_y_nz = delta_y_sparse.i.size();
+  for (i_t k = 0; k < delta_y_nz; ++k) {
+    const i_t i = delta_y_sparse.i[k];
+    y[i] += step_length * delta_y_sparse.x[k];
+  }
+  // z <- z + steplength * delta_z
+  const i_t delta_z_nz = delta_z_indices.size();
+  for (i_t k = 0; k < delta_z_nz; ++k) {
+    const i_t j = delta_z_indices[k];
+    z[j] += step_length * delta_z[j];
+  }
+  z[leaving_index] += step_length * delta_z[leaving_index];
+}
+
+template <typename i_t, typename f_t>
+void adjust_for_flips(const basis_update_mpf_t<i_t, f_t>& ft,
+                      const std::vector<i_t>& basic_list,
+                      const std::vector<i_t>& delta_z_indices,
+                      std::vector<i_t>& atilde_index,
+                      std::vector<f_t>& atilde,
+                      std::vector<i_t>& atilde_mark,
+                      sparse_vector_t<i_t, f_t>& delta_xB_0_sparse,
+                      std::vector<f_t>& delta_x_flip,
+                      std::vector<f_t>& x)
+{
+  const i_t m         = basic_list.size();
+  const i_t atilde_nz = atilde_index.size();
+  // B*delta_xB_0 = atilde
+  sparse_vector_t<i_t, f_t> atilde_sparse(m, atilde_nz);
+  for (i_t k = 0; k < atilde_nz; ++k) {
+    atilde_sparse.i[k] = atilde_index[k];
+    atilde_sparse.x[k] = atilde[atilde_index[k]];
+  }
+  ft.b_solve(atilde_sparse, delta_xB_0_sparse);
+  const i_t delta_xB_0_nz = delta_xB_0_sparse.i.size();
+  for (i_t k = 0; k < delta_xB_0_nz; ++k) {
+    const i_t j = basic_list[delta_xB_0_sparse.i[k]];
+    x[j] += delta_xB_0_sparse.x[k];
+  }
+
+  for (i_t j : delta_z_indices) {
+    x[j] += delta_x_flip[j];
+    delta_x_flip[j] = 0.0;
+  }
+
+  // Clear atilde
+  for (i_t k = 0; k < atilde_index.size(); ++k) {
+    atilde[atilde_index[k]] = 0.0;
+  }
+  // Clear atilde_mark
+  for (i_t k = 0; k < atilde_mark.size(); ++k) {
+    atilde_mark[k] = 0;
+  }
+  atilde_index.clear();
+}
+
+template <typename i_t, typename f_t>
+i_t compute_delta_x(const lp_problem_t<i_t, f_t>& lp,
+                    const basis_update_mpf_t<i_t, f_t>& ft,
+                    i_t entering_index,
+                    i_t leaving_index,
+                    i_t basic_leaving_index,
+                    i_t direction,
+                    const std::vector<i_t>& basic_list,
+                    const std::vector<f_t>& delta_x_flip,
+                    const sparse_vector_t<i_t, f_t>& rhs_sparse,
+                    const std::vector<f_t>& x,
+                    sparse_vector_t<i_t, f_t>& utilde_sparse,
+                    sparse_vector_t<i_t, f_t>& scaled_delta_xB_sparse,
+                    std::vector<f_t>& delta_x)
+{
+  f_t delta_x_leaving = direction == 1 ? lp.lower[leaving_index] - x[leaving_index]
+                                       : lp.upper[leaving_index] - x[leaving_index];
+  // B*w = -A(:, entering)
+  ft.b_solve(rhs_sparse, scaled_delta_xB_sparse, utilde_sparse);
+  scaled_delta_xB_sparse.negate();
+
+#ifdef CHECK_B_SOLVE
+  std::vector<f_t> scaled_delta_xB(m);
+  {
+    std::vector<f_t> residual_B(m);
+    b_multiply(lp, basic_list, scaled_delta_xB, residual_B);
+    f_t err_max = 0;
+    for (i_t k = 0; k < m; ++k) {
+      const f_t err = std::abs(rhs[k] + residual_B[k]);
+      if (err >= 1e-6) {
+        settings.log.printf(
+          "Bsolve diff %d %e rhs %e residual %e\n", k, err, rhs[k], residual_B[k]);
+      }
+      err_max = std::max(err_max, err);
+    }
+    if (err_max > 1e-6) { settings.log.printf("B multiply error %e\n", err_max); }
+  }
+#endif
+
+  f_t scale = scaled_delta_xB_sparse.find_coefficient(basic_leaving_index);
+  if (scale != scale) {
+    // We couldn't find a coefficient for the basic leaving index.
+    // The coefficient might be very small. Switch to a regular solve and try to recover.
+    std::vector<f_t> rhs;
+    rhs_sparse.to_dense(rhs);
+    const i_t m = basic_list.size();
+    std::vector<f_t> scaled_delta_xB(m);
+    ft.b_solve(rhs, scaled_delta_xB);
+    if (scaled_delta_xB[basic_leaving_index] != 0.0 &&
+        !std::isnan(scaled_delta_xB[basic_leaving_index])) {
+      scaled_delta_xB_sparse.from_dense(scaled_delta_xB);
+      scaled_delta_xB_sparse.negate();
+      scale = -scaled_delta_xB[basic_leaving_index];
+    } else {
+      return -1;
+    }
+  }
+  const f_t primal_step_length = delta_x_leaving / scale;
+  const i_t scaled_delta_xB_nz = scaled_delta_xB_sparse.i.size();
+  for (i_t k = 0; k < scaled_delta_xB_nz; ++k) {
+    const i_t j = basic_list[scaled_delta_xB_sparse.i[k]];
+    delta_x[j]  = primal_step_length * scaled_delta_xB_sparse.x[k];
+  }
+  delta_x[leaving_index]  = delta_x_leaving;
+  delta_x[entering_index] = primal_step_length;
+  return 0;
+}
+
+template <typename i_t, typename f_t>
+void update_primal_variables(const sparse_vector_t<i_t, f_t>& scaled_delta_xB_sparse,
+                             const std::vector<i_t>& basic_list,
+                             const std::vector<f_t>& delta_x,
+                             i_t entering_index,
+                             std::vector<f_t>& x)
+{
+  // x <- x + delta_x
+  const i_t scaled_delta_xB_nz = scaled_delta_xB_sparse.i.size();
+  for (i_t k = 0; k < scaled_delta_xB_nz; ++k) {
+    const i_t j = basic_list[scaled_delta_xB_sparse.i[k]];
+    x[j] += delta_x[j];
+  }
+  // Leaving index already included above
+  x[entering_index] += delta_x[entering_index];
+}
+
+template <typename i_t, typename f_t>
+void update_objective(const std::vector<i_t>& basic_list,
+                      const std::vector<i_t>& changed_basic_indices,
+                      const std::vector<f_t>& objective,
+                      const std::vector<f_t>& delta_x,
+                      i_t entering_index,
+                      f_t& obj)
+{
+  const i_t changed_basic_nz = changed_basic_indices.size();
+  for (i_t k = 0; k < changed_basic_nz; ++k) {
+    const i_t j = basic_list[changed_basic_indices[k]];
+    obj += delta_x[j] * objective[j];
+  }
+  // Leaving index already included above
+  obj += delta_x[entering_index] * objective[entering_index];
+}
+
 template <typename i_t, typename f_t>
 f_t dual_infeasibility(const lp_problem_t<i_t, f_t>& lp,
                        const simplex_solver_settings_t<i_t, f_t>& settings,
@@ -833,6 +1789,103 @@ f_t primal_infeasibility(const lp_problem_t<i_t, f_t>& lp,
   return primal_inf;
 }
 
+template <typename i_t, typename f_t>
+void check_primal_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
+                                  const simplex_solver_settings_t<i_t, f_t>& settings,
+                                  const std::vector<i_t>& basic_list,
+                                  const std::vector<f_t>& x,
+                                  const std::vector<f_t>& squared_infeasibilities,
+                                  const std::vector<i_t>& infeasibility_indices)
+{
+  const i_t m = basic_list.size();
+  for (i_t k = 0; k < m; ++k) {
+    const i_t j            = basic_list[k];
+    const f_t lower_infeas = lp.lower[j] - x[j];
+    const f_t upper_infeas = x[j] - lp.upper[j];
+    const f_t infeas       = std::max(lower_infeas, upper_infeas);
+    if (infeas > settings.primal_tol) {
+      const f_t square_infeas = infeas * infeas;
+      if (square_infeas != squared_infeasibilities[j]) {
+        settings.log.printf("Primal infeasibility mismatch %d %e != %e\n",
+                            j,
+                            square_infeas,
+                            squared_infeasibilities[j]);
+      }
+      bool found = false;
+      for (i_t h = 0; h < infeasibility_indices.size(); ++h) {
+        if (infeasibility_indices[h] == j) {
+          found = true;
+          break;
+        }
+      }
+      if (!found) { settings.log.printf("Infeasibility index not found %d\n", j); }
+    }
+  }
+}
+
+template <typename i_t>
+void check_basic_infeasibilities(const std::vector<i_t>& basic_list,
+                                 const std::vector<i_t>& basic_mark,
+                                 const std::vector<i_t>& infeasibility_indices,
+                                 i_t info)
+{
+  for (i_t k = 0; k < infeasibility_indices.size(); ++k) {
+    const i_t j = infeasibility_indices[k];
+    if (basic_mark[j] < 0) { printf("%d basic_infeasibilities basic_mark[%d] < 0\n", info, j); }
+  }
+}
+
+template <typename i_t, typename f_t>
+void check_update(const lp_problem_t<i_t, f_t>& lp,
+                  const simplex_solver_settings_t<i_t, f_t>& settings,
+                  const basis_update_t<i_t, f_t>& ft,
+                  const std::vector<i_t>& basic_list,
+                  const std::vector<i_t>& basic_leaving_index)
+{
+  const i_t m = basic_list.size();
+  csc_matrix_t<i_t, f_t> Btest(m, m, 1);
+  ft.multiply_lu(Btest);
+  {
+    csc_matrix_t<i_t, f_t> B(m, m, 1);
+    form_b(lp.A, basic_list, B);
+    csc_matrix_t<i_t, f_t> Diff(m, m, 1);
+    add(Btest, B, 1.0, -1.0, Diff);
+    const f_t err = Diff.norm1();
+    if (err > settings.primal_tol) { settings.log.printf("|| B - L*U || %e\n", Diff.norm1()); }
+    if (err > settings.primal_tol) {
+      for (i_t j = 0; j < m; ++j) {
+        for (i_t p = Diff.col_start[j]; p < Diff.col_start[j + 1]; ++p) {
+          const i_t i = Diff.i[p];
+          if (Diff.x[p] != 0.0) { settings.log.printf("Diff %d %d %e\n", j, i, Diff.x[p]); }
+        }
+      }
+    }
+    settings.log.printf("basic leaving index %d\n", basic_leaving_index);
+    assert(err < settings.primal_tol);
+  }
+}
+
+template <typename i_t, typename f_t>
+void check_basis_mark(const simplex_solver_settings_t<i_t, f_t>& settings,
+                      const std::vector<i_t>& basic_list,
+                      const std::vector<i_t>& nonbasic_list,
+                      const std::vector<i_t>& basic_mark,
+                      const std::vector<i_t>& nonbasic_mark)
+{
+  const i_t m = basic_list.size();
+  const i_t n = basic_mark.size();
+  for (i_t k = 0; k < m; k++) {
+    if (basic_mark[basic_list[k]] != k) {
+      settings.log.printf("Basic mark %d %d\n", basic_list[k], k);
+    }
+  }
+  for (i_t k = 0; k < n - m; k++) {
+    if (nonbasic_mark[nonbasic_list[k]] != k) {
+      settings.log.printf("Nonbasic mark %d %d\n", nonbasic_list[k], k);
+    }
+  }
+}
+
 template <typename i_t, typename f_t>
 void bound_info(const lp_problem_t<i_t, f_t>& lp,
                 const simplex_solver_settings_t<i_t, f_t>& settings)
@@ -953,10 +2006,32 @@ void set_primal_variables_on_bounds(const lp_problem_t<i_t, f_t>& lp,
   }
 }
 
+template <typename f_t>
+f_t compute_perturbed_objective(const std::vector<f_t>& objective, const std::vector<f_t>& x)
+{
+  const size_t n = objective.size();
+  f_t obj_val    = 0.0;
+  for (size_t j = 0; j < n; ++j) {
+    obj_val += objective[j] * x[j];
+  }
+  return obj_val;
+}
+
+template <typename i_t, typename f_t>
+f_t amount_of_perturbation(const lp_problem_t<i_t, f_t>& lp, const std::vector<f_t>& objective)
+{
+  f_t perturbation = 0.0;
+  const i_t n      = lp.num_cols;
+  for (i_t j = 0; j < n; ++j) {
+    perturbation += std::abs(lp.objective[j] - objective[j]);
+  }
+  return perturbation;
+}
+
 template <typename i_t, typename f_t>
 void prepare_optimality(const lp_problem_t<i_t, f_t>& lp,
                         const simplex_solver_settings_t<i_t, f_t>& settings,
-                        basis_update_t<i_t, f_t>& ft,
+                        basis_update_mpf_t<i_t, f_t>& ft,
                         const std::vector<f_t>& objective,
                         const std::vector<i_t>& basic_list,
                         const std::vector<i_t>& nonbasic_list,
@@ -975,11 +2050,7 @@ void prepare_optimality(const lp_problem_t<i_t, f_t>& lp,
 
   sol.objective      = compute_objective(lp, sol.x);
   sol.user_objective = compute_user_objective(lp, sol.objective);
-  f_t perturbation   = 0.0;
-  for (i_t j = 0; j < n; ++j) {
-    perturbation += std::abs(lp.objective[j] - objective[j]);
-  }
-
+  f_t perturbation   = phase2::amount_of_perturbation(lp, objective);
   if (perturbation > 1e-6 && phase == 2) {
     // Try to remove perturbation
     std::vector<f_t> unperturbed_y(m);
@@ -994,6 +2065,8 @@ void prepare_optimality(const lp_problem_t<i_t, f_t>& lp,
         z            = unperturbed_z;
         y            = unperturbed_y;
         perturbation = 0.0;
+      } else {
+        settings.log.printf("Failed to remove perturbation of %.2e.\n", perturbation);
       }
     }
   }
@@ -1015,7 +2088,6 @@ void prepare_optimality(const lp_problem_t<i_t, f_t>& lp,
       settings.log.printf("Primal infeasibility (abs): %.2e\n", primal_infeas);
       settings.log.printf("Dual infeasibility (abs):   %.2e\n", dual_infeas);
       settings.log.printf("Perturbation:               %.2e\n", perturbation);
-      settings.log.printf("Max steepest edge norm:     %.2e\n", max_val);
     } else {
       settings.log.printf("\n");
       settings.log.printf(
@@ -1026,6 +2098,81 @@ void prepare_optimality(const lp_problem_t<i_t, f_t>& lp,
   }
 }
 
+template <typename i_t, typename f_t>
+class phase2_timers_t {
+ public:
+  phase2_timers_t(bool should_time)
+    : record_time(should_time),
+      bfrt_time(0),
+      pricing_time(0),
+      btran_time(0),
+      ftran_time(0),
+      flip_time(0),
+      delta_z_time(0),
+      se_norms_time(0),
+      se_entering_time(0),
+      lu_update_time(0),
+      perturb_time(0),
+      vector_time(0),
+      objective_time(0),
+      update_infeasibility_time(0)
+  {
+  }
+
+  void start_timer()
+  {
+    if (!record_time) { return; }
+    start_time = tic();
+  }
+
+  f_t stop_timer()
+  {
+    if (!record_time) { return 0.0; }
+    return toc(start_time);
+  }
+
+  void print_timers(const simplex_solver_settings_t<i_t, f_t>& settings) const
+  {
+    if (!record_time) { return; }
+    const f_t total_time = bfrt_time + pricing_time + btran_time + ftran_time + flip_time +
+                           delta_z_time + lu_update_time + se_norms_time + se_entering_time +
+                           perturb_time + vector_time + objective_time + update_infeasibility_time;
+    // clang-format off
+    settings.log.printf("BFRT time       %.2fs %4.1f%\n", bfrt_time, 100.0 * bfrt_time / total_time);
+    settings.log.printf("Pricing time    %.2fs %4.1f%\n", pricing_time, 100.0 * pricing_time / total_time);
+    settings.log.printf("BTran time      %.2fs %4.1f%\n", btran_time, 100.0 * btran_time / total_time);
+    settings.log.printf("FTran time      %.2fs %4.1f%\n", ftran_time, 100.0 * ftran_time / total_time);
+    settings.log.printf("Flip time       %.2fs %4.1f%\n", flip_time, 100.0 * flip_time / total_time);
+    settings.log.printf("Delta_z time    %.2fs %4.1f%\n", delta_z_time, 100.0 * delta_z_time / total_time);
+    settings.log.printf("LU update time  %.2fs %4.1f%\n", lu_update_time, 100.0 * lu_update_time / total_time);
+    settings.log.printf("SE norms time   %.2fs %4.1f%\n", se_norms_time, 100.0 * se_norms_time / total_time);
+    settings.log.printf("SE enter time   %.2fs %4.1f%\n", se_entering_time, 100.0 * se_entering_time / total_time);
+    settings.log.printf("Perturb time    %.2fs %4.1f%\n", perturb_time, 100.0 * perturb_time / total_time);
+    settings.log.printf("Vector time     %.2fs %4.1f%\n", vector_time, 100.0 * vector_time / total_time);
+    settings.log.printf("Objective time  %.2fs %4.1f%\n", objective_time, 100.0 * objective_time / total_time);
+    settings.log.printf("Inf update time %.2fs %4.1f%\n", update_infeasibility_time, 100.0 * update_infeasibility_time / total_time);
+    settings.log.printf("Sum             %.2fs\n", total_time);
+    // clang-format on
+  }
+  f_t bfrt_time;
+  f_t pricing_time;
+  f_t btran_time;
+  f_t ftran_time;
+  f_t flip_time;
+  f_t delta_z_time;
+  f_t se_norms_time;
+  f_t se_entering_time;
+  f_t lu_update_time;
+  f_t perturb_time;
+  f_t vector_time;
+  f_t objective_time;
+  f_t update_infeasibility_time;
+
+ private:
+  f_t start_time;
+  bool record_time;
+};
+
 }  // namespace phase2
 
 template <typename i_t, typename f_t>
@@ -1093,7 +2240,7 @@ dual::status_t dual_phase2(i_t phase,
   if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; }
   assert(q.size() == m);
   reorder_basic_list(q, basic_list);
-  basis_update_t ft(L, U, p);
+  basis_update_mpf_t<i_t, f_t> ft(L, U, p, settings.refactor_frequency);
 
   std::vector<f_t> c_basic(m);
   for (i_t k = 0; k < m; ++k) {
@@ -1105,41 +2252,19 @@ dual::status_t dual_phase2(i_t phase,
   ft.b_transpose_solve(c_basic, y);
   if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; }
   constexpr bool print_norms = false;
-  if (print_norms) {
+  if constexpr (print_norms) {
     settings.log.printf(
       "|| y || %e || cB || %e\n", vector_norm_inf<i_t, f_t>(y), vector_norm_inf<i_t, f_t>(c_basic));
   }
 
-  // zN = cN - N'*y
-  for (i_t k = 0; k < n - m; k++) {
-    const i_t j = nonbasic_list[k];
-    // z_j <- c_j
-    z[j] = objective[j];
-
-    // z_j <- z_j - A(:, j)'*y
-    const i_t col_start = lp.A.col_start[j];
-    const i_t col_end   = lp.A.col_start[j + 1];
-    f_t dot             = 0.0;
-    for (i_t p = col_start; p < col_end; ++p) {
-      dot += lp.A.x[p] * y[lp.A.i[p]];
-    }
-    z[j] -= dot;
-  }
-  // zB = 0
-  for (i_t k = 0; k < m; ++k) {
-    z[basic_list[k]] = 0.0;
-  }
-  if (print_norms) { settings.log.printf("|| z || %e\n", vector_norm_inf<i_t, f_t>(z)); }
+  phase2::compute_reduced_costs(objective, lp.A, y, basic_list, nonbasic_list, z);
+  if constexpr (print_norms) { settings.log.printf("|| z || %e\n", vector_norm_inf<i_t, f_t>(z)); }
 
 #ifdef COMPUTE_DUAL_RESIDUAL
-  // || A'*y + z  - c||_inf
-  std::vector<f_t> dual_res1 = z;
-  for (i_t j = 0; j < n; ++j) {
-    dual_res1[j] -= objective[j];
-  }
-  matrix_transpose_vector_multiply(lp.A, 1.0, y, 1.0, dual_res1);
+  std::vector<f_t> dual_res1;
+  compute_dual_residual(lp.A, objective, y, z, dual_res1);
   f_t dual_res_norm = vector_norm_inf<i_t, f_t>(dual_res1);
-  if (1 || dual_res_norm > settings.tight_tol) {
+  if (dual_res_norm > settings.tight_tol) {
     settings.log.printf("|| A'*y + z - c || %e\n", dual_res_norm);
   }
   assert(dual_res_norm < 1e-3);
@@ -1148,15 +2273,11 @@ dual::status_t dual_phase2(i_t phase,
   phase2::set_primal_variables_on_bounds(lp, settings, z, vstatus, x);
 
 #ifdef PRINT_VSTATUS_CHANGES
-  i_t num_vstatus_changes = 0;
-  i_t num_z_changes       = 0;
-  for (i_t j = 0; j < n; ++j) {
-    if (vstatus[j] != vstatus_old[j]) { num_vstatus_changes++; }
-    if (std::abs(z[j] - z_old[j]) > 1e-6) { num_z_changes++; }
-  }
-
-  printf("Number of vstatus changes %d\n", num_vstatus_changes);
-  printf("Number of z changes %d\n", num_z_changes);
+  i_t num_vstatus_changes;
+  i_t num_z_changes;
+  phase2::vstatus_changes(vstatus, vstatus_old, z, z_old, num_vstatus_changes, num_z_changes);
+  settings.log.printf("Number of vstatus changes %d\n", num_vstatus_changes);
+  settings.log.printf("Number of z changes %d\n", num_z_changes);
 #endif
 
   const f_t init_dual_inf =
@@ -1171,28 +2292,10 @@ dual::status_t dual_phase2(i_t phase,
     }
   }
 
-  std::vector<f_t> rhs = lp.rhs;
-  // rhs = b - sum_{j : x_j = l_j} A(:, j) l(j) - sum_{j : x_j = u_j} A(:, j) *
-  // u(j)
-  for (i_t k = 0; k < n - m; ++k) {
-    const i_t j         = nonbasic_list[k];
-    const i_t col_start = lp.A.col_start[j];
-    const i_t col_end   = lp.A.col_start[j + 1];
-    const f_t xj        = x[j];
-    if (std::abs(xj) < settings.tight_tol * 10) continue;
-    for (i_t p = col_start; p < col_end; ++p) {
-      rhs[lp.A.i[p]] -= xj * lp.A.x[p];
-    }
-  }
+  phase2::compute_primal_variables(
+    ft, lp.rhs, lp.A, basic_list, nonbasic_list, settings.tight_tol, x);
 
-  std::vector<f_t> xB(m);
-  ft.b_solve(rhs, xB);
   if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; }
-
-  for (i_t k = 0; k < m; ++k) {
-    const i_t j = basic_list[k];
-    x[j]        = xB[k];
-  }
   if (print_norms) { settings.log.printf("|| x || %e\n", vector_norm2<i_t, f_t>(x)); }
 
 #ifdef COMPUTE_PRIMAL_RESIDUAL
@@ -1207,18 +2310,12 @@ dual::status_t dual_phase2(i_t phase,
   if (delta_y_steepest_edge.size() == 0) {
     delta_y_steepest_edge.resize(n);
     if (slack_basis) {
-      for (i_t k = 0; k < m; ++k) {
-        const i_t j              = basic_list[k];
-        delta_y_steepest_edge[j] = 1.0;
-      }
-      for (i_t k = 0; k < n - m; ++k) {
-        const i_t j              = nonbasic_list[k];
-        delta_y_steepest_edge[j] = 1e-4;
-      }
+      phase2::initialize_steepest_edge_norms_from_slack_basis(
+        basic_list, nonbasic_list, delta_y_steepest_edge);
     } else {
       std::fill(delta_y_steepest_edge.begin(), delta_y_steepest_edge.end(), -1);
       if (phase2::initialize_steepest_edge_norms(
-            settings, start_time, basic_list, ft, delta_y_steepest_edge) == -1) {
+            lp, settings, start_time, basic_list, ft, delta_y_steepest_edge) == -1) {
         return dual::status_t::TIME_LIMIT;
       }
     }
@@ -1227,35 +2324,74 @@ dual::status_t dual_phase2(i_t phase,
                         vector_norm2<i_t, f_t>(delta_y_steepest_edge));
   }
 
-  if (phase == 2) { settings.log.printf(" Iter     Objective   Primal Infeas  Perturb  Time\n"); }
+  if (phase == 2) {
+    settings.log.printf(" Iter     Objective           Num Inf.  Sum Inf.     Perturb  Time\n");
+  }
 
   const i_t iter_limit = settings.iteration_limit;
-  std::vector<f_t> delta_y(m);
-  std::vector<f_t> delta_z(n);
-  std::vector<f_t> delta_x(n);
+  std::vector<f_t> delta_y(m, 0.0);
+  std::vector<f_t> delta_z(n, 0.0);
+  std::vector<f_t> delta_x(n, 0.0);
+  std::vector<f_t> delta_x_flip(n, 0.0);
+  std::vector<f_t> atilde(m, 0.0);
+  std::vector<i_t> atilde_mark(m, 0);
+  std::vector<i_t> atilde_index;
+  std::vector<i_t> nonbasic_mark(n);
+  std::vector<i_t> basic_mark(n);
+  std::vector<i_t> delta_z_mark(n, 0);
+  std::vector<i_t> delta_z_indices;
+  std::vector<f_t> v(m, 0.0);
+  std::vector<f_t> squared_infeasibilities;
+  std::vector<i_t> infeasibility_indices;
+
+  delta_z_indices.reserve(n);
+
+  phase2::reset_basis_mark(basic_list, nonbasic_list, basic_mark, nonbasic_mark);
+
+  std::vector<uint8_t> bounded_variables(n, 0);
+  phase2::compute_bounded_info(lp.lower, lp.upper, bounded_variables);
+
+  f_t primal_infeasibility = phase2::compute_initial_primal_infeasibilities(
+    lp, settings, basic_list, x, squared_infeasibilities, infeasibility_indices);
+
+#ifdef CHECK_BASIC_INFEASIBILITIES
+  phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 0);
+#endif
+
+  csc_matrix_t<i_t, f_t> A_transpose(1, 1, 0);
+  lp.A.transpose(A_transpose);
+
+  f_t obj              = compute_objective(lp, x);
   const i_t start_iter = iter;
+
+  i_t sparse_delta_z = 0;
+  i_t dense_delta_z  = 0;
+  phase2::phase2_timers_t<i_t, f_t> timers(false);
+
   while (iter < iter_limit) {
     // Pricing
-    i_t direction;
-    i_t basic_leaving_index;
-    f_t primal_infeasibility;
-    i_t leaving_index = -1;
+    i_t direction           = 0;
+    i_t basic_leaving_index = -1;
+    i_t leaving_index       = -1;
     f_t max_val;
+    timers.start_timer();
     if (settings.use_steepest_edge_pricing) {
-      leaving_index = phase2::steepest_edge_pricing(lp,
-                                                    settings,
-                                                    x,
-                                                    delta_y_steepest_edge,
-                                                    basic_list,
-                                                    direction,
-                                                    basic_leaving_index,
-                                                    primal_infeasibility,
-                                                    max_val);
+      leaving_index = phase2::steepest_edge_pricing_with_infeasibilities(lp,
+                                                                         settings,
+                                                                         x,
+                                                                         delta_y_steepest_edge,
+                                                                         basic_mark,
+                                                                         squared_infeasibilities,
+                                                                         infeasibility_indices,
+                                                                         direction,
+                                                                         basic_leaving_index,
+                                                                         max_val);
     } else {
       // Max infeasibility pricing
       leaving_index = phase2::phase2_pricing(
         lp, settings, x, basic_list, direction, basic_leaving_index, primal_infeasibility);
     }
+    timers.pricing_time += timers.stop_timer();
     if (leaving_index == -1) {
       phase2::prepare_optimality(lp,
                                  settings,
@@ -1277,18 +2413,18 @@ dual::status_t dual_phase2(i_t phase,
     }
 
     // BTran
-    // TODO: replace with sparse solve.
-    std::vector<f_t> ei(m, 0.0);
-    std::vector<f_t> delta_y(m);
-    ei[basic_leaving_index] = -direction;
     // BT*delta_y = -delta_zB = -sigma*ei
-    ft.b_transpose_solve(ei, delta_y);
+    timers.start_timer();
+    sparse_vector_t<i_t, f_t> delta_y_sparse(m, 0);
+    sparse_vector_t<i_t, f_t> UTsol_sparse(m, 0);
+    phase2::compute_delta_y(ft, basic_leaving_index, direction, delta_y_sparse, UTsol_sparse);
+    timers.btran_time += timers.stop_timer();
 
-    const f_t steepest_edge_norm_check = vector_norm2_squared<i_t, f_t>(delta_y);
+    const f_t steepest_edge_norm_check = delta_y_sparse.norm2_squared();
     if (delta_y_steepest_edge[leaving_index] <
         settings.steepest_edge_ratio * steepest_edge_norm_check) {
       constexpr bool verbose = false;
-      if (verbose) {
+      if constexpr (verbose) {
         settings.log.printf(
           "iteration restart due to steepest edge. Leaving %d. Actual %.2e "
           "from update %.2e\n",
@@ -1300,43 +2436,48 @@ dual::status_t dual_phase2(i_t phase,
       continue;
     }
 
-#ifdef COMPUTE_BTRANSPOSE_RESIDUAL
-    {
-      std::vector<f_t> res(m);
-      b_transpose_multiply(lp, basic_list, delta_y, res);
-      for (Int k = 0; k < m; k++) {
-        const f_t err = std::abs(res[k] - ei[k]);
-        if (err > 1e-4) { settings.log.printf("BT err %d %e\n", k, err); }
-        assert(err < 1e-4);
-      }
+    timers.start_timer();
+    i_t delta_y_nz0      = 0;
+    const i_t nz_delta_y = delta_y_sparse.i.size();
+    for (i_t k = 0; k < nz_delta_y; k++) {
+      if (std::abs(delta_y_sparse.x[k]) > 1e-12) { delta_y_nz0++; }
     }
-#endif
-
-    // delta_zB = sigma*ei
-    for (i_t k = 0; k < m; k++) {
-      const i_t j = basic_list[k];
-      delta_z[j]  = 0;
-    }
-    delta_z[leaving_index] = direction;
-    // delta_zN = -N'*delta_y
-    for (i_t k = 0; k < n - m; k++) {
-      const i_t j = nonbasic_list[k];
-      // z_j <- -A(:, j)'*delta_y
-      const i_t col_start = lp.A.col_start[j];
-      const i_t col_end   = lp.A.col_start[j + 1];
-      f_t dot             = 0.0;
-      for (i_t p = col_start; p < col_end; ++p) {
-        dot += lp.A.x[p] * delta_y[lp.A.i[p]];
-      }
-      delta_z[j] = -dot;
+    const f_t delta_y_nz_percentage = delta_y_nz0 / static_cast<f_t>(m) * 100.0;
+    const bool use_transpose        = delta_y_nz_percentage <= 30.0;
+    if (use_transpose) {
+      sparse_delta_z++;
+      phase2::compute_delta_z(A_transpose,
+                              delta_y_sparse,
+                              leaving_index,
+                              direction,
+                              nonbasic_mark,
+                              delta_z_mark,
+                              delta_z_indices,
+                              delta_z);
+    } else {
+      dense_delta_z++;
+      // delta_zB = sigma*ei
+      delta_y_sparse.to_dense(delta_y);
+      phase2::compute_reduced_cost_update(lp,
+                                          basic_list,
+                                          nonbasic_list,
+                                          delta_y,
+                                          leaving_index,
+                                          direction,
+                                          delta_z_mark,
+                                          delta_z_indices,
+                                          delta_z);
     }
+    timers.delta_z_time += timers.stop_timer();
 
 #ifdef COMPUTE_DUAL_RESIDUAL
-    std::vector<f_t> dual_residual = delta_z;
+    std::vector<f_t> dual_residual;
+    std::vector<f_t> zeros(n, 0.0);
+    phase2::compute_dual_residual(lp.A, zeros, delta_y, delta_z, dual_residual);
     // || A'*delta_y + delta_z ||_inf
-    matrix_transpose_vector_multiply(lp.A, 1.0, delta_y, 1.0, dual_residual);
     f_t dual_residual_norm = vector_norm_inf<i_t, f_t>(dual_residual);
-    settings.log.printf("|| A'*dy - dz || %e\n", dual_residual_norm);
+    settings.log.printf(
+      "|| A'*dy - dz || %e use transpose %d\n", dual_residual_norm, use_transpose);
 #endif
 
     // Ratio test
@@ -1356,18 +2497,25 @@ dual::status_t dual_phase2(i_t phase,
                                                    step_length,
                                                    nonbasic_entering_index);
     } else if (bound_flip_ratio) {
-      entering_index = phase2::bound_flipping_ratio_test(lp,
-                                                         settings,
-                                                         start_time,
-                                                         vstatus,
-                                                         nonbasic_list,
-                                                         x,
-                                                         z,
-                                                         delta_z,
-                                                         direction,
-                                                         leaving_index,
-                                                         step_length,
-                                                         nonbasic_entering_index);
+      timers.start_timer();
+      f_t slope = direction == 1 ? (lp.lower[leaving_index] - x[leaving_index])
+                                 : (x[leaving_index] - lp.upper[leaving_index]);
+      bound_flipping_ratio_test_t<i_t, f_t> bfrt(settings,
+                                                 start_time,
+                                                 m,
+                                                 n,
+                                                 slope,
+                                                 lp.lower,
+                                                 lp.upper,
+                                                 bounded_variables,
+                                                 vstatus,
+                                                 nonbasic_list,
+                                                 z,
+                                                 delta_z,
+                                                 delta_z_indices,
+                                                 nonbasic_mark);
+      entering_index = bfrt.compute_step_length(step_length, nonbasic_entering_index);
+      timers.bfrt_time += timers.stop_timer();
     } else {
       entering_index = phase2::phase2_ratio_test(
         lp, settings, vstatus, nonbasic_list, z, delta_z, step_length, nonbasic_entering_index);
@@ -1375,33 +2523,135 @@ dual::status_t dual_phase2(i_t phase,
     if (entering_index == -2) { return dual::status_t::TIME_LIMIT; }
     if (entering_index == -3) { return dual::status_t::CONCURRENT_LIMIT; }
     if (entering_index == -1) {
-      if (primal_infeasibility > settings.primal_tol &&
-          max_val < settings.steepest_edge_primal_tol) {
-        // We could be done
-        settings.log.printf("Exiting due to small primal infeasibility se %e\n", max_val);
-        phase2::prepare_optimality(lp,
-                                   settings,
-                                   ft,
-                                   objective,
-                                   basic_list,
-                                   nonbasic_list,
-                                   vstatus,
-                                   phase,
-                                   start_time,
-                                   max_val,
-                                   iter,
-                                   x,
-                                   y,
-                                   z,
-                                   sol);
-        status = dual::status_t::OPTIMAL;
-        break;
+      settings.log.printf("No entering variable found. Iter %d\n", iter);
+      settings.log.printf("Scaled infeasibility %e\n", max_val);
+      f_t perturbation = phase2::amount_of_perturbation(lp, objective);
+
+      if (perturbation > 0.0 && phase == 2) {
+        // Try to remove perturbation
+        std::vector<f_t> unperturbed_y(m);
+        std::vector<f_t> unperturbed_z(n);
+        phase2::compute_dual_solution_from_basis(
+          lp, ft, basic_list, nonbasic_list, unperturbed_y, unperturbed_z);
+        {
+          const f_t dual_infeas = phase2::dual_infeasibility(
+            lp, settings, vstatus, unperturbed_z, settings.tight_tol, settings.dual_tol);
+          settings.log.printf("Dual infeasibility after removing perturbation %e\n", dual_infeas);
+          if (dual_infeas <= settings.dual_tol) {
+            settings.log.printf("Removed perturbation of %.2e.\n", perturbation);
+            z            = unperturbed_z;
+            y            = unperturbed_y;
+            perturbation = 0.0;
+
+            std::vector<f_t> unperturbed_x(n);
+            phase2::compute_primal_solution_from_basis(
+              lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x);
+            x                    = unperturbed_x;
+            primal_infeasibility = phase2::compute_initial_primal_infeasibilities(
+              lp, settings, basic_list, x, squared_infeasibilities, infeasibility_indices);
+            settings.log.printf("Updated primal infeasibility: %e\n", primal_infeasibility);
+
+            objective = lp.objective;
+            // Need to reset the objective value, since we have recomputed x
+            obj = phase2::compute_perturbed_objective(objective, x);
+            if (dual_infeas <= settings.dual_tol && primal_infeasibility <= settings.primal_tol) {
+              phase2::prepare_optimality(lp,
+                                         settings,
+                                         ft,
+                                         objective,
+                                         basic_list,
+                                         nonbasic_list,
+                                         vstatus,
+                                         phase,
+                                         start_time,
+                                         max_val,
+                                         iter,
+                                         x,
+                                         y,
+                                         z,
+                                         sol);
+              status = dual::status_t::OPTIMAL;
+              break;
+            }
+            settings.log.printf(
+              "Continuing with perturbation removed and steepest edge norms reset\n");
+            // Clear delta_z before restarting the iteration
+            phase2::clear_delta_z(
+              entering_index, leaving_index, delta_z_mark, delta_z_indices, delta_z);
+            continue;
+          } else {
+            std::vector<f_t> unperturbed_x(n);
+            phase2::compute_primal_solution_from_basis(
+              lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x);
+            x                    = unperturbed_x;
+            primal_infeasibility = phase2::compute_initial_primal_infeasibilities(
+              lp, settings, basic_list, x, squared_infeasibilities, infeasibility_indices);
+
+            const f_t orig_dual_infeas = phase2::dual_infeasibility(
+              lp, settings, vstatus, z, settings.tight_tol, settings.dual_tol);
+
+            if (primal_infeasibility <= settings.primal_tol &&
+                orig_dual_infeas <= settings.dual_tol) {
+              phase2::prepare_optimality(lp,
+                                         settings,
+                                         ft,
+                                         objective,
+                                         basic_list,
+                                         nonbasic_list,
+                                         vstatus,
+                                         phase,
+                                         start_time,
+                                         max_val,
+                                         iter,
+                                         x,
+                                         y,
+                                         z,
+                                         sol);
+              status = dual::status_t::OPTIMAL;
+              break;
+            }
+            settings.log.printf("Failed to remove perturbation of %.2e.\n", perturbation);
+          }
+        }
       }
+
+      if (perturbation == 0.0 && phase == 2) {
+        constexpr bool use_farkas = false;
+        if constexpr (use_farkas) {
+          std::vector<f_t> farkas_y;
+          std::vector<f_t> farkas_zl;
+          std::vector<f_t> farkas_zu;
+          f_t farkas_constant;
+          std::vector<f_t> my_delta_y;
+          delta_y_sparse.to_dense(my_delta_y);
+
+          // TODO(CMM): Do I use the perturbed or unperturbed objective?
+          const f_t obj_val = phase2::compute_perturbed_objective(objective, x);
+          phase2::compute_farkas_certificate(lp,
+                                             settings,
+                                             vstatus,
+                                             x,
+                                             y,
+                                             z,
+                                             my_delta_y,
+                                             delta_z,
+                                             direction,
+                                             leaving_index,
+                                             obj_val,
+                                             farkas_y,
+                                             farkas_zl,
+                                             farkas_zu,
+                                             farkas_constant);
+        }
+      }
+
       const f_t dual_infeas =
         phase2::dual_infeasibility(lp, settings, vstatus, z, settings.tight_tol, settings.dual_tol);
       settings.log.printf("Dual infeasibility %e\n", dual_infeas);
       const f_t primal_inf = phase2::primal_infeasibility(lp, settings, vstatus, x);
       settings.log.printf("Primal infeasibility %e\n", primal_inf);
+      settings.log.printf("Updates %d\n", ft.num_updates());
+      settings.log.printf("Steepest edge %e\n", max_val);
       if (dual_infeas > settings.dual_tol) {
         settings.log.printf(
           "Numerical issues encountered. No entering variable found with large infeasibility.\n");
@@ -1410,121 +2660,111 @@ dual::status_t dual_phase2(i_t phase,
       return dual::status_t::DUAL_UNBOUNDED;
     }
 
+    timers.start_timer();
     // Update dual variables
     // y <- y + steplength * delta_y
-    for (i_t i = 0; i < m; ++i) {
-      y[i] += step_length * delta_y[i];
-    }
-
     // z <- z + steplength * delta_z
-    for (i_t j = 0; j < n; ++j) {
-      z[j] += step_length * delta_z[j];
-    }
+    phase2::update_dual_variables(
+      delta_y_sparse, delta_z_indices, delta_z, step_length, leaving_index, y, z);
+    timers.vector_time += timers.stop_timer();
 
 #ifdef COMPUTE_DUAL_RESIDUAL
-    dual_res1 = z;
-    for (i_t j = 0; j < n; ++j) {
-      dual_res1[j] -= objective[j];
-    }
-    matrix_transpose_vector_multiply(lp.A, 1.0, y, 1.0, dual_res1);
+    phase2::compute_dual_residual(lp.A, objective, y, z, dual_res1);
     f_t dual_res_norm = vector_norm_inf<i_t, f_t>(dual_res1);
     if (dual_res_norm > settings.dual_tol) {
       settings.log.printf("|| A'*y + z - c || %e steplength %e\n", dual_res_norm, step_length);
     }
 #endif
 
+    timers.start_timer();
     // Update primal variable
-    std::vector<f_t> atilde(m);
-    std::vector<f_t> delta_x_flip(n);
-    phase2::flip_bounds(
-      lp, settings, objective, z, nonbasic_list, entering_index, vstatus, delta_x_flip, atilde);
-
-    // B*delta_xB_0 = atilde
-    std::vector<f_t> delta_xB_0(m);
-    ft.b_solve(atilde, delta_xB_0);
-    for (i_t k = 0; k < m; ++k) {
-      const i_t j = basic_list[k];
-      x[j] += delta_xB_0[k];
-    }
-    for (i_t k = 0; k < n - m; ++k) {
-      const i_t j = nonbasic_list[k];
-      x[j] += delta_x_flip[j];
-    }
-
-    f_t delta_x_leaving;
-    if (direction == 1) {
-      delta_x_leaving = lp.lower[leaving_index] - x[leaving_index];
-    } else {
-      delta_x_leaving = lp.upper[leaving_index] - x[leaving_index];
-    }
-    // B*w = -A(:, entering)
-    std::vector<f_t> scaled_delta_xB(m);
-    std::fill(rhs.begin(), rhs.end(), 0.0);
-    lp.A.load_a_column(entering_index, rhs);
-    std::vector<f_t> utilde(m);
-    ft.b_solve(rhs, scaled_delta_xB, utilde);
-    for (i_t i = 0; i < m; ++i) {
-      scaled_delta_xB[i] *= -1.0;
+    const i_t num_flipped = phase2::flip_bounds(lp,
+                                                settings,
+                                                bounded_variables,
+                                                objective,
+                                                z,
+                                                delta_z_indices,
+                                                nonbasic_list,
+                                                entering_index,
+                                                vstatus,
+                                                delta_x_flip,
+                                                atilde_mark,
+                                                atilde,
+                                                atilde_index);
+
+    timers.flip_time += timers.stop_timer();
+
+    sparse_vector_t<i_t, f_t> delta_xB_0_sparse(m, 0);
+    if (num_flipped > 0) {
+      timers.start_timer();
+      phase2::adjust_for_flips(ft,
+                               basic_list,
+                               delta_z_indices,
+                               atilde_index,
+                               atilde,
+                               atilde_mark,
+                               delta_xB_0_sparse,
+                               delta_x_flip,
+                               x);
+      timers.ftran_time += timers.stop_timer();
     }
 
-#ifdef COMPUTE_BSOLVE_RESIDUAL
-    {
-      std::vector<f_t> residual_B(m);
-      b_multiply(lp, basic_list, scaled_delta_xB, residual_B);
-      f_t err_max = 0;
-      for (Int k = 0; k < m; ++k) {
-        const f_t err = std::abs(rhs[k] - residual_B[k]);
-        if (err >= 1e-5) {
-          settings.log.printf(
-            "Bsolve diff %d %e rhs %e residual %e\n", k, err, rhs[k], residual_B[k]);
-        }
-        err_max = std::max(err_max, err);
-      }
-      assert(err_max < 1e-4);
+    timers.start_timer();
+    sparse_vector_t<i_t, f_t> utilde_sparse(m, 0);
+    sparse_vector_t<i_t, f_t> scaled_delta_xB_sparse(m, 0);
+    sparse_vector_t<i_t, f_t> rhs_sparse(lp.A, entering_index);
+    if (phase2::compute_delta_x(lp,
+                                ft,
+                                entering_index,
+                                leaving_index,
+                                basic_leaving_index,
+                                direction,
+                                basic_list,
+                                delta_x_flip,
+                                rhs_sparse,
+                                x,
+                                utilde_sparse,
+                                scaled_delta_xB_sparse,
+                                delta_x) == -1) {
+      settings.log.printf("Failed to compute delta_x. Iter %d\n", iter);
+      return dual::status_t::NUMERICAL;
     }
-#endif
 
-    f_t primal_step_length = delta_x_leaving / scaled_delta_xB[basic_leaving_index];
-    for (i_t k = 0; k < m; ++k) {
-      const i_t j = basic_list[k];
-      delta_x[j]  = primal_step_length * scaled_delta_xB[k];
-    }
-    delta_x[leaving_index] = delta_x_leaving;
-    for (i_t k = 0; k < n - m; k++) {
-      const i_t j = nonbasic_list[k];
-      delta_x[j]  = 0.0;
-    }
-    delta_x[entering_index] = primal_step_length;
+    timers.ftran_time += timers.stop_timer();
 
-#ifdef COMPUTE_PRIMAL_STEP_RESIDUAL
+#ifdef CHECK_PRIMAL_STEP
+    std::vector<f_t> residual(m);
     matrix_vector_multiply(lp.A, 1.0, delta_x, 1.0, residual);
-    f_t primal_step_err = vector_norm_inf(residual);
+    f_t primal_step_err = vector_norm_inf<i_t, f_t>(residual);
     if (primal_step_err > 1e-4) { settings.log.printf("|| A * dx || %e\n", primal_step_err); }
 #endif
 
+    timers.start_timer();
     const i_t steepest_edge_status = phase2::update_steepest_edge_norms(settings,
                                                                         basic_list,
                                                                         ft,
                                                                         direction,
-                                                                        delta_y,
-                                                                        scaled_delta_xB,
+                                                                        delta_y_sparse,
+                                                                        steepest_edge_norm_check,
+                                                                        scaled_delta_xB_sparse,
                                                                         basic_leaving_index,
                                                                         entering_index,
+                                                                        v,
                                                                         delta_y_steepest_edge);
 #ifdef STEEPEST_EDGE_DEBUG
     if (steepest_edge_status == -1) {
       settings.log.printf("Num updates %d\n", ft.num_updates());
-      settings.log.printf(" Primal step length %e\n", primal_step_length);
-      settings.log.printf("|| delta_xB || %e\n", vector_norm_inf(scaled_delta_xB));
       settings.log.printf("|| rhs || %e\n", vector_norm_inf(rhs));
     }
 #endif
     assert(steepest_edge_status == 0);
+    timers.se_norms_time += timers.stop_timer();
 
+    timers.start_timer();
     // x <- x + delta_x
-    for (i_t j = 0; j < n; ++j) {
-      x[j] += delta_x[j];
-    }
+    phase2::update_primal_variables(scaled_delta_xB_sparse, basic_list, delta_x, entering_index, x);
+    timers.vector_time += timers.stop_timer();
+
 #ifdef COMPUTE_PRIMAL_RESIDUAL
     residual = lp.rhs;
     matrix_vector_multiply(lp.A, 1.0, x, -1.0, residual);
@@ -1534,10 +2774,68 @@ dual::status_t dual_phase2(i_t phase,
     }
 #endif
 
+    timers.start_timer();
+    // TODO(CMM): Do I also need to update the objective due to the bound flips?
+    // TODO(CMM): I'm using the unperturbed objective here, should this be the perturbed objective?
+    phase2::update_objective(
+      basic_list, scaled_delta_xB_sparse.i, lp.objective, delta_x, entering_index, obj);
+    timers.objective_time += timers.stop_timer();
+
+    timers.start_timer();
+    // Update primal infeasibilities due to changes in basic variables
+    // from flipping bounds
+#ifdef CHECK_BASIC_INFEASIBILITIES
+    phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 2);
+#endif
+    phase2::update_primal_infeasibilities(lp,
+                                          settings,
+                                          basic_list,
+                                          x,
+                                          entering_index,
+                                          leaving_index,
+                                          delta_xB_0_sparse.i,
+                                          squared_infeasibilities,
+                                          infeasibility_indices,
+                                          primal_infeasibility);
+    // Update primal infeasibilities due to changes in basic variables
+    // from the leaving and entering variables
+    phase2::update_primal_infeasibilities(lp,
+                                          settings,
+                                          basic_list,
+                                          x,
+                                          entering_index,
+                                          leaving_index,
+                                          scaled_delta_xB_sparse.i,
+                                          squared_infeasibilities,
+                                          infeasibility_indices,
+                                          primal_infeasibility);
+    // Update the entering variable
+    phase2::update_single_primal_infeasibility(lp.lower,
+                                               lp.upper,
+                                               x,
+                                               settings.primal_tol,
+                                               squared_infeasibilities,
+                                               infeasibility_indices,
+                                               entering_index,
+                                               primal_infeasibility);
+
+    phase2::clean_up_infeasibilities(squared_infeasibilities, infeasibility_indices);
+
+#if CHECK_PRIMAL_INFEASIBILITIES
+    phase2::check_primal_infeasibilities(
+      lp, settings, basic_list, x, squared_infeasibilities, infeasibility_indices);
+#endif
+    timers.update_infeasibility_time += timers.stop_timer();
+
+    // Clear delta_x
+    phase2::clear_delta_x(basic_list, entering_index, scaled_delta_xB_sparse, delta_x);
+
+    timers.start_timer();
     f_t sum_perturb = 0.0;
-    phase2::compute_perturbation(lp, settings, z, objective, sum_perturb);
+    phase2::compute_perturbation(lp, settings, delta_z_indices, z, objective, sum_perturb);
+    timers.perturb_time += timers.stop_timer();
 
-    // Update basis
+    // Update basis information
     vstatus[entering_index] = variable_status_t::BASIC;
     if (lp.lower[leaving_index] != lp.upper[leaving_index]) {
       vstatus[leaving_index] = static_cast<variable_status_t>(-direction);
@@ -1546,66 +2844,119 @@ dual::status_t dual_phase2(i_t phase,
     }
     basic_list[basic_leaving_index]        = entering_index;
     nonbasic_list[nonbasic_entering_index] = leaving_index;
+    nonbasic_mark[entering_index]          = -1;
+    nonbasic_mark[leaving_index]           = nonbasic_entering_index;
+    basic_mark[leaving_index]              = -1;
+    basic_mark[entering_index]             = basic_leaving_index;
+
+#ifdef CHECK_BASIC_INFEASIBILITIES
+    phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 5);
+#endif
 
-    // Refactor or Update
+    timers.start_timer();
+    // Refactor or update the basis factorization
     bool should_refactor = ft.num_updates() > settings.refactor_frequency;
     if (!should_refactor) {
-      i_t recommend_refactor = ft.update(utilde, basic_leaving_index);
-#ifdef CHECK_FT
-      {
-        csc_matrix_t Btest(m, m, 1);
-        ft.multiply_lu(Btest);
-        {
-          csc_matrix_t B(m, m, 1);
-          form_b(lp, basic_list, B);
-          csc_matrix_t Diff(m, m, 1);
-          add(Btest, B, 1.0, -1.0, Diff);
-          const f_t err = Diff.norm1();
-          if (err > settings.primal_tol) {
-            settings.log.printf("|| B - L*U || %e\n", Diff.norm1());
-          }
-          assert(err < settings.primal_tol);
-        }
-      }
+      i_t recommend_refactor = ft.update(utilde_sparse, UTsol_sparse, basic_leaving_index);
+#ifdef CHECK_UPDATE
+      phase2::check_update(lp, settings, ft, basic_list, basic_leaving_index);
 #endif
       should_refactor = recommend_refactor == 1;
     }
 
+#ifdef CHECK_BASIC_INFEASIBILITIES
+    phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 6);
+#endif
     if (should_refactor) {
+      bool should_recompute_x = false;
       if (factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed) ==
           -1) {
+        should_recompute_x = true;
+        settings.log.printf("Failed to factorize basis. Iteration %d\n", iter);
+        if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; }
         basis_repair(lp.A, settings, deficient, slacks_needed, basic_list, nonbasic_list, vstatus);
-        if (factorize_basis(
-              lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed) == -1) {
-          return dual::status_t::NUMERICAL;
+        i_t count = 0;
+        while (factorize_basis(
+                 lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed) == -1) {
+          settings.log.printf("Failed to repair basis. Iteration %d. %d deficient columns.\n",
+                              iter,
+                              static_cast<int>(deficient.size()));
+          if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; }
+          settings.threshold_partial_pivoting_tol = 1.0;
+          count++;
+          if (count > 10) { return dual::status_t::NUMERICAL; }
+          basis_repair(
+            lp.A, settings, deficient, slacks_needed, basic_list, nonbasic_list, vstatus);
+
+#ifdef CHECK_BASIS_REPAIR
+          csc_matrix_t<i_t, f_t> B(m, m, 0);
+          form_b(lp.A, basic_list, B);
+          for (i_t k = 0; k < deficient.size(); ++k) {
+            const i_t j         = deficient[k];
+            const i_t col_start = B.col_start[j];
+            const i_t col_end   = B.col_start[j + 1];
+            const i_t col_nz    = col_end - col_start;
+            if (col_nz != 1) {
+              settings.log.printf("Deficient column %d has %d nonzeros\n", j, col_nz);
+            }
+            const i_t i = B.i[col_start];
+            if (i != slacks_needed[k]) {
+              settings.log.printf("Slack %d needed but found %d instead\n", slacks_needed[k], i);
+            }
+          }
+#endif
         }
+
+        settings.log.printf("Successfully repaired basis. Iteration %d\n", iter);
       }
       reorder_basic_list(q, basic_list);
       ft.reset(L, U, p);
+      phase2::reset_basis_mark(basic_list, nonbasic_list, basic_mark, nonbasic_mark);
+      if (should_recompute_x) {
+        std::vector<f_t> unperturbed_x(n);
+        phase2::compute_primal_solution_from_basis(
+          lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x);
+        x = unperturbed_x;
+      }
+      phase2::compute_initial_primal_infeasibilities(
+        lp, settings, basic_list, x, squared_infeasibilities, infeasibility_indices);
     }
+#ifdef CHECK_BASIC_INFEASIBILITIES
+    phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 7);
+#endif
+    timers.lu_update_time += timers.stop_timer();
 
+    timers.start_timer();
     phase2::compute_steepest_edge_norm_entering(
       settings, m, ft, basic_leaving_index, entering_index, delta_y_steepest_edge);
+    timers.se_entering_time += timers.stop_timer();
 
 #ifdef STEEPEST_EDGE_DEBUG
     if (iter < 100 || iter % 100 == 0))
-        {
-            phase2::check_steepest_edge_norms(settings, basic_list, ft, delta_y_steepest_edge);
-        }
+    {
+      phase2::check_steepest_edge_norms(settings, basic_list, ft, delta_y_steepest_edge);
+    }
+#endif
+
+#ifdef CHECK_BASIS_MARK
+    phase2::check_basis_mark(settings, basic_list, nonbasic_list, basic_mark, nonbasic_mark);
 #endif
 
     iter++;
 
-    const f_t obj = compute_objective(lp, x);
-    f_t now       = toc(start_time);
+    // Clear delta_z
+    phase2::clear_delta_z(entering_index, leaving_index, delta_z_mark, delta_z_indices, delta_z);
+
+    f_t now = toc(start_time);
     if ((iter - start_iter) < settings.first_iteration_log ||
         (iter % settings.iteration_log_frequency) == 0) {
       if (phase == 1 && iter == 1) {
-        settings.log.printf(" Iter     Objective   Primal Infeas  Perturb  Time\n");
+        settings.log.printf(" Iter     Objective           Num Inf.  Sum Inf.     Perturb  Time\n");
       }
-      settings.log.printf("%5d %+.8e %.8e %.2e %.2f\n",
+      settings.log.printf("%5d %+.16e %7d %.8e %.2e %.2f\n",
                           iter,
                           compute_user_objective(lp, obj),
+                          infeasibility_indices.size(),
                           primal_infeasibility,
                           sum_perturb,
                           now);
@@ -1624,6 +2975,20 @@ dual::status_t dual_phase2(i_t phase,
     }
   }
   if (iter >= iter_limit) { status = dual::status_t::ITERATION_LIMIT; }
+
+  if (phase == 2) {
+    timers.print_timers(settings);
+    constexpr bool print_stats = false;
+    if constexpr (print_stats) {
+      settings.log.printf("Sparse delta_z %8d %8.2f%\n",
+                          sparse_delta_z,
+                          100.0 * sparse_delta_z / (sparse_delta_z + dense_delta_z));
+      settings.log.printf("Dense delta_z  %8d %8.2f%\n",
+                          dense_delta_z,
+                          100.0 * dense_delta_z / (sparse_delta_z + dense_delta_z));
+      ft.print_stats();
+    }
+  }
   return status;
 }
 
diff --git a/cpp/src/dual_simplex/presolve.cpp b/cpp/src/dual_simplex/presolve.cpp
index d94fd0f6c..68043e06a 100644
--- a/cpp/src/dual_simplex/presolve.cpp
+++ b/cpp/src/dual_simplex/presolve.cpp
@@ -23,6 +23,128 @@
 
 namespace cuopt::linear_programming::dual_simplex {
 
+template <typename i_t, typename f_t>
+void bound_strengthening(const std::vector<char>& row_sense,
+                         const simplex_solver_settings_t<i_t, f_t>& settings,
+                         lp_problem_t<i_t, f_t>& problem)
+{
+  const i_t m = problem.num_rows;
+  const i_t n = problem.num_cols;
+
+  std::vector<f_t> constraint_lower(m);
+  std::vector<i_t> num_lower_infinity(m);
+  std::vector<i_t> num_upper_infinity(m);
+
+  csc_matrix_t<i_t, f_t> Arow(1, 1, 1);
+  problem.A.transpose(Arow);
+
+  std::vector<i_t> less_rows;
+  less_rows.reserve(m);
+
+  for (i_t i = 0; i < m; ++i) {
+    if (row_sense[i] == 'L') { less_rows.push_back(i); }
+  }
+
+  std::vector<f_t> lower = problem.lower;
+  std::vector<f_t> upper = problem.upper;
+
+  std::vector<i_t> updated_variables_list;
+  updated_variables_list.reserve(n);
+  std::vector<i_t> updated_variables_mark(n, 0);
+
+  i_t iter                         = 0;
+  const i_t iter_limit             = 10;
+  i_t total_strengthened_variables = 0;
+  settings.log.printf("Less equal rows %d\n", less_rows.size());
+  while (iter < iter_limit && less_rows.size() > 0) {
+    // Derive bounds on the constraints
+    settings.log.printf("Running bound strengthening on %d rows\n",
+                        static_cast<i_t>(less_rows.size()));
+    for (i_t i : less_rows) {
+      const i_t row_start   = Arow.col_start[i];
+      const i_t row_end     = Arow.col_start[i + 1];
+      num_lower_infinity[i] = 0;
+      num_upper_infinity[i] = 0;
+
+      f_t lower_limit = 0.0;
+      for (i_t p = row_start; p < row_end; ++p) {
+        const i_t j    = Arow.i[p];
+        const f_t a_ij = Arow.x[p];
+        if (a_ij > 0) {
+          lower_limit += a_ij * lower[j];
+        } else if (a_ij < 0) {
+          lower_limit += a_ij * upper[j];
+        }
+        if (lower[j] == -inf && a_ij > 0) {
+          num_lower_infinity[i]++;
+          lower_limit = -inf;
+        }
+        if (upper[j] == inf && a_ij < 0) {
+          num_lower_infinity[i]++;
+          lower_limit = -inf;
+        }
+      }
+      constraint_lower[i] = lower_limit;
+    }
+
+    // Use the constraint bounds to derive new bounds on the variables
+    for (i_t i : less_rows) {
+      if (std::isfinite(constraint_lower[i]) && num_lower_infinity[i] == 0) {
+        const i_t row_start = Arow.col_start[i];
+        const i_t row_end   = Arow.col_start[i + 1];
+        for (i_t p = row_start; p < row_end; ++p) {
+          const i_t k    = Arow.i[p];
+          const f_t a_ik = Arow.x[p];
+          if (a_ik > 0) {
+            const f_t new_upper = lower[k] + (problem.rhs[i] - constraint_lower[i]) / a_ik;
+            if (new_upper < upper[k]) {
+              upper[k] = new_upper;
+              if (lower[k] > upper[k]) {
+                settings.log.printf(
+                  "\t INFEASIBLE!!!!!!!!!!!!!!!!! constraint_lower %e lower %e rhs %e\n",
+                  constraint_lower[i],
+                  lower[k],
+                  problem.rhs[i]);
+              }
+              if (!updated_variables_mark[k]) { updated_variables_list.push_back(k); }
+            }
+          } else if (a_ik < 0) {
+            const f_t new_lower = upper[k] + (problem.rhs[i] - constraint_lower[i]) / a_ik;
+            if (new_lower > lower[k]) {
+              lower[k] = new_lower;
+              if (lower[k] > upper[k]) {
+                settings.log.printf("\t INFEASIBLE !!!!!!!!!!!!!!!!!!1\n");
+              }
+              if (!updated_variables_mark[k]) { updated_variables_list.push_back(k); }
+            }
+          }
+        }
+      }
+    }
+    less_rows.clear();
+
+    // Update the bounds on the constraints
+    settings.log.printf("Round %d: Strengthend %d variables\n",
+                        iter,
+                        static_cast<i_t>(updated_variables_list.size()));
+    total_strengthened_variables += updated_variables_list.size();
+    for (i_t j : updated_variables_list) {
+      updated_variables_mark[j] = 0;
+      const i_t col_start       = problem.A.col_start[j];
+      const i_t col_end         = problem.A.col_start[j + 1];
+      for (i_t p = col_start; p < col_end; ++p) {
+        const i_t i = problem.A.i[p];
+        less_rows.push_back(i);
+      }
+    }
+    updated_variables_list.clear();
+    iter++;
+  }
+  settings.log.printf("Total strengthened variables %d\n", total_strengthened_variables);
+  problem.lower = lower;
+  problem.upper = upper;
+}
+
 template <typename i_t, typename f_t>
 i_t remove_empty_cols(lp_problem_t<i_t, f_t>& problem,
                       i_t& num_empty_cols,
@@ -500,6 +622,7 @@ i_t add_artifical_variables(lp_problem_t<i_t, f_t>& problem,
 
 template <typename i_t, typename f_t>
 void convert_user_problem(const user_problem_t<i_t, f_t>& user_problem,
+                          const simplex_solver_settings_t<i_t, f_t>& settings,
                           lp_problem_t<i_t, f_t>& problem,
                           std::vector<i_t>& new_slacks)
 {
@@ -559,6 +682,14 @@ void convert_user_problem(const user_problem_t<i_t, f_t>& user_problem,
     convert_greater_to_less(user_problem, row_sense, problem, greater_rows, less_rows);
   }
 
+  // At this point the problem representation is in the form: A*x {<=, =} b
+  // This is the time to run bound strengthening
+  constexpr bool run_bound_strengthening = false;
+  if constexpr (run_bound_strengthening) {
+    settings.log.printf("Running bound strengthening\n");
+    bound_strengthening(row_sense, settings, problem);
+  }
+
   // The original problem may have a variable without a lower bound
   // but a finite upper bound
   // -inf < x_j <= u_j
@@ -669,7 +800,8 @@ void convert_user_lp_with_guess(const user_problem_t<i_t, f_t>& user_problem,
                                 lp_solution_t<i_t, f_t>& converted_solution)
 {
   std::vector<i_t> new_slacks;
-  convert_user_problem(user_problem, problem, new_slacks);
+  simplex_solver_settings_t<i_t, f_t> settings;
+  convert_user_problem(user_problem, settings, problem, new_slacks);
   crush_primal_solution_with_slack(
     user_problem, problem, initial_solution.x, initial_slack, new_slacks, converted_solution.x);
   crush_dual_solution(user_problem,
@@ -900,9 +1032,11 @@ void uncrush_solution(const presolve_info_t<i_t, f_t>& presolve_info,
 
 #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE
 
-template void convert_user_problem<int, double>(const user_problem_t<int, double>& user_problem,
-                                                lp_problem_t<int, double>& problem,
-                                                std::vector<int>& new_slacks);
+template void convert_user_problem<int, double>(
+  const user_problem_t<int, double>& user_problem,
+  const simplex_solver_settings_t<int, double>& settings,
+  lp_problem_t<int, double>& problem,
+  std::vector<int>& new_slacks);
 
 template void convert_user_lp_with_guess<int, double>(
   const user_problem_t<int, double>& user_problem,
diff --git a/cpp/src/dual_simplex/presolve.hpp b/cpp/src/dual_simplex/presolve.hpp
index 947c637cb..7a307e6f7 100644
--- a/cpp/src/dual_simplex/presolve.hpp
+++ b/cpp/src/dual_simplex/presolve.hpp
@@ -63,6 +63,7 @@ struct presolve_info_t {
 
 template <typename i_t, typename f_t>
 void convert_user_problem(const user_problem_t<i_t, f_t>& user_problem,
+                          const simplex_solver_settings_t<i_t, f_t>& settings,
                           lp_problem_t<i_t, f_t>& problem,
                           std::vector<i_t>& new_slacks);
 
diff --git a/cpp/src/dual_simplex/random.hpp b/cpp/src/dual_simplex/random.hpp
index e1ad01fef..dfc60dbd5 100644
--- a/cpp/src/dual_simplex/random.hpp
+++ b/cpp/src/dual_simplex/random.hpp
@@ -21,7 +21,7 @@
 
 namespace cuopt::linear_programming::dual_simplex {
 
-template <typename i_t>
+template <typename i_t, typename f_t>
 class random_t {
  public:
   random_t(i_t seed) : gen(seed) {}
@@ -34,6 +34,12 @@ class random_t {
     return distrib(gen);
   }
 
+  f_t random()
+  {
+    std::uniform_real_distribution<> distrib(0.0, 1.0);
+    return distrib(gen);
+  }
+
  private:
   std::mt19937 gen;
 };
diff --git a/cpp/src/dual_simplex/right_looking_lu.cpp b/cpp/src/dual_simplex/right_looking_lu.cpp
index 57eb9b01d..caf80ad11 100644
--- a/cpp/src/dual_simplex/right_looking_lu.cpp
+++ b/cpp/src/dual_simplex/right_looking_lu.cpp
@@ -31,12 +31,11 @@ namespace {
 // submatrix during the LU factorization
 template <typename i_t, typename f_t>
 struct element_t {
-  i_t i;  // row index
-  i_t j;  // column index
-  f_t x;  // coefficient value
-  i_t
-    next_in_column;  // index of the next element in the column: nullptr if there is no next element
-  i_t next_in_row;   // index of the next element in the row: nullptr if there is no next element
+  i_t i;               // row index
+  i_t j;               // column index
+  f_t x;               // coefficient value
+  i_t next_in_column;  // index of the next element in the column: kNone if there is no next element
+  i_t next_in_row;     // index of the next element in the row: kNone if there is no next element
 };
 constexpr int kNone = -1;
 
@@ -165,6 +164,34 @@ void initialize_max_in_column(const std::vector<i_t>& first_in_col,
   }
 }
 
+template <typename i_t, typename f_t>
+f_t maximum_in_row(i_t i,
+                   const std::vector<i_t>& first_in_row,
+                   std::vector<element_t<i_t, f_t>>& elements)
+{
+  f_t max_in_row = 0.0;
+  for (i_t p = first_in_row[i]; p != kNone; p = elements[p].next_in_row) {
+    element_t<i_t, f_t>* entry = &elements[p];
+    assert(entry->i == i);
+    max_in_row = std::max(max_in_row, std::abs(entry->x));
+  }
+  return max_in_row;
+}
+
+template <typename i_t, typename f_t>
+void initialize_max_in_row(const std::vector<i_t>& first_in_row,
+                           std::vector<element_t<i_t, f_t>>& elements,
+                           std::vector<f_t>& max_in_row)
+{
+  const i_t m = first_in_row.size();
+  for (i_t i = 0; i < m; ++i) {
+    max_in_row[i] = maximum_in_row(i, first_in_row, elements);
+  }
+}
+
+#undef THRESHOLD_ROOK_PIVOTING  // Disable threshold rook pivoting for now.
+                                // 3% slower when enabled. But keep it around
+                                // for challenging numerical problems.
 template <typename i_t, typename f_t>
 i_t markowitz_search(const std::vector<i_t>& Cdegree,
                      const std::vector<i_t>& Rdegree,
@@ -173,6 +200,7 @@ i_t markowitz_search(const std::vector<i_t>& Cdegree,
                      const std::vector<i_t>& first_in_row,
                      const std::vector<i_t>& first_in_col,
                      const std::vector<f_t>& max_in_column,
+                     const std::vector<f_t>& max_in_row,
                      std::vector<element_t<i_t, f_t>>& elements,
                      f_t pivot_tol,
                      f_t threshold_tol,
@@ -199,6 +227,7 @@ i_t markowitz_search(const std::vector<i_t>& Cdegree,
         element_t<i_t, f_t>* entry = &elements[p];
         const i_t i                = entry->i;
         assert(entry->j == j);
+#ifdef CHECK_RDEGREE
         if (Rdegree[i] < 0) {
           if (verbose) {
             printf("Rdegree[%d] %d. Searching in column %d. Entry i %d j %d val %e\n",
@@ -210,9 +239,13 @@ i_t markowitz_search(const std::vector<i_t>& Cdegree,
                    entry->x);
           }
         }
+#endif
         assert(Rdegree[i] >= 0);
         const i_t Mij = (Rdegree[i] - 1) * (nz - 1);
         if (Mij < markowitz && std::abs(entry->x) >= threshold_tol * max_in_col &&
+#ifdef THRESHOLD_ROOK_PIVOTING
+            std::abs(entry->x) >= threshold_tol * max_in_row[i] &&
+#endif
             std::abs(entry->x) >= pivot_tol) {
           markowitz = Mij;
           pivot_i   = i;
@@ -233,6 +266,9 @@ i_t markowitz_search(const std::vector<i_t>& Cdegree,
     assert(row_count[nz].size() >= 0);
     for (const i_t i : row_count[nz]) {
       assert(Rdegree[i] == nz);
+#ifdef THRESHOLD_ROOK_PIVOTING
+      const f_t max_in_row_i = max_in_row[i];
+#endif
       for (i_t p = first_in_row[i]; p != kNone; p = elements[p].next_in_row) {
         element_t<i_t, f_t>* entry = &elements[p];
         const i_t j                = entry->j;
@@ -241,6 +277,9 @@ i_t markowitz_search(const std::vector<i_t>& Cdegree,
         assert(Cdegree[j] >= 0);
         const i_t Mij = (nz - 1) * (Cdegree[j] - 1);
         if (Mij < markowitz && std::abs(entry->x) >= threshold_tol * max_in_col &&
+#ifdef THRESHOLD_ROOK_PIVOTING
+            std::abs(entry->x) >= threshold_tol * max_in_row_i &&
+#endif
             std::abs(entry->x) >= pivot_tol) {
           markowitz = Mij;
           pivot_i   = i;
@@ -257,7 +296,7 @@ i_t markowitz_search(const std::vector<i_t>& Cdegree,
     nz++;
   }
   if (nsearch > 10) {
-    if (verbose) { printf("nsearch %d\n", nsearch); }
+    if constexpr (verbose) { printf("nsearch %d\n", nsearch); }
   }
   return nsearch;
 }
@@ -333,6 +372,7 @@ void schur_complement(i_t pivot_i,
                       std::vector<i_t>& row_last_workspace,
                       std::vector<i_t>& column_j_workspace,
                       std::vector<f_t>& max_in_column,
+                      std::vector<f_t>& max_in_row,
                       std::vector<i_t>& Rdegree,
                       std::vector<i_t>& Cdegree,
                       std::vector<std::list<i_t>>& row_count,
@@ -378,6 +418,9 @@ void schur_complement(i_t pivot_i,
         e2->x -= val;
         const f_t abs_e2x = std::abs(e2->x);
         if (abs_e2x > max_in_column[j]) { max_in_column[j] = abs_e2x; }
+#ifdef THRESHOLD_ROOK_PIVOTING
+        if (abs_e2x > max_in_row[i]) { max_in_row[i] = abs_e2x; }
+#endif
       } else {
         element_t<i_t, f_t> fill;
         fill.i              = i;
@@ -385,6 +428,9 @@ void schur_complement(i_t pivot_i,
         fill.x              = -val;
         const f_t abs_fillx = std::abs(fill.x);
         if (abs_fillx > max_in_column[j]) { max_in_column[j] = abs_fillx; }
+#ifdef THRESHOLD_ROOK_PIVOTING
+        if (abs_fillx > max_in_row[i]) { max_in_row[i] = abs_fillx; }
+#endif
         fill.next_in_column = kNone;
         fill.next_in_row    = kNone;
         elements.push_back(fill);
@@ -484,7 +530,7 @@ void remove_pivot_col(i_t pivot_i,
                       i_t pivot_j,
                       std::vector<i_t>& first_in_col,
                       std::vector<i_t>& first_in_row,
-                      std::vector<f_t>& max_in_column,
+                      std::vector<f_t>& max_in_row,
                       std::vector<element_t<i_t, f_t>>& elements)
 {
   // Remove the pivot col
@@ -492,6 +538,9 @@ void remove_pivot_col(i_t pivot_i,
     element_t<i_t, f_t>* e = &elements[p1];
     const i_t i            = e->i;
     i_t last               = kNone;
+#ifdef THRESHOLD_ROOK_PIVOTING
+    f_t max_in_row_i = 0.0;
+#endif
     for (i_t p = first_in_row[i]; p != kNone; p = elements[p].next_in_row) {
       element_t<i_t, f_t>* entry = &elements[p];
       if (entry->j == pivot_j) {
@@ -504,8 +553,17 @@ void remove_pivot_col(i_t pivot_i,
         entry->j = -1;
         entry->x = std::numeric_limits<f_t>::quiet_NaN();
       }
+#ifdef THRESHOLD_ROOK_PIVOTING
+      else {
+        const f_t abs_entryx = std::abs(entry->x);
+        if (abs_entryx > max_in_row_i) { max_in_row_i = abs_entryx; }
+      }
+#endif
       last = p;
     }
+#ifdef THRESHOLD_ROOK_PIVOTING
+    max_in_row[i] = max_in_row_i;
+#endif
   }
   first_in_col[pivot_j] = kNone;
 }
@@ -549,7 +607,11 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
   std::vector<i_t> column_j_workspace(n, kNone);
   std::vector<i_t> row_last_workspace(n);
   std::vector<f_t> max_in_column(n);
+  std::vector<f_t> max_in_row(m);
   initialize_max_in_column(first_in_col, elements, max_in_column);
+#ifdef THRESHOLD_ROOK_PIVOTING
+  initialize_max_in_row(first_in_row, elements, max_in_row);
+#endif
 
   csr_matrix_t<i_t, f_t> Urow;  // We will store U by rows in Urow during the factorization and
                                 // translate back to U at the end
@@ -561,22 +623,22 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
   L.x.clear();
   L.i.clear();
 
-  for (i_t k = 0; k < n; ++k) {
-    pinv[k] = -1;
-    q[k]    = -1;
-  }
+  std::fill(q.begin(), q.end(), -1);
+  std::fill(pinv.begin(), pinv.end(), -1);
+  std::vector<i_t> qinv(n);
+  std::fill(qinv.begin(), qinv.end(), -1);
 
   i_t pivots = 0;
   for (i_t k = 0; k < n; ++k) {
     // Find pivot that satisfies
     // abs(pivot) >= abstol,
     // abs(pivot) >= threshold_tol * max abs[pivot column]
-    i_t pivot_i                 = -1;
-    i_t pivot_j                 = -1;
-    i_t pivot_p                 = kNone;
-    constexpr f_t pivot_tol     = 1e-11;
-    constexpr f_t drop_tol      = 1e-13;
-    constexpr f_t threshold_tol = 1.0 / 10.0;
+    i_t pivot_i             = -1;
+    i_t pivot_j             = -1;
+    i_t pivot_p             = kNone;
+    constexpr f_t pivot_tol = 1e-11;
+    const f_t drop_tol      = tol == 1.0 ? 0.0 : 1e-13;
+    const f_t threshold_tol = tol;
     markowitz_search(Cdegree,
                      Rdegree,
                      col_count,
@@ -584,6 +646,7 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
                      first_in_row,
                      first_in_col,
                      max_in_column,
+                     max_in_row,
                      elements,
                      pivot_tol,
                      threshold_tol,
@@ -598,6 +661,7 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
     // Pivot
     pinv[pivot_i]       = k;  // pivot_i is the kth pivot row
     q[k]                = pivot_j;
+    qinv[pivot_j]       = k;
     const f_t pivot_val = pivot_entry->x;
     assert(std::abs(pivot_val) >= pivot_tol);
     pivots++;
@@ -656,6 +720,7 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
                      row_last_workspace,
                      column_j_workspace,
                      max_in_column,
+                     max_in_row,
                      Rdegree,
                      Cdegree,
                      row_count,
@@ -664,7 +729,7 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
 
     // Remove the pivot row
     remove_pivot_row(pivot_i, pivot_j, first_in_col, first_in_row, max_in_column, elements);
-    remove_pivot_col(pivot_i, pivot_j, first_in_col, first_in_row, max_in_column, elements);
+    remove_pivot_col(pivot_i, pivot_j, first_in_col, first_in_row, max_in_row, elements);
 
     // Set pivot entry to sentinel value
     pivot_entry->i = -1;
@@ -695,6 +760,30 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
     }
 #endif
 
+#ifdef CHECK_MAX_IN_ROW
+    // Check that maximum in row is maintained
+    for (i_t i = 0; i < m; ++i) {
+      if (Rdegree[i] == -1) { continue; }
+      const f_t max_in_row_i = max_in_row[i];
+      bool found_max         = false;
+      f_t largest_abs_x      = 0.0;
+      for (i_t p = first_in_row[i]; p != kNone; p = elements[p].next_in_row) {
+        const f_t abs_e2x = std::abs(elements[p].x);
+        if (abs_e2x > largest_abs_x) { largest_abs_x = abs_e2x; }
+        if (abs_e2x > max_in_row_i) {
+          printf("Found max in row %d is %e but %e\n", i, max_in_row_i, abs_e2x);
+        }
+        assert(abs_e2x <= max_in_row_i);
+        if (abs_e2x == max_in_row_i) { found_max = true; }
+      }
+      if (!found_max) {
+        printf(
+          "Did not find max %e in row %d. Largest abs x is %e\n", max_in_row_i, i, largest_abs_x);
+      }
+      assert(found_max);
+    }
+#endif
+
 #if CHECK_BAD_ENTRIES
     for (Int j = 0; j < n; j++) {
       for (Int p = first_in_col[j]; p != kNone; p = elements[p].next_in_column) {
@@ -761,6 +850,15 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
     for (i_t i = 0; i < m; ++i) {
       if (pinv[i] == -1) { pinv[i] = start++; }
     }
+
+    // Finalize the permutation q. Do this by first completing the inverse permutation qinv.
+    // Then invert qinv to get the final permutation q.
+    start = pivots;
+    for (i_t j = 0; j < n; ++j) {
+      if (qinv[j] == -1) { qinv[j] = start++; }
+    }
+    inverse_permutation(qinv, q);
+
     return pivots;
   }
 
@@ -852,7 +950,11 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t<i_t, f_t>& A,
   std::vector<i_t> column_j_workspace(m, kNone);
   std::vector<i_t> row_last_workspace(m);
   std::vector<f_t> max_in_column(n);
+  std::vector<f_t> max_in_row(m);
   initialize_max_in_column(first_in_col, elements, max_in_column);
+#ifdef THRESHOLD_ROOK_PIVOTING
+  initialize_max_in_row(first_in_row, elements, max_in_row);
+#endif
 
   settings.log.debug("Empty rows %ld\n", row_count[0].size());
   settings.log.debug("Empty cols %ld\n", col_count[0].size());
@@ -884,6 +986,7 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t<i_t, f_t>& A,
                      first_in_row,
                      first_in_col,
                      max_in_column,
+                     max_in_row,
                      elements,
                      pivot_tol,
                      threshold_tol,
@@ -924,6 +1027,7 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t<i_t, f_t>& A,
                                row_last_workspace,
                                column_j_workspace,
                                max_in_column,
+                               max_in_row,
                                Rdegree,
                                Cdegree,
                                row_count,
@@ -933,8 +1037,7 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t<i_t, f_t>& A,
     // Remove the pivot row
     remove_pivot_row<i_t, f_t>(
       pivot_i, pivot_j, first_in_col, first_in_row, max_in_column, elements);
-    remove_pivot_col<i_t, f_t>(
-      pivot_i, pivot_j, first_in_col, first_in_row, max_in_column, elements);
+    remove_pivot_col<i_t, f_t>(pivot_i, pivot_j, first_in_col, first_in_row, max_in_row, elements);
 
     // Set pivot entry to sentinel value
     pivot_entry->i = -1;
diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp
index df5e4e1d0..a51ed19bc 100644
--- a/cpp/src/dual_simplex/simplex_solver_settings.hpp
+++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp
@@ -46,6 +46,8 @@ struct simplex_solver_settings_t {
       cut_off(std::numeric_limits<f_t>::infinity()),
       steepest_edge_ratio(0.5),
       steepest_edge_primal_tol(1e-9),
+      hypersparse_threshold(0.05),
+      threshold_partial_pivoting_tol(1.0 / 10.0),
       use_steepest_edge_pricing(true),
       use_harris_ratio(false),
       use_bound_flip_ratio(true),
@@ -86,7 +88,9 @@ struct simplex_solver_settings_t {
   f_t cut_off;               // If the dual objective is greater than the cutoff we stop
   f_t
     steepest_edge_ratio;  // the ratio of computed steepest edge mismatch from updated steepest edge
-  f_t steepest_edge_primal_tol;    // Primal tolerance divided by steepest edge norm
+  f_t steepest_edge_primal_tol;  // Primal tolerance divided by steepest edge norm
+  f_t hypersparse_threshold;
+  mutable f_t threshold_partial_pivoting_tol;
   bool use_steepest_edge_pricing;  // true if using steepest edge pricing, false if using max
                                    // infeasibility pricing
   bool use_harris_ratio;           // true if using the harris ratio test
diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp
index 464bd7047..e665bae97 100644
--- a/cpp/src/dual_simplex/solve.cpp
+++ b/cpp/src/dual_simplex/solve.cpp
@@ -244,7 +244,7 @@ lp_status_t solve_linear_program(const user_problem_t<i_t, f_t>& user_problem,
   f_t start_time = tic();
   lp_problem_t<i_t, f_t> original_lp(1, 1, 1);
   std::vector<i_t> new_slacks;
-  convert_user_problem(user_problem, original_lp, new_slacks);
+  convert_user_problem(user_problem, settings, original_lp, new_slacks);
   solution.resize(user_problem.num_rows, user_problem.num_cols);
   lp_solution_t<i_t, f_t> lp_solution(original_lp.num_rows, original_lp.num_cols);
   std::vector<variable_status_t> vstatus;
@@ -283,7 +283,7 @@ i_t solve(const user_problem_t<i_t, f_t>& problem,
     lp_problem_t<i_t, f_t> original_lp(
       problem.num_rows, problem.num_cols, problem.A.col_start[problem.A.n]);
     std::vector<i_t> new_slacks;
-    convert_user_problem(problem, original_lp, new_slacks);
+    convert_user_problem(problem, settings, original_lp, new_slacks);
     lp_solution_t<i_t, f_t> solution(original_lp.num_rows, original_lp.num_cols);
     std::vector<variable_status_t> vstatus;
     std::vector<f_t> edge_norms;
diff --git a/cpp/src/dual_simplex/sparse_matrix.cpp b/cpp/src/dual_simplex/sparse_matrix.cpp
index 830838bf5..dc4df3990 100644
--- a/cpp/src/dual_simplex/sparse_matrix.cpp
+++ b/cpp/src/dual_simplex/sparse_matrix.cpp
@@ -16,6 +16,7 @@
  */
 
 #include <dual_simplex/sparse_matrix.hpp>
+#include <dual_simplex/sparse_vector.hpp>
 
 #include <dual_simplex/types.hpp>
 
@@ -148,6 +149,61 @@ i_t csc_matrix_t<i_t, f_t>::load_a_column(i_t j, std::vector<f_t>& Aj) const
   return (col_end - col_start);
 }
 
+template <typename i_t, typename f_t>
+void csc_matrix_t<i_t, f_t>::append_column(const std::vector<f_t>& x)
+{
+  const i_t m = this->m;
+  assert(x.size() == m);
+  const i_t xsz = x.size();
+  i_t nz        = this->col_start[this->n];
+  for (i_t j = 0; j < xsz; ++j) {
+    if (x[j] != 0.0) {
+      this->i[nz] = j;
+      this->x[nz] = x[j];
+      nz++;
+    }
+  }
+  this->col_start[this->n + 1] = nz;
+  this->n++;
+}
+
+template <typename i_t, typename f_t>
+void csc_matrix_t<i_t, f_t>::append_column(const sparse_vector_t<i_t, f_t>& x)
+{
+  const i_t m = this->m;
+  assert(x.n == m);
+  i_t nz        = this->col_start[this->n];
+  const i_t xnz = x.i.size();
+  for (i_t k = 0; k < xnz; ++k) {
+    const i_t i     = x.i[k];
+    const f_t x_val = x.x[k];
+    if (x_val != 0.0) {
+      this->i[nz] = i;
+      this->x[nz] = x_val;
+      nz++;
+    }
+  }
+  this->col_start[this->n + 1] = nz;
+  this->n++;
+}
+
+template <typename i_t, typename f_t>
+void csc_matrix_t<i_t, f_t>::append_column(i_t x_nz, i_t* i, f_t* x)
+{
+  i_t nz = this->col_start[this->n];
+  for (i_t k = 0; k < x_nz; ++k) {
+    const i_t i_val = i[k];
+    const f_t x_val = x[i_val];
+    if (x_val != 0.0) {
+      this->i[nz] = i_val;
+      this->x[nz] = x_val;
+      nz++;
+    }
+  }
+  this->col_start[this->n + 1] = nz;
+  this->n++;
+}
+
 template <typename i_t, typename f_t>
 i_t csc_matrix_t<i_t, f_t>::transpose(csc_matrix_t<i_t, f_t>& AT) const
 {
@@ -360,6 +416,28 @@ void scatter_dense(const csc_matrix_t<i_t, f_t>& A, i_t j, f_t alpha, std::vecto
   }
 }
 
+// x <- x + alpha * A(:, j)
+template <typename i_t, typename f_t>
+void scatter_dense(const csc_matrix_t<i_t, f_t>& A,
+                   i_t j,
+                   f_t alpha,
+                   std::vector<f_t>& x,
+                   std::vector<i_t>& mark,
+                   std::vector<i_t>& indices)
+{
+  const i_t col_start = A.col_start[j];
+  const i_t col_end   = A.col_start[j + 1];
+  for (i_t p = col_start; p < col_end; ++p) {
+    const i_t i  = A.i[p];
+    const f_t ax = A.x[p];
+    x[i] += alpha * ax;
+    if (!mark[i]) {
+      mark[i] = 1;
+      indices.push_back(i);
+    }
+  }
+}
+
 // Compute C = A*B where C is m x n, A is m x k, and B = k x n
 // Do this by computing C(:, j) = A*B(:, j) = sum (i=1 to k) A(:, k)*B(i, j)
 template <typename i_t, typename f_t>
@@ -695,6 +773,13 @@ template void scatter_dense<int, double>(const csc_matrix_t<int, double>& A,
                                          double alpha,
                                          std::vector<double>& x);
 
+template void scatter_dense<int, double>(const csc_matrix_t<int, double>& A,
+                                         int j,
+                                         double alpha,
+                                         std::vector<double>& x,
+                                         std::vector<int>& mark,
+                                         std::vector<int>& indices);
+
 template int multiply<int, double>(const csc_matrix_t<int, double>& A,
                                    const csc_matrix_t<int, double>& B,
                                    csc_matrix_t<int, double>& C);
diff --git a/cpp/src/dual_simplex/sparse_matrix.hpp b/cpp/src/dual_simplex/sparse_matrix.hpp
index 29e6a0cf4..9cc3d6380 100644
--- a/cpp/src/dual_simplex/sparse_matrix.hpp
+++ b/cpp/src/dual_simplex/sparse_matrix.hpp
@@ -20,6 +20,7 @@
 #include <dual_simplex/types.hpp>
 #include <dual_simplex/vector_math.hpp>
 
+#include <algorithm>
 #include <cassert>
 #include <cstdio>
 #include <vector>
@@ -29,6 +30,9 @@ namespace cuopt::linear_programming::dual_simplex {
 template <typename i_t, typename f_t>
 class csr_matrix_t;  // Forward declaration of CSR matrix needed to define CSC matrix
 
+template <typename i_t, typename f_t>
+class sparse_vector_t;  // Forward declaration of sparse vector needed to define CSC matrix
+
 // A sparse matrix stored in compressed sparse column format
 template <typename i_t, typename f_t>
 class csc_matrix_t {
@@ -59,6 +63,15 @@ class csc_matrix_t {
   // Compute the transpose of A
   i_t transpose(csc_matrix_t<i_t, f_t>& AT) const;
 
+  // Append a dense column to the matrix. Assumes the matrix has already been resized accordingly
+  void append_column(const std::vector<f_t>& x);
+
+  // Append a sparse column to the matrix. Assumes the matrix has already been resized accordingly
+  void append_column(const sparse_vector_t<i_t, f_t>& x);
+
+  // Append a sparse column to the matrix. Assumes the matrix has already been resized accordingly
+  void append_column(i_t nz, i_t* i, f_t* x);
+
   // Remove columns from the matrix
   i_t remove_columns(const std::vector<i_t>& cols_to_remove);
 
@@ -131,6 +144,14 @@ i_t scatter(const csc_matrix_t<i_t, f_t>& A,
 template <typename i_t, typename f_t>
 void scatter_dense(const csc_matrix_t<i_t, f_t>& A, i_t j, f_t alpha, std::vector<f_t>& x);
 
+template <typename i_t, typename f_t>
+void scatter_dense(const csc_matrix_t<i_t, f_t>& A,
+                   i_t j,
+                   f_t alpha,
+                   std::vector<f_t>& x,
+                   std::vector<i_t>& mark,
+                   std::vector<i_t>& indices);
+
 // Compute C = A*B where C is m x n, A is m x k, and B = k x n
 // Do this by computing C(:, j) = A*B(:, j) = sum (i=1 to k) A(:, k)*B(i, j)
 template <typename i_t, typename f_t>
diff --git a/cpp/src/dual_simplex/sparse_vector.cpp b/cpp/src/dual_simplex/sparse_vector.cpp
new file mode 100644
index 000000000..73a0c0a8f
--- /dev/null
+++ b/cpp/src/dual_simplex/sparse_vector.cpp
@@ -0,0 +1,224 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <dual_simplex/sparse_vector.hpp>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+template <typename i_t, typename f_t>
+sparse_vector_t<i_t, f_t>::sparse_vector_t(const csc_matrix_t<i_t, f_t>& A, i_t col)
+{
+  const i_t col_start = A.col_start[col];
+  const i_t col_end   = A.col_start[col + 1];
+  n                   = A.m;
+  const i_t nz        = col_end - col_start;
+  i.reserve(nz);
+  x.reserve(nz);
+  for (i_t k = col_start; k < col_end; ++k) {
+    i.push_back(A.i[k]);
+    x.push_back(A.x[k]);
+  }
+}
+
+template <typename i_t, typename f_t>
+void sparse_vector_t<i_t, f_t>::from_dense(const std::vector<f_t>& in)
+{
+  i.clear();
+  x.clear();
+  n = in.size();
+  i.reserve(n);
+  x.reserve(n);
+  for (i_t k = 0; k < n; ++k) {
+    if (in[k] != 0) {
+      i.push_back(k);
+      x.push_back(in[k]);
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+void sparse_vector_t<i_t, f_t>::to_csc(csc_matrix_t<i_t, f_t>& A) const
+{
+  A.m      = n;
+  A.n      = 1;
+  A.nz_max = i.size();
+  A.col_start.resize(2);
+  A.col_start[0] = 0;
+  A.col_start[1] = i.size();
+  A.i            = i;
+  A.x            = x;
+}
+
+template <typename i_t, typename f_t>
+void sparse_vector_t<i_t, f_t>::to_dense(std::vector<f_t>& x_dense) const
+{
+  x_dense.clear();
+  x_dense.resize(n, 0.0);
+  const i_t nz = i.size();
+  for (i_t k = 0; k < nz; ++k) {
+    x_dense[i[k]] = x[k];
+  }
+}
+
+template <typename i_t, typename f_t>
+void sparse_vector_t<i_t, f_t>::scatter(std::vector<f_t>& x_dense) const
+{
+  // Assumes x_dense is already cleared
+  const i_t nz = i.size();
+  for (i_t k = 0; k < nz; ++k) {
+    x_dense[i[k]] += x[k];
+  }
+}
+
+template <typename i_t, typename f_t>
+void sparse_vector_t<i_t, f_t>::inverse_permute_vector(const std::vector<i_t>& p)
+{
+  assert(p.size() == n);
+  i_t nz = i.size();
+  std::vector<i_t> i_perm(nz);
+  for (i_t k = 0; k < nz; ++k) {
+    i_perm[k] = p[i[k]];
+  }
+  i = i_perm;
+}
+
+template <typename i_t, typename f_t>
+void sparse_vector_t<i_t, f_t>::inverse_permute_vector(const std::vector<i_t>& p,
+                                                       sparse_vector_t<i_t, f_t>& y) const
+{
+  i_t m = p.size();
+  assert(n == m);
+  i_t nz = i.size();
+  y.n    = n;
+  y.x    = x;
+  std::vector<i_t> i_perm(nz);
+  for (i_t k = 0; k < nz; ++k) {
+    i_perm[k] = p[i[k]];
+  }
+  y.i = i_perm;
+}
+
+template <typename i_t, typename f_t>
+f_t sparse_vector_t<i_t, f_t>::sparse_dot(const csc_matrix_t<i_t, f_t>& Y, i_t y_col) const
+{
+  const i_t col_start = Y.col_start[y_col];
+  const i_t col_end   = Y.col_start[y_col + 1];
+  const i_t ny        = col_end - col_start;
+  const i_t nx        = i.size();
+  f_t dot             = 0.0;
+  for (i_t h = 0, k = col_start; h < nx && k < col_end;) {
+    const i_t p = i[h];
+    const i_t q = Y.i[k];
+    if (p == q) {
+      dot += Y.x[k] * x[h];
+      h++;
+      k++;
+    } else if (p < q) {
+      h++;
+    } else if (q < p) {
+      k++;
+    }
+  }
+  return dot;
+}
+
+template <typename i_t, typename f_t>
+void sparse_vector_t<i_t, f_t>::sort()
+{
+  if (i.size() == 1) { return; }
+  // If the number of nonzeros is large, use a O(n) bucket sort
+  if (i.size() > 0.3 * n) {
+    std::vector<f_t> bucket(n, 0.0);
+    const i_t nz = i.size();
+    for (i_t k = 0; k < nz; ++k) {
+      bucket[i[k]] = x[k];
+    }
+    i.clear();
+    i.reserve(nz);
+    x.clear();
+    x.reserve(nz);
+    for (i_t k = 0; k < n; ++k) {
+      if (bucket[k] != 0.0) {
+        i.push_back(k);
+        x.push_back(bucket[k]);
+      }
+    }
+  } else {
+    // Use a n log n sort
+    const i_t nz = i.size();
+    std::vector<i_t> i_sorted(nz);
+    std::vector<f_t> x_sorted(nz);
+    std::vector<i_t> perm(nz);
+    for (i_t k = 0; k < nz; ++k) {
+      perm[k] = k;
+    }
+    std::vector<i_t>& iunsorted = i;
+    std::sort(
+      perm.begin(), perm.end(), [&iunsorted](i_t a, i_t b) { return iunsorted[a] < iunsorted[b]; });
+    for (i_t k = 0; k < nz; ++k) {
+      i_sorted[k] = i[perm[k]];
+      x_sorted[k] = x[perm[k]];
+    }
+    i = i_sorted;
+    x = x_sorted;
+  }
+
+  // Check
+#ifdef CHECK_SORT
+  if (!std::is_sorted(i.begin(), i.end())) { printf("Sort error\n"); }
+#endif
+}
+
+template <typename i_t, typename f_t>
+f_t sparse_vector_t<i_t, f_t>::norm2_squared() const
+{
+  f_t dot      = 0.0;
+  const i_t nz = i.size();
+  for (i_t k = 0; k < nz; ++k) {
+    dot += x[k] * x[k];
+  }
+  return dot;
+}
+
+template <typename i_t, typename f_t>
+void sparse_vector_t<i_t, f_t>::negate()
+{
+  const i_t nz = x.size();
+  for (i_t k = 0; k < nz; ++k) {
+    x[k] *= -1.0;
+  }
+}
+
+template <typename i_t, typename f_t>
+f_t sparse_vector_t<i_t, f_t>::find_coefficient(i_t index) const
+{
+  const i_t nz = i.size();
+  for (i_t k = 0; k < nz; ++k) {
+    if (i[k] == index) { return x[k]; }
+  }
+  return std::numeric_limits<f_t>::quiet_NaN();
+}
+
+#ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE
+template class sparse_vector_t<int, double>;
+#endif
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/sparse_vector.hpp b/cpp/src/dual_simplex/sparse_vector.hpp
new file mode 100644
index 000000000..cf970acda
--- /dev/null
+++ b/cpp/src/dual_simplex/sparse_vector.hpp
@@ -0,0 +1,64 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <dual_simplex/sparse_matrix.hpp>
+#include <dual_simplex/types.hpp>
+
+#include <vector>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+// A sparse vector stored as a list of nonzero coefficients and their indices
+template <typename i_t, typename f_t>
+class sparse_vector_t {
+ public:
+  // Construct a sparse vector of dimension n with nz nonzero coefficients
+  sparse_vector_t(i_t n, i_t nz) : n(n), i(nz), x(nz) {}
+  // Construct a sparse vector from a dense vector.
+  sparse_vector_t(const std::vector<f_t>& in) { from_dense(in); }
+  // Construct a sparse vector from a column of a CSC matrix
+  sparse_vector_t(const csc_matrix_t<i_t, f_t>& A, i_t col);
+  // gather a dense vector into a sparse vector
+  void from_dense(const std::vector<f_t>& in);
+  // convert a sparse vector into a CSC matrix with a single column
+  void to_csc(csc_matrix_t<i_t, f_t>& A) const;
+  // convert a sparse vector into a dense vector. Dense vector is cleared and resized.
+  void to_dense(std::vector<f_t>& x_dense) const;
+  // scatter a sparse vector into a dense vector. Assumes x_dense is already cleared or
+  // preinitialized
+  void scatter(std::vector<f_t>& x_dense) const;
+  // inverse permute the current sparse vector
+  void inverse_permute_vector(const std::vector<i_t>& p);
+  // inverse permute a sparse vector into another sparse vector
+  void inverse_permute_vector(const std::vector<i_t>& p, sparse_vector_t<i_t, f_t>& y) const;
+  // compute the dot product of a sparse vector with a column of a CSC matrix
+  f_t sparse_dot(const csc_matrix_t<i_t, f_t>& Y, i_t y_col) const;
+  // ensure the coefficients in the sparse vectory are sorted in terms of increasing index
+  void sort();
+  // compute the squared 2-norm of the sparse vector
+  f_t norm2_squared() const;
+  void negate();
+  f_t find_coefficient(i_t index) const;
+
+  i_t n;
+  std::vector<i_t> i;
+  std::vector<f_t> x;
+};
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/triangle_solve.cpp b/cpp/src/dual_simplex/triangle_solve.cpp
index eddf04843..13a42d2f9 100644
--- a/cpp/src/dual_simplex/triangle_solve.cpp
+++ b/cpp/src/dual_simplex/triangle_solve.cpp
@@ -89,26 +89,25 @@ i_t upper_triangular_transpose_solve(const csc_matrix_t<i_t, f_t>& U, std::vecto
   return 0;
 }
 
-// \brief Reach computes the reach of b=B(:, col) in the graph of G
-// \param[in] B - Sparse CSC matrix containing rhs
-// \param[in] col - column of B
+// \brief Reach computes the reach of b in the graph of G
+// \param[in] b - Sparse vector containing the rhs
 // \param[in] pinv - inverse permuation vector
 // \param[in, out] G - Sparse CSC matrix G. The column pointers of G are
 // modified (but restored) during this call \param[out] xi  - stack of size 2*n.
 // xi[top] .. xi[n-1] contains the reachable indicies \returns top - the size of
 // the stack
 template <typename i_t, typename f_t>
-i_t reach(const csc_matrix_t<i_t, f_t>& B,
-          i_t col,
+i_t reach(const sparse_vector_t<i_t, f_t>& b,
           const std::optional<std::vector<i_t>>& pinv,
           csc_matrix_t<i_t, f_t>& G,
           std::vector<i_t>& xi)
 {
-  const i_t m = G.m;
-  i_t top     = m;
-  for (i_t p = B.col_start[col]; p < B.col_start[col + 1]; ++p) {
-    if (!MARKED(G.col_start, B.i[p])) {  // start a DFS at unmarked node i
-      top = depth_first_search(B.i[p], pinv, G, top, xi, xi.begin() + m);
+  const i_t m   = G.m;
+  i_t top       = m;
+  const i_t bnz = b.i.size();
+  for (i_t p = 0; p < bnz; ++p) {
+    if (!MARKED(G.col_start, b.i[p])) {  // start a DFS at unmarked node i
+      top = depth_first_search(b.i[p], pinv, G, top, xi, xi.begin() + m);
     }
   }
   for (i_t p = top; p < m; ++p) {  // restore G
@@ -152,7 +151,7 @@ i_t depth_first_search(i_t j,
     }
     done   = 1;  // Node j is done if no unvisited neighbors
     i_t p2 = (jnew < 0) ? 0 : UNFLIP(G.col_start[jnew + 1]);
-    for (i_t p = pstack[head]; p < p2; ++p) {  // Examin all neighbors of j
+    for (i_t p = pstack[head]; p < p2; ++p) {  // Examine all neighbors of j
       i_t i = G.i[p];                          // Consider neighbor i
       if (MARKED(G.col_start, i)) {
         continue;  // skip visited node i
@@ -163,29 +162,31 @@ i_t depth_first_search(i_t j,
       break;             // break to start dfs at node i
     }
     if (done) {
-      head--;         // remove j from the recursion stack
-      xi[--top] = j;  // and place it the output stack
+      pstack[head] = 0;  // restore pstack so it can be used again in other routines
+      xi[head]     = 0;  // restore xi so it can be used again in other routines
+      head--;            // remove j from the recursion stack
+      xi[--top] = j;     // and place it the output stack
     }
   }
   return top;
 }
 
 template <typename i_t, typename f_t, bool lo>
-i_t sparse_triangle_solve(const csc_matrix_t<i_t, f_t>& B,
-                          i_t col,
+i_t sparse_triangle_solve(const sparse_vector_t<i_t, f_t>& b,
                           const std::optional<std::vector<i_t>>& pinv,
                           std::vector<i_t>& xi,
                           csc_matrix_t<i_t, f_t>& G,
                           f_t* x)
 {
   i_t m = G.m;
-  assert(B.m == m);
-  i_t top = reach(B, col, pinv, G, xi);
+  assert(b.n == m);
+  i_t top = reach(b, pinv, G, xi);
   for (i_t p = top; p < m; ++p) {
     x[xi[p]] = 0;  // Clear x vector
   }
-  for (i_t p = B.col_start[col]; p < B.col_start[col + 1]; ++p) {
-    x[B.i[p]] = B.x[p];  // Scatter b
+  const i_t bnz = b.i.size();
+  for (i_t p = 0; p < bnz; ++p) {
+    x[b.i[p]] = b.x[p];  // Scatter b
   }
   for (i_t px = top; px < m; ++px) {
     i_t j = xi[px];                   // x(j) is nonzero
@@ -225,8 +226,7 @@ template int upper_triangular_solve<int, double>(const csc_matrix_t<int, double>
 template int upper_triangular_transpose_solve<int, double>(const csc_matrix_t<int, double>& U,
                                                            std::vector<double>& x);
 
-template int reach<int, double>(const csc_matrix_t<int, double>& B,
-                                int col,
+template int reach<int, double>(const sparse_vector_t<int, double>& b,
                                 const std::optional<std::vector<int>>& pinv,
                                 csc_matrix_t<int, double>& G,
                                 std::vector<int>& xi);
@@ -238,12 +238,17 @@ template int depth_first_search<int, double>(int j,
                                              std::vector<int>& xi,
                                              std::vector<int>::iterator pstack);
 
-template int sparse_triangle_solve<int, double, true>(const csc_matrix_t<int, double>& B,
-                                                      int col,
+template int sparse_triangle_solve<int, double, true>(const sparse_vector_t<int, double>& b,
                                                       const std::optional<std::vector<int>>& pinv,
                                                       std::vector<int>& xi,
                                                       csc_matrix_t<int, double>& G,
                                                       double* x);
+
+template int sparse_triangle_solve<int, double, false>(const sparse_vector_t<int, double>& b,
+                                                       const std::optional<std::vector<int>>& pinv,
+                                                       std::vector<int>& xi,
+                                                       csc_matrix_t<int, double>& G,
+                                                       double* x);
 #endif
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/triangle_solve.hpp b/cpp/src/dual_simplex/triangle_solve.hpp
index fc01613c7..5016332da 100644
--- a/cpp/src/dual_simplex/triangle_solve.hpp
+++ b/cpp/src/dual_simplex/triangle_solve.hpp
@@ -18,6 +18,7 @@
 #pragma once
 
 #include <dual_simplex/sparse_matrix.hpp>
+#include <dual_simplex/sparse_vector.hpp>
 #include <dual_simplex/types.hpp>
 
 #include <optional>
@@ -52,17 +53,15 @@ i_t upper_triangular_solve(const csc_matrix_t<i_t, f_t>& U, std::vector<f_t>& x)
 template <typename i_t, typename f_t>
 i_t upper_triangular_transpose_solve(const csc_matrix_t<i_t, f_t>& U, std::vector<f_t>& x);
 
-// \brief Reach computes the reach of b=B(:, col) in the graph of G
-// \param[in] B - Sparse CSC matrix containing rhs
-// \param[in] col - column of B
+// \brief Reach computes the reach of b in the graph of G
+// \param[in] b - sparse vector containing the rhs
 // \param[in] pinv - inverse permuation vector
 // \param[in, out] G - Sparse CSC matrix G. The column pointers of G are
 // modified (but restored) during this call \param[out] xi  - stack of size 2*n.
 // xi[top] .. xi[n-1] contains the reachable indicies \returns top - the size of
 // the stack
 template <typename i_t, typename f_t>
-i_t reach(const csc_matrix_t<i_t, f_t>& B,
-          i_t col,
+i_t reach(const sparse_vector_t<i_t, f_t>& b,
           const std::optional<std::vector<i_t>>& pinv,
           csc_matrix_t<i_t, f_t>& G,
           std::vector<i_t>& xi);
@@ -95,8 +94,7 @@ i_t depth_first_search(i_t j,
 //        and U is a sparse upper triangular matrix, and b is a sparse
 //        right-hand side. The vector b is obtained from the column of a sparse
 //        matrix.
-// \param[in] B - Sparse CSC matrix contain the rhs
-// \param[in] col - the column of B to use as b. b = B(:, col)
+// \param[in] b - Sparse vector contain the rhs
 // \param[in] pinv - optional inverse permutation vector
 // \param[in, out] xi - An array of size 2*m, on output it contains the non-zero
 // pattern of x in xi[top] through xi[m-1]
@@ -104,8 +102,7 @@ i_t depth_first_search(i_t j,
 //                     G.col_start is marked and restored during the algorithm
 // \param[out] - The solution vector xi_t
 template <typename i_t, typename f_t, bool lo>
-i_t sparse_triangle_solve(const csc_matrix_t<i_t, f_t>& B,
-                          i_t col,
+i_t sparse_triangle_solve(const sparse_vector_t<i_t, f_t>& b,
                           const std::optional<std::vector<i_t>>& pinv,
                           std::vector<i_t>& xi,
                           csc_matrix_t<i_t, f_t>& G,
diff --git a/cpp/src/dual_simplex/vector_math.cpp b/cpp/src/dual_simplex/vector_math.cpp
index aa05d5743..239848ac3 100644
--- a/cpp/src/dual_simplex/vector_math.cpp
+++ b/cpp/src/dual_simplex/vector_math.cpp
@@ -67,6 +67,47 @@ f_t dot(const std::vector<f_t>& x, const std::vector<f_t>& y)
   return dot;
 }
 
+template <typename i_t, typename f_t>
+f_t sparse_dot(
+  i_t const* xind, f_t const* xval, i_t nx, i_t const* yind, i_t ny, f_t const* y_scatter_val)
+{
+  f_t dot = 0.0;
+  for (i_t i = 0, j = 0; i < nx && j < ny;) {
+    const i_t p = xind[i];
+    const i_t q = yind[j];
+    if (p == q) {
+      dot += xval[i] * y_scatter_val[q];
+      i++;
+      j++;
+    } else if (p < q) {
+      i++;
+    } else if (q < p) {
+      j++;
+    }
+  }
+  return dot;
+}
+
+template <typename i_t, typename f_t>
+f_t sparse_dot(i_t* xind, f_t* xval, i_t nx, i_t* yind, f_t* yval, i_t ny)
+{
+  f_t dot = 0.0;
+  for (i_t i = 0, j = 0; i < nx && j < ny;) {
+    const i_t p = xind[i];
+    const i_t q = yind[j];
+    if (p == q) {
+      dot += xval[i] * yval[j];
+      i++;
+      j++;
+    } else if (p < q) {
+      i++;
+    } else if (q < p) {
+      j++;
+    }
+  }
+  return dot;
+}
+
 template <typename i_t, typename f_t>
 f_t sparse_dot(const std::vector<i_t>& xind,
                const std::vector<f_t>& xval,
@@ -146,6 +187,16 @@ template double sparse_dot<int, double>(const std::vector<int>& xind,
                                         const std::vector<int>& yind,
                                         const std::vector<double>& yval);
 
+template double sparse_dot<int, double>(int const* xind,
+                                        double const* xval,
+                                        int nx,
+                                        int const* yind,
+                                        int ny,
+                                        double const* y_scatter_val);
+
+template double sparse_dot<int, double>(
+  int* xind, double* xval, int nx, int* yind, double* yval, int ny);
+
 template int permute_vector<int, double>(const std::vector<int>& p,
                                          const std::vector<double>& b,
                                          std::vector<double>& x);
diff --git a/cpp/src/dual_simplex/vector_math.hpp b/cpp/src/dual_simplex/vector_math.hpp
index 962b21743..c5bd12863 100644
--- a/cpp/src/dual_simplex/vector_math.hpp
+++ b/cpp/src/dual_simplex/vector_math.hpp
@@ -44,6 +44,13 @@ f_t sparse_dot(const std::vector<i_t>& xind,
                const std::vector<i_t>& yind,
                const std::vector<f_t>& yval);
 
+template <typename i_t, typename f_t>
+f_t sparse_dot(
+  i_t const* xind, f_t const* xval, i_t nx, i_t const* yind, i_t ny, f_t const* y_scatter_val);
+
+template <typename i_t, typename f_t>
+f_t sparse_dot(i_t* xind, f_t* xval, i_t nx, i_t* yind, f_t* yval, i_t ny);
+
 // Computes x = P*b or x=b(p) in MATLAB.
 template <typename i_t, typename f_t>
 i_t permute_vector(const std::vector<i_t>& p, const std::vector<f_t>& b, std::vector<f_t>& x);
diff --git a/cpp/src/linear_programming/solve.cu b/cpp/src/linear_programming/solve.cu
index df3d3d1e1..c06985997 100644
--- a/cpp/src/linear_programming/solve.cu
+++ b/cpp/src/linear_programming/solve.cu
@@ -467,7 +467,7 @@ void run_dual_simplex_thread(
 
 template <typename i_t, typename f_t>
 optimization_problem_solution_t<i_t, f_t> run_concurrent(
-  optimization_problem_t<i_t, f_t>& op_problem,
+  const optimization_problem_t<i_t, f_t>& op_problem,
   detail::problem_t<i_t, f_t>& problem,
   pdlp_solver_settings_t<i_t, f_t> const& settings,
   bool is_batch_mode)
@@ -540,7 +540,7 @@ optimization_problem_solution_t<i_t, f_t> run_concurrent(
 
 template <typename i_t, typename f_t>
 optimization_problem_solution_t<i_t, f_t> solve_lp_with_method(
-  optimization_problem_t<i_t, f_t>& op_problem,
+  const optimization_problem_t<i_t, f_t>& op_problem,
   detail::problem_t<i_t, f_t>& problem,
   pdlp_solver_settings_t<i_t, f_t> const& settings,
   bool is_batch_mode)
@@ -714,6 +714,12 @@ optimization_problem_solution_t<i_t, f_t> solve_lp(
     bool problem_checking,                                                             \
     bool use_pdlp_solver_mode);                                                        \
                                                                                        \
+  template optimization_problem_solution_t<int, F_TYPE> solve_lp_with_method(          \
+    const optimization_problem_t<int, F_TYPE>& op_problem,                             \
+    detail::problem_t<int, F_TYPE>& problem,                                           \
+    pdlp_solver_settings_t<int, F_TYPE> const& settings,                               \
+    bool is_batch_mode = false);                                                       \
+                                                                                       \
   template optimization_problem_t<int, F_TYPE> mps_data_model_to_optimization_problem( \
     raft::handle_t const* handle_ptr,                                                  \
     const cuopt::mps_parser::mps_data_model_t<int, F_TYPE>& data_model);
diff --git a/cpp/src/linear_programming/solve.cuh b/cpp/src/linear_programming/solve.cuh
index bd7eee8df..3024d6774 100644
--- a/cpp/src/linear_programming/solve.cuh
+++ b/cpp/src/linear_programming/solve.cuh
@@ -30,4 +30,11 @@ cuopt::linear_programming::optimization_problem_t<i_t, f_t> mps_data_model_to_op
   raft::handle_t const* handle_ptr,
   const cuopt::mps_parser::mps_data_model_t<i_t, f_t>& data_model);
 
+template <typename i_t, typename f_t>
+cuopt::linear_programming::optimization_problem_solution_t<i_t, f_t> solve_lp_with_method(
+  const optimization_problem_t<i_t, f_t>& op_problem,
+  detail::problem_t<i_t, f_t>& problem,
+  pdlp_solver_settings_t<i_t, f_t> const& settings,
+  bool is_batch_mode = false);
+
 }  // namespace cuopt::linear_programming
diff --git a/cpp/src/linear_programming/utilities/ping_pong_graph.cuh b/cpp/src/linear_programming/utilities/ping_pong_graph.cuh
index 4939e0711..9fd5c6586 100644
--- a/cpp/src/linear_programming/utilities/ping_pong_graph.cuh
+++ b/cpp/src/linear_programming/utilities/ping_pong_graph.cuh
@@ -59,14 +59,12 @@ class ping_pong_graph_t {
     if (!is_batch_mode_) {
       if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
         RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph));
-        // Extra NULL NULL 0 mandatory for cuda 11.8
-        RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph, nullptr, nullptr, 0));
+        RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph));
         even_initialized = true;
         RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph));
       } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
         RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph));
-        // Extra NULL NULL 0 mandatory for cuda 11.8
-        RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph, nullptr, nullptr, 0));
+        RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph));
         odd_initialized = true;
         RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph));
       }
diff --git a/cpp/src/mip/diversity/assignment_hash_map.cu b/cpp/src/mip/diversity/assignment_hash_map.cu
index cdef55ea5..91ef05bd1 100644
--- a/cpp/src/mip/diversity/assignment_hash_map.cu
+++ b/cpp/src/mip/diversity/assignment_hash_map.cu
@@ -97,7 +97,7 @@ size_t assignment_hash_map_t<i_t, f_t>::hash_solution(solution_t<i_t, f_t>& solu
   hash_solution_kernel<i_t, f_t, TPB>
     <<<(integer_assignment.size() + TPB - 1) / TPB, TPB, 0, solution.handle_ptr->get_stream()>>>(
       cuopt::make_span(integer_assignment), cuopt::make_span(reduction_buffer));
-  RAFT_CHECK_CUDA(handle_ptr->get_stream());
+  RAFT_CHECK_CUDA(solution.handle_ptr->get_stream());
   // Get the number of blocks used in the hash_solution_kernel
   int num_blocks = (integer_assignment.size() + TPB - 1) / TPB;
 
diff --git a/cpp/src/mip/feasibility_jump/feasibility_jump.cu b/cpp/src/mip/feasibility_jump/feasibility_jump.cu
index ba92a99a8..fa88bddd1 100644
--- a/cpp/src/mip/feasibility_jump/feasibility_jump.cu
+++ b/cpp/src/mip/feasibility_jump/feasibility_jump.cu
@@ -785,7 +785,7 @@ void fj_t<i_t, f_t>::run_step_device(const rmm::cuda_stream_view& climber_stream
 
     if (use_graph) {
       cudaStreamEndCapture(climber_stream, &graph);
-      cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0);
+      cudaGraphInstantiate(&graph_instance, graph);
       RAFT_CHECK_CUDA(climber_stream);
       cudaGraphDestroy(graph);
       graph_created = true;
diff --git a/cpp/src/mip/presolve/bounds_presolve.cu b/cpp/src/mip/presolve/bounds_presolve.cu
index 45fee622e..72440cd9a 100644
--- a/cpp/src/mip/presolve/bounds_presolve.cu
+++ b/cpp/src/mip/presolve/bounds_presolve.cu
@@ -202,6 +202,7 @@ termination_criterion_t bound_presolve_t<i_t, f_t>::bound_update_loop(problem_t<
   }
   pb.handle_ptr->sync_stream();
   calculate_infeasible_redundant_constraints(pb);
+  solve_iter = iter;
 
   return criteria;
 }
diff --git a/cpp/src/mip/presolve/bounds_presolve.cuh b/cpp/src/mip/presolve/bounds_presolve.cuh
index 9a25b05e9..84853a781 100644
--- a/cpp/src/mip/presolve/bounds_presolve.cuh
+++ b/cpp/src/mip/presolve/bounds_presolve.cuh
@@ -86,6 +86,7 @@ class bound_presolve_t {
   i_t infeas_constraints_count = 0;
   i_t redund_constraints_count = 0;
   probing_cache_t<i_t, f_t> probing_cache;
+  i_t solve_iter;
 };
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu
index 091f8a53b..4b65de9c2 100644
--- a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu
+++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu
@@ -203,10 +203,10 @@ bool build_graph(managed_stream_pool& streams,
 
   if (graph_exec != nullptr) {
     cudaGraphExecDestroy(graph_exec);
-    cudaGraphInstantiate(&graph_exec, graph, NULL, NULL, 0);
+    cudaGraphInstantiate(&graph_exec, graph);
     RAFT_CHECK_CUDA(handle_ptr->get_stream());
   } else {
-    cudaGraphInstantiate(&graph_exec, graph, NULL, NULL, 0);
+    cudaGraphInstantiate(&graph_exec, graph);
     RAFT_CHECK_CUDA(handle_ptr->get_stream());
   }
 
@@ -245,6 +245,7 @@ void load_balanced_bounds_presolve_t<i_t, f_t>::setup(
                                                            heavy_degree_cutoff,
                                                            problem.cnst_bin_offsets,
                                                            problem.offsets);
+  RAFT_CHECK_CUDA(stream_heavy_cnst);
 
   num_blocks_heavy_vars = create_heavy_item_block_segments(stream_heavy_vars,
                                                            heavy_vars_vertex_ids,
@@ -253,49 +254,33 @@ void load_balanced_bounds_presolve_t<i_t, f_t>::setup(
                                                            heavy_degree_cutoff,
                                                            problem.vars_bin_offsets,
                                                            problem.reverse_offsets);
+  RAFT_CHECK_CUDA(stream_heavy_vars);
 
   tmp_act.resize(2 * num_blocks_heavy_cnst, stream_heavy_cnst);
   tmp_bnd.resize(2 * num_blocks_heavy_vars, stream_heavy_vars);
 
-  std::tie(is_cnst_sub_warp_single_bin, cnst_sub_warp_count) = sub_warp_meta(
-    streams.get_stream(), warp_cnst_offsets, warp_cnst_id_offsets, pb->cnst_bin_offsets, 4);
+  std::tie(is_cnst_sub_warp_single_bin, cnst_sub_warp_count) =
+    sub_warp_meta(stream, warp_cnst_offsets, warp_cnst_id_offsets, pb->cnst_bin_offsets, 4);
 
-  std::tie(is_vars_sub_warp_single_bin, vars_sub_warp_count) = sub_warp_meta(
-    streams.get_stream(), warp_vars_offsets, warp_vars_id_offsets, pb->vars_bin_offsets, 4);
+  std::tie(is_vars_sub_warp_single_bin, vars_sub_warp_count) =
+    sub_warp_meta(stream, warp_vars_offsets, warp_vars_id_offsets, pb->vars_bin_offsets, 4);
 
-  stream.synchronize();
-  streams.sync_all_issued();
+  RAFT_CHECK_CUDA(stream);
+  streams.sync_test_all_issued();
 
   if (!calc_slack_erase_inf_cnst_graph_created) {
-    bool erase_inf_cnst                     = true;
-    calc_slack_erase_inf_cnst_graph_created = build_graph(
-      streams,
-      handle_ptr,
-      calc_slack_erase_inf_cnst_graph,
-      calc_slack_erase_inf_cnst_exec,
-      [erase_inf_cnst, this]() { this->calculate_activity_graph(erase_inf_cnst, true); },
-      [erase_inf_cnst, this]() { this->calculate_activity_graph(erase_inf_cnst); });
+    create_constraint_slack_graph(true);
+    calc_slack_erase_inf_cnst_graph_created = true;
   }
 
   if (!calc_slack_graph_created) {
-    bool erase_inf_cnst      = false;
-    calc_slack_graph_created = build_graph(
-      streams,
-      handle_ptr,
-      calc_slack_graph,
-      calc_slack_exec,
-      [erase_inf_cnst, this]() { this->calculate_activity_graph(erase_inf_cnst, true); },
-      [erase_inf_cnst, this]() { this->calculate_activity_graph(erase_inf_cnst); });
+    create_constraint_slack_graph(false);
+    calc_slack_graph_created = true;
   }
 
   if (!upd_bnd_graph_created) {
-    upd_bnd_graph_created = build_graph(
-      streams,
-      handle_ptr,
-      upd_bnd_graph,
-      upd_bnd_exec,
-      [this]() { this->calculate_bounds_update_graph(true); },
-      [this]() { this->calculate_bounds_update_graph(); });
+    create_bounds_update_graph();
+    upd_bnd_graph_created = true;
   }
 }
 
@@ -368,6 +353,116 @@ void load_balanced_bounds_presolve_t<i_t, f_t>::calculate_activity_graph(bool er
                                          dry_run);
 }
 
+template <typename i_t, typename f_t>
+void load_balanced_bounds_presolve_t<i_t, f_t>::create_bounds_update_graph()
+{
+  using f_t2 = typename type_2<f_t>::type;
+  cudaGraph_t upd_graph;
+  cudaGraphCreate(&upd_graph, 0);
+  cudaGraphNode_t bounds_changed_node;
+  {
+    i_t* bounds_changed_ptr = bounds_changed.data();
+
+    cudaMemcpy3DParms memcpyParams = {0};
+    memcpyParams.srcArray          = NULL;
+    memcpyParams.srcPos            = make_cudaPos(0, 0, 0);
+    memcpyParams.srcPtr            = make_cudaPitchedPtr(bounds_changed_ptr, sizeof(i_t), 1, 1);
+    memcpyParams.dstArray          = NULL;
+    memcpyParams.dstPos            = make_cudaPos(0, 0, 0);
+    memcpyParams.dstPtr            = make_cudaPitchedPtr(&h_bounds_changed, sizeof(i_t), 1, 1);
+    memcpyParams.extent            = make_cudaExtent(sizeof(i_t), 1, 1);
+    memcpyParams.kind              = cudaMemcpyDeviceToHost;
+    cudaGraphAddMemcpyNode(&bounds_changed_node, upd_graph, NULL, 0, &memcpyParams);
+  }
+
+  auto bounds_update_view = get_bounds_update_view(*pb);
+
+  create_update_bounds_heavy_vars<i_t, f_t, f_t2, 640>(upd_graph,
+                                                       bounds_changed_node,
+                                                       bounds_update_view,
+                                                       make_span_2(tmp_bnd),
+                                                       heavy_vars_vertex_ids,
+                                                       heavy_vars_pseudo_block_ids,
+                                                       heavy_vars_block_segments,
+                                                       pb->vars_bin_offsets,
+                                                       heavy_degree_cutoff,
+                                                       num_blocks_heavy_vars);
+  RAFT_CUDA_TRY(cudaGetLastError());
+  create_update_bounds_per_block<i_t, f_t, f_t2>(
+    upd_graph, bounds_changed_node, bounds_update_view, pb->vars_bin_offsets, heavy_degree_cutoff);
+  RAFT_CUDA_TRY(cudaGetLastError());
+  create_update_bounds_sub_warp<i_t, f_t, f_t2>(upd_graph,
+                                                bounds_changed_node,
+                                                bounds_update_view,
+                                                is_vars_sub_warp_single_bin,
+                                                vars_sub_warp_count,
+                                                warp_vars_offsets,
+                                                warp_vars_id_offsets,
+                                                pb->vars_bin_offsets);
+  RAFT_CUDA_TRY(cudaGetLastError());
+  cudaGraphInstantiate(&upd_bnd_exec, upd_graph, NULL, NULL, 0);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+template <typename i_t, typename f_t>
+void load_balanced_bounds_presolve_t<i_t, f_t>::create_constraint_slack_graph(bool erase_inf_cnst)
+{
+  using f_t2 = typename type_2<f_t>::type;
+  cudaGraph_t cnst_slack_graph;
+  cudaGraphCreate(&cnst_slack_graph, 0);
+
+  cudaGraphNode_t set_bounds_changed_node;
+  {
+    // TODO : Investigate why memset node is not captured manually
+    i_t* bounds_changed_ptr = bounds_changed.data();
+
+    cudaMemcpy3DParms memcpyParams = {0};
+    memcpyParams.srcArray          = NULL;
+    memcpyParams.srcPos            = make_cudaPos(0, 0, 0);
+    memcpyParams.srcPtr            = make_cudaPitchedPtr(&h_bounds_changed, sizeof(i_t), 1, 1);
+    memcpyParams.dstArray          = NULL;
+    memcpyParams.dstPos            = make_cudaPos(0, 0, 0);
+    memcpyParams.dstPtr            = make_cudaPitchedPtr(bounds_changed_ptr, sizeof(i_t), 1, 1);
+    memcpyParams.extent            = make_cudaExtent(sizeof(i_t), 1, 1);
+    memcpyParams.kind              = cudaMemcpyHostToDevice;
+    cudaGraphAddMemcpyNode(&set_bounds_changed_node, cnst_slack_graph, NULL, 0, &memcpyParams);
+  }
+
+  auto activity_view = get_activity_view(*pb);
+
+  create_activity_heavy_cnst<i_t, f_t, f_t2, 512>(cnst_slack_graph,
+                                                  set_bounds_changed_node,
+                                                  activity_view,
+                                                  make_span_2(tmp_act),
+                                                  heavy_cnst_vertex_ids,
+                                                  heavy_cnst_pseudo_block_ids,
+                                                  heavy_cnst_block_segments,
+                                                  pb->cnst_bin_offsets,
+                                                  heavy_degree_cutoff,
+                                                  num_blocks_heavy_cnst,
+                                                  erase_inf_cnst);
+  create_activity_per_block<i_t, f_t, f_t2>(cnst_slack_graph,
+                                            set_bounds_changed_node,
+                                            activity_view,
+                                            pb->cnst_bin_offsets,
+                                            heavy_degree_cutoff,
+                                            erase_inf_cnst);
+  create_activity_sub_warp<i_t, f_t, f_t2>(cnst_slack_graph,
+                                           set_bounds_changed_node,
+                                           activity_view,
+                                           is_cnst_sub_warp_single_bin,
+                                           cnst_sub_warp_count,
+                                           warp_cnst_offsets,
+                                           warp_cnst_id_offsets,
+                                           pb->cnst_bin_offsets,
+                                           erase_inf_cnst);
+  if (erase_inf_cnst) {
+    cudaGraphInstantiate(&calc_slack_erase_inf_cnst_exec, cnst_slack_graph, NULL, NULL, 0);
+  } else {
+    cudaGraphInstantiate(&calc_slack_exec, cnst_slack_graph, NULL, NULL, 0);
+  }
+}
+
 template <typename i_t, typename f_t>
 void load_balanced_bounds_presolve_t<i_t, f_t>::calculate_bounds_update_graph(bool dry_run)
 {
@@ -401,12 +496,13 @@ template <typename i_t, typename f_t>
 void load_balanced_bounds_presolve_t<i_t, f_t>::calculate_constraint_slack_iter(
   const raft::handle_t* handle_ptr)
 {
+  // h_bounds_changed is copied to bounds_changed in calc_slack_exec
+  h_bounds_changed = 0;
   {
     // writes nans to constraint activities that are infeasible
     //-> less expensive checks for update bounds step
     raft::common::nvtx::range scope("act_cuda_task_graph");
     cudaGraphLaunch(calc_slack_erase_inf_cnst_exec, handle_ptr->get_stream());
-    handle_ptr->sync_stream();
   }
   infeas_cnst_slack_set_to_nan = true;
   RAFT_CHECK_CUDA(handle_ptr->get_stream());
@@ -416,6 +512,8 @@ template <typename i_t, typename f_t>
 void load_balanced_bounds_presolve_t<i_t, f_t>::calculate_constraint_slack(
   const raft::handle_t* handle_ptr)
 {
+  // h_bounds_changed is copied to bounds_changed in calc_slack_exec
+  h_bounds_changed = 0;
   {
     raft::common::nvtx::range scope("act_cuda_task_graph");
     cudaGraphLaunch(calc_slack_exec, handle_ptr->get_stream());
@@ -428,13 +526,10 @@ template <typename i_t, typename f_t>
 bool load_balanced_bounds_presolve_t<i_t, f_t>::update_bounds_from_slack(
   const raft::handle_t* handle_ptr)
 {
-  i_t h_bounds_changed;
-  bounds_changed.set_value_to_zero_async(handle_ptr->get_stream());
-
+  // bounds_changed is copied to h_bounds_changed in upd_bnd_exec
   {
     raft::common::nvtx::range scope("upd_cuda_task_graph");
     cudaGraphLaunch(upd_bnd_exec, handle_ptr->get_stream());
-    h_bounds_changed = bounds_changed.value(handle_ptr->get_stream());
   }
   RAFT_CHECK_CUDA(handle_ptr->get_stream());
   constexpr i_t zero = 0;
diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cuh b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cuh
index 19aef04f8..42736b3a0 100644
--- a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cuh
+++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cuh
@@ -212,6 +212,8 @@ class load_balanced_bounds_presolve_t {
 
   activity_view_t get_activity_view(const load_balanced_problem_t<i_t, f_t>& pb);
   bounds_update_view_t get_bounds_update_view(const load_balanced_problem_t<i_t, f_t>& pb);
+  void create_bounds_update_graph();
+  void create_constraint_slack_graph(bool erase_inf_cnst);
 
   rmm::cuda_stream main_stream;
   rmm::cuda_stream act_stream;
@@ -221,6 +223,7 @@ class load_balanced_bounds_presolve_t {
   const load_balanced_problem_t<i_t, f_t>* pb;
 
   rmm::device_scalar<i_t> bounds_changed;
+  i_t h_bounds_changed;
 
   rmm::device_uvector<f_t> cnst_slack;
   rmm::device_uvector<f_t> vars_bnd;
diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh b/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh
index 53a76536b..7eb2b41a9 100644
--- a/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh
+++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh
@@ -108,6 +108,7 @@ i_t create_heavy_item_block_segments(rmm::cuda_stream_view stream,
 
   // Inclusive scan so that each block can determine which item it belongs to
   item_block_segments.set_element_to_zero_async(0, stream);
+
   thrust::inclusive_scan(rmm::exec_policy(stream),
                          calc_blocks_per_vertex_iter,
                          calc_blocks_per_vertex_iter + heavy_id_count,
@@ -156,6 +157,7 @@ void calc_activity_heavy_cnst(managed_stream_pool& streams,
 {
   if (num_blocks_heavy_cnst != 0) {
     auto heavy_cnst_stream = streams.get_stream();
+    RAFT_CHECK_CUDA(heavy_cnst_stream);
     // TODO : Check heavy_cnst_block_segments size for profiling
     if (!dry_run) {
       auto heavy_cnst_beg_id = get_id_offset(cnst_bin_offsets, heavy_degree_cutoff);
@@ -167,15 +169,18 @@ void calc_activity_heavy_cnst(managed_stream_pool& streams,
           heavy_degree_cutoff,
           view,
           tmp_cnst_act);
+      RAFT_CHECK_CUDA(heavy_cnst_stream);
       auto num_heavy_cnst = cnst_bin_offsets.back() - heavy_cnst_beg_id;
       if (erase_inf_cnst) {
         finalize_calc_act_kernel<true, i_t, f_t, f_t2>
           <<<num_heavy_cnst, 32, 0, heavy_cnst_stream>>>(
             heavy_cnst_beg_id, make_span(heavy_cnst_block_segments), tmp_cnst_act, view);
+        RAFT_CHECK_CUDA(heavy_cnst_stream);
       } else {
         finalize_calc_act_kernel<false, i_t, f_t, f_t2>
           <<<num_heavy_cnst, 32, 0, heavy_cnst_stream>>>(
             heavy_cnst_beg_id, make_span(heavy_cnst_block_segments), tmp_cnst_act, view);
+        RAFT_CHECK_CUDA(heavy_cnst_stream);
       }
     }
   }
@@ -201,9 +206,11 @@ void calc_activity_per_block(managed_stream_pool& streams,
       if (erase_inf_cnst) {
         lb_calc_act_block_kernel<true, i_t, f_t, f_t2, block_dim>
           <<<block_count, block_dim, 0, block_stream>>>(cnst_id_beg, view);
+        RAFT_CHECK_CUDA(block_stream);
       } else {
         lb_calc_act_block_kernel<false, i_t, f_t, f_t2, block_dim>
           <<<block_count, block_dim, 0, block_stream>>>(cnst_id_beg, view);
+        RAFT_CHECK_CUDA(block_stream);
       }
     }
   }
@@ -260,9 +267,11 @@ void calc_activity_sub_warp(managed_stream_pool& streams,
       if (erase_inf_cnst) {
         lb_calc_act_sub_warp_kernel<true, i_t, f_t, f_t2, block_dim, threads_per_constraint>
           <<<block_count, block_dim, 0, sub_warp_thread>>>(cnst_id_beg, cnst_id_end, view);
+        RAFT_CHECK_CUDA(sub_warp_thread);
       } else {
         lb_calc_act_sub_warp_kernel<false, i_t, f_t, f_t2, block_dim, threads_per_constraint>
           <<<block_count, block_dim, 0, sub_warp_thread>>>(cnst_id_beg, cnst_id_end, view);
+        RAFT_CHECK_CUDA(sub_warp_thread);
       }
     }
   }
@@ -303,10 +312,12 @@ void calc_activity_sub_warp(managed_stream_pool& streams,
         lb_calc_act_sub_warp_kernel<true, i_t, f_t, f_t2, block_dim>
           <<<block_count, block_dim, 0, sub_warp_stream>>>(
             view, make_span(warp_cnst_offsets), make_span(warp_cnst_id_offsets));
+        RAFT_CHECK_CUDA(sub_warp_stream);
       } else {
         lb_calc_act_sub_warp_kernel<false, i_t, f_t, f_t2, block_dim>
           <<<block_count, block_dim, 0, sub_warp_stream>>>(
             view, make_span(warp_cnst_offsets), make_span(warp_cnst_id_offsets));
+        RAFT_CHECK_CUDA(sub_warp_stream);
       }
     }
   }
@@ -358,44 +369,310 @@ void calc_activity_sub_warp(managed_stream_pool& streams,
   }
 }
 
-/// BOUNDS UPDATE
+template <typename i_t,
+          typename f_t,
+          typename f_t2,
+          i_t threads_per_constraint,
+          typename activity_view_t>
+void create_activity_sub_warp(cudaGraph_t act_graph,
+                              cudaGraphNode_t& set_bounds_changed_node,
+                              activity_view_t view,
+                              i_t degree_beg,
+                              i_t degree_end,
+                              const std::vector<i_t>& cnst_bin_offsets,
+                              bool erase_inf_cnst)
+{
+  constexpr i_t block_dim         = 32;
+  auto cnst_per_block             = block_dim / threads_per_constraint;
+  auto [cnst_id_beg, cnst_id_end] = get_id_range(cnst_bin_offsets, degree_beg, degree_end);
 
-template <typename i_t, typename f_t, typename f_t2, i_t block_dim, typename bounds_update_view_t>
-void upd_bounds_heavy_vars(managed_stream_pool& streams,
-                           bounds_update_view_t view,
-                           raft::device_span<f_t2> tmp_vars_bnd,
-                           const rmm::device_uvector<i_t>& heavy_vars_vertex_ids,
-                           const rmm::device_uvector<i_t>& heavy_vars_pseudo_block_ids,
-                           const rmm::device_uvector<i_t>& heavy_vars_block_segments,
-                           const std::vector<i_t>& vars_bin_offsets,
-                           i_t heavy_degree_cutoff,
-                           i_t num_blocks_heavy_vars,
-                           bool dry_run = false)
+  auto block_count = raft::ceildiv<i_t>(cnst_id_end - cnst_id_beg, cnst_per_block);
+  if (block_count != 0) {
+    cudaGraphNode_t act_sub_warp_node;
+    void* kernelArgs[]                    = {&cnst_id_beg, &cnst_id_end, &view};
+    cudaKernelNodeParams kernelNodeParams = {0};
+
+    kernelNodeParams.gridDim        = dim3(block_count, 1, 1);
+    kernelNodeParams.blockDim       = dim3(block_dim, 1, 1);
+    kernelNodeParams.sharedMemBytes = 0;
+    kernelNodeParams.kernelParams   = (void**)kernelArgs;
+    kernelNodeParams.extra          = NULL;
+    if (erase_inf_cnst) {
+      kernelNodeParams.func = (void*)lb_calc_act_sub_warp_kernel<true,
+                                                                 i_t,
+                                                                 f_t,
+                                                                 f_t2,
+                                                                 block_dim,
+                                                                 threads_per_constraint,
+                                                                 activity_view_t>;
+    } else {
+      kernelNodeParams.func = (void*)lb_calc_act_sub_warp_kernel<false,
+                                                                 i_t,
+                                                                 f_t,
+                                                                 f_t2,
+                                                                 block_dim,
+                                                                 threads_per_constraint,
+                                                                 activity_view_t>;
+    }
+
+    cudaGraphAddKernelNode(&act_sub_warp_node, act_graph, NULL, 0, &kernelNodeParams);
+    cudaGraphAddDependencies(act_graph, &act_sub_warp_node, &set_bounds_changed_node, 1);
+  }
+}
+
+template <typename i_t,
+          typename f_t,
+          typename f_t2,
+          i_t threads_per_constraint,
+          typename activity_view_t>
+void create_activity_sub_warp(cudaGraph_t act_graph,
+                              cudaGraphNode_t& set_bounds_changed_node,
+                              activity_view_t view,
+                              i_t degree,
+                              const std::vector<i_t>& cnst_bin_offsets,
+                              bool erase_inf_cnst)
 {
-  if (num_blocks_heavy_vars != 0) {
-    auto heavy_vars_stream = streams.get_stream();
-    // TODO : Check heavy_vars_block_segments size for profiling
-    if (!dry_run) {
-      auto heavy_vars_beg_id = get_id_offset(vars_bin_offsets, heavy_degree_cutoff);
-      lb_upd_bnd_heavy_kernel<i_t, f_t, f_t2, block_dim>
-        <<<num_blocks_heavy_vars, block_dim, 0, heavy_vars_stream>>>(
-          heavy_vars_beg_id,
-          make_span(heavy_vars_vertex_ids),
-          make_span(heavy_vars_pseudo_block_ids),
-          heavy_degree_cutoff,
-          view,
-          tmp_vars_bnd);
-      auto num_heavy_vars = vars_bin_offsets.back() - heavy_vars_beg_id;
-      finalize_upd_bnd_kernel<i_t, f_t, f_t2><<<num_heavy_vars, 32, 0, heavy_vars_stream>>>(
-        heavy_vars_beg_id, make_span(heavy_vars_block_segments), tmp_vars_bnd, view);
+  create_activity_sub_warp<i_t, f_t, f_t2, threads_per_constraint>(
+    act_graph, set_bounds_changed_node, view, degree, degree, cnst_bin_offsets, erase_inf_cnst);
+}
+
+template <typename i_t, typename f_t, typename f_t2, typename activity_view_t>
+void create_activity_sub_warp(cudaGraph_t act_graph,
+                              cudaGraphNode_t& set_bounds_changed_node,
+                              activity_view_t view,
+                              i_t cnst_sub_warp_count,
+                              rmm::device_uvector<i_t>& warp_cnst_offsets,
+                              rmm::device_uvector<i_t>& warp_cnst_id_offsets,
+                              bool erase_inf_cnst)
+{
+  constexpr i_t block_dim = 256;
+
+  auto block_count = raft::ceildiv<i_t>(cnst_sub_warp_count * 32, block_dim);
+  if (block_count != 0) {
+    cudaGraphNode_t act_sub_warp_node;
+    auto warp_cnst_offsets_span    = make_span(warp_cnst_offsets);
+    auto warp_cnst_id_offsets_span = make_span(warp_cnst_id_offsets);
+
+    void* kernelArgs[] = {&view, &warp_cnst_offsets_span, &warp_cnst_id_offsets_span};
+    cudaKernelNodeParams kernelNodeParams = {0};
+
+    kernelNodeParams.gridDim        = dim3(block_count, 1, 1);
+    kernelNodeParams.blockDim       = dim3(block_dim, 1, 1);
+    kernelNodeParams.sharedMemBytes = 0;
+    kernelNodeParams.kernelParams   = (void**)kernelArgs;
+    kernelNodeParams.extra          = NULL;
+
+    if (erase_inf_cnst) {
+      kernelNodeParams.func =
+        (void*)lb_calc_act_sub_warp_kernel<true, i_t, f_t, f_t2, block_dim, activity_view_t>;
+    } else {
+      kernelNodeParams.func =
+        (void*)lb_calc_act_sub_warp_kernel<false, i_t, f_t, f_t2, block_dim, activity_view_t>;
+    }
+
+    cudaGraphAddKernelNode(&act_sub_warp_node, act_graph, NULL, 0, &kernelNodeParams);
+    cudaGraphAddDependencies(act_graph, &act_sub_warp_node, &set_bounds_changed_node, 1);
+  }
+}
+
+template <typename i_t, typename f_t, typename f_t2, typename activity_view_t>
+void create_activity_sub_warp(cudaGraph_t act_graph,
+                              cudaGraphNode_t& set_bounds_changed_node,
+                              activity_view_t view,
+                              bool is_cnst_sub_warp_single_bin,
+                              i_t cnst_sub_warp_count,
+                              rmm::device_uvector<i_t>& warp_cnst_offsets,
+                              rmm::device_uvector<i_t>& warp_cnst_id_offsets,
+                              const std::vector<i_t>& cnst_bin_offsets,
+                              bool erase_inf_cnst)
+{
+  if (view.nnz < 10000) {
+    create_activity_sub_warp<i_t, f_t, f_t2, 16>(
+      act_graph, set_bounds_changed_node, view, 16, cnst_bin_offsets, erase_inf_cnst);
+    create_activity_sub_warp<i_t, f_t, f_t2, 8>(
+      act_graph, set_bounds_changed_node, view, 8, cnst_bin_offsets, erase_inf_cnst);
+    create_activity_sub_warp<i_t, f_t, f_t2, 4>(
+      act_graph, set_bounds_changed_node, view, 4, cnst_bin_offsets, erase_inf_cnst);
+    create_activity_sub_warp<i_t, f_t, f_t2, 2>(
+      act_graph, set_bounds_changed_node, view, 2, cnst_bin_offsets, erase_inf_cnst);
+    create_activity_sub_warp<i_t, f_t, f_t2, 1>(
+      act_graph, set_bounds_changed_node, view, 1, cnst_bin_offsets, erase_inf_cnst);
+  } else {
+    if (is_cnst_sub_warp_single_bin) {
+      create_activity_sub_warp<i_t, f_t, f_t2, 16>(
+        act_graph, set_bounds_changed_node, view, 64, cnst_bin_offsets, erase_inf_cnst);
+      create_activity_sub_warp<i_t, f_t, f_t2, 8>(
+        act_graph, set_bounds_changed_node, view, 32, cnst_bin_offsets, erase_inf_cnst);
+      create_activity_sub_warp<i_t, f_t, f_t2, 4>(
+        act_graph, set_bounds_changed_node, view, 16, cnst_bin_offsets, erase_inf_cnst);
+      create_activity_sub_warp<i_t, f_t, f_t2, 2>(
+        act_graph, set_bounds_changed_node, view, 8, cnst_bin_offsets, erase_inf_cnst);
+      create_activity_sub_warp<i_t, f_t, f_t2, 1>(
+        act_graph, set_bounds_changed_node, view, 1, 4, cnst_bin_offsets, erase_inf_cnst);
+    } else {
+      create_activity_sub_warp<i_t, f_t, f_t2>(act_graph,
+                                               set_bounds_changed_node,
+                                               view,
+                                               cnst_sub_warp_count,
+                                               warp_cnst_offsets,
+                                               warp_cnst_id_offsets,
+                                               erase_inf_cnst);
+    }
+  }
+}
+
+template <typename i_t, typename f_t, typename f_t2, i_t block_dim, typename activity_view_t>
+void create_activity_per_block(cudaGraph_t act_graph,
+                               cudaGraphNode_t& set_bounds_changed_node,
+                               activity_view_t view,
+                               const std::vector<i_t>& cnst_bin_offsets,
+                               i_t degree_beg,
+                               i_t degree_end,
+                               bool erase_inf_cnst)
+{
+  static_assert(block_dim <= 1024, "Cannot launch kernel with more than 1024 threads");
+
+  auto [cnst_id_beg, cnst_id_end] = get_id_range(cnst_bin_offsets, degree_beg, degree_end);
+
+  auto block_count = cnst_id_end - cnst_id_beg;
+  if (block_count > 0) {
+    cudaGraphNode_t act_block_node;
+    void* kernelArgs[] = {&cnst_id_beg, &view};
+
+    cudaKernelNodeParams kernelNodeParams = {0};
+
+    kernelNodeParams.gridDim        = dim3(block_count, 1, 1);
+    kernelNodeParams.blockDim       = dim3(block_dim, 1, 1);
+    kernelNodeParams.sharedMemBytes = 0;
+    kernelNodeParams.kernelParams   = (void**)kernelArgs;
+    kernelNodeParams.extra          = NULL;
+    if (erase_inf_cnst) {
+      kernelNodeParams.func =
+        (void*)lb_calc_act_block_kernel<true, i_t, f_t, f_t2, block_dim, activity_view_t>;
+    } else {
+      kernelNodeParams.func =
+        (void*)lb_calc_act_block_kernel<false, i_t, f_t, f_t2, block_dim, activity_view_t>;
+    }
+
+    cudaGraphAddKernelNode(&act_block_node, act_graph, NULL, 0, &kernelNodeParams);
+    cudaGraphAddDependencies(act_graph, &act_block_node, &set_bounds_changed_node, 1);
+  }
+}
+
+template <typename i_t, typename f_t, typename f_t2, typename activity_view_t>
+void create_activity_per_block(cudaGraph_t act_graph,
+                               cudaGraphNode_t& set_bounds_changed_node,
+                               activity_view_t view,
+                               const std::vector<i_t>& cnst_bin_offsets,
+                               i_t heavy_degree_cutoff,
+                               bool erase_inf_cnst)
+{
+  if (view.nnz < 10000) {
+    create_activity_per_block<i_t, f_t, f_t2, 32>(
+      act_graph, set_bounds_changed_node, view, cnst_bin_offsets, 32, 32, erase_inf_cnst);
+    create_activity_per_block<i_t, f_t, f_t2, 64>(
+      act_graph, set_bounds_changed_node, view, cnst_bin_offsets, 64, 64, erase_inf_cnst);
+    create_activity_per_block<i_t, f_t, f_t2, 128>(
+      act_graph, set_bounds_changed_node, view, cnst_bin_offsets, 128, 128, erase_inf_cnst);
+    create_activity_per_block<i_t, f_t, f_t2, 256>(
+      act_graph, set_bounds_changed_node, view, cnst_bin_offsets, 256, 256, erase_inf_cnst);
+  } else {
+    //[1024, heavy_degree_cutoff/2] -> 1024 block size
+    create_activity_per_block<i_t, f_t, f_t2, 1024>(act_graph,
+                                                    set_bounds_changed_node,
+                                                    view,
+                                                    cnst_bin_offsets,
+                                                    1024,
+                                                    heavy_degree_cutoff / 2,
+                                                    erase_inf_cnst);
+    //[512, 512] -> 128 block size
+    create_activity_per_block<i_t, f_t, f_t2, 128>(
+      act_graph, set_bounds_changed_node, view, cnst_bin_offsets, 128, 512, erase_inf_cnst);
+  }
+}
+
+template <typename i_t, typename f_t, typename f_t2, i_t block_dim, typename activity_view_t>
+void create_activity_heavy_cnst(cudaGraph_t act_graph,
+                                cudaGraphNode_t& set_bounds_changed_node,
+                                activity_view_t view,
+                                raft::device_span<f_t2> tmp_cnst_act,
+                                const rmm::device_uvector<i_t>& heavy_cnst_vertex_ids,
+                                const rmm::device_uvector<i_t>& heavy_cnst_pseudo_block_ids,
+                                const rmm::device_uvector<i_t>& heavy_cnst_block_segments,
+                                const std::vector<i_t>& cnst_bin_offsets,
+                                i_t heavy_degree_cutoff,
+                                i_t num_blocks_heavy_cnst,
+                                bool erase_inf_cnst,
+                                bool dry_run = false)
+{
+  if (num_blocks_heavy_cnst != 0) {
+    cudaGraphNode_t act_heavy_node;
+    cudaGraphNode_t finalize_heavy_node;
+    // Add heavy kernel
+    {
+      auto heavy_cnst_beg_id                = get_id_offset(cnst_bin_offsets, heavy_degree_cutoff);
+      auto heavy_cnst_vertex_ids_span       = make_span(heavy_cnst_vertex_ids);
+      auto heavy_cnst_pseudo_block_ids_span = make_span(heavy_cnst_pseudo_block_ids);
+      i_t work_per_block                    = heavy_degree_cutoff;
+
+      void* kernelArgs[] = {&heavy_cnst_beg_id,
+                            &heavy_cnst_vertex_ids_span,
+                            &heavy_cnst_pseudo_block_ids_span,
+                            &work_per_block,
+                            &view,
+                            &tmp_cnst_act};
+
+      cudaKernelNodeParams kernelNodeParams = {0};
+
+      kernelNodeParams.func =
+        (void*)lb_calc_act_heavy_kernel<i_t, f_t, f_t2, block_dim, activity_view_t>;
+      kernelNodeParams.gridDim        = dim3(num_blocks_heavy_cnst, 1, 1);
+      kernelNodeParams.blockDim       = dim3(block_dim, 1, 1);
+      kernelNodeParams.sharedMemBytes = 0;
+      kernelNodeParams.kernelParams   = (void**)kernelArgs;
+      kernelNodeParams.extra          = NULL;
+
+      cudaGraphAddKernelNode(&act_heavy_node, act_graph, NULL, 0, &kernelNodeParams);
     }
+    {
+      auto heavy_cnst_beg_id              = get_id_offset(cnst_bin_offsets, heavy_degree_cutoff);
+      auto num_heavy_cnst                 = cnst_bin_offsets.back() - heavy_cnst_beg_id;
+      auto heavy_cnst_block_segments_span = make_span(heavy_cnst_block_segments);
+
+      void* kernelArgs[] = {
+        &heavy_cnst_beg_id, &heavy_cnst_block_segments_span, &tmp_cnst_act, &view};
+
+      cudaKernelNodeParams kernelNodeParams = {0};
+
+      kernelNodeParams.gridDim        = dim3(num_heavy_cnst, 1, 1);
+      kernelNodeParams.blockDim       = dim3(32, 1, 1);
+      kernelNodeParams.sharedMemBytes = 0;
+      kernelNodeParams.kernelParams   = (void**)kernelArgs;
+      kernelNodeParams.extra          = NULL;
+      if (erase_inf_cnst) {
+        kernelNodeParams.func =
+          (void*)finalize_calc_act_kernel<true, i_t, f_t, f_t2, activity_view_t>;
+      } else {
+        kernelNodeParams.func =
+          (void*)finalize_calc_act_kernel<false, i_t, f_t, f_t2, activity_view_t>;
+      }
+
+      cudaGraphAddKernelNode(&finalize_heavy_node, act_graph, NULL, 0, &kernelNodeParams);
+    }
+
+    cudaGraphAddDependencies(act_graph, &act_heavy_node, &finalize_heavy_node, 1);
+    cudaGraphAddDependencies(act_graph, &finalize_heavy_node, &set_bounds_changed_node, 1);
   }
 }
 
+/// BOUNDS UPDATE
+
 template <typename i_t, typename f_t, typename f_t2, i_t block_dim, typename bounds_update_view_t>
 void upd_bounds_heavy_vars(managed_stream_pool& streams,
                            bounds_update_view_t view,
                            raft::device_span<f_t2> tmp_vars_bnd,
+                           const rmm::device_uvector<i_t>& heavy_vars_vertex_ids,
+                           const rmm::device_uvector<i_t>& heavy_vars_pseudo_block_ids,
                            const rmm::device_uvector<i_t>& heavy_vars_block_segments,
                            const std::vector<i_t>& vars_bin_offsets,
                            i_t heavy_degree_cutoff,
@@ -410,7 +687,8 @@ void upd_bounds_heavy_vars(managed_stream_pool& streams,
       lb_upd_bnd_heavy_kernel<i_t, f_t, f_t2, block_dim>
         <<<num_blocks_heavy_vars, block_dim, 0, heavy_vars_stream>>>(
           heavy_vars_beg_id,
-          make_span(heavy_vars_block_segments, 1, heavy_vars_block_segments.size()),
+          make_span(heavy_vars_vertex_ids),
+          make_span(heavy_vars_pseudo_block_ids),
           heavy_degree_cutoff,
           view,
           tmp_vars_bnd);
@@ -555,4 +833,275 @@ void upd_bounds_sub_warp(managed_stream_pool& streams,
     }
   }
 }
+
+template <typename i_t,
+          typename f_t,
+          typename f_t2,
+          i_t threads_per_variable,
+          typename bounds_update_view_t>
+void create_update_bounds_sub_warp(cudaGraph_t upd_graph,
+                                   cudaGraphNode_t& bounds_changed_node,
+                                   bounds_update_view_t view,
+                                   i_t degree_beg,
+                                   i_t degree_end,
+                                   const std::vector<i_t>& vars_bin_offsets)
+{
+  constexpr i_t block_dim         = 32;
+  auto vars_per_block             = block_dim / threads_per_variable;
+  auto [vars_id_beg, vars_id_end] = get_id_range(vars_bin_offsets, degree_beg, degree_end);
+
+  auto block_count = raft::ceildiv<i_t>(vars_id_end - vars_id_beg, vars_per_block);
+  if (block_count != 0) {
+    cudaGraphNode_t upd_bnd_sub_warp_node;
+
+    void* kernelArgs[] = {&vars_id_beg, &vars_id_end, &view};
+
+    cudaKernelNodeParams kernelNodeParams = {0};
+
+    kernelNodeParams.func           = (void*)lb_upd_bnd_sub_warp_kernel<i_t,
+                                                                        f_t,
+                                                                        f_t2,
+                                                                        block_dim,
+                                                                        threads_per_variable,
+                                                                        bounds_update_view_t>;
+    kernelNodeParams.gridDim        = dim3(block_count, 1, 1);
+    kernelNodeParams.blockDim       = dim3(block_dim, 1, 1);
+    kernelNodeParams.sharedMemBytes = 0;
+    kernelNodeParams.kernelParams   = (void**)kernelArgs;
+    kernelNodeParams.extra          = NULL;
+
+    cudaGraphAddKernelNode(&upd_bnd_sub_warp_node, upd_graph, NULL, 0, &kernelNodeParams);
+    RAFT_CUDA_TRY(cudaGetLastError());
+
+    cudaGraphAddDependencies(upd_graph, &upd_bnd_sub_warp_node, &bounds_changed_node, 1);
+    RAFT_CUDA_TRY(cudaGetLastError());
+  }
+}
+
+template <typename i_t,
+          typename f_t,
+          typename f_t2,
+          i_t threads_per_variable,
+          typename bounds_update_view_t>
+void create_update_bounds_sub_warp(cudaGraph_t upd_graph,
+                                   cudaGraphNode_t& bounds_changed_node,
+                                   bounds_update_view_t view,
+                                   i_t degree,
+                                   const std::vector<i_t>& vars_bin_offsets)
+{
+  create_update_bounds_sub_warp<i_t, f_t, f_t2, threads_per_variable>(
+    upd_graph, bounds_changed_node, view, degree, degree, vars_bin_offsets);
+}
+
+template <typename i_t, typename f_t, typename f_t2, typename bounds_update_view_t>
+void create_update_bounds_sub_warp(cudaGraph_t upd_graph,
+                                   cudaGraphNode_t& bounds_changed_node,
+                                   bounds_update_view_t view,
+                                   i_t vars_sub_warp_count,
+                                   rmm::device_uvector<i_t>& warp_vars_offsets,
+                                   rmm::device_uvector<i_t>& warp_vars_id_offsets)
+{
+  constexpr i_t block_dim = 256;
+
+  auto block_count = raft::ceildiv<i_t>(vars_sub_warp_count * 32, block_dim);
+  if (block_count != 0) {
+    cudaGraphNode_t upd_bnd_sub_warp_node;
+
+    auto warp_vars_offsets_span    = make_span(warp_vars_offsets);
+    auto warp_vars_id_offsets_span = make_span(warp_vars_id_offsets);
+
+    void* kernelArgs[] = {&view, &warp_vars_offsets_span, &warp_vars_id_offsets_span};
+
+    cudaKernelNodeParams kernelNodeParams = {0};
+
+    kernelNodeParams.func =
+      (void*)lb_upd_bnd_sub_warp_kernel<i_t, f_t, f_t2, block_dim, bounds_update_view_t>;
+    kernelNodeParams.gridDim        = dim3(block_count, 1, 1);
+    kernelNodeParams.blockDim       = dim3(block_dim, 1, 1);
+    kernelNodeParams.sharedMemBytes = 0;
+    kernelNodeParams.kernelParams   = (void**)kernelArgs;
+    kernelNodeParams.extra          = NULL;
+
+    cudaGraphAddKernelNode(&upd_bnd_sub_warp_node, upd_graph, NULL, 0, &kernelNodeParams);
+    RAFT_CUDA_TRY(cudaGetLastError());
+
+    cudaGraphAddDependencies(upd_graph, &upd_bnd_sub_warp_node, &bounds_changed_node, 1);
+    RAFT_CUDA_TRY(cudaGetLastError());
+  }
+}
+
+template <typename i_t, typename f_t, typename f_t2, typename bounds_update_view_t>
+void create_update_bounds_sub_warp(cudaGraph_t upd_graph,
+                                   cudaGraphNode_t& bounds_changed_node,
+                                   bounds_update_view_t view,
+                                   bool is_vars_sub_warp_single_bin,
+                                   i_t vars_sub_warp_count,
+                                   rmm::device_uvector<i_t>& warp_vars_offsets,
+                                   rmm::device_uvector<i_t>& warp_vars_id_offsets,
+                                   const std::vector<i_t>& vars_bin_offsets)
+{
+  if (view.nnz < 10000) {
+    create_update_bounds_sub_warp<i_t, f_t, f_t2, 16>(
+      upd_graph, bounds_changed_node, view, 16, vars_bin_offsets);
+    create_update_bounds_sub_warp<i_t, f_t, f_t2, 8>(
+      upd_graph, bounds_changed_node, view, 8, vars_bin_offsets);
+    create_update_bounds_sub_warp<i_t, f_t, f_t2, 4>(
+      upd_graph, bounds_changed_node, view, 4, vars_bin_offsets);
+    create_update_bounds_sub_warp<i_t, f_t, f_t2, 2>(
+      upd_graph, bounds_changed_node, view, 2, vars_bin_offsets);
+    create_update_bounds_sub_warp<i_t, f_t, f_t2, 1>(
+      upd_graph, bounds_changed_node, view, 1, vars_bin_offsets);
+  } else {
+    if (is_vars_sub_warp_single_bin) {
+      create_update_bounds_sub_warp<i_t, f_t, f_t2, 16>(
+        upd_graph, bounds_changed_node, view, 64, vars_bin_offsets);
+      create_update_bounds_sub_warp<i_t, f_t, f_t2, 8>(
+        upd_graph, bounds_changed_node, view, 32, vars_bin_offsets);
+      create_update_bounds_sub_warp<i_t, f_t, f_t2, 4>(
+        upd_graph, bounds_changed_node, view, 16, vars_bin_offsets);
+      create_update_bounds_sub_warp<i_t, f_t, f_t2, 2>(
+        upd_graph, bounds_changed_node, view, 8, vars_bin_offsets);
+      create_update_bounds_sub_warp<i_t, f_t, f_t2, 1>(
+        upd_graph, bounds_changed_node, view, 1, 4, vars_bin_offsets);
+    } else {
+      create_update_bounds_sub_warp<i_t, f_t, f_t2>(upd_graph,
+                                                    bounds_changed_node,
+                                                    view,
+                                                    vars_sub_warp_count,
+                                                    warp_vars_offsets,
+                                                    warp_vars_id_offsets);
+    }
+  }
+}
+
+template <typename i_t, typename f_t, typename f_t2, i_t block_dim, typename bounds_update_view_t>
+void create_update_bounds_per_block(cudaGraph_t upd_graph,
+                                    cudaGraphNode_t& bounds_changed_node,
+                                    bounds_update_view_t view,
+                                    const std::vector<i_t>& vars_bin_offsets,
+                                    i_t degree_beg,
+                                    i_t degree_end)
+{
+  auto [vars_id_beg, vars_id_end] = get_id_range(vars_bin_offsets, degree_beg, degree_end);
+
+  auto block_count = vars_id_end - vars_id_beg;
+  if (block_count > 0) {
+    cudaGraphNode_t upd_bnd_block_node;
+
+    void* kernelArgs[] = {&vars_id_beg, &view};
+
+    cudaKernelNodeParams kernelNodeParams = {0};
+
+    kernelNodeParams.func =
+      (void*)lb_upd_bnd_block_kernel<i_t, f_t, f_t2, block_dim, bounds_update_view_t>;
+    kernelNodeParams.gridDim        = dim3(block_count, 1, 1);
+    kernelNodeParams.blockDim       = dim3(block_dim, 1, 1);
+    kernelNodeParams.sharedMemBytes = 0;
+    kernelNodeParams.kernelParams   = (void**)kernelArgs;
+    kernelNodeParams.extra          = NULL;
+
+    cudaGraphAddKernelNode(&upd_bnd_block_node, upd_graph, NULL, 0, &kernelNodeParams);
+    RAFT_CUDA_TRY(cudaGetLastError());
+
+    cudaGraphAddDependencies(upd_graph, &upd_bnd_block_node, &bounds_changed_node, 1);
+    RAFT_CUDA_TRY(cudaGetLastError());
+  }
+}
+
+template <typename i_t, typename f_t, typename f_t2, typename bounds_update_view_t>
+void create_update_bounds_per_block(cudaGraph_t upd_graph,
+                                    cudaGraphNode_t& bounds_changed_node,
+                                    bounds_update_view_t view,
+                                    const std::vector<i_t>& vars_bin_offsets,
+                                    i_t heavy_degree_cutoff)
+{
+  if (view.nnz < 10000) {
+    create_update_bounds_per_block<i_t, f_t, f_t2, 32>(
+      upd_graph, bounds_changed_node, view, vars_bin_offsets, 32, 32);
+    create_update_bounds_per_block<i_t, f_t, f_t2, 64>(
+      upd_graph, bounds_changed_node, view, vars_bin_offsets, 64, 64);
+    create_update_bounds_per_block<i_t, f_t, f_t2, 128>(
+      upd_graph, bounds_changed_node, view, vars_bin_offsets, 128, 128);
+    create_update_bounds_per_block<i_t, f_t, f_t2, 256>(
+      upd_graph, bounds_changed_node, view, vars_bin_offsets, 256, 256);
+  } else {
+    //[1024, heavy_degree_cutoff/2] -> 128 block size
+    create_update_bounds_per_block<i_t, f_t, f_t2, 256>(
+      upd_graph, bounds_changed_node, view, vars_bin_offsets, 1024, heavy_degree_cutoff / 2);
+    //[64, 512] -> 32 block size
+    create_update_bounds_per_block<i_t, f_t, f_t2, 64>(
+      upd_graph, bounds_changed_node, view, vars_bin_offsets, 128, 512);
+  }
+}
+
+template <typename i_t, typename f_t, typename f_t2, i_t block_dim, typename bounds_update_view_t>
+void create_update_bounds_heavy_vars(cudaGraph_t upd_graph,
+                                     cudaGraphNode_t& bounds_changed_node,
+                                     bounds_update_view_t view,
+                                     raft::device_span<f_t2> tmp_vars_bnd,
+                                     const rmm::device_uvector<i_t>& heavy_vars_vertex_ids,
+                                     const rmm::device_uvector<i_t>& heavy_vars_pseudo_block_ids,
+                                     const rmm::device_uvector<i_t>& heavy_vars_block_segments,
+                                     const std::vector<i_t>& vars_bin_offsets,
+                                     i_t heavy_degree_cutoff,
+                                     i_t num_blocks_heavy_vars)
+{
+  if (num_blocks_heavy_vars != 0) {
+    cudaGraphNode_t upd_bnd_heavy_node;
+    cudaGraphNode_t finalize_heavy_node;
+    // Add heavy kernel
+    {
+      auto heavy_vars_beg_id                = get_id_offset(vars_bin_offsets, heavy_degree_cutoff);
+      auto heavy_vars_vertex_ids_span       = make_span(heavy_vars_vertex_ids);
+      auto heavy_vars_pseudo_block_ids_span = make_span(heavy_vars_pseudo_block_ids);
+      i_t work_per_block                    = heavy_degree_cutoff;
+
+      void* kernelArgs[] = {&heavy_vars_beg_id,
+                            &heavy_vars_vertex_ids_span,
+                            &heavy_vars_pseudo_block_ids_span,
+                            &work_per_block,
+                            &view,
+                            &tmp_vars_bnd};
+
+      cudaKernelNodeParams kernelNodeParams = {0};
+
+      kernelNodeParams.func =
+        (void*)lb_upd_bnd_heavy_kernel<i_t, f_t, f_t2, block_dim, bounds_update_view_t>;
+      kernelNodeParams.gridDim        = dim3(num_blocks_heavy_vars, 1, 1);
+      kernelNodeParams.blockDim       = dim3(block_dim, 1, 1);
+      kernelNodeParams.sharedMemBytes = 0;
+      kernelNodeParams.kernelParams   = (void**)kernelArgs;
+      kernelNodeParams.extra          = NULL;
+
+      cudaGraphAddKernelNode(&upd_bnd_heavy_node, upd_graph, NULL, 0, &kernelNodeParams);
+      RAFT_CUDA_TRY(cudaGetLastError());
+    }
+    // Add finalize
+    {
+      auto heavy_vars_beg_id              = get_id_offset(vars_bin_offsets, heavy_degree_cutoff);
+      auto num_heavy_vars                 = vars_bin_offsets.back() - heavy_vars_beg_id;
+      auto heavy_vars_block_segments_span = make_span(heavy_vars_block_segments);
+
+      void* kernelArgs[] = {
+        &heavy_vars_beg_id, &heavy_vars_block_segments_span, &tmp_vars_bnd, &view};
+
+      cudaKernelNodeParams kernelNodeParams = {0};
+
+      kernelNodeParams.func = (void*)finalize_upd_bnd_kernel<i_t, f_t, f_t2, bounds_update_view_t>;
+      kernelNodeParams.gridDim        = dim3(num_heavy_vars, 1, 1);
+      kernelNodeParams.blockDim       = dim3(32, 1, 1);
+      kernelNodeParams.sharedMemBytes = 0;
+      kernelNodeParams.kernelParams   = (void**)kernelArgs;
+      kernelNodeParams.extra          = NULL;
+
+      cudaGraphAddKernelNode(&finalize_heavy_node, upd_graph, NULL, 0, &kernelNodeParams);
+      RAFT_CUDA_TRY(cudaGetLastError());
+    }
+    cudaGraphAddDependencies(upd_graph, &upd_bnd_heavy_node, &finalize_heavy_node, 1);
+    RAFT_CUDA_TRY(cudaGetLastError());
+    cudaGraphAddDependencies(upd_graph, &finalize_heavy_node, &bounds_changed_node, 1);
+    RAFT_CUDA_TRY(cudaGetLastError());
+  }
+}
+
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh b/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh
index 328fa25b9..10089664a 100644
--- a/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh
+++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh
@@ -98,40 +98,6 @@ __global__ void lb_calc_act_heavy_kernel(i_t id_range_beg,
   if (threadIdx.x == 0) { tmp_cnst_act[blockIdx.x] = act; }
 }
 
-template <typename i_t, typename f_t, typename f_t2, i_t BDIM, typename activity_view_t>
-__global__ void lb_calc_act_heavy_kernel(i_t id_range_beg,
-                                         raft::device_span<const i_t> item_block_segments,
-                                         i_t work_per_block,
-                                         activity_view_t view,
-                                         raft::device_span<f_t2> tmp_cnst_act)
-{
-  __shared__ i_t id_map;
-  __shared__ i_t pseudo_block_id;
-  if (threadIdx.x == 0) {
-    id_map = thrust::upper_bound(
-               thrust::seq, item_block_segments.begin(), item_block_segments.end(), blockIdx.x) -
-             item_block_segments.begin();
-    pseudo_block_id = blockIdx.x - item_block_segments[id_map - 1];
-  }
-  __syncthreads();
-  auto idx         = id_range_beg + id_map;
-  i_t item_off_beg = view.offsets[idx] + work_per_block * pseudo_block_id;
-  i_t item_off_end = min(item_off_beg + work_per_block, view.offsets[idx + 1]);
-
-  typedef cub::BlockReduce<f_t, BDIM> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  auto act = calc_act<i_t, f_t, f_t2, BDIM>(view, threadIdx.x, item_off_beg, item_off_end);
-
-  act.x = BlockReduce(temp_storage).Sum(act.x);
-  __syncthreads();
-  act.y = BlockReduce(temp_storage).Sum(act.y);
-
-  // don't subtract constraint bounds yet
-  // to be done in post processing in finalize_calc_act_kernel
-  if (threadIdx.x == 0) { tmp_cnst_act[blockIdx.x] = act; }
-}
-
 template <bool erase_inf_cnst, typename i_t, typename f_t, typename f_t2, typename activity_view_t>
 inline __device__ void write_cnst_slack(
   activity_view_t view, i_t cnst_idx, f_t2 cnst_lb_ub, f_t2 act, f_t eps)
diff --git a/cpp/src/mip/solution/solution.cu b/cpp/src/mip/solution/solution.cu
index 74ec4c41c..b3a7f6dbb 100644
--- a/cpp/src/mip/solution/solution.cu
+++ b/cpp/src/mip/solution/solution.cu
@@ -541,6 +541,11 @@ f_t solution_t<i_t, f_t>::compute_max_int_violation()
 template <typename i_t, typename f_t>
 f_t solution_t<i_t, f_t>::compute_max_variable_violation()
 {
+  cuopt_assert(problem_ptr->n_variables == assignment.size(), "Size mismatch");
+  cuopt_assert(problem_ptr->n_variables == problem_ptr->variable_lower_bounds.size(),
+               "Size mismatch");
+  cuopt_assert(problem_ptr->n_variables == problem_ptr->variable_upper_bounds.size(),
+               "Size mismatch");
   return thrust::transform_reduce(
     handle_ptr->get_thrust_policy(),
     thrust::make_counting_iterator(0),
diff --git a/cpp/src/mip/solve.cu b/cpp/src/mip/solve.cu
index dcfcdd0b1..841770c4d 100644
--- a/cpp/src/mip/solve.cu
+++ b/cpp/src/mip/solve.cu
@@ -125,10 +125,12 @@ mip_solution_t<i_t, f_t> run_mip(detail::problem_t<i_t, f_t>& problem,
     running_mip);
 
   cuopt_func_call(auto saved_problem = scaled_problem);
-  if (settings.mip_scaling) { scaling.scale_problem(); }
-  if (settings.initial_solutions.size() > 0) {
-    for (const auto& initial_solution : settings.initial_solutions) {
-      scaling.scale_primal(*initial_solution);
+  if (settings.mip_scaling) {
+    scaling.scale_problem();
+    if (settings.initial_solutions.size() > 0) {
+      for (const auto& initial_solution : settings.initial_solutions) {
+        scaling.scale_primal(*initial_solution);
+      }
     }
   }
   // only call preprocess on scaled problem, so we can compute feasibility on the original problem
diff --git a/cpp/src/mip/solver.cu b/cpp/src/mip/solver.cu
index a60aa77b5..0f2117991 100644
--- a/cpp/src/mip/solver.cu
+++ b/cpp/src/mip/solver.cu
@@ -23,6 +23,9 @@
 #include "local_search/rounding/simple_rounding.cuh"
 #include "solver.cuh"
 
+#include <linear_programming/pdlp.cuh>
+#include <linear_programming/solve.cuh>
+
 #include <dual_simplex/branch_and_bound.hpp>
 #include <dual_simplex/simplex_solver_settings.hpp>
 #include <dual_simplex/solve.hpp>
@@ -124,6 +127,27 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     return sol;
   }
 
+  // if the problem was reduced to a LP: run concurrent LP
+  if (context.problem_ptr->n_integer_vars == 0) {
+    CUOPT_LOG_INFO("Problem reduced to a LP, running concurrent LP");
+    pdlp_solver_settings_t<i_t, f_t> settings{};
+    settings.time_limit = timer_.remaining_time();
+    settings.method     = method_t::Concurrent;
+
+    auto opt_sol = solve_lp_with_method<i_t, f_t>(
+      *context.problem_ptr->original_problem_ptr, *context.problem_ptr, settings);
+
+    solution_t<i_t, f_t> sol(*context.problem_ptr);
+    sol.copy_new_assignment(host_copy(opt_sol.get_primal_solution()));
+    if (opt_sol.get_termination_status() == pdlp_termination_status_t::Optimal ||
+        opt_sol.get_termination_status() == pdlp_termination_status_t::PrimalInfeasible ||
+        opt_sol.get_termination_status() == pdlp_termination_status_t::DualInfeasible) {
+      sol.set_problem_fully_reduced();
+    }
+    context.problem_ptr->post_process_solution(sol);
+    return sol;
+  }
+
   namespace dual_simplex = cuopt::linear_programming::dual_simplex;
   std::future<dual_simplex::mip_status_t> branch_and_bound_status_future;
   dual_simplex::user_problem_t<i_t, f_t> branch_and_bound_problem;
diff --git a/cpp/src/routing/cuda_graph.cuh b/cpp/src/routing/cuda_graph.cuh
index 563cc6f56..898bde65d 100644
--- a/cpp/src/routing/cuda_graph.cuh
+++ b/cpp/src/routing/cuda_graph.cuh
@@ -54,7 +54,7 @@ struct cuda_graph_t {
       if (graph_created) { cudaGraphExecDestroy(instance); }
       // Instantiate graphExec from graph. The error node and
       // error message parameters are unused here.
-      cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
+      cudaGraphInstantiate(&instance, graph);
       graph_created = true;
     }
     cudaGraphDestroy(graph);
diff --git a/cpp/src/utilities/macros.cuh b/cpp/src/utilities/macros.cuh
index 1d9b50bf5..0d6e69fb5 100644
--- a/cpp/src/utilities/macros.cuh
+++ b/cpp/src/utilities/macros.cuh
@@ -23,7 +23,6 @@
 // 2) medium
 // 3) heavy
 #ifdef ASSERT_MODE
-#undef NDEBUG
 #include <cassert>
 #define cuopt_assert(val, msg) assert(val&& msg)
 #define cuopt_func_call(func)  func;
diff --git a/cpp/tests/linear_programming/c_api_tests/c_api_tests.cpp b/cpp/tests/linear_programming/c_api_tests/c_api_tests.cpp
index e675a3d5d..bfd100946 100644
--- a/cpp/tests/linear_programming/c_api_tests/c_api_tests.cpp
+++ b/cpp/tests/linear_programming/c_api_tests/c_api_tests.cpp
@@ -57,7 +57,7 @@ TEST_P(TimeLimitTestFixture, time_limit)
                            method),
             CUOPT_SUCCESS);
   EXPECT_EQ(termination_status, CUOPT_TERIMINATION_STATUS_TIME_LIMIT);
-  EXPECT_NEAR(solve_time, target_solve_time, 0.1);
+  EXPECT_NEAR(solve_time, target_solve_time, 1.0);
 }
 INSTANTIATE_TEST_SUITE_P(
   c_api,
diff --git a/cpp/tests/mip/CMakeLists.txt b/cpp/tests/mip/CMakeLists.txt
index b9fd249a5..020c537f6 100644
--- a/cpp/tests/mip/CMakeLists.txt
+++ b/cpp/tests/mip/CMakeLists.txt
@@ -27,6 +27,9 @@ ConfigureTest(ELIM_VAR_REMAP_TEST
 ConfigureTest(STANDARDIZATION_TEST
     ${CMAKE_CURRENT_SOURCE_DIR}/bounds_standardization_test.cu
 )
+ConfigureTest(LB_TEST
+    ${CMAKE_CURRENT_SOURCE_DIR}/load_balancing_test.cu
+)
 ConfigureTest(MULTI_PROBE_TEST
     ${CMAKE_CURRENT_SOURCE_DIR}/multi_probe_test.cu
 )
diff --git a/cpp/tests/mip/bounds_standardization_test.cu b/cpp/tests/mip/bounds_standardization_test.cu
index 07e386bbd..77b4acfd7 100644
--- a/cpp/tests/mip/bounds_standardization_test.cu
+++ b/cpp/tests/mip/bounds_standardization_test.cu
@@ -71,6 +71,7 @@ void test_bounds_standardization_test(std::string test_instance)
   init_handler(op_problem.get_handle_ptr());
   // run the problem constructor of MIP, so that we do bounds standardization
   detail::problem_t<int, double> standardized_problem(op_problem);
+  detail::problem_t<int, double> original_problem(op_problem);
   standardized_problem.preprocess_problem();
   detail::trivial_presolve(standardized_problem);
   detail::solution_t<int, double> solution_1(standardized_problem);
@@ -88,6 +89,7 @@ void test_bounds_standardization_test(std::string test_instance)
   // only consider the pdlp results
   EXPECT_TRUE(sol_1_feasible);
   standardized_problem.post_process_solution(solution_1);
+  solution_1.problem_ptr = &original_problem;
   auto optimization_prob_solution =
     solution_1.get_solution(sol_1_feasible, solver_stats_t<int, double>{});
   test_objective_sanity(problem,
diff --git a/cpp/tests/mip/elim_var_remap_test.cu b/cpp/tests/mip/elim_var_remap_test.cu
index a1af26a33..aeb48fe5d 100644
--- a/cpp/tests/mip/elim_var_remap_test.cu
+++ b/cpp/tests/mip/elim_var_remap_test.cu
@@ -154,6 +154,7 @@ void test_elim_var_solution(std::string test_instance)
   init_handler(op_problem.get_handle_ptr());
   // run the problem constructor of MIP, so that we do bounds standardization
   detail::problem_t<int, double> standardized_problem(op_problem);
+  detail::problem_t<int, double> original_problem(op_problem);
   standardized_problem.preprocess_problem();
   trivial_presolve(standardized_problem);
   detail::problem_t<int, double> sub_problem(standardized_problem);
@@ -171,7 +172,8 @@ void test_elim_var_solution(std::string test_instance)
   bool sol_1_feasible = (int)result_1.get_termination_status() == CUOPT_TERIMINATION_STATUS_OPTIMAL;
   EXPECT_EQ((int)result_1.get_termination_status(), CUOPT_TERIMINATION_STATUS_OPTIMAL);
   standardized_problem.post_process_solution(solution_1);
-  auto opt_sol_1 = solution_1.get_solution(sol_1_feasible, solver_stats_t<int, double>{});
+  solution_1.problem_ptr = &original_problem;
+  auto opt_sol_1         = solution_1.get_solution(sol_1_feasible, solver_stats_t<int, double>{});
   test_objective_sanity(
     mps_problem, opt_sol_1.get_solution(), opt_sol_1.get_objective_value(), 1e-3);
   test_constraint_sanity_per_row(
@@ -198,7 +200,8 @@ void test_elim_var_solution(std::string test_instance)
   bool sol_2_feasible = (int)result_2.get_termination_status() == CUOPT_TERIMINATION_STATUS_OPTIMAL;
   EXPECT_EQ((int)result_2.get_termination_status(), CUOPT_TERIMINATION_STATUS_OPTIMAL);
   sub_problem.post_process_solution(solution_2);
-  auto opt_sol_2 = solution_2.get_solution(sol_2_feasible, solver_stats_t<int, double>{});
+  solution_2.problem_ptr = &original_problem;
+  auto opt_sol_2         = solution_2.get_solution(sol_2_feasible, solver_stats_t<int, double>{});
   test_objective_sanity(
     mps_problem, opt_sol_2.get_solution(), opt_sol_2.get_objective_value(), 1e-3);
   test_constraint_sanity_per_row(
diff --git a/cpp/tests/mip/empty_fixed_problems_test.cu b/cpp/tests/mip/empty_fixed_problems_test.cu
index 06ad24df2..30d1ecf1d 100644
--- a/cpp/tests/mip/empty_fixed_problems_test.cu
+++ b/cpp/tests/mip/empty_fixed_problems_test.cu
@@ -78,4 +78,11 @@ TEST(mip_solve, empty_max_problem_with_objective_test)
   EXPECT_NEAR(obj_val, 11, 1e-5);
 }
 
+TEST(mip_solve, mip_presolved_to_lp)
+{
+  auto [termination_status, obj_val, lb] = test_mps_file("mip/mip-presolved-to-lp.mps", 5, false);
+  EXPECT_EQ(termination_status, mip_termination_status_t::Optimal);
+  EXPECT_NEAR(obj_val, 0, 1e-5);
+}
+
 }  // namespace cuopt::linear_programming::test
diff --git a/cpp/tests/mip/load_balancing_test.cu b/cpp/tests/mip/load_balancing_test.cu
new file mode 100644
index 000000000..deed9ea85
--- /dev/null
+++ b/cpp/tests/mip/load_balancing_test.cu
@@ -0,0 +1,188 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../linear_programming/utilities/pdlp_test_utilities.cuh"
+#include "mip_utils.cuh"
+
+#include <raft/sparse/detail/cusparse_wrappers.h>
+#include <linear_programming/initial_scaling_strategy/initial_scaling.cuh>
+#include <linear_programming/utilities/problem_checking.cuh>
+#include <mip/presolve/bounds_presolve.cuh>
+#include <mip/presolve/load_balanced_bounds_presolve.cuh>
+#include <mip/problem/load_balanced_problem.cuh>
+#include <mps_parser/parser.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <utilities/common_utils.hpp>
+#include <utilities/error.hpp>
+#include <utilities/timer.hpp>
+
+#include <rmm/mr/device/cuda_async_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace cuopt::linear_programming::test {
+
+inline auto make_async() { return std::make_shared<rmm::mr::cuda_async_memory_resource>(); }
+
+void init_handler(const raft::handle_t* handle_ptr)
+{
+  // Init cuBlas / cuSparse context here to avoid having it during solving time
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublassetpointermode(
+    handle_ptr->get_cublas_handle(), CUBLAS_POINTER_MODE_DEVICE, handle_ptr->get_stream()));
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsesetpointermode(
+    handle_ptr->get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, handle_ptr->get_stream()));
+}
+
+std::tuple<std::vector<int>, std::vector<double>, std::vector<double>> select_k_random(
+  detail::problem_t<int, double>& problem, int sample_size)
+{
+  auto seed = std::random_device{}();
+  std::cerr << "Tested with seed " << seed << "\n";
+  problem.compute_n_integer_vars();
+  auto v_lb       = host_copy(problem.variable_lower_bounds);
+  auto v_ub       = host_copy(problem.variable_upper_bounds);
+  auto int_var_id = host_copy(problem.integer_indices);
+  int_var_id.erase(std::remove_if(int_var_id.begin(),
+                                  int_var_id.end(),
+                                  [v_lb, v_ub](auto id) {
+                                    return !(std::isfinite(v_lb[id]) && std::isfinite(v_ub[id]));
+                                  }),
+                   int_var_id.end());
+  sample_size = std::min(sample_size, static_cast<int>(int_var_id.size()));
+  std::vector<int> random_int_vars;
+  std::mt19937 m{seed};
+  std::sample(
+    int_var_id.begin(), int_var_id.end(), std::back_inserter(random_int_vars), sample_size, m);
+  std::vector<double> probe_0(sample_size);
+  std::vector<double> probe_1(sample_size);
+  for (int i = 0; i < static_cast<int>(random_int_vars.size()); ++i) {
+    if (i % 2) {
+      probe_0[i] = v_lb[random_int_vars[i]];
+      probe_1[i] = v_ub[random_int_vars[i]];
+    } else {
+      probe_1[i] = v_lb[random_int_vars[i]];
+      probe_0[i] = v_ub[random_int_vars[i]];
+    }
+  }
+  return std::make_tuple(std::move(random_int_vars), std::move(probe_0), std::move(probe_1));
+}
+
+std::pair<std::vector<thrust::pair<int, double>>, std::vector<thrust::pair<int, double>>>
+convert_probe_tuple(std::tuple<std::vector<int>, std::vector<double>, std::vector<double>>& probe)
+{
+  std::vector<thrust::pair<int, double>> probe_first;
+  std::vector<thrust::pair<int, double>> probe_second;
+  for (size_t i = 0; i < std::get<0>(probe).size(); ++i) {
+    probe_first.emplace_back(thrust::make_pair(std::get<0>(probe)[i], std::get<1>(probe)[i]));
+    probe_second.emplace_back(thrust::make_pair(std::get<0>(probe)[i], std::get<2>(probe)[i]));
+  }
+  return std::make_pair(std::move(probe_first), std::move(probe_second));
+}
+
+std::tuple<std::vector<double>, std::vector<double>, std::vector<double>, std::vector<double>>
+bounds_probe_results(detail::bound_presolve_t<int, double>& bnd_prb_0,
+                     detail::bound_presolve_t<int, double>& bnd_prb_1,
+                     detail::problem_t<int, double>& problem,
+                     const std::pair<std::vector<thrust::pair<int, double>>,
+                                     std::vector<thrust::pair<int, double>>>& probe)
+{
+  auto& probe_first  = std::get<0>(probe);
+  auto& probe_second = std::get<1>(probe);
+  rmm::device_uvector<double> b_lb_0(problem.n_variables, problem.handle_ptr->get_stream());
+  rmm::device_uvector<double> b_ub_0(problem.n_variables, problem.handle_ptr->get_stream());
+  rmm::device_uvector<double> b_lb_1(problem.n_variables, problem.handle_ptr->get_stream());
+  rmm::device_uvector<double> b_ub_1(problem.n_variables, problem.handle_ptr->get_stream());
+  bnd_prb_0.solve(problem, probe_first);
+  bnd_prb_0.set_updated_bounds(problem.handle_ptr, make_span(b_lb_0), make_span(b_ub_0));
+  bnd_prb_1.solve(problem, probe_second);
+  bnd_prb_1.set_updated_bounds(problem.handle_ptr, make_span(b_lb_1), make_span(b_ub_1));
+
+  auto h_lb_0 = host_copy(b_lb_0);
+  auto h_ub_0 = host_copy(b_ub_0);
+  auto h_lb_1 = host_copy(b_lb_1);
+  auto h_ub_1 = host_copy(b_ub_1);
+  return std::make_tuple(
+    std::move(h_lb_0), std::move(h_ub_0), std::move(h_lb_1), std::move(h_ub_1));
+}
+
+void test_multi_probe(std::string path)
+{
+  auto memory_resource = make_async();
+  rmm::mr::set_current_device_resource(memory_resource.get());
+  const raft::handle_t handle_{};
+  cuopt::mps_parser::mps_data_model_t<int, double> mps_problem =
+    cuopt::mps_parser::parse_mps<int, double>(path, false);
+  handle_.sync_stream();
+  auto op_problem = mps_data_model_to_optimization_problem(&handle_, mps_problem);
+  problem_checking_t<int, double>::check_problem_representation(op_problem);
+  detail::problem_t<int, double> problem(op_problem);
+  mip_solver_settings_t<int, double> default_settings{};
+  detail::pdhg_solver_t<int, double> pdhg_solver(problem.handle_ptr, problem);
+  detail::pdlp_initial_scaling_strategy_t<int, double> scaling(&handle_,
+                                                               problem,
+                                                               10,
+                                                               1.0,
+                                                               pdhg_solver,
+                                                               problem.reverse_coefficients,
+                                                               problem.reverse_offsets,
+                                                               problem.reverse_constraints,
+                                                               true);
+  detail::mip_solver_t<int, double> solver(problem, default_settings, scaling, cuopt::timer_t(0));
+  detail::load_balanced_problem_t<int, double> lb_problem(problem);
+  detail::load_balanced_bounds_presolve_t<int, double> lb_prs(lb_problem, solver.context);
+
+  detail::bound_presolve_t<int, double> bnd_prb(solver.context);
+
+  auto probe_tuple       = select_k_random(problem, 100);
+  auto bounds_probe_vals = convert_probe_tuple(probe_tuple);
+  {
+    auto& probe_first = std::get<0>(bounds_probe_vals);
+    bnd_prb.solve(problem, probe_first);
+    rmm::device_uvector<double> b_lb(problem.n_variables, problem.handle_ptr->get_stream());
+    rmm::device_uvector<double> b_ub(problem.n_variables, problem.handle_ptr->get_stream());
+    bnd_prb.set_updated_bounds(problem.handle_ptr, make_span(b_lb), make_span(b_ub));
+
+    auto h_lb = host_copy(b_lb);
+    auto h_ub = host_copy(b_ub);
+
+    lb_prs.solve(probe_first);
+
+    auto bnds = host_copy(lb_prs.vars_bnd);
+    for (int i = 0; i < (int)h_lb.size(); ++i) {
+      EXPECT_DOUBLE_EQ(bnds[2 * i], h_lb[i]);
+      EXPECT_DOUBLE_EQ(bnds[2 * i + 1], h_ub[i]);
+    }
+  }
+}
+
+TEST(presolve, multi_probe)
+{
+  std::vector<std::string> test_instances = {
+    "mip/50v-10-free-bound.mps", "mip/neos5-free-bound.mps", "mip/neos5.mps"};
+  for (const auto& test_instance : test_instances) {
+    auto path = make_path_absolute(test_instance);
+    test_multi_probe(path);
+  }
+}
+
+}  // namespace cuopt::linear_programming::test
diff --git a/datasets/cuopt_service_data/lpmip_configs.json b/datasets/cuopt_service_data/lpmip_configs.json
new file mode 100644
index 000000000..cb920de50
--- /dev/null
+++ b/datasets/cuopt_service_data/lpmip_configs.json
@@ -0,0 +1,103 @@
+{
+  "csr_constraint_matrix": {
+    "offsets": [
+      0,
+      2,
+      4
+    ],
+    "indices": [
+      0,
+      1,
+      0,
+      1
+    ],
+    "values": [
+      3.0,
+      4.0,
+      2.7,
+      10.1
+    ]
+  },
+  "constraint_bounds": {
+    "bounds": [
+      5.4,
+      4.9
+    ],
+    "upper_bounds": [
+      5.4,
+      4.9
+    ],
+    "lower_bounds": [
+      "ninf",
+      "ninf"
+    ]
+  },
+  "objective_data": {
+    "coefficients": [
+      0.2,
+      0.1
+    ],
+    "scalability_factor": 1.0,
+    "offset": 0.0
+  },
+  "variable_bounds": {
+    "upper_bounds": [
+      "inf",
+      "inf"
+    ],
+    "lower_bounds": [
+      0.0,
+      0.0
+    ]
+  },
+  "maximize": false,
+  "solver_config": {
+    "tolerances": {
+	"optimality": 0.0001,
+	"absolute_primal_tolerance": 0.0001,
+	"absolute_dual_tolerance": 0.0001,
+	"absolute_gap_tolerance": 0.0001,
+	"relative_primal_tolerance": 0.0001,
+	"relative_dual_tolerance": 0.0001,
+	"relative_gap_tolerance": 0.0001,
+	"primal_infeasible_tolerance": 0.0001,
+	"dual_infeasible_tolerance": 0.0001,
+	"mip_integrality_tolerance": 0.0001,
+	"mip_absolute_gap": 0.0001,
+	"mip_relative_gap": 0.0001,
+	"mip_absolute_tolerance": 0.0001,
+	"mip_relative_tolerance": 0.0001,
+	
+	"absolute_primal": 0.0001,
+	"absolute_dual": 0.0001,
+	"absolute_gap": 0.0001,
+	"relative_primal": 0.0001,
+	"relative_dual": 0.0001,
+	"relative_gap": 0.0001,
+	"primal_infeasible": 0.0001,
+	"dual_infeasible": 0.0001,
+	"integrality_tolerance": 0.0001,
+	"absolute_mip_gap": 0.0001,
+	"relative_mip_gap": 0.0001
+    },
+    "infeasibility_detection": true,
+    "time_limit": 5,
+    "iteration_limit": 100,
+    "pdlp_solver_mode": 2,
+    "method": 2,
+    "mip_scaling": true,
+    "mip_heuristics_only": true,
+    "num_cpu_threads": 100,
+    "crossover": true,
+    "log_to_console": false,
+    "strict_infeasibility": false,
+    "user_problem_file": "bob",
+    "per_constraint_residual": true,
+    "save_best_primal_so_far": true,
+    "first_primal_feasible": true,
+    "log_file": "bill",
+    "solution_file": "barry",
+    "solver_mode": 3,
+    "heuristics_only": false
+  }
+}
diff --git a/datasets/mip/mip-presolved-to-lp.mps b/datasets/mip/mip-presolved-to-lp.mps
new file mode 100644
index 000000000..755bcc328
--- /dev/null
+++ b/datasets/mip/mip-presolved-to-lp.mps
@@ -0,0 +1,32 @@
+NAME          LP_PROBLEM
+ROWS
+ N  OBJ
+ E  R001
+ E  R002
+ E  R003
+ L  R004
+ L  R005
+ L  R006
+ L  R007
+COLUMNS
+    X001    OBJ          1.000000
+    X001    R004     -1.000000
+    X001    R006     -1.000000
+    X002    OBJ          1.000000
+    X002    R005     -1.000000
+    X002    R007     -1.000000
+    X003    R001      1.000000
+    X003    R004      1.000000
+    X003    R006     -1.000000
+    X004    R002      1.000000
+    X004    R005      1.000000
+    X004    R007     -1.000000
+    X005    R001     -1.000000
+    X005    R002     -1.000000
+    X005    R003      1.000000
+RHS
+BOUNDS
+ LO BND1    X005      0.000000
+ UP BND1    X005      1.000000
+ BV BND1    X005
+ENDATA
diff --git a/dependencies.yaml b/dependencies.yaml
index 3aa6c9460..b382a9a0e 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -537,7 +537,6 @@ dependencies:
       - output_types: conda
         packages:
           - cupy>=12.0.0
-
   depends_on_rapids_logger:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/docs/cuopt/source/cuopt-python/index.rst b/docs/cuopt/source/cuopt-python/index.rst
index d54d1f835..7a412804a 100644
--- a/docs/cuopt/source/cuopt-python/index.rst
+++ b/docs/cuopt/source/cuopt-python/index.rst
@@ -21,4 +21,13 @@ This section contains details on the cuOpt Python package.
    :name: Routing Optimization
    :titlesonly:
 
-   Routing Optimization <routing/index.rst>
\ No newline at end of file
+   Routing Optimization <routing/index.rst>
+
+
+.. toctree::
+   :maxdepth: 3
+   :caption: Linear Programming and Mixed Integer Linear Programming
+   :name: LP and MILP API
+   :titlesonly:
+
+   Linear Programming and Mixed Integer Linear Programming <lp-milp/index.rst>
\ No newline at end of file
diff --git a/docs/cuopt/source/cuopt-python/lp-milp/index.rst b/docs/cuopt/source/cuopt-python/lp-milp/index.rst
new file mode 100644
index 000000000..0d60ccc41
--- /dev/null
+++ b/docs/cuopt/source/cuopt-python/lp-milp/index.rst
@@ -0,0 +1,14 @@
+=======================================================
+Linear Programming and Mixed Integer Linear Programming
+=======================================================
+
+This section contains details on the cuOpt linear programming and mixed integer linear programming Python API.
+
+.. toctree::
+   :maxdepth: 3
+   :caption: LP and MILP
+   :name: LP and MILP    
+   :titlesonly:
+
+   lp-milp-api.rst
+   lp-milp-examples.rst
\ No newline at end of file
diff --git a/docs/cuopt/source/cuopt-python/lp-milp/lp-milp-api.rst b/docs/cuopt/source/cuopt-python/lp-milp/lp-milp-api.rst
new file mode 100644
index 000000000..ea6b0ff79
--- /dev/null
+++ b/docs/cuopt/source/cuopt-python/lp-milp/lp-milp-api.rst
@@ -0,0 +1,44 @@
+=========================
+LP and MILP API Reference
+=========================
+
+.. autoclass:: cuopt.linear_programming.problem.VType
+   :members:
+   :member-order: bysource
+   :undoc-members:
+   :exclude-members: capitalize, casefold, center, count, encode, endswith, expandtabs, find, format, format_map, index, isalnum, isalpha, isascii, isdecimal, isdigit, isidentifier, islower, isnumeric, isprintable, isspace, istitle, isupper, join, ljust, lower, lstrip, maketrans, partition, removeprefix, removesuffix, replace, rfind, rindex, rjust, rpartition, rsplit, rstrip, split, splitlines, startswith, strip, swapcase, title, translate, upper, zfill
+
+.. autoclass:: cuopt.linear_programming.problem.CType
+   :members:
+   :member-order: bysource
+   :undoc-members:
+   :exclude-members: capitalize, casefold, center, count, encode, endswith, expandtabs, find, format, format_map, index, isalnum, isalpha, isascii, isdecimal, isdigit, isidentifier, islower, isnumeric, isprintable, isspace, istitle, isupper, join, ljust, lower, lstrip, maketrans, partition, removeprefix, removesuffix, replace, rfind, rindex, rjust, rpartition, rsplit, rstrip, split, splitlines, startswith, strip, swapcase, title, translate, upper, zfill
+
+.. autoclass:: cuopt.linear_programming.problem.sense
+   :members:
+   :member-order: bysource
+   :exclude-members: __new__, __init__, _generate_next_value_, as_integer_ratio, bit_count, bit_length, conjugate, denominator, from_bytes, imag, is_integer, numerator, real, to_bytes
+   :no-inherited-members:
+
+.. autoclass:: cuopt.linear_programming.problem.Problem
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :exclude-members: reset_solved_values, post_solve, dict_to_object, NumNZs, NumVariables, NumConstraints, IsMIP
+
+.. autoclass:: cuopt.linear_programming.problem.Variable
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :exclude-members:
+
+.. autoclass:: cuopt.linear_programming.problem.LinearExpression
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. autoclass:: cuopt.linear_programming.problem.Constraint
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :exclude-members: compute_slack
diff --git a/docs/cuopt/source/cuopt-python/lp-milp/lp-milp-examples.rst b/docs/cuopt/source/cuopt-python/lp-milp/lp-milp-examples.rst
new file mode 100644
index 000000000..98ef2d75d
--- /dev/null
+++ b/docs/cuopt/source/cuopt-python/lp-milp/lp-milp-examples.rst
@@ -0,0 +1,313 @@
+====================
+LP and MILP Examples
+====================
+
+This section contains examples of how to use the cuOpt linear programming and mixed integer linear programming Python API.
+
+.. note::
+
+    The examples in this section are not exhaustive. They are provided to help you get started with the cuOpt linear programming and mixed integer linear programming Python API. For more examples, please refer to the `cuopt-examples GitHub repository <https://github.com/NVIDIA/cuopt-examples>`_.
+
+
+Simple Linear Programming Example
+---------------------------------
+
+.. code-block:: python
+
+    from cuopt.linear_programming.problem import Problem, CONTINUOUS, MAXIMIZE
+    from cuopt.linear_programming.solver_settings import SolverSettings
+
+    # Create a new problem
+    problem = Problem("Simple LP")
+    
+    # Add variables
+    x = problem.addVariable(lb=0, vtype=CONTINUOUS, name="x")
+    y = problem.addVariable(lb=0, vtype=CONTINUOUS, name="y")
+
+    # Add constraints
+    problem.addConstraint(x + y <= 10, name="c1")
+    problem.addConstraint(x - y >= 0, name="c2")
+
+    # Set objective function
+    problem.setObjective(x + y, sense=MAXIMIZE)
+    
+    # Configure solver settings
+    settings = SolverSettings()
+    settings.set_parameter("time_limit", 60)
+    
+    # Solve the problem
+    problem.solve(settings)
+    
+    # Check solution status
+    if problem.Status.name == "Optimal":
+        print(f"Optimal solution found in {problem.SolveTime:.2f} seconds")
+        print(f"x = {x.getValue()}")
+        print(f"y = {y.getValue()}")
+        print(f"Objective value = {problem.ObjValue}")
+
+The response is as follows:
+
+.. code-block:: text
+
+    Optimal solution found in 0.01 seconds
+    x = 10.0
+    y = 0.0
+    Objective value = 10.0
+
+Mixed Integer Linear Programming Example
+----------------------------------------
+
+.. code-block:: python
+
+    from cuopt.linear_programming.problem import Problem, INTEGER, MAXIMIZE
+    from cuopt.linear_programming.solver_settings import SolverSettings
+
+    # Create a new MIP problem
+    problem = Problem("Simple MIP")
+    
+    # Add integer variables with bounds
+    x = problem.addVariable(vtype=INTEGER, name="V_x")
+    y = problem.addVariable(lb=10, ub=50, vtype=INTEGER, name="V_y")
+
+    # Add constraints
+    problem.addConstraint(2 * x + 4 * y >= 230, name="C1")
+    problem.addConstraint(3 * x + 2 * y <= 190, name="C2")
+
+    # Set objective function
+    problem.setObjective(5 * x + 3 * y, sense=MAXIMIZE)
+
+    # Configure solver settings
+    settings = SolverSettings()
+    settings.set_parameter("time_limit", 60)
+
+    # Solve the problem
+    problem.solve(settings)
+    
+    # Check solution status and results
+    if problem.Status.name == "Optimal":
+        print(f"Optimal solution found in {problem.SolveTime:.2f} seconds")
+        print(f"x = {x.getValue()}")
+        print(f"y = {y.getValue()}")
+        print(f"Objective value = {problem.ObjValue}")
+    else:
+        print(f"Problem status: {problem.Status.name}")
+
+The response is as follows:
+
+.. code-block:: text
+
+    Optimal solution found in 0.00 seconds
+    x = 36.0
+    y = 40.99999999999999
+    Objective value = 303.0
+
+
+Advanced Example: Production Planning
+-------------------------------------
+
+.. code-block:: python
+
+    from cuopt.linear_programming.problem import Problem, INTEGER, MAXIMIZE
+    from cuopt.linear_programming.solver_settings import SolverSettings
+
+    # Production planning problem
+    problem = Problem("Production Planning")
+    
+    # Decision variables: production quantities
+    # x1 = units of product A
+    # x2 = units of product B
+    x1 = problem.addVariable(lb=10, vtype=INTEGER, name="Product_A")
+    x2 = problem.addVariable(lb=15, vtype=INTEGER, name="Product_B")
+    
+    # Resource constraints
+    # Machine time: 2 hours per unit of A, 1 hour per unit of B, max 100 hours
+    problem.addConstraint(2 * x1 + x2 <= 100, name="Machine_Time")
+    
+    # Labor: 1 hour per unit of A, 3 hours per unit of B, max 120 hours
+    problem.addConstraint(x1 + 3 * x2 <= 120, name="Labor_Hours")
+    
+    # Material: 4 units per unit of A, 2 units per unit of B, max 200 units
+    problem.addConstraint(4 * x1 + 2 * x2 <= 200, name="Material")
+    
+    # Objective: maximize profit
+    # Profit: $50 per unit of A, $30 per unit of B
+    problem.setObjective(50 * x1 + 30 * x2, sense=MAXIMIZE)
+    
+    # Solve with time limit
+    settings = SolverSettings()
+    settings.set_parameter("time_limit", 30)
+    problem.solve(settings)
+    
+    # Display results
+    if problem.Status.name == "Optimal":
+        print("=== Production Planning Solution ===")
+        print(f"Status: {problem.Status.name}")
+        print(f"Solve time: {problem.SolveTime:.2f} seconds")
+        print(f"Product A production: {x1.getValue()} units")
+        print(f"Product B production: {x2.getValue()} units")
+        print(f"Total profit: ${problem.ObjValue:.2f}")
+        
+    else:
+        print(f"Problem not solved optimally. Status: {problem.Status.name}")
+
+The response is as follows:
+
+.. code-block:: text
+
+    === Production Planning Solution ===
+
+    Status: Optimal
+    Solve time: 0.09 seconds
+    Product A production: 36.0 units
+    Product B production: 28.000000000000004 units
+    Total profit: $2640.00
+
+Working with Expressions and Constraints
+----------------------------------------
+
+.. code-block:: python
+
+    from cuopt.linear_programming.problem import Problem, MAXIMIZE
+    from cuopt.linear_programming.solver_settings import SolverSettings
+
+    problem = Problem("Expression Example")
+    
+    # Create variables
+    x = problem.addVariable(lb=0, name="x")
+    y = problem.addVariable(lb=0, name="y")
+    z = problem.addVariable(lb=0, name="z")
+    
+    # Create complex expressions
+    expr1 = 2 * x + 3 * y - z
+    expr2 = x + y + z
+    
+    # Add constraints using expressions
+    problem.addConstraint(expr1 <= 100, name="Complex_Constraint_1")
+    problem.addConstraint(expr2 >= 20, name="Complex_Constraint_2")
+    
+    # Add constraint with different senses
+    problem.addConstraint(x + y == 50, name="Equality_Constraint")
+    problem.addConstraint(1 * x <= 30, name="Upper_Bound_X")
+    problem.addConstraint(1 * y >= 10, name="Lower_Bound_Y")
+    problem.addConstraint(1 * z <= 100, name="Upper_Bound_Z")
+    
+    # Set objective
+    problem.setObjective(x + 2 * y + 3 * z, sense=MAXIMIZE)
+
+    settings = SolverSettings()
+    settings.set_parameter("time_limit", 20) 
+
+    problem.solve(settings)
+    
+    
+    if problem.Status.name == "Optimal":
+        print("=== Expression Example Results ===")
+        print(f"x = {x.getValue()}")
+        print(f"y = {y.getValue()}")
+        print(f"z = {z.getValue()}")
+        print(f"Objective value = {problem.ObjValue}")
+        
+The response is as follows:
+
+.. code-block:: text
+
+    === Expression Example Results ===
+    x = 0.0
+    y = 50.0
+    z = 99.99999999999999
+    Objective value = 399.99999999999994
+
+Working with Incumbent Solutions
+--------------------------------
+
+Incumbent solutions are intermediate feasible solutions found during the MIP solving process. They represent the best integer-feasible solution discovered so far and can be accessed through callback functions.
+
+.. note::
+    Incumbent solutions are only available for Mixed Integer Programming (MIP) problems, not for pure Linear Programming (LP) problems.
+
+.. code-block:: python
+
+    from cuopt.linear_programming.problem import Problem, INTEGER, MAXIMIZE
+    from cuopt.linear_programming.solver_settings import SolverSettings
+    from cuopt.linear_programming.solver.solver_parameters import CUOPT_TIME_LIMIT    
+    from cuopt.linear_programming.internals import GetSolutionCallback, SetSolutionCallback
+
+    # Create a callback class to receive incumbent solutions
+    class IncumbentCallback(GetSolutionCallback):
+        def __init__(self):
+            super().__init__()
+            self.solutions = []
+            self.n_callbacks = 0
+
+        def get_solution(self, solution, solution_cost):
+            """
+            Called whenever the solver finds a new incumbent solution.
+
+            Parameters
+            ----------
+            solution : array-like
+                The variable values of the incumbent solution
+            solution_cost : array-like
+                The objective value of the incumbent solution
+            """
+            self.n_callbacks += 1
+
+            # Store the incumbent solution
+            incumbent = {
+                "solution": solution.copy_to_host(),
+                "cost": solution_cost.copy_to_host()[0],
+                "iteration": self.n_callbacks
+            }
+            self.solutions.append(incumbent)
+
+            print(f"Incumbent {self.n_callbacks}: {incumbent['solution']}, cost: {incumbent['cost']:.2f}")
+
+    # Create a more complex MIP problem that will generate multiple incumbents
+    problem = Problem("Incumbent Example")
+
+    # Add integer variables
+    x = problem.addVariable(vtype=INTEGER)
+    y = problem.addVariable(vtype=INTEGER)
+
+    # Add constraints to create a problem that will generate multiple incumbents
+    problem.addConstraint(2 * x + 4 * y >= 230)
+    problem.addConstraint(3 * x + 2 * y <= 190)
+
+    # Set objective to maximize
+    problem.setObjective(5 * x + 3 * y, sense=MAXIMIZE)
+
+    # Configure solver settings with callback
+    settings = SolverSettings()
+    # Set the incumbent callback
+    incumbent_callback = IncumbentCallback()
+    settings.set_mip_callback(incumbent_callback)
+    settings.set_parameter(CUOPT_TIME_LIMIT, 30)  # Allow enough time to find multiple incumbents
+
+    # Solve the problem
+    problem.solve(settings)
+
+    # Display final results
+    print(f"\n=== Final Results ===")
+    print(f"Problem status: {problem.Status.name}")
+    print(f"Solve time: {problem.SolveTime:.2f} seconds")
+    print(f"Final solution: x={x.getValue()}, y={y.getValue()}")
+    print(f"Final objective value: {problem.ObjValue:.2f}")
+    
+The response is as follows:
+
+.. code-block:: text
+
+    Optimal solution found.
+    Incumbent 1: [ 0. 58.], cost: 174.00
+    Incumbent 2: [36. 41.], cost: 303.00
+    Generated fast solution in 0.158467 seconds with objective 303.000000
+    Consuming B&B solutions, solution queue size 2
+    Solution objective: 303.000000 , relative_mip_gap 0.000000 solution_bound 303.000000 presolve_time 0.043211 total_solve_time 0.160270 max constraint violation 0.000000 max int violation 0.000000 max var bounds violation 0.000000 nodes 4 simplex_iterations 3
+
+    === Final Results ===
+    Problem status: Optimal
+    Solve time: 0.16 seconds
+    Final solution: x=36.0, y=40.99999999999999
+    Final objective value: 303.00
+
+
diff --git a/docs/cuopt/source/introduction.rst b/docs/cuopt/source/introduction.rst
index 671446727..aaf164198 100644
--- a/docs/cuopt/source/introduction.rst
+++ b/docs/cuopt/source/introduction.rst
@@ -112,8 +112,7 @@ cuOpt supports the following APIs:
    - cuOpt is written in C++ and includes a native C++ API. However, we do not provide documentation for the C++ API at this time. We anticipate that the C++ API will change significantly in the future. Use it at your own risk.
 - Python support
    - :doc:`Routing (TSP, VRP, and PDP) - Python <cuopt-python/quick-start>`
-   - Linear Programming (LP) and Mixed Integer Linear Programming (MILP)
-       - cuOpt includes a Python API that is used as the backend of the cuOpt server. However, we do not provide documentation for the Python API at this time. We suggest using cuOpt server to access cuOpt via Python. We anticipate that the Python API will change significantly in the future. Use it at your own risk.
+   - :doc:`Linear Programming (LP) and Mixed Integer Linear Programming (MILP) - Python <cuopt-python/quick-start>`
 - Server support
    - :doc:`Linear Programming (LP) - Server <cuopt-server/quick-start>`
    - :doc:`Mixed Integer Linear Programming (MILP) - Server <cuopt-server/quick-start>`
diff --git a/docs/cuopt/source/lp-features.rst b/docs/cuopt/source/lp-features.rst
index f3861ffac..b89ace5d3 100644
--- a/docs/cuopt/source/lp-features.rst
+++ b/docs/cuopt/source/lp-features.rst
@@ -16,6 +16,8 @@ The LP solver can be accessed in the following ways:
 
 - **C API**: A native C API that provides direct low-level access to cuOpt's LP capabilities, enabling integration into any application or system that can interface with C.
 
+- **Python SDK**: A Python package that provides direct access to cuOpt's LP capabilities through a simple, intuitive API. This allows for seamless integration into Python applications and workflows. For more information, see :doc:`cuopt-python/quick-start`.
+
 - **As a Self-Hosted Service**: cuOpt's LP solver can be deployed as a in your own infrastructure, enabling you to maintain full control while integrating it into your existing systems.
 
 Each option provide the same powerful linear optimization capabilities while offering flexibility in deployment and integration.
@@ -76,9 +78,17 @@ Crossover
 Crossover allows you to obtain a high-quality basic solution from the results of a PDLP solve. More details can be found :ref:`here <crossover>`.
 
 
-Logging Callback
-----------------
-With logging callback, users can fetch server-side logs for additional debugs and to get details on solver process details. :ref:`Examples <generic-example-with-normal-and-batch-mode>` are shared on the self-hosted page.
+Logging
+-------
+
+The CUOPT_LOG_FILE parameter can be set to write detailed solver logs for LP problems. This parameter is available in all APIs that allow setting solver parameters except the cuOpt service. For the service, see the logging callback below.
+
+Logging Callback in the Service
+-------------------------------
+
+In the cuOpt service API, the ``log_file`` value in ``solver_configs`` is ignored.
+
+If however you set the ``solver_logs`` flag on the ``/cuopt/request`` REST API call, users can fetch the log file content from the webserver at ``/cuopt/logs/{id}``. Using the logging callback feature through the cuOpt client is shown in :ref:`Examples <generic-example-with-normal-and-batch-mode>` on the self-hosted page.
 
 
 Infeasibility Detection
diff --git a/docs/cuopt/source/lp-milp-settings.rst b/docs/cuopt/source/lp-milp-settings.rst
index 28e5105d0..8e15f36c7 100644
--- a/docs/cuopt/source/lp-milp-settings.rst
+++ b/docs/cuopt/source/lp-milp-settings.rst
@@ -39,19 +39,19 @@ Log File
 ^^^^^^^^
 ``CUOPT_LOG_FILE`` controls the name of a log file where cuOpt should write information about the solve.
 
-Note: the default value is ``""`` and no log file is written.
+Note: the default value is ``""`` and no log file is written. This setting is ignored by the cuOpt service, use the log callback feature instead.
 
 Solution File
 ^^^^^^^^^^^^^
-``CUOPT_SOL_FILE`` controls the name of a file where cuOpt should write the solution.
+``CUOPT_SOLUTION_FILE`` controls the name of a file where cuOpt should write the solution.
 
-Note: the default value is ``""`` and no solution file is written.
+Note: the default value is ``""`` and no solution file is written. This setting is ignored by the cuOpt service.
 
 User Problem File
 ^^^^^^^^^^^^^^^^^
 ``CUOPT_USER_PROBLEM_FILE`` controls the name of a file where cuOpt should write the user problem.
 
-Note: the default value is ``""`` and no user problem file is written.
+Note: the default value is ``""`` and no user problem file is written. This setting is ignored by the cuOpt service.
 
 Num CPU Threads
 ^^^^^^^^^^^^^^^
@@ -257,7 +257,7 @@ We now describe parameter settings for the MILP solvers
 Heuristics only
 ^^^^^^^^^^^^^^^
 
-``CUOPT_HEURISTICS_ONLY`` controls if only the GPU heuristics should be run. When set to true, only the primal
+``CUOPT_MIP_HEURISTICS_ONLY`` controls if only the GPU heuristics should be run for the MIP problem. When set to true, only the primal
 bound is improved via the GPU. When set to false, both the GPU and CPU are used and
 the dual bound is improved on the CPU.
 
@@ -275,14 +275,14 @@ Note: the defaulte value is true.
 Absolute Tolerance
 ^^^^^^^^^^^^^^^^^^
 
-``CUOPT_ABSOLUTE_TOLERANCE`` controls the MIP absolute tolerance.
+``CUOPT_MIP_ABSOLUTE_TOLERANCE`` controls the MIP absolute tolerance.
 
 Note: the default value is ``1e-4``.
 
 Relative Tolerance
 ^^^^^^^^^^^^^^^^^^
 
-``CUOPT_RELATIVE_TOLERANCE`` controls the MIP relative tolerance.
+``CUOPT_MIP_RELATIVE_TOLERANCE`` controls the MIP relative tolerance.
 
 Note: the default value is ``1e-6``.
 
diff --git a/docs/cuopt/source/milp-features.rst b/docs/cuopt/source/milp-features.rst
index 814207a1c..40eba5c40 100644
--- a/docs/cuopt/source/milp-features.rst
+++ b/docs/cuopt/source/milp-features.rst
@@ -16,6 +16,8 @@ The MILP solver can be accessed in the following ways:
 
 - **C API**: A native C API that provides direct low-level access to cuOpt's MILP solver, enabling integration into any application or system that can interface with C.
 
+- **Python SDK**: A Python package that provides direct access to cuOpt's MILP capabilities through a simple, intuitive API. This allows for seamless integration into Python applications and workflows. For more information, see :doc:`cuopt-python/quick-start`.
+
 - **As a Self-Hosted Service**: cuOpt's MILP solver can be deployed in your own infrastructure, enabling you to maintain full control while integrating it into your existing systems.
 
 Each option provide the same powerful mixed-integer linear optimization capabilities while offering flexibility in deployment and integration.
@@ -50,15 +52,23 @@ There are two ways to specify constraints in cuOpt MILP:
 
 Both forms are mathematically equivalent. The choice between them is a matter of convenience depending on your problem formulation.
 
-Incumbent Solution Callback
----------------------------
+Incumbent Solution Callback in the Service
+------------------------------------------
+
+When using the service, users can provide a callback to receive new integer feasible solutions that improve the objective (called incumbents) while the solver is running. An :ref:`Incumbent Example <incumbent-and-logging-callback>` is shared on the self-hosted page.
+
+Logging
+-------
+
+The CUOPT_LOG_FILE parameter can be set to write detailed solver logs for MILP problems. This parameter is available in all APIs that allow setting solver parameters except for the cuOpt service. For the service, see the logging callback below.
+
+Logging Callback in the Service
+-------------------------------
 
-User can provide a callback to receive new integer feasible solutions that improve the objective (called incumbents) while the solver is running. An :ref:`Incumbent Example <incumbent-and-logging-callback>` is shared on the self-hosted page.
+In the cuOpt service API, the ``log_file`` value in ``solver_configs`` is ignored.
 
-Logging Callback
-----------------
+If however you set the ``solver_logs`` flag on the ``/cuopt/request`` REST API call, users can fetch the log file content from the webserver at ``/cuopt/logs/{id}``. Using the logging callback feature through the cuOpt client is shown in :ref:`Logging Callback Example <incumbent-and-logging-callback>` on the self-hosted page.
 
-A logging callback allows users to get additional information about how the solve is progressing. A :ref:`Logging Callback Example <incumbent-and-logging-callback>` is shared on the self-hosted page.
 
 Time Limit
 --------------
diff --git a/docs/cuopt/source/transition.rst b/docs/cuopt/source/transition.rst
index 97d48f7b7..dd3d47bbf 100644
--- a/docs/cuopt/source/transition.rst
+++ b/docs/cuopt/source/transition.rst
@@ -1,6 +1,6 @@
-========================================
+=======================================
 Transition Guide for Change in Features
-========================================
+=======================================
 
 In addition to the quality improvements,  some new features were added, and some features were deprecated to improve user experience. For any questions, please reach out to the cuOpt team through github issues.
 
@@ -10,8 +10,72 @@ Parameter/option statuses are listed below, they express how each of these optio
 
   **Update** - A change in definition of feature.
 
-  **Deprecated** - These are “no operation” options, they will be accepted by the server, but they will not be used anywhere. And the solver will also return a warning about them being deprecated.
+  **Deprecated** - These options will be accepted but will be removed in the future. In the case of the cuOpt service, the server will also return a warning noting that a feature is deprecated.
 
   **Limited** - These options are limited with respect to the number of dimensions that can be provided.
 
-  **Removed** - These features were deprecated in previous release and completely removed in this one.
\ No newline at end of file
+  **Removed** - These features were deprecated in a previous release and completely removed in this one.
+
+For all solver_configs fields, see the LP/MILP settings guide :doc:`lp-milp-settings` or the service openapi spec :doc:`open-api`.
+
+Changes to solver_configs.tolerances
+------------------------------------
+
+The following fields are **Deprecated** in ``solver_configs.tolerances`` for the service:
+
+- absolute_primal
+- absolute_dual
+- absolute_gap
+- relative_primal
+- relative_dual
+- relative_gap
+- primal_infeasible
+- dual_infeasible
+- integrality_tolerance
+- absolute_mip_gap
+- relative_mip_gap
+
+The following fields are **New** in ``solver_configs.tolerances`` for the service and replace the deprecated fields above:
+
+- absolute_primal_tolerance
+- absolute_dual_tolerance
+- absolute_gap_tolerance
+- relative_primal_tolerance
+- relative_dual_tolerance
+- relative_gap_tolerance
+- primal_infeasible_tolerance
+- dual_infeasible_tolerance
+- mip_integrality_tolerance
+- mip_absolute_gap
+- mip_relative_gap
+
+The following fields are **New** in ``solver_configs.tolerances`` for the service but were available in the C API in 25.05
+
+- mip_absolute_tolerance
+- mip_relative_tolerance
+
+Changes to solver_configs
+-------------------------
+
+The following fields are **Deprecated** in ``solver_configs`` for the service:
+
+- solver_mode
+- heuristics_only
+
+The following fields are **New** in ``solver_configs`` for the service and replace the deprecated fields above:
+
+- pdlp_solver_mode
+- mip_heuristics_only
+  
+The following are **New** in ``solver_configs`` for the service but were available in the C API in 25.05
+
+- strict_infeasibility
+- user_problem_file
+- per_constraint_residual
+- save_best_primal_so_far
+- first_primal_feasible
+- log_file
+- solution_file
+  
+
+ 
diff --git a/python/cuopt/cuopt/linear_programming/__init__.py b/python/cuopt/cuopt/linear_programming/__init__.py
index 4d88382eb..7941ad911 100644
--- a/python/cuopt/cuopt/linear_programming/__init__.py
+++ b/python/cuopt/cuopt/linear_programming/__init__.py
@@ -15,6 +15,7 @@
 
 from cuopt.linear_programming import internals
 from cuopt.linear_programming.data_model import DataModel
+from cuopt.linear_programming.problem import Problem
 from cuopt.linear_programming.solution import Solution
 from cuopt.linear_programming.solver import BatchSolve, Solve
 from cuopt.linear_programming.solver_settings import (
diff --git a/python/cuopt/cuopt/linear_programming/problem.py b/python/cuopt/cuopt/linear_programming/problem.py
new file mode 100644
index 000000000..1a14e17cf
--- /dev/null
+++ b/python/cuopt/cuopt/linear_programming/problem.py
@@ -0,0 +1,997 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.  # noqa
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+
+import numpy as np
+
+import cuopt.linear_programming.data_model as data_model
+import cuopt.linear_programming.solver as solver
+import cuopt.linear_programming.solver_settings as solver_settings
+
+
+class VType(str, Enum):
+    """
+    The type of a variable is either continuous or integer.
+    Variable Types can be directly used as a constant.
+    CONTINUOUS is  VType.CONTINUOUS
+    INTEGER is VType.INTEGER
+    """
+
+    CONTINUOUS = "C"
+    INTEGER = "I"
+
+
+CONTINUOUS = VType.CONTINUOUS
+INTEGER = VType.INTEGER
+
+
+class CType(str, Enum):
+    """
+    The sense of a constraint is either LE, GE or EQ.
+    Constraint Sense Types can be directly used as a constant.
+    LE is CType.LE
+    GE is CType.GE
+    EQ is CType EQ
+    """
+
+    LE = "L"
+    GE = "G"
+    EQ = "E"
+
+
+LE = CType.LE
+GE = CType.GE
+EQ = CType.EQ
+
+
+class sense(int, Enum):
+    """
+    The sense of a model is either MINIMIZE or MAXIMIZE.
+    Model objective sense can be directly used as a constant.
+    MINIMIZE is sense.MINIMIZE
+    MAXIMIZE is sense.MAXIMIZE
+    """
+
+    MAXIMIZE = -1
+    MINIMIZE = 1
+
+
+MAXIMIZE = sense.MAXIMIZE
+MINIMIZE = sense.MINIMIZE
+
+
+class Variable:
+    """
+    cuOpt variable object initialized with details of the variable
+    such as lower bound, upper bound, type and name.
+    Variables are always associated with a problem and can be
+    created using problem.addVariable (See problem class).
+
+    Parameters
+    ----------
+    lb : float
+        Lower bound of the variable. Defaults to  0.
+    ub : float
+        Upper bound of the variable. Defaults to infinity.
+    vtype : enum
+        CONTINUOUS or INTEGER. Defaults to CONTINUOUS.
+    obj : float
+        Coefficient of the Variable in the objective.
+    name : str
+        Name of the variable. Optional.
+
+    Attributes
+    ----------
+    VariableName : str
+        Name of the Variable.
+    VariableType : CONTINUOUS or INTEGER
+        Variable type.
+    LB : float
+        Lower Bound of the Variable.
+    UB : float
+        Upper Bound of the Variable.
+    Obj : float
+        Coefficient of the variable in the Objective function.
+    Value : float
+        Value of the variable after solving.
+    ReducedCost : float
+        Reduced Cost after solving an LP problem.
+    """
+
+    def __init__(
+        self,
+        lb=0.0,
+        ub=float("inf"),
+        obj=0.0,
+        vtype=CONTINUOUS,
+        vname="",
+    ):
+        self.index = -1
+        self.LB = lb
+        self.UB = ub
+        self.Obj = obj
+        self.Value = float("nan")
+        self.ReducedCost = float("nan")
+        self.VariableType = vtype
+        self.VariableName = vname
+
+    def getIndex(self):
+        """
+        Get the index position of the variable in the problem.
+        """
+        return self.index
+
+    def getValue(self):
+        """
+        Returns the Value of the variable computed in current solution.
+        Defaults to 0
+        """
+        return self.Value
+
+    def getObjectiveCoefficient(self):
+        """
+        Returns the objective coefficient of the variable.
+        """
+        return self.Obj
+
+    def setObjectiveCoefficient(self, val):
+        """
+        Sets the objective cofficient of the variable.
+        """
+        self.Obj = val
+
+    def setLowerBound(self, val):
+        """
+        Sets the lower bound of the variable.
+        """
+        self.LB = val
+
+    def getLowerBound(self):
+        """
+        Returns the lower bound of the variable.
+        """
+        return self.LB
+
+    def setUpperBound(self, val):
+        """
+        Sets the upper bound of the variable.
+        """
+        self.UB = val
+
+    def getUpperBound(self):
+        """
+        Returns the upper bound of the variable.
+        """
+        return self.UB
+
+    def setVariableType(self, val):
+        """
+        Sets the variable type of the variable.
+        Variable types can be either CONTINUOUS or INTEGER.
+        """
+        self.VariableType = val
+
+    def getVariableType(self):
+        """
+        Returns the type of the variable.
+        """
+        return self.VariableType
+
+    def setVariableName(self, val):
+        """
+        Sets the name of the variable.
+        """
+        self.VariableName = val
+
+    def getVariableName(self):
+        """
+        Returns the name of the variable.
+        """
+        return self.VariableName
+
+    def __add__(self, other):
+        match other:
+            case int() | float():
+                return LinearExpression([self], [1.0], float(other))
+            case Variable():
+                # Change?
+                return LinearExpression([self, other], [1.0, 1.0], 0.0)
+            case LinearExpression():
+                return other + self
+            case _:
+                raise ValueError(
+                    "Cannot add type %s to variable" % type(other).__name__
+                )
+
+    def __radd__(self, other):
+        return self + other
+
+    def __sub__(self, other):
+        match other:
+            case int() | float():
+                return LinearExpression([self], [1.0], -float(other))
+            case Variable():
+                return LinearExpression([self, other], [1.0, -1.0], 0.0)
+            case LinearExpression():
+                # self - other ->   other * -1.0 + self
+                return other * -1.0 + self
+            case _:
+                raise ValueError(
+                    "Cannot subtract type %s from variable"
+                    % type(other).__name__
+                )
+
+    def __rsub__(self, other):
+        # other - self  -> other + self * -1.0
+        return other + self * -1.0
+
+    def __mul__(self, other):
+        match other:
+            case int() | float():
+                return LinearExpression([self], [float(other)], 0.0)
+            case _:
+                raise ValueError(
+                    "Cannot multiply type %s with variable"
+                    % type(other).__name__
+                )
+
+    def __rmul__(self, other):
+        return self * other
+
+    def __le__(self, other):
+        match other:
+            case int() | float():
+                expr = LinearExpression([self], [1.0], 0.0)
+                return Constraint(expr, LE, float(other))
+            case Variable() | LinearExpression():
+                # var1 <= var2   -> var1 - var2 <= 0
+                expr = self - other
+                return Constraint(expr, LE, 0.0)
+            case _:
+                raise ValueError("Unsupported operation")
+
+    def __ge__(self, other):
+        match other:
+            case int() | float():
+                expr = LinearExpression([self], [1.0], 0.0)
+                return Constraint(expr, GE, float(other))
+            case Variable() | LinearExpression():
+                # var1 >= var2   ->  var1 - var2 >= 0
+                expr = self - other
+                return Constraint(expr, GE, 0.0)
+            case _:
+                raise ValueError("Unsupported operation")
+
+    def __eq__(self, other):
+        match other:
+            case int() | float():
+                expr = LinearExpression([self], [1.0], 0.0)
+                return Constraint(expr, EQ, float(other))
+            case Variable() | LinearExpression():
+                # var1 == var2   -> var1 - var2 == 0
+                expr = self - other
+                return Constraint(expr, EQ, 0.0)
+            case _:
+                raise ValueError("Unsupported operation")
+
+
+class LinearExpression:
+    """
+    LinearExpressions contain a set of variables, the coefficients
+    for the variables, and a constant.
+    LinearExpressions can be used to create constraints and the
+    objective in the Problem.
+    LinearExpressions can be added and subtracted with other
+    LinearExpressions and Variables and can also be multiplied and
+    divided by scalars.
+    LinearExpressions can be compared with scalars, Variables, and
+    other LinearExpressions to create Constraints.
+
+    Parameters
+    ----------
+    vars : List
+        List of Variables in the linear expression.
+    coefficients : List
+        List of coefficients corresponding to the variables.
+    constant : float
+        Constant of the linear expression.
+    """
+
+    def __init__(self, vars, coefficients, constant):
+        self.vars = vars
+        self.coefficients = coefficients
+        self.constant = constant
+
+    def getVariables(self):
+        """
+        Returns all the variables in the linear expression.
+        """
+        return self.vars
+
+    def getVariable(self, i):
+        """
+        Gets Variable at ith index in the linear expression.
+        """
+        return self.vars[i]
+
+    def getCoefficients(self):
+        """
+        Returns all the coefficients in the linear expression.
+        """
+        return self.coefficients
+
+    def getCoefficient(self, i):
+        """
+        Gets the coefficient of the variable at ith index of the
+        linear expression.
+        """
+        return self.coefficients[i]
+
+    def getConstant(self):
+        """
+        Returns the constant in the linear expression.
+        """
+        return self.constant
+
+    def zipVarCoefficients(self):
+        return zip(self.vars, self.coefficients)
+
+    def getValue(self):
+        """
+        Returns the value of the expression computed with the
+        current solution.
+        """
+        value = 0.0
+        for i, var in enumerate(self.vars):
+            value += var.Value * self.coefficients[i]
+        return value + self.constant
+
+    def __len__(self):
+        return len(self.vars)
+
+    def __iadd__(self, other):
+        # Compute expr1 += expr2
+        match other:
+            case int() | float():
+                # Update just the constant value
+                self.constant += float(other)
+                return self
+            case Variable():
+                # Append just a variable with coefficient 1.0
+                self.vars.append(other)
+                self.coefficients.append(1.0)
+                return self
+            case LinearExpression():
+                # Append all variables, coefficients and constants
+                self.vars.extend(other.vars)
+                self.coefficients.extend(other.coefficients)
+                self.constant += other.constant
+                return self
+            case _:
+                raise ValueError(
+                    "Can't add type %s to Linear Expression"
+                    % type(other).__name__
+                )
+
+    def __add__(self, other):
+        # Compute expr3 = expr1 + expr2
+        match other:
+            case int() | float():
+                # Update just the constant value
+                return LinearExpression(
+                    self.vars, self.coefficients, self.constant + float(other)
+                )
+            case Variable():
+                # Append just a variable with coefficient 1.0
+                vars = self.vars + [other]
+                coeffs = self.coefficients + [1.0]
+                return LinearExpression(vars, coeffs, self.constant)
+            case LinearExpression():
+                # Append all variables, coefficients and constants
+                vars = self.vars + other.vars
+                coeffs = self.coefficients + other.coefficients
+                constant = self.constant + other.constant
+                return LinearExpression(vars, coeffs, constant)
+
+    def __radd__(self, other):
+        return self + other
+
+    def __isub__(self, other):
+        # Compute expr1 -= expr2
+        match other:
+            case int() | float():
+                # Update just the constant value
+                self.constant -= float(other)
+                return self
+            case Variable():
+                # Append just a variable with coefficient -1.0
+                self.vars.append(other)
+                self.coefficients.append(-1.0)
+                return self
+            case LinearExpression():
+                # Append all variables, coefficients and constants
+                self.vars.extend(other.vars)
+                for coeff in other.coefficients:
+                    self.coefficients.append(-coeff)
+                self.constant -= other.constant
+                return self
+            case _:
+                raise ValueError(
+                    "Can't sub type %s from LinearExpression"
+                    % type(other).__name__
+                )
+
+    def __sub__(self, other):
+        # Compute expr3 = expr1 - expr2
+        match other:
+            case int() | float():
+                # Update just the constant value
+                return LinearExpression(
+                    self.vars, self.coefficients, self.constant - float(other)
+                )
+            case Variable():
+                # Append just a variable with coefficient -1.0
+                vars = self.vars + [other]
+                coeffs = self.coefficients + [-1.0]
+                return LinearExpression(vars, coeffs, self.constant)
+            case LinearExpression():
+                # Append all variables, coefficients and constants
+                vars = self.vars + other.vars
+                coeffs = []
+                for i in self.coefficients:
+                    coeffs.append(i)
+                for i in other.coefficients:
+                    coeffs.append(-1.0 * i)
+                constant = self.constant - other.constant
+                return LinearExpression(vars, coeffs, constant)
+
+    def __rsub__(self, other):
+        # other - self  -> other + self * -1.0
+        return other + self * -1.0
+
+    def __imul__(self, other):
+        # Compute expr *= constant
+        match other:
+            case int() | float():
+                self.coefficients = [
+                    coeff * float(other) for coeff in self.coefficients
+                ]
+                self.constant = self.constant * float(other)
+                return self
+            case _:
+                raise ValueError(
+                    "Can't multiply type %s by LinearExpresson"
+                    % type(other).__name__
+                )
+
+    def __mul__(self, other):
+        # Compute expr2 = expr1 * constant
+        match other:
+            case int() | float():
+                coeffs = [coeff * float(other) for coeff in self.coefficients]
+                constant = self.constant * float(other)
+                return LinearExpression(self.vars, coeffs, constant)
+            case _:
+                raise ValueError(
+                    "Can't multiply type %s by LinearExpresson"
+                    % type(other).__name__
+                )
+
+    def __rmul__(self, other):
+        return self * other
+
+    def __itruediv__(self, other):
+        # Compute expr /= constant
+        match other:
+            case int() | float():
+                self.coefficients = [
+                    coeff / float(other) for coeff in self.coefficients
+                ]
+                self.constant = self.constant / float(other)
+                return self
+            case _:
+                raise ValueError(
+                    "Can't divide LinearExpression by type %s"
+                    % type(other).__name__
+                )
+
+    def __truediv__(self, other):
+        # Compute expr2 = expr1 / constant
+        match other:
+            case int() | float():
+                coeffs = [coeff / float(other) for coeff in self.coefficients]
+                constant = self.constant / float(other)
+                return LinearExpression(self.vars, coeffs, constant)
+            case _:
+                raise ValueError(
+                    "Can't divide LinearExpression by type %s"
+                    % type(other).__name__
+                )
+
+    def __le__(self, other):
+        match other:
+            case int() | float():
+                return Constraint(self, LE, float(other))
+            case Variable() | LinearExpression():
+                # expr1 <= expr2   -> expr1 - expr2 <= 0
+                expr = self - other
+                return Constraint(expr, LE, 0.0)
+
+    def __ge__(self, other):
+        match other:
+            case int() | float():
+                return Constraint(self, GE, float(other))
+            case Variable() | LinearExpression():
+                # expr1 >= expr2   ->  expr1 - expr2 >= 0
+                expr = self - other
+                return Constraint(expr, GE, 0.0)
+
+    def __eq__(self, other):
+        match other:
+            case int() | float():
+                return Constraint(self, EQ, float(other))
+            case Variable() | LinearExpression():
+                # expr1 == expr2   -> expr1 - expr2 == 0
+                expr = self - other
+                return Constraint(expr, EQ, 0.0)
+
+
+class Constraint:
+    """
+    cuOpt constraint object containing a linear expression,
+    the sense of the constraint, and the right-hand side of
+    the constraint.
+    Constraints are associated with a problem and can be
+    created using problem.addConstraint (See problem class).
+
+    Parameters
+    ----------
+    expr : LinearExpression
+        Linear expression corresponding to a problem.
+    sense : enum
+        Sense of the constraint. Either LE for <=,
+        GE for >= or EQ for == .
+    rhs : float
+        Constraint right-hand side value.
+    name : str, Optional
+        Name of the constraint. Optional.
+
+    Attributes
+    ----------
+    ConstraintName : str
+        Name of the constraint.
+    Sense : LE, GE or EQ
+        Row sense. LE for >=, GE for <= or EQ for == .
+    RHS : float
+        Constraint right-hand side value.
+    Slack : float
+        Computed LHS - RHS with current solution.
+    DualValue : float
+        Constraint dual value in the current solution.
+    """
+
+    def __init__(self, expr, sense, rhs, name=""):
+        self.vindex_coeff_dict = {}
+        nz = len(expr)
+        self.vars = expr.vars
+        self.index = -1
+        for i in range(nz):
+            v_idx = expr.vars[i].index
+            v_coeff = expr.coefficients[i]
+            self.vindex_coeff_dict[v_idx] = (
+                self.vindex_coeff_dict[v_idx] + v_coeff
+                if v_idx in self.vindex_coeff_dict
+                else v_coeff
+            )
+        self.Sense = sense
+        self.RHS = rhs - expr.getConstant()
+        self.ConstraintName = name
+        self.DualValue = float("nan")
+        self.Slack = float("nan")
+
+    def __len__(self):
+        return len(self.vindex_coeff_dict)
+
+    def getConstraintName(self):
+        """
+        Returns the name of the constraint.
+        """
+        return self.ConstraintName
+
+    def getSense(self):
+        """
+        Returns the sense of the constraint.
+        Constraint sense can be LE(<=), GE(>=) or EQ(==).
+        """
+        return self.Sense
+
+    def getRHS(self):
+        """
+        Returns the right-hand side value of the constraint.
+        """
+        return self.RHS
+
+    def getCoefficient(self, var):
+        """
+        Returns the coefficient of a variable in the constraint.
+        """
+        v_idx = var.index
+        return self.vindex_coeff_dict[v_idx]
+
+    def compute_slack(self):
+        # Computes the constraint Slack in the current solution.
+        lhs = 0.0
+        for var in self.vars:
+            lhs += var.Value * self.vindex_coeff_dict[var.index]
+        return self.RHS - lhs
+
+
+class Problem:
+    """
+    A Problem defines a Linear Program or Mixed Integer Program
+    Variable can be be created by calling addVariable()
+    Constraints can be added by calling addConstraint()
+    The objective can be set by calling setObjective()
+    The problem data is formed when calling solve().
+
+    Parameters
+    ----------
+    model_name : str, optional
+        Name of the model. Default is an empty string.
+
+    Attributes
+    ----------
+    Name : str
+        Name of the model.
+    ObjSense : sense
+        Objective sense (MINIMIZE or MAXIMIZE).
+    ObjConstant : float
+        Constant term in the objective.
+    Status : int
+        Status of the problem after solving.
+    SolveTime : float
+        Time taken to solve the problem.
+    SolutionStats : object
+        Solution statistics for LP or MIP problem.
+    ObjValue : float
+        Objective value of the problem.
+    IsMIP : bool
+        Indicates if the problem is a Mixed Integer Program.
+    NumVariables : int
+        Number of Variables in the problem.
+    NumConstraints : int
+        Number of constraints in the problem.
+    NumNZs : int
+        Number of non-zeros in the problem.
+
+    Examples
+    --------
+    >>> problem = problem.Problem("MIP_model")
+    >>> x = problem.addVariable(lb=-2.0, ub=8.0, vtype=INTEGER)
+    >>> y = problem.addVariable(name="Var2")
+    >>> problem.addConstraint(2*x - 3*y <= 10, name="Constr1")
+    >>> expr = 3*x + y
+    >>> problem.addConstraint(expr + x == 20, name="Constr2")
+    >>> problem.setObjective(x + y, sense=MAXIMIZE)
+    >>> problem.solve()
+    """
+
+    def __init__(self, model_name=""):
+        self.Name = model_name
+        self.vars = []
+        self.constrs = []
+        self.ObjSense = MINIMIZE
+        self.Obj = None
+        self.ObjConstant = 0.0
+        self.Status = -1
+        self.ObjValue = float("nan")
+
+        self.solved = False
+        self.rhs = None
+        self.row_sense = None
+        self.row_pointers = None
+        self.column_indicies = None
+        self.values = None
+        self.lower_bound = None
+        self.upper_bound = None
+        self.var_type = None
+
+    class dict_to_object:
+        def __init__(self, mdict):
+            for key, value in mdict.items():
+                setattr(self, key, value)
+
+    def reset_solved_values(self):
+        # Resets all post solve values
+        for var in self.vars:
+            var.Value = float("nan")
+            var.ReducedCost = float("nan")
+
+        for constr in self.constrs:
+            constr.Slack = float("nan")
+            constr.DualValue = float("nan")
+
+        self.ObjValue = float("nan")
+        self.solved = False
+
+    def addVariable(
+        self, lb=0.0, ub=float("inf"), obj=0.0, vtype=CONTINUOUS, name=""
+    ):
+        """
+        Adds a variable to the problem defined by lower bound,
+        upper bound, type and name.
+
+        Parameters
+        ----------
+        lb : float
+            Lower bound of the variable. Defaults to  0.
+        ub : float
+            Upper bound of the variable. Defaults to infinity.
+        vtype : enum
+            vtype.CONTINUOUS or vtype.INTEGER. Defaults to CONTINUOUS.
+        name : string
+            Name of the variable. Optional.
+
+        Examples
+        --------
+        >>> problem = problem.Problem("MIP_model")
+        >>> x = problem.addVariable(lb=-2.0, ub=8.0, vtype=INTEGER,
+                name="Var1")
+        """
+        if self.solved:
+            self.reset_solved_values()  # Reset all solved values
+        n = len(self.vars)
+        var = Variable(lb, ub, obj, vtype, name)
+        var.index = n
+        self.vars.append(var)
+        return var
+
+    def addConstraint(self, constr, name=""):
+        """
+        Adds a constraint to the problem defined by constraint object
+        and name. A constraint is generated using LinearExpression,
+        Sense and RHS.
+
+        Parameters
+        ----------
+        constr : Constraint
+            Constructed using LinearExpressions (See Examples)
+        name : string
+            Name of the variable. Optional.
+
+        Examples
+        --------
+        >>> problem = problem.Problem("MIP_model")
+        >>> x = problem.addVariable(lb=-2.0, ub=8.0, vtype=INTEGER)
+        >>> y = problem.addVariable(name="Var2")
+        >>> problem.addConstraint(2*x - 3*y <= 10, name="Constr1")
+        >>> expr = 3*x + y
+        >>> problem.addConstraint(expr + x == 20, name="Constr2")
+        """
+        if self.solved:
+            self.reset_solved_values()  # Reset all solved values
+        n = len(self.constrs)
+        match constr:
+            case Constraint():
+                constr.index = n
+                constr.ConstraintName = name
+                self.constrs.append(constr)
+            case _:
+                raise ValueError("addConstraint requires a Constraint object")
+
+    def setObjective(self, expr, sense=MINIMIZE):
+        """
+        Set the Objective of the problem with an expression that needs to
+        be MINIMIZED or MAXIMIZED.
+
+        Parameters
+        ----------
+        expr : LinearExpression or Variable or Constant
+            Objective expression that needs maximization or minimization.
+        sense : enum
+            Sets whether the problem is a maximization or a minimization
+            problem. Values passed can either be MINIMIZE or MAXIMIZE.
+            Defaults to MINIMIZE.
+
+        Examples
+        --------
+        >>> problem = problem.Problem("MIP_model")
+        >>> x = problem.addVariable(lb=-2.0, ub=8.0, vtype=INTEGER)
+        >>> y = problem.addVariable(name="Var2")
+        >>> problem.addConstraint(2*x - 3*y <= 10, name="Constr1")
+        >>> expr = 3*x + y
+        >>> problem.addConstraint(expr + x == 20, name="Constr2")
+        >>> problem.setObjective(x + y, sense=MAXIMIZE)
+        """
+        if self.solved:
+            self.reset_solved_values()  # Reset all solved values
+        self.ObjSense = sense
+        match expr:
+            case int() | float():
+                for var in self.vars:
+                    var.setObjectiveCoefficient(0.0)
+                self.ObjCon = float(expr)
+            case Variable():
+                for var in self.vars:
+                    var.setObjectiveCoefficient(0.0)
+                    if var.getIndex() == expr.getIndex():
+                        var.setObjectiveCoefficient(1.0)
+            case LinearExpression():
+                for var, coeff in expr.zipVarCoefficients():
+                    self.vars[var.getIndex()].setObjectiveCoefficient(coeff)
+            case _:
+                raise ValueError(
+                    "Objective must be a LinearExpression or a constant"
+                )
+        self.Obj = expr
+
+    def getObjective(self):
+        """
+        Get the Objective expression of the problem.
+        """
+        return self.Obj
+
+    def getVariables(self):
+        """
+        Get a list of all the variables in the problem.
+        """
+        return self.vars
+
+    def getConstraints(self):
+        """
+        Get a list of all the Constraints in a problem.
+        """
+        return self.constrs
+
+    @property
+    def NumVariables(self):
+        # Returns number of variables in the problem
+        return len(self.vars)
+
+    @property
+    def NumConstraints(self):
+        # Returns number of contraints in the problem.
+        return len(self.constrs)
+
+    @property
+    def NumNZs(self):
+        # Returns number of non-zeros in the problem.
+        nnz = 0
+        for constr in self.constrs:
+            nnz += len(constr)
+        return nnz
+
+    @property
+    def IsMIP(self):
+        # Returns if the problem is a MIP problem.
+        for var in self.vars:
+            if var.VariableType == "I":
+                return True
+        return False
+
+    def getCSR(self):
+        """
+        Computes and returns the CSR representation of the
+        constraint matrix.
+        """
+        csr_dict = {"row_pointers": [0], "column_indices": [], "values": []}
+        for constr in self.constrs:
+            csr_dict["column_indices"].extend(
+                list(constr.vindex_coeff_dict.keys())
+            )
+            csr_dict["values"].extend(list(constr.vindex_coeff_dict.values()))
+            csr_dict["row_pointers"].append(len(csr_dict["column_indices"]))
+        return self.dict_to_object(csr_dict)
+
+    def get_incumbent_values(self, solution, vars):
+        """
+        Extract incumbent values of the vars from a problem solution.
+        """
+        values = []
+        for var in vars:
+            values.append(solution[var.index])
+        return values
+
+    def post_solve(self, solution):
+        self.Status = solution.get_termination_status()
+        self.SolveTime = solution.get_solve_time()
+
+        IsMIP = False
+        if solution.problem_category == 0:
+            self.SolutionStats = self.dict_to_object(solution.get_lp_stats())
+        else:
+            IsMIP = True
+            self.SolutionStats = self.dict_to_object(solution.get_milp_stats())
+
+        primal_sol = solution.get_primal_solution()
+        reduced_cost = solution.get_reduced_cost()
+        if len(primal_sol) > 0:
+            for var in self.vars:
+                var.Value = primal_sol[var.index]
+                if not IsMIP:
+                    var.ReducedCost = reduced_cost[var.index]
+        dual_sol = None
+        if not IsMIP:
+            dual_sol = solution.get_dual_solution()
+        for i, constr in enumerate(self.constrs):
+            if dual_sol is not None:
+                constr.DualValue = dual_sol[i]
+            constr.Slack = constr.compute_slack()
+        self.ObjValue = self.Obj.getValue()
+        self.solved = True
+
+    def solve(self, settings=solver_settings.SolverSettings()):
+        """
+        Optimizes the LP or MIP problem with the added variables,
+        constraints and objective.
+
+        Examples
+        --------
+        >>> problem = problem.Problem("MIP_model")
+        >>> x = problem.addVariable(lb=-2.0, ub=8.0, vtype=INTEGER)
+        >>> y = problem.addVariable(name="Var2")
+        >>> problem.addConstraint(2*x - 3*y <= 10, name="Constr1")
+        >>> expr = 3*x + y
+        >>> problem.addConstraint(expr + x == 20, name="Constr2")
+        >>> problem.setObjective(x + y, sense=MAXIMIZE)
+        >>> problem.solve()
+        """
+
+        # iterate through the constraints and construct the constraint matrix
+        n = len(self.vars)
+        self.row_pointers = [0]
+        self.column_indicies = []
+        self.values = []
+        self.rhs = []
+        self.row_sense = []
+        for constr in self.constrs:
+            self.column_indicies.extend(list(constr.vindex_coeff_dict.keys()))
+            self.values.extend(list(constr.vindex_coeff_dict.values()))
+            self.row_pointers.append(len(self.column_indicies))
+            self.rhs.append(constr.RHS)
+            self.row_sense.append(constr.Sense)
+
+        self.objective = np.zeros(n)
+        self.lower_bound, self.upper_bound = np.zeros(n), np.zeros(n)
+        self.var_type = np.empty(n, dtype="S1")
+
+        for j in range(n):
+            self.objective[j] = self.vars[j].getObjectiveCoefficient()
+            self.var_type[j] = self.vars[j].getVariableType()
+            self.lower_bound[j] = self.vars[j].getLowerBound()
+            self.upper_bound[j] = self.vars[j].getUpperBound()
+
+        # Initialize datamodel
+        dm = data_model.DataModel()
+        dm.set_csr_constraint_matrix(
+            np.array(self.values),
+            np.array(self.column_indicies),
+            np.array(self.row_pointers),
+        )
+        if self.ObjSense == -1:
+            dm.set_maximize(True)
+        dm.set_constraint_bounds(np.array(self.rhs))
+        dm.set_row_types(np.array(self.row_sense, dtype="S1"))
+        dm.set_objective_coefficients(self.objective)
+        dm.set_variable_lower_bounds(self.lower_bound)
+        dm.set_variable_upper_bounds(self.upper_bound)
+        dm.set_variable_types(self.var_type)
+
+        # Call Solver
+        solution = solver.Solve(dm, settings)
+
+        # Post Solve
+        self.post_solve(solution)
diff --git a/python/cuopt/cuopt/linear_programming/solver/solver.py b/python/cuopt/cuopt/linear_programming/solver/solver.py
index 24812e70c..12921ae7c 100644
--- a/python/cuopt/cuopt/linear_programming/solver/solver.py
+++ b/python/cuopt/cuopt/linear_programming/solver/solver.py
@@ -19,7 +19,7 @@
 
 
 @catch_cuopt_exception
-def Solve(data_model, solver_settings=None, log_file=""):
+def Solve(data_model, solver_settings=None):
     """
     Solve the Linear Program passed as input and returns the solution.
 
@@ -84,21 +84,26 @@ def Solve(data_model, solver_settings=None, log_file=""):
     def is_mip(var_types):
         if len(var_types) == 0:
             return False
-        elif "I" in var_types:
-            return True
-
-        return False
+        # Check if all types are the same (fast check)
+        if len(set(map(type, var_types))) == 1:
+            # Homogeneous - use appropriate check
+            if isinstance(var_types[0], bytes):
+                return b"I" in var_types
+            else:
+                return "I" in var_types
+        else:
+            # Mixed types - fallback to comprehensive check
+            return any(vt == "I" or vt == b"I" for vt in var_types)
 
     return solver_wrapper.Solve(
         data_model,
         solver_settings,
-        log_file,
         mip=is_mip(data_model.get_variable_types()),
     )
 
 
 @catch_cuopt_exception
-def BatchSolve(data_model_list, solver_settings=None, log_file=""):
+def BatchSolve(data_model_list, solver_settings=None):
     """
     Solve the list of Linear Programs passed as input and returns the solutions
     and total solve time.
@@ -174,6 +179,4 @@ def BatchSolve(data_model_list, solver_settings=None, log_file=""):
     if solver_settings is None:
         solver_settings = SolverSettings()
 
-    return solver_wrapper.BatchSolve(
-        data_model_list, solver_settings, log_file
-    )
+    return solver_wrapper.BatchSolve(data_model_list, solver_settings)
diff --git a/python/cuopt/cuopt/linear_programming/solver/solver_wrapper.pyx b/python/cuopt/cuopt/linear_programming/solver/solver_wrapper.pyx
index 93a303489..02782b8f9 100644
--- a/python/cuopt/cuopt/linear_programming/solver/solver_wrapper.pyx
+++ b/python/cuopt/cuopt/linear_programming/solver/solver_wrapper.pyx
@@ -65,7 +65,6 @@ from numba import cuda
 import cudf
 from cudf.core.buffer import as_buffer
 
-from cuopt.linear_programming.solver.solver_parameters import CUOPT_LOG_FILE
 from cuopt.linear_programming.solver_settings.solver_settings import (
     PDLPSolverMode,
     SolverSettings,
@@ -279,7 +278,6 @@ cdef set_data_model_view(DataModel data_model_obj):
 cdef set_solver_setting(
         unique_ptr[solver_settings_t[int, double]]& unique_solver_settings,
         settings,
-        log_file,
         DataModel data_model_obj=None,
         mip=False):
     cdef solver_settings_t[int, double]* c_solver_settings = (
@@ -425,13 +423,6 @@ cdef set_solver_setting(
             settings.get_pdlp_warm_start_data().iterations_since_last_restart # noqa
         )
 
-    # Common to LP and MIP
-
-    c_solver_settings.set_parameter_from_string(
-        CUOPT_LOG_FILE.encode('utf-8'),
-        log_file.encode('utf-8')
-    )
-
 cdef create_solution(unique_ptr[solver_ret_t] sol_ret_ptr,
                      DataModel data_model_obj,
                      is_batch=False):
@@ -670,7 +661,7 @@ cdef create_solution(unique_ptr[solver_ret_t] sol_ret_ptr,
         )
 
 
-def Solve(py_data_model_obj, settings, str log_file, mip=False):
+def Solve(py_data_model_obj, settings, mip=False):
 
     cdef DataModel data_model_obj = <DataModel>py_data_model_obj
     cdef unique_ptr[solver_settings_t[int, double]] unique_solver_settings
@@ -682,7 +673,7 @@ def Solve(py_data_model_obj, settings, str log_file, mip=False):
     )
 
     set_solver_setting(
-        unique_solver_settings, settings, log_file, data_model_obj, mip
+        unique_solver_settings, settings, data_model_obj, mip
     )
     set_data_model_view(data_model_obj)
 
@@ -697,13 +688,13 @@ cdef insert_vector(DataModel data_model_obj,
     data_model_views.push_back(data_model_obj.c_data_model_view.get())
 
 
-def BatchSolve(py_data_model_list, settings, str log_file):
+def BatchSolve(py_data_model_list, settings):
     cdef unique_ptr[solver_settings_t[int, double]] unique_solver_settings
     unique_solver_settings.reset(new solver_settings_t[int, double]())
 
     if settings.get_pdlp_warm_start_data() is not None:  # noqa
         raise Exception("Cannot use warmstart data with Batch Solve")
-    set_solver_setting(unique_solver_settings, settings, log_file)
+    set_solver_setting(unique_solver_settings, settings)
 
     cdef vector[data_model_view_t[int, double] *] data_model_views
 
diff --git a/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py b/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py
index 9159ba933..9f429e655 100644
--- a/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py
+++ b/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py
@@ -21,22 +21,31 @@
     CUOPT_ABSOLUTE_PRIMAL_TOLERANCE,
     CUOPT_CROSSOVER,
     CUOPT_DUAL_INFEASIBLE_TOLERANCE,
+    CUOPT_FIRST_PRIMAL_FEASIBLE,
     CUOPT_INFEASIBILITY_DETECTION,
     CUOPT_ITERATION_LIMIT,
+    CUOPT_LOG_FILE,
     CUOPT_LOG_TO_CONSOLE,
     CUOPT_METHOD,
     CUOPT_MIP_ABSOLUTE_GAP,
+    CUOPT_MIP_ABSOLUTE_TOLERANCE,
     CUOPT_MIP_HEURISTICS_ONLY,
     CUOPT_MIP_INTEGRALITY_TOLERANCE,
     CUOPT_MIP_RELATIVE_GAP,
+    CUOPT_MIP_RELATIVE_TOLERANCE,
     CUOPT_MIP_SCALING,
     CUOPT_NUM_CPU_THREADS,
     CUOPT_PDLP_SOLVER_MODE,
+    CUOPT_PER_CONSTRAINT_RESIDUAL,
     CUOPT_PRIMAL_INFEASIBLE_TOLERANCE,
     CUOPT_RELATIVE_DUAL_TOLERANCE,
     CUOPT_RELATIVE_GAP_TOLERANCE,
     CUOPT_RELATIVE_PRIMAL_TOLERANCE,
+    CUOPT_SAVE_BEST_PRIMAL_SO_FAR,
+    CUOPT_SOLUTION_FILE,
+    CUOPT_STRICT_INFEASIBILITY,
     CUOPT_TIME_LIMIT,
+    CUOPT_USER_PROBLEM_FILE,
     get_solver_setting,
 )
 
@@ -322,51 +331,72 @@ def toDict(self):
             time_limit = None
 
         solver_config = {
-            "tolerances": {},
+            "tolerances": {
+                "absolute_dual_tolerance": self.get_parameter(
+                    CUOPT_ABSOLUTE_DUAL_TOLERANCE
+                ),
+                "relative_dual_tolerance": self.get_parameter(
+                    CUOPT_RELATIVE_DUAL_TOLERANCE
+                ),
+                "absolute_primal_tolerance": self.get_parameter(
+                    CUOPT_ABSOLUTE_PRIMAL_TOLERANCE
+                ),
+                "relative_primal_tolerance": self.get_parameter(
+                    CUOPT_RELATIVE_PRIMAL_TOLERANCE
+                ),
+                "absolute_gap_tolerance": self.get_parameter(
+                    CUOPT_ABSOLUTE_GAP_TOLERANCE
+                ),
+                "relative_gap_tolerance": self.get_parameter(
+                    CUOPT_RELATIVE_GAP_TOLERANCE
+                ),
+                "primal_infeasible_tolerance": self.get_parameter(
+                    CUOPT_PRIMAL_INFEASIBLE_TOLERANCE
+                ),
+                "dual_infeasible_tolerance": self.get_parameter(
+                    CUOPT_DUAL_INFEASIBLE_TOLERANCE
+                ),
+                "mip_integrality_tolerance": self.get_parameter(
+                    CUOPT_MIP_INTEGRALITY_TOLERANCE
+                ),
+                "mip_absolute_gap": self.get_parameter(CUOPT_MIP_ABSOLUTE_GAP),
+                "mip_relative_gap": self.get_parameter(CUOPT_MIP_RELATIVE_GAP),
+                "mip_absolute_tolerance": self.get_parameter(
+                    CUOPT_MIP_ABSOLUTE_TOLERANCE
+                ),
+                "mip_relative_tolerance": self.get_parameter(
+                    CUOPT_MIP_RELATIVE_TOLERANCE
+                ),
+            },
             "infeasibility_detection": self.get_parameter(
                 CUOPT_INFEASIBILITY_DETECTION
             ),
             "time_limit": time_limit,
             "iteration_limit": self.get_parameter(CUOPT_ITERATION_LIMIT),
-            "solver_mode": self.get_parameter(CUOPT_PDLP_SOLVER_MODE),
+            "pdlp_solver_mode": self.get_parameter(CUOPT_PDLP_SOLVER_MODE),
             "method": self.get_parameter(CUOPT_METHOD),
             "mip_scaling": self.get_parameter(CUOPT_MIP_SCALING),
-            "heuristics_only": self.get_parameter(CUOPT_MIP_HEURISTICS_ONLY),
+            "mip_heuristics_only": self.get_parameter(
+                CUOPT_MIP_HEURISTICS_ONLY
+            ),
             "num_cpu_threads": self.get_parameter(CUOPT_NUM_CPU_THREADS),
             "crossover": self.get_parameter(CUOPT_CROSSOVER),
             "log_to_console": self.get_parameter(CUOPT_LOG_TO_CONSOLE),
+            "first_primal_feasible": self.get_parameter(
+                CUOPT_FIRST_PRIMAL_FEASIBLE
+            ),
+            "log_file": self.get_parameter(CUOPT_LOG_FILE),
+            "per_constraint_residual": self.get_parameter(
+                CUOPT_PER_CONSTRAINT_RESIDUAL
+            ),
+            "save_best_primal_so_far": self.get_parameter(
+                CUOPT_SAVE_BEST_PRIMAL_SO_FAR
+            ),
+            "solution_file": self.get_parameter(CUOPT_SOLUTION_FILE),
+            "strict_infeasibility": self.get_parameter(
+                CUOPT_STRICT_INFEASIBILITY
+            ),
+            "user_problem_file": self.get_parameter(CUOPT_USER_PROBLEM_FILE),
         }
-        solver_config["tolerances"]["absolute_dual"] = self.get_parameter(
-            CUOPT_ABSOLUTE_DUAL_TOLERANCE
-        )
-        solver_config["tolerances"]["relative_dual"] = self.get_parameter(
-            CUOPT_RELATIVE_DUAL_TOLERANCE
-        )
-        solver_config["tolerances"]["absolute_primal"] = self.get_parameter(
-            CUOPT_ABSOLUTE_PRIMAL_TOLERANCE
-        )
-        solver_config["tolerances"]["relative_primal"] = self.get_parameter(
-            CUOPT_RELATIVE_PRIMAL_TOLERANCE
-        )
-        solver_config["tolerances"]["absolute_gap"] = self.get_parameter(
-            CUOPT_ABSOLUTE_GAP_TOLERANCE
-        )
-        solver_config["tolerances"]["relative_gap"] = self.get_parameter(
-            CUOPT_RELATIVE_GAP_TOLERANCE
-        )
-        solver_config["tolerances"]["primal_infeasible"] = self.get_parameter(
-            CUOPT_PRIMAL_INFEASIBLE_TOLERANCE
-        )
-        solver_config["tolerances"]["dual_infeasible"] = self.get_parameter(
-            CUOPT_DUAL_INFEASIBLE_TOLERANCE
-        )
-        solver_config["tolerances"][
-            "integrality_tolerance"
-        ] = self.get_parameter(CUOPT_MIP_INTEGRALITY_TOLERANCE)
-        solver_config["tolerances"]["absolute_mip_gap"] = self.get_parameter(
-            CUOPT_MIP_ABSOLUTE_GAP
-        )
-        solver_config["tolerances"]["relative_mip_gap"] = self.get_parameter(
-            CUOPT_MIP_RELATIVE_GAP
-        )
+
         return solver_config
diff --git a/python/cuopt/cuopt/tests/linear_programming/test_python_API.py b/python/cuopt/cuopt/tests/linear_programming/test_python_API.py
new file mode 100644
index 000000000..132920a86
--- /dev/null
+++ b/python/cuopt/cuopt/tests/linear_programming/test_python_API.py
@@ -0,0 +1,328 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.  # noqa
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import pytest
+
+from cuopt.linear_programming import SolverSettings
+from cuopt.linear_programming.internals import (
+    GetSolutionCallback,
+    SetSolutionCallback,
+)
+from cuopt.linear_programming.problem import (
+    CONTINUOUS,
+    INTEGER,
+    MAXIMIZE,
+    CType,
+    Problem,
+    VType,
+    sense,
+)
+
+
+def test_model():
+
+    prob = Problem("Simple MIP")
+    assert prob.Name == "Simple MIP"
+
+    # Adding Variable
+    x = prob.addVariable(lb=0, vtype=VType.INTEGER, name="V_x")
+    y = prob.addVariable(lb=10, ub=50, vtype=INTEGER, name="V_y")
+
+    assert x.getVariableName() == "V_x"
+    assert y.getUpperBound() == 50
+    assert y.getLowerBound() == 10
+    assert x.getVariableType() == VType.INTEGER
+    assert y.getVariableType() == "I"
+    assert [x.getIndex(), y.getIndex()] == [0, 1]
+    assert prob.IsMIP
+
+    # Adding Constraints
+    prob.addConstraint(2 * x + 4 * y >= 230, name="C1")
+    prob.addConstraint(3 * x + 2 * y + 10 <= 200, name="C2")
+
+    expected_name = ["C1", "C2"]
+    expected_coefficient_x = [2, 3]
+    expected_coefficient_y = [4, 2]
+    expected_sense = [CType.GE, "L"]
+    expected_rhs = [230, 190]
+    for i, c in enumerate(prob.getConstraints()):
+        assert c.getConstraintName() == expected_name[i]
+        assert c.getSense() == expected_sense[i]
+        assert c.getRHS() == expected_rhs[i]
+        assert c.getCoefficient(x) == expected_coefficient_x[i]
+        assert c.getCoefficient(y) == expected_coefficient_y[i]
+
+    assert prob.NumVariables == 2
+    assert prob.NumConstraints == 2
+    assert prob.NumNZs == 4
+
+    # Setting Objective
+    expr = 5 * x + 3 * y + 50
+    prob.setObjective(expr, sense=MAXIMIZE)
+
+    expected_obj_coeff = [5, 3]
+    assert expr.getVariables() == [x, y]
+    assert expr.getCoefficients() == expected_obj_coeff
+    assert expr.getConstant() == 50
+    assert prob.ObjSense == sense.MAXIMIZE
+    assert prob.getObjective() is expr
+
+    # Initialize Settings
+    settings = SolverSettings()
+    settings.set_parameter("time_limit", 5)
+
+    assert not prob.solved
+    # Solving Problem
+    prob.solve(settings)
+    assert prob.solved
+    assert prob.Status.name == "Optimal"
+    assert prob.SolveTime < 5
+
+    csr = prob.getCSR()
+    expected_row_pointers = [0, 2, 4]
+    expected_column_indices = [0, 1, 0, 1]
+    expected_values = [2.0, 4.0, 3.0, 2.0]
+
+    assert csr.row_pointers == expected_row_pointers
+    assert csr.column_indices == expected_column_indices
+    assert csr.values == expected_values
+
+    expected_slack = [-6, 0]
+    expected_var_values = [36, 41]
+
+    for i, var in enumerate(prob.getVariables()):
+        assert var.Value == pytest.approx(expected_var_values[i])
+        assert var.getObjectiveCoefficient() == expected_obj_coeff[i]
+
+    assert prob.ObjValue == 353
+
+    for i, c in enumerate(prob.getConstraints()):
+        assert c.Slack == pytest.approx(expected_slack[i])
+
+    assert hasattr(prob.SolutionStats, "mip_gap")
+
+    # Change Objective
+    prob.setObjective(expr + 20, sense.MINIMIZE)
+    assert not prob.solved
+
+    # Check if values reset
+    for i, var in enumerate(prob.getVariables()):
+        assert math.isnan(var.Value) and math.isnan(var.ReducedCost)
+    for i, c in enumerate(prob.getConstraints()):
+        assert math.isnan(c.Slack) and math.isnan(c.DualValue)
+
+    # Change Problem to LP
+    x.VariableType = VType.CONTINUOUS
+    y.VariableType = CONTINUOUS
+    y.UB = 45.5
+    assert not prob.IsMIP
+
+    prob.solve(settings)
+    assert prob.solved
+    assert prob.Status.name == "Optimal"
+    assert hasattr(prob.SolutionStats, "primal_residual")
+
+    assert x.getValue() == 24
+    assert y.getValue() == pytest.approx(45.5)
+
+    assert prob.ObjValue == pytest.approx(5 * x.Value + 3 * y.Value + 70)
+
+
+def test_linear_expression():
+
+    prob = Problem()
+
+    x = prob.addVariable()
+    y = prob.addVariable()
+    z = prob.addVariable()
+
+    expr1 = 2 * x + 5 + 3 * y
+    expr2 = y - z + 2 * x - 3
+
+    expr3 = expr1 + expr2
+    expr4 = expr1 - expr2
+
+    # Test expr1 and expr 2 is unchanged
+    assert expr1.getCoefficients() == [2, 3]
+    assert expr1.getVariables() == [x, y]
+    assert expr1.getConstant() == 5
+    assert expr2.getCoefficients() == [1, -1, 2]
+    assert expr2.getVariables() == [y, z, x]
+    assert expr2.getConstant() == -3
+
+    # Testing add and sub
+    assert expr3.getCoefficients() == [2, 3, 1, -1, 2]
+    assert expr3.getVariables() == [x, y, y, z, x]
+    assert expr3.getConstant() == 2
+    assert expr4.getCoefficients() == [2, 3, -1, 1, -2]
+    assert expr4.getVariables() == [x, y, y, z, x]
+    assert expr4.getConstant() == 8
+
+    expr5 = 8 * y - x - 5
+    expr6 = expr5 / 2
+    expr7 = expr5 * 2
+
+    # Test expr5 is unchanged
+    assert expr5.getCoefficients() == [8, -1]
+    assert expr5.getVariables() == [y, x]
+    assert expr5.getConstant() == -5
+
+    # Test mul and truediv
+    assert expr6.getCoefficients() == [4, -0.5]
+    assert expr6.getVariables() == [y, x]
+    assert expr6.getConstant() == -2.5
+    assert expr7.getCoefficients() == [16, -2]
+    assert expr7.getVariables() == [y, x]
+    assert expr7.getConstant() == -10
+
+    expr6 *= 2
+    expr7 /= 2
+
+    # Test imul and itruediv
+    assert expr6.getCoefficients() == [8, -1]
+    assert expr6.getVariables() == [y, x]
+    assert expr6.getConstant() == -5
+    assert expr7.getCoefficients() == [8, -1]
+    assert expr7.getVariables() == [y, x]
+    assert expr7.getConstant() == -5
+
+
+def test_constraint_matrix():
+
+    prob = Problem()
+
+    a = prob.addVariable(lb=0, ub=float("inf"), vtype="C", name="a")
+    b = prob.addVariable(lb=0, ub=float("inf"), vtype="C", name="b")
+    c = prob.addVariable(lb=0, ub=float("inf"), vtype="C", name="c")
+    d = prob.addVariable(lb=0, ub=float("inf"), vtype="C", name="d")
+    e = prob.addVariable(lb=0, ub=float("inf"), vtype="C", name="e")
+    f = prob.addVariable(lb=0, ub=float("inf"), vtype="C", name="f")
+
+    # 2*a + 3*e + 1 + 4*d - 2*e + f - 8 <= 90    i.e.    2a + e + 4d + f <= 97
+    prob.addConstraint(2 * a + 3 * e + 1 + 4 * d - 2 * e + f - 8 <= 90, "C1")
+    # d + 5*c - a - 4*d - 2 + 5*b - 20 >= 10    i.e.    -3d + 5c - a + 5b >= 32
+    prob.addConstraint(d + 5 * c - a - 4 * d - 2 + 5 * b - 20 >= 10, "C2")
+    # 7*f + 3 - 2*b + c == 3*f - 61 + 8*e    i.e.    4f - 2b + c - 8e == -64
+    prob.addConstraint(7 * f + 3 - 2 * b + c == 3 * f - 61 + 8 * e, "C3")
+    # a <= 5
+    prob.addConstraint(a <= 5, "C4")
+    # d >= 7*f - b - 27   i.e.   d - 7*f + b >= -27
+    prob.addConstraint(d >= 7 * f - b - 27, "C5")
+    # c == e   i.e.   c - e == 0
+    prob.addConstraint(c == e, "C6")
+
+    sense = []
+    rhs = []
+    for c in prob.getConstraints():
+        sense.append(c.Sense)
+        rhs.append(c.RHS)
+
+    csr = prob.getCSR()
+
+    exp_row_pointers = [0, 4, 8, 12, 13, 16, 18]
+    exp_column_indices = [0, 4, 3, 5, 2, 3, 0, 1, 5, 1, 2, 4, 0, 5, 1, 3, 2, 4]
+    exp_values = [
+        2.0,
+        1.0,
+        4.0,
+        1.0,
+        5.0,
+        -3.0,
+        -1.0,
+        5.0,
+        4.0,
+        -2.0,
+        1.0,
+        -8.0,
+        1.0,
+        -7.0,
+        1.0,
+        1.0,
+        1.0,
+        -1.0,
+    ]
+    exp_sense = ["L", "G", "E", "L", "G", "E"]
+    exp_rhs = [97, 32, -64, 5, -27, 0]
+
+    assert csr.row_pointers == exp_row_pointers
+    assert csr.column_indices == exp_column_indices
+    assert csr.values == exp_values
+    assert sense == exp_sense
+    assert rhs == exp_rhs
+
+
+def test_incumbent_solutions():
+
+    # Callback for incumbent solution
+    class CustomGetSolutionCallback(GetSolutionCallback):
+        def __init__(self):
+            super().__init__()
+            self.n_callbacks = 0
+            self.solutions = []
+
+        def get_solution(self, solution, solution_cost):
+
+            self.n_callbacks += 1
+            assert len(solution) > 0
+            assert len(solution_cost) == 1
+
+            self.solutions.append(
+                {
+                    "solution": solution.copy_to_host(),
+                    "cost": solution_cost.copy_to_host()[0],
+                }
+            )
+
+    class CustomSetSolutionCallback(SetSolutionCallback):
+        def __init__(self, get_callback):
+            super().__init__()
+            self.n_callbacks = 0
+            self.get_callback = get_callback
+
+        def set_solution(self, solution, solution_cost):
+            self.n_callbacks += 1
+            if self.get_callback.solutions:
+                solution[:] = self.get_callback.solutions[-1]["solution"]
+                solution_cost[0] = float(
+                    self.get_callback.solutions[-1]["cost"]
+                )
+
+    prob = Problem()
+    x = prob.addVariable(vtype=VType.INTEGER)
+    y = prob.addVariable(vtype=VType.INTEGER)
+    prob.addConstraint(2 * x + 4 * y >= 230)
+    prob.addConstraint(3 * x + 2 * y <= 190)
+    prob.setObjective(5 * x + 3 * y, sense=sense.MAXIMIZE)
+
+    get_callback = CustomGetSolutionCallback()
+    set_callback = CustomSetSolutionCallback(get_callback)
+    settings = SolverSettings()
+    settings.set_mip_callback(get_callback)
+    settings.set_mip_callback(set_callback)
+    settings.set_parameter("time_limit", 1)
+
+    prob.solve(settings)
+
+    assert get_callback.n_callbacks > 0
+
+    for sol in get_callback.solutions:
+        x_val = sol["solution"][0]
+        y_val = sol["solution"][1]
+        cost = sol["cost"]
+        assert 2 * x_val + 4 * y_val >= 230
+        assert 3 * x_val + 2 * y_val <= 190
+        assert 5 * x_val + 3 * y_val == cost
diff --git a/python/cuopt_self_hosted/cuopt_sh_client/thin_client_solver_settings.py b/python/cuopt_self_hosted/cuopt_sh_client/thin_client_solver_settings.py
index 9d2139931..63703b3e4 100644
--- a/python/cuopt_self_hosted/cuopt_sh_client/thin_client_solver_settings.py
+++ b/python/cuopt_self_hosted/cuopt_sh_client/thin_client_solver_settings.py
@@ -163,53 +163,42 @@ def toDict(self):
             "tolerances": {},
         }
 
+        t = [
+            "absolute_primal_tolerance",
+            "absolute_dual_tolerance",
+            "absolute_gap_tolerance",
+            "relative_primal_tolerance",
+            "relative_dual_tolerance",
+            "relative_gap_tolerance",
+            "primal_infeasible_tolerance",
+            "dual_infeasible_tolerance",
+            "mip_integrality_tolerance",
+            "mip_absolute_gap",
+            "mip_relative_gap",
+            "mip_absolute_tolerance",
+            "mip_relative_tolerance",
+            # deprecated parameters
+            "absolute_primal",
+            "absolute_dual",
+            "absolute_gap",
+            "relative_primal",
+            "relative_dual",
+            "relative_gap",
+            "primal_infeasible",
+            "dual_infeasible",
+            "integrality_tolerance",
+            "absolute_mip_gap",
+            "relative_mip_gap",
+        ]
+
         # Grab everything that is not a tolerance
         for key in self.parameter_dict:
-            if "tolerance" not in key:
+            if key not in t:
                 solver_config[key] = self.parameter_dict[key]
-        # Handle tolerance separately
-        if "absolute_dual_tolerance" in self.parameter_dict:
-            solver_config["tolerances"]["absolute_dual"] = self.parameter_dict[
-                "absolute_dual_tolerance"
-            ]
-        if "relative_dual_tolerance" in self.parameter_dict:
-            solver_config["tolerances"]["relative_dual"] = self.parameter_dict[
-                "relative_dual_tolerance"
-            ]
-        if "absolute_primal_tolerance" in self.parameter_dict:
-            solver_config["tolerances"][
-                "absolute_primal"
-            ] = self.parameter_dict["absolute_primal_tolerance"]
-        if "relative_primal_tolerance" in self.parameter_dict:
-            solver_config["tolerances"][
-                "relative_primal"
-            ] = self.parameter_dict["relative_primal_tolerance"]
-        if "absolute_gap_tolerance" in self.parameter_dict:
-            solver_config["tolerances"]["absolute_gap"] = self.parameter_dict[
-                "absolute_gap_tolerance"
-            ]
-        if "relative_gap_tolerance" in self.parameter_dict:
-            solver_config["tolerances"]["relative_gap"] = self.parameter_dict[
-                "relative_gap_tolerance"
-            ]
-        if "primal_infeasible_tolerance" in self.parameter_dict:
-            solver_config["tolerances"][
-                "primal_infeasible"
-            ] = self.parameter_dict["primal_infeasible_tolerance"]
-        if "dual_infeasible_tolerance" in self.parameter_dict:
-            solver_config["tolerances"][
-                "dual_infeasible"
-            ] = self.parameter_dict["dual_infeasible_tolerance"]
-        if "integrality_tolerance" in self.parameter_dict:
-            solver_config["tolerances"][
-                "integrality_tolerance"
-            ] = self.parameter_dict["integrality_tolerance"]
-        if "absolute_mip_gap" in self.parameter_dict:
-            solver_config["tolerances"][
-                "absolute_mip_gap"
-            ] = self.parameter_dict["absolute_mip_gap"]
-        if "relative_mip_gap" in self.parameter_dict:
-            solver_config["tolerances"][
-                "relative_mip_gap"
-            ] = self.parameter_dict["relative_mip_gap"]
+
+        # Now grab everything that is a tolerance and set in the dictionary
+        for name in t:
+            if name in self.parameter_dict:
+                solver_config["tolerances"][name] = self.parameter_dict[name]
+
         return solver_config
diff --git a/python/cuopt_server/cuopt_server/utils/job_queue.py b/python/cuopt_server/cuopt_server/utils/job_queue.py
index 7d1986e56..5d2adc6a0 100644
--- a/python/cuopt_server/cuopt_server/utils/job_queue.py
+++ b/python/cuopt_server/cuopt_server/utils/job_queue.py
@@ -60,47 +60,6 @@ class PickleForbidden(Exception):
 msgpack_numpy.patch()
 
 
-def lp_datamodel_compat(data):
-    """
-    Maintain backward compat for some parameters
-    that change names in 25.05. Replace the
-    old parameters with the new names
-    """
-
-    sc = {
-        "solver_mode": "pdlp_solver_mode",
-        "heuristics_only": "mip_heuristics_only",
-    }
-
-    tol = {
-        "integrality_tolerance": "mip_integrality_tolerance",
-        "absolute_mip_gap": "mip_absolute_gap",
-        "relative_mip_gap": "mip_relative_gap",
-    }
-
-    replace = []
-    if "solver_config" in data:
-        s = data["solver_config"]
-        for k, v in sc.items():
-            if k in s:
-                replace.append([k, v, s[k]])
-
-        for r in replace:
-            data["solver_config"][r[1]] = r[2]
-            del data["solver_config"][r[0]]
-
-        replace = []
-        if "tolerances" in s:
-            t = s["tolerances"]
-            for k, v in tol.items():
-                if k in t:
-                    replace.append([k, v, t[k]])
-
-            for r in replace:
-                data["solver_config"]["tolerances"][r[1]] = r[2]
-                del data["solver_config"]["tolerances"][r[0]]
-
-
 def check_client_version(client_vers):
     logging.debug(f"client_vers is {client_vers} in check")
     if os.environ.get("CUOPT_CHECK_CLIENT", True) in ["True", True]:
@@ -1289,7 +1248,6 @@ def _resolve_job(self):
                         t = SolverLPJob(0, i_data, None, None)
                         t._transform(t.LP_data)
                         i_data = t.get_data()
-                        lp_datamodel_compat(i_data)
                         lpdata.append(LPData.parse_obj(i_data))
                     data = lpdata
                 else:
@@ -1299,7 +1257,6 @@ def _resolve_job(self):
                     t = SolverLPJob(0, data, None, None)
                     t._transform(t.LP_data)
                     data = t.get_data()
-                    lp_datamodel_compat(data)
                     data = LPData.parse_obj(data)
             except Exception as e:
                 raise HTTPException(
@@ -1539,7 +1496,6 @@ def _resolve_job(self):
                         t = SolverLPJob(0, i_data, None, None)
                         t._transform(t.LP_data)
                         i_data = t.get_data()
-                        lp_datamodel_compat(i_data)
                         lpdata.append(LPData.parse_obj(i_data))
                     data = lpdata
                 else:
@@ -1549,7 +1505,6 @@ def _resolve_job(self):
                     t = SolverLPJob(0, data, None, None)
                     t._transform(t.LP_data)
                     data = t.get_data()
-                    lp_datamodel_compat(data)
                     data = LPData.parse_obj(data)
             except Exception as e:
                 raise HTTPException(
diff --git a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
index d66b7c817..c739c8b7e 100644
--- a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
+++ b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
@@ -336,34 +336,34 @@ class Tolerances(StrictModel):
         default=None,
         description="absolute and relative tolerance on the primal feasibility, dual feasibility, and gap",  # noqa
     )
-    absolute_primal: float = Field(
+    absolute_primal_tolerance: float = Field(
         default=None, description="Absolute primal tolerance"
     )
-    absolute_dual: float = Field(
+    absolute_dual_tolerance: float = Field(
         default=None,
         description="Absolute dual tolerance" "NOTE: Only applicable to LP",
     )
-    absolute_gap: float = Field(
+    absolute_gap_tolerance: float = Field(
         default=None,
         description="Absolute gap tolerance" "NOTE: Only applicable to LP",
     )
-    relative_primal: float = Field(
+    relative_primal_tolerance: float = Field(
         default=None, description="Relative primal tolerance"
     )
-    relative_dual: float = Field(
+    relative_dual_tolerance: float = Field(
         default=None,
         description="Relative dual tolerance" "NOTE: Only applicable to LP",
     )
-    relative_gap: float = Field(
+    relative_gap_tolerance: float = Field(
         default=None,
         description="Relative gap tolerance" "NOTE: Only applicable to LP",
     )
-    primal_infeasible: float = Field(
+    primal_infeasible_tolerance: float = Field(
         default=None,
         description="Primal infeasible tolerance"
         "NOTE: Only applicable to LP",
     )
-    dual_infeasible: float = Field(
+    dual_infeasible_tolerance: float = Field(
         default=None,
         description="Dual infeasible tolerance" "NOTE: Only applicable to LP",
     )
@@ -381,6 +381,78 @@ class Tolerances(StrictModel):
         description="MIP gap relative tolerance"
         "NOTE: Only applicable to MILP",
     )
+    mip_absolute_tolerance: float = Field(
+        default=None, description="MIP absolute tolerance"
+    )
+    mip_relative_tolerance: float = Field(
+        default=None, description="MIP relative tolerance"
+    )
+    absolute_primal: float = Field(
+        default=None,
+        deprecated=True,
+        description="Deprecated in 25.08. "
+        "Use absolute_primal_tolerance instead",
+    )
+    absolute_dual: float = Field(
+        default=None,
+        deprecated=True,
+        description="Deprecated in 25.08. "
+        "Use absolute_dual_tolerance instead",
+    )
+    absolute_gap: float = Field(
+        default=None,
+        deprecated=True,
+        description="Deprecated in 25.08. "
+        "Use absolute_gap_tolerance instead",
+    )
+    relative_primal: float = Field(
+        default=None,
+        deprecated=True,
+        description="Deprecated in 25.08. "
+        "Use relative_primal_tolerance instead",
+    )
+    relative_dual: float = Field(
+        default=None,
+        deprecated=True,
+        description="Deprecated in 25.08. "
+        "Use relative_dual_tolerance instead",
+    )
+    relative_gap: float = Field(
+        default=None,
+        deprecated=True,
+        description="Deprecated in 25.08. "
+        "Use relative_gap_tolerance instead",
+    )
+    primal_infeasible: float = Field(
+        default=None,
+        deprecated=True,
+        description="Deprecated in 25.08. "
+        "Use primal_infeasible_tolerance instead",
+    )
+    dual_infeasible: float = Field(
+        default=None,
+        deprecated=True,
+        description="Deprecated in 25.08. "
+        "Use dual_infeasible_tolerance instead",
+    )
+    integrality_tolerance: float = Field(
+        default=None,
+        deprecated=True,
+        description="Deprecated starting in 25.05. "
+        "Use mip_integratlity_tolerance instead.",
+    )
+    absolute_mip_gap: float = Field(
+        default=None,
+        deprecated=True,
+        description="Deprecated starting in 25.05. "
+        "Use mip_absolute_gap instead.",
+    )
+    relative_mip_gap: float = Field(
+        default=None,
+        deprecated=True,
+        description="Deprecated starting in 25.05. "
+        "Use mip_relative_gap instead.",
+    )
 
 
 class SolverConfig(StrictModel):
@@ -468,6 +540,66 @@ class SolverConfig(StrictModel):
         description="Set True to write logs to console, False to "
         "not write logs to console.",
     )
+    strict_infeasibility: Optional[bool] = Field(
+        default=False,
+        description=" controls the strict infeasibility "
+        "mode in PDLP. When true if either the current or "
+        "the average solution is detected as infeasible, "
+        "PDLP will stop. When false both the current and "
+        "average solution need to be detected as infeasible "
+        "for PDLP to stop.",
+    )
+    user_problem_file: Optional[str] = Field(
+        default="",
+        description="Ignored by the service but included "
+        "for dataset compatibility",
+    )
+    per_constraint_residual: Optional[bool] = Field(
+        default=False,
+        description="Controls whether PDLP should compute the "
+        "primal & dual residual per constraint instead of globally.",
+    )
+    save_best_primal_so_far: Optional[bool] = Field(
+        default=False,
+        description="controls whether PDLP should save the "
+        "best primal solution so far. "
+        "With this parameter set to true, PDLP will always "
+        "prioritize a primal feasible "
+        "to a non primal feasible. "
+        "If a new primal feasible is found, the one with the "
+        "best primal objective will be kept. "
+        "If no primal feasible was found, the one "
+        "with the lowest primal residual will be kept. "
+        "If two have the same primal residual, "
+        "the one with the best objective will be kept.",
+    )
+    first_primal_feasible: Optional[bool] = Field(
+        default=False,
+        description="Controls whether PDLP should stop when "
+        "the first primal feasible solution is found.",
+    )
+    log_file: Optional[str] = Field(
+        default="",
+        description="Ignored by the service but included "
+        "for dataset compatibility",
+    )
+    solution_file: Optional[str] = Field(
+        default="",
+        description="Ignored by the service but included "
+        "for dataset compatibility",
+    )
+    solver_mode: Optional[int] = Field(
+        default=None,
+        deprecated=True,
+        description="Deprecated starting in 25.05. "
+        "Use pdlp_solver_mode instead.",
+    )
+    heuristics_only: Optional[bool] = Field(
+        default=None,
+        deprecated=True,
+        description="Deprecated starting in 25.05. "
+        "Use mip_heuristics_only instead.",
+    )
 
 
 class LPData(StrictModel):
diff --git a/python/cuopt_server/cuopt_server/utils/linear_programming/solver.py b/python/cuopt_server/cuopt_server/utils/linear_programming/solver.py
index fcb9d0764..ccb5b1514 100644
--- a/python/cuopt_server/cuopt_server/utils/linear_programming/solver.py
+++ b/python/cuopt_server/cuopt_server/utils/linear_programming/solver.py
@@ -27,21 +27,28 @@
     CUOPT_ABSOLUTE_PRIMAL_TOLERANCE,
     CUOPT_CROSSOVER,
     CUOPT_DUAL_INFEASIBLE_TOLERANCE,
+    CUOPT_FIRST_PRIMAL_FEASIBLE,
     CUOPT_INFEASIBILITY_DETECTION,
     CUOPT_ITERATION_LIMIT,
+    CUOPT_LOG_FILE,
     CUOPT_LOG_TO_CONSOLE,
     CUOPT_METHOD,
     CUOPT_MIP_ABSOLUTE_GAP,
+    CUOPT_MIP_ABSOLUTE_TOLERANCE,
     CUOPT_MIP_HEURISTICS_ONLY,
     CUOPT_MIP_INTEGRALITY_TOLERANCE,
     CUOPT_MIP_RELATIVE_GAP,
+    CUOPT_MIP_RELATIVE_TOLERANCE,
     CUOPT_MIP_SCALING,
     CUOPT_NUM_CPU_THREADS,
     CUOPT_PDLP_SOLVER_MODE,
+    CUOPT_PER_CONSTRAINT_RESIDUAL,
     CUOPT_PRIMAL_INFEASIBLE_TOLERANCE,
     CUOPT_RELATIVE_DUAL_TOLERANCE,
     CUOPT_RELATIVE_GAP_TOLERANCE,
     CUOPT_RELATIVE_PRIMAL_TOLERANCE,
+    CUOPT_SAVE_BEST_PRIMAL_SO_FAR,
+    CUOPT_STRICT_INFEASIBILITY,
     CUOPT_TIME_LIMIT,
 )
 from cuopt.linear_programming.solver.solver_wrapper import (
@@ -55,9 +62,16 @@
     OutOfMemoryError,
 )
 
-dep_warning = (
-    "{field} is deprecated and will be removed in the next release. Ignored."
-)
+
+def dep_warning(field):
+    return (
+        f"solver config {field} is deprecated and will "
+        "be removed in a future release"
+    )
+
+
+def ignored_warning(field):
+    return f"solver config {field} ignored in the cuopt service"
 
 
 class CustomGetSolutionCallback(GetSolutionCallback):
@@ -156,7 +170,15 @@ def create_solver(LP_data, warmstart_data):
                 CUOPT_INFEASIBILITY_DETECTION,
                 solver_config.infeasibility_detection,
             )
-        if solver_config.pdlp_solver_mode is not None:
+        if solver_config.solver_mode is not None:
+            solver_settings.set_parameter(
+                CUOPT_PDLP_SOLVER_MODE,
+                linear_programming.solver_settings.PDLPSolverMode(
+                    solver_config.solver_mode
+                ),
+            )
+            warnings.append(dep_warning("solver_mode"))
+        elif solver_config.pdlp_solver_mode is not None:
             solver_settings.set_parameter(
                 CUOPT_PDLP_SOLVER_MODE,
                 linear_programming.solver_settings.PDLPSolverMode(
@@ -212,59 +234,138 @@ def create_solver(LP_data, warmstart_data):
             tolerance = solver_config.tolerances
             if tolerance.optimality is not None:
                 solver_settings.set_optimality_tolerance(tolerance.optimality)
-            if tolerance.absolute_dual is not None:
+            if tolerance.absolute_dual_tolerance is not None:
+                solver_settings.set_parameter(
+                    CUOPT_ABSOLUTE_DUAL_TOLERANCE,
+                    tolerance.absolute_dual_tolerance,
+                )
+            elif tolerance.absolute_dual is not None:
                 solver_settings.set_parameter(
                     CUOPT_ABSOLUTE_DUAL_TOLERANCE, tolerance.absolute_dual
                 )
-            if tolerance.absolute_primal is not None:
+                warnings.append(dep_warning("absolute_dual"))
+            if tolerance.absolute_primal_tolerance is not None:
+                solver_settings.set_parameter(
+                    CUOPT_ABSOLUTE_PRIMAL_TOLERANCE,
+                    tolerance.absolute_primal_tolerance,
+                )
+            elif tolerance.absolute_primal is not None:
                 solver_settings.set_parameter(
                     CUOPT_ABSOLUTE_PRIMAL_TOLERANCE, tolerance.absolute_primal
                 )
-            if tolerance.absolute_gap is not None:
+                warnings.append(dep_warning("absolute_primal"))
+            if tolerance.absolute_gap_tolerance is not None:
+                solver_settings.set_parameter(
+                    CUOPT_ABSOLUTE_GAP_TOLERANCE,
+                    tolerance.absolute_gap_tolerance,
+                )
+            elif tolerance.absolute_gap is not None:
                 solver_settings.set_parameter(
                     CUOPT_ABSOLUTE_GAP_TOLERANCE, tolerance.absolute_gap
                 )
-            if tolerance.relative_dual is not None:
+                warnings.append(dep_warning("absolute_gap"))
+            if tolerance.relative_dual_tolerance is not None:
+                solver_settings.set_parameter(
+                    CUOPT_RELATIVE_DUAL_TOLERANCE,
+                    tolerance.relative_dual_tolerance,
+                )
+            elif tolerance.relative_dual is not None:
                 solver_settings.set_parameter(
                     CUOPT_RELATIVE_DUAL_TOLERANCE, tolerance.relative_dual
                 )
-            if tolerance.relative_primal is not None:
+                warnings.append(dep_warning("relative_dual"))
+            if tolerance.relative_primal_tolerance is not None:
+                solver_settings.set_parameter(
+                    CUOPT_RELATIVE_PRIMAL_TOLERANCE,
+                    tolerance.relative_primal_tolerance,
+                )
+            elif tolerance.relative_primal is not None:
                 solver_settings.set_parameter(
                     CUOPT_RELATIVE_PRIMAL_TOLERANCE, tolerance.relative_primal
                 )
-            if tolerance.relative_gap is not None:
+                warnings.append(dep_warning("relative_primal"))
+            if tolerance.relative_gap_tolerance is not None:
+                solver_settings.set_parameter(
+                    CUOPT_RELATIVE_GAP_TOLERANCE,
+                    tolerance.relative_gap_tolerance,
+                )
+            elif tolerance.relative_gap is not None:
                 solver_settings.set_parameter(
                     CUOPT_RELATIVE_GAP_TOLERANCE, tolerance.relative_gap
                 )
-            if tolerance.primal_infeasible is not None:
+                warnings.append(dep_warning("relative_gap"))
+            if tolerance.primal_infeasible_tolerance is not None:
+                solver_settings.set_parameter(
+                    CUOPT_PRIMAL_INFEASIBLE_TOLERANCE,
+                    tolerance.primal_infeasible_tolerance,
+                )
+            elif tolerance.primal_infeasible is not None:
                 solver_settings.set_parameter(
                     CUOPT_PRIMAL_INFEASIBLE_TOLERANCE,
                     tolerance.primal_infeasible,
                 )
-            if tolerance.dual_infeasible is not None:
+                warnings.append(dep_warning("primal_infeasible"))
+            if tolerance.dual_infeasible_tolerance is not None:
+                solver_settings.set_parameter(
+                    CUOPT_DUAL_INFEASIBLE_TOLERANCE,
+                    tolerance.dual_infeasible_tolerance,
+                )
+            elif tolerance.dual_infeasible is not None:
                 solver_settings.set_parameter(
                     CUOPT_DUAL_INFEASIBLE_TOLERANCE, tolerance.dual_infeasible
                 )
+                warnings.append(dep_warning("dual_infeasible"))
             if tolerance.mip_integrality_tolerance is not None:
                 solver_settings.set_parameter(
                     CUOPT_MIP_INTEGRALITY_TOLERANCE,
                     tolerance.mip_integrality_tolerance,
                 )
+            elif tolerance.integrality_tolerance is not None:
+                solver_settings.set_parameter(
+                    CUOPT_MIP_INTEGRALITY_TOLERANCE,
+                    tolerance.integrality_tolerance,
+                )
+                warnings.append(dep_warning("integrality_tolerance"))
             if tolerance.mip_absolute_gap is not None:
                 solver_settings.set_parameter(
                     CUOPT_MIP_ABSOLUTE_GAP, tolerance.mip_absolute_gap
                 )
+            elif tolerance.absolute_mip_gap is not None:
+                solver_settings.set_parameter(
+                    CUOPT_MIP_ABSOLUTE_GAP, tolerance.absolute_mip_gap
+                )
+                warnings.append(dep_warning("absolute_mip_gap"))
             if tolerance.mip_relative_gap is not None:
                 solver_settings.set_parameter(
                     CUOPT_MIP_RELATIVE_GAP, tolerance.mip_relative_gap
                 )
+            elif tolerance.relative_mip_gap is not None:
+                solver_settings.set_parameter(
+                    CUOPT_MIP_RELATIVE_GAP, tolerance.relative_mip_gap
+                )
+                warnings.append(dep_warning("relative_mip_gap"))
+            if tolerance.mip_absolute_tolerance is not None:
+                solver_settings.set_parameter(
+                    CUOPT_MIP_ABSOLUTE_TOLERANCE,
+                    tolerance.mip_absolute_tolerance,
+                )
+            if tolerance.mip_relative_tolerance is not None:
+                solver_settings.set_parameter(
+                    CUOPT_MIP_RELATIVE_TOLERANCE,
+                    tolerance.mip_relative_tolerance,
+                )
         if warmstart_data is not None:
             solver_settings.set_pdlp_warm_start_data(warmstart_data)
         if solver_config.mip_scaling is not None:
             solver_settings.set_parameter(
                 CUOPT_MIP_SCALING, solver_config.mip_scaling
             )
-        if solver_config.mip_heuristics_only is not None:
+        if solver_config.heuristics_only is not None:
+            solver_settings.set_parameter(
+                CUOPT_MIP_HEURISTICS_ONLY, solver_config.heuristics_only
+            )
+            warnings.append(dep_warning("heuristics_only"))
+        elif solver_config.mip_heuristics_only is not None:
             solver_settings.set_parameter(
                 CUOPT_MIP_HEURISTICS_ONLY, solver_config.mip_heuristics_only
             )
@@ -280,6 +381,34 @@ def create_solver(LP_data, warmstart_data):
             solver_settings.set_parameter(
                 CUOPT_LOG_TO_CONSOLE, solver_config.log_to_console
             )
+        if solver_config.strict_infeasibility is not None:
+            solver_settings.set_parameter(
+                CUOPT_STRICT_INFEASIBILITY, solver_config.strict_infeasibility
+            )
+        if solver_config.user_problem_file != "":
+            warnings.append(ignored_warning("user_problem_file"))
+        if solver_config.per_constraint_residual is not None:
+            solver_settings.set_parameter(
+                CUOPT_PER_CONSTRAINT_RESIDUAL,
+                solver_config.per_constraint_residual,
+            )
+        if solver_config.save_best_primal_so_far is not None:
+            solver_settings.set_parameter(
+                CUOPT_SAVE_BEST_PRIMAL_SO_FAR,
+                solver_config.save_best_primal_so_far,
+            )
+        if solver_config.first_primal_feasible is not None:
+            solver_settings.set_parameter(
+                CUOPT_FIRST_PRIMAL_FEASIBLE,
+                solver_config.first_primal_feasible,
+            )
+        if solver_config.log_file != "":
+            solver_settings.set_parameter(
+                CUOPT_LOG_FILE, solver_config.log_file
+            )
+        if solver_config.solution_file != "":
+            warnings.append(ignored_warning("solution_file"))
+
     return warnings, solver_settings
 
 
@@ -300,7 +429,7 @@ def get_solver_exception_type(status, message):
         return RuntimeError(msg)
 
 
-def solve(LP_data, reqId, intermediate_sender, warmstart_data, log_file):
+def solve(LP_data, reqId, intermediate_sender, warmstart_data):
     notes = []
 
     def get_if_attribute_is_valid_else_none(attr):
@@ -431,7 +560,7 @@ def create_solution(sol):
             solver_settings.set_mip_callback(callback)
             solve_begin_time = time.time()
             sol = linear_programming.Solve(
-                data_model, solver_settings=solver_settings, log_file=log_file
+                data_model, solver_settings=solver_settings
             )
             total_solve_time = time.time() - solve_begin_time
 
diff --git a/python/cuopt_server/cuopt_server/utils/solver.py b/python/cuopt_server/cuopt_server/utils/solver.py
index 0a98c368f..5c09b07f9 100644
--- a/python/cuopt_server/cuopt_server/utils/solver.py
+++ b/python/cuopt_server/cuopt_server/utils/solver.py
@@ -88,7 +88,7 @@ def solve_LP_sync(
 
     begin_time = time.time()
 
-    if type(LP_data) is list:
+    if isinstance(LP_data, list):
         for i_data in LP_data:
             validate_LP_data(i_data)
     else:
@@ -98,15 +98,31 @@ def solve_LP_sync(
     logging.debug(f"etl_time {etl_end_time - begin_time}")
 
     if not validation_only:
-        if solver_logging:
+        # log_file setting is ignored in the service,
+        # instead we control it and use it as the basis for callbacks
+        if isinstance(LP_data, list):
+            # clear log_file setting for all because
+            # we don't support callbacks for batch mode
+            # and otherwise we ignore log_file
+            for i_data in LP_data:
+                i_data.solver_config.log_file = ""
+        elif solver_logging:
             log_dir, _, _ = settings.get_result_dir()
             log_fname = "log_" + reqId
             log_file = os.path.join(log_dir, log_fname)
             logging.info(f"Writing logs to {log_file}")
-        else:
-            log_file = ""
+            LP_data.solver_config.log_file = log_file
+        elif LP_data.solver_config.log_file:
+            warnings.append(
+                "solver config log_file ignored in the cuopt service"
+            )
+            LP_data.solver_config.log_file = ""
+
         notes, addl_warnings, res, total_solve_time = LP_solve(
-            LP_data, reqId, intermediate_sender, warmstart_data, log_file
+            LP_data,
+            reqId,
+            intermediate_sender,
+            warmstart_data,
         )
         warnings.extend(addl_warnings)
     else:
diff --git a/python/cuopt_server/cuopt_server/webserver.py b/python/cuopt_server/cuopt_server/webserver.py
index edbc8234d..ac389652a 100644
--- a/python/cuopt_server/cuopt_server/webserver.py
+++ b/python/cuopt_server/cuopt_server/webserver.py
@@ -958,7 +958,7 @@ async def postrequest(
     ),
     solver_logs: Optional[bool] = Query(
         default=False,
-        description="If set to True, MIP problems will produce detailed solver logs that can be retrieved from /cuopt/log/{id}",  # noqa
+        description="If set to True, math optimization problems will produce detailed solver logs that can be retrieved from /cuopt/log/{id}. ",  # noqa
     ),
     cuopt_data_file: str = Header(
         default=None,