diff --git a/benchmarks/README.md b/benchmarks/README.md
index 1280356b9..1e56f9dfb 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,3 +1,26 @@
 # Benchmarks Scripts
 
-This directory contains the scripts for the benchmarks.
+This directory contains the scripts for running benchmarks. Currently it supports running benchmarks on LP with and MILP.
+
+
+# Linear Programming Benchmarking
+
+
+- Mittelmann LP benchmark
+
+```bash
+benchmarks/linear_programming/utils/benchmark_lp_mittelmann.sh
+```
+
+- MIPLIB Benchmark
+
+```
+mkdir miplib_data
+mkdir miplib_result
+wget https://miplib.zib.de/downloads/benchmark.zip -O miplib_data/benchmark.zip
+unzip miplib_data/benchmark.zip -d miplib_data
+find miplib_data -name "*.gz" -exec gunzip {} \;
+find miplib_data -name "*.gz" -delete
+
+benchmarks/linear_programming/run_mps_files.sh --path miplib_data/ --write-log-file --log-to-console false --output-dir miplib_result --time-limit 600 --presolve t > miplib_result/output.log 2>&1
+```
diff --git a/benchmarks/linear_programming/utils/get_datasets.py b/benchmarks/linear_programming/utils/get_datasets.py
index 447ac2e99..01ed86dc9 100644
--- a/benchmarks/linear_programming/utils/get_datasets.py
+++ b/benchmarks/linear_programming/utils/get_datasets.py
@@ -21,11 +21,11 @@
 import subprocess
 
 
-# From: http://plato.asu.edu/bench.html
+# From: https://plato.asu.edu/bench.html
 # Folder containg instances:
 # - https://miplib2010.zib.de/miplib2010.php
 # - https://www.netlib.org/lp/data/
-# - http://old.sztaki.hu/~meszaros/public_ftp/lptestset/ (and it's subfolders)
+# - https://old.sztaki.hu/~meszaros/public_ftp/lptestset/ (and it's subfolders)
 # - https://plato.asu.edu/ftp/lptestset/ (and it's subfolders)
 # - https://miplib.zib.de/tag_benchmark.html
 # - https://miplib.zib.de/tag_collection.html
@@ -83,7 +83,7 @@
 ]
 
 MittelmannInstances = {
-    "emps": "http://old.sztaki.hu/~meszaros/public_ftp/lptestset/emps.c",
+    "emps": "https://old.sztaki.hu/~meszaros/public_ftp/lptestset/emps.c",
     "problems" : {
         "irish-electricity" : [
             "https://plato.asu.edu/ftp/lptestset/irish-electricity.mps.bz2",
@@ -94,88 +94,88 @@
             "mps"
         ],
         "16_n14" : [
-            "http://plato.asu.edu/ftp/lptestset/network/16_n14.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/network/16_n14.mps.bz2",
             "mps"
         ],
         "Dual2_5000" : [
-            "http://plato.asu.edu/ftp/lptestset/Dual2_5000.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/Dual2_5000.mps.bz2",
             "mps"
         ],
         "L1_six1000" : [
-            "http://plato.asu.edu/ftp/lptestset/L1_sixm1000obs.bz2",
+            "https://plato.asu.edu/ftp/lptestset/L1_sixm1000obs.bz2",
             "netlib"
         ],
         "L1_sixm" : ["", "mps"],
         "L1_sixm1000obs" : [
-            "http://plato.asu.edu/ftp/lptestset/L1_sixm1000obs.bz2",
+            "https://plato.asu.edu/ftp/lptestset/L1_sixm1000obs.bz2",
             "netlib"
         ],
         "L1_sixm250" : ["", "netlib"],
         "L1_sixm250obs" : [
-            "http://plato.asu.edu/ftp/lptestset/L1_sixm250obs.bz2",
+            "https://plato.asu.edu/ftp/lptestset/L1_sixm250obs.bz2",
             "netlib"
         ],
         "L2CTA3D" : [
-            "http://plato.asu.edu/ftp/lptestset/L2CTA3D.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/L2CTA3D.mps.bz2",
             "mps"
         ],
         "Linf_520c" : [
-            "http://plato.asu.edu/ftp/lptestset/Linf_520c.bz2",
+            "https://plato.asu.edu/ftp/lptestset/Linf_520c.bz2",
             "netlib"
         ],
         "Primal2_1000" : [
-            "http://plato.asu.edu/ftp/lptestset/Primal2_1000.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/Primal2_1000.mps.bz2",
             "mps"
         ],
         "a2864" : [
-            "http://plato.asu.edu/ftp/lptestset/a2864.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/a2864.mps.bz2",
             "mps"
         ],
         "bdry2" : [
-            "http://plato.asu.edu/ftp/lptestset/bdry2.bz2",
+            "https://plato.asu.edu/ftp/lptestset/bdry2.bz2",
             "netlib"
         ],
         "braun" : ["", "mps"],
         "cont1" : [
-            "http://plato.asu.edu/ftp/lptestset/misc/cont1.bz2",
+            "https://plato.asu.edu/ftp/lptestset/misc/cont1.bz2",
             "netlib"
         ],
         "cont11" : [
-            "http://plato.asu.edu/ftp/lptestset/misc/cont11.bz2",
+            "https://plato.asu.edu/ftp/lptestset/misc/cont11.bz2",
             "netlib"
         ],
         "datt256" : [
-            "http://plato.asu.edu/ftp/lptestset/datt256_lp.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/datt256_lp.mps.bz2",
             "mps"
         ],
         "datt256_lp" : [
-            "http://plato.asu.edu/ftp/lptestset/datt256_lp.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/datt256_lp.mps.bz2",
             "mps"
         ],
         "degme" : [
-            "http://old.sztaki.hu/~meszaros/public_ftp/lptestset/New/degme.gz",
+            "https://old.sztaki.hu/~meszaros/public_ftp/lptestset/New/degme.gz",
             "netlib"
         ],
         "dlr1" : [
-            "http://plato.asu.edu/ftp/lptestset/dlr1.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/dlr1.mps.bz2",
             "mps"
         ],
         "dlr2" : [
-            "http://plato.asu.edu/ftp/lptestset/dlr2.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/dlr2.mps.bz2",
             "mps"
         ],
         "energy1" : ["", "mps"], # Kept secret by Mittlemman
         "energy2" : ["", "mps"],
         "ex10" : [
-            "http://plato.asu.edu/ftp/lptestset/ex10.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/ex10.mps.bz2",
             "mps"
         ],
         "fhnw-binschedule1" : [
-            "http://plato.asu.edu/ftp/lptestset/fhnw-binschedule1.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/fhnw-binschedule1.mps.bz2",
             "mps"
         ],
         "fome13" : [
-            "http://plato.asu.edu/ftp/lptestset/fome/fome13.bz2",
+            "https://plato.asu.edu/ftp/lptestset/fome/fome13.bz2",
             "netlib"
         ],
         "gamora" : ["", "mps"], # Kept secret by Mittlemman
@@ -195,106 +195,106 @@
         "goto32_512_4" : ["", "mps"],
         "goto32_512_5" : ["", "mps"],
         "graph40-40" : [
-            "http://plato.asu.edu/ftp/lptestset/graph40-40.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/graph40-40.mps.bz2",
             "mps"
         ],
         "graph40-40_lp" : [
-            "http://plato.asu.edu/ftp/lptestset/graph40-40.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/graph40-40.mps.bz2",
             "mps"
         ],
         "groot" : ["", "mps"], # Kept secret by Mittlemman
         "heimdall" : ["", "mps"], # Kept secret by Mittlemman
         "hulk" : ["", "mps"], # Kept secret by Mittlemman
         "i_n13" : [
-            "http://plato.asu.edu/ftp/lptestset/network/i_n13.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/network/i_n13.mps.bz2",
             "mps"
         ],
         "irish-e" : ["", "mps"],
         "karted" : [
-            "http://old.sztaki.hu/~meszaros/public_ftp/lptestset/New/karted.gz",
+            "https://old.sztaki.hu/~meszaros/public_ftp/lptestset/New/karted.gz",
             "netlib"
         ],
         "lo10" : [
-            "http://plato.asu.edu/ftp/lptestset/network/lo10.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/network/lo10.mps.bz2",
             "mps"
         ],
         "loki" : ["", "mps"], # Kept secret by Mittlemman
         "long15" : [
-            "http://plato.asu.edu/ftp/lptestset/network/long15.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/network/long15.mps.bz2",
             "mps"
         ],
         "nebula" : ["", "mps"], # Kept secret by Mittlemman
         "neos" : [
-            "http://plato.asu.edu/ftp/lptestset/misc/neos.bz2",
+            "https://plato.asu.edu/ftp/lptestset/misc/neos.bz2",
             "netlib"
         ],
         "neos-3025225" : [
-            "http://plato.asu.edu/ftp/lptestset/neos-3025225.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/neos-3025225.mps.bz2",
             "mps"
         ],
         "neos-3025225_lp" : [
-            "http://plato.asu.edu/ftp/lptestset/neos-3025225.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/neos-3025225.mps.bz2",
             "mps"
         ],
         "neos-5251015" : [
-            "http://plato.asu.edu/ftp/lptestset/neos-5251015.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/neos-5251015.mps.bz2",
             "mps"
         ],
         "neos-5251015_lp" : [
-            "http://plato.asu.edu/ftp/lptestset/neos-5251015.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/neos-5251015.mps.bz2",
             "mps"
         ],
         "neos3" : [
-            "http://plato.asu.edu/ftp/lptestset/misc/neos3.bz2",
+            "https://plato.asu.edu/ftp/lptestset/misc/neos3.bz2",
             "netlib"
         ],
         "neos-5052403-cygnet" : [
-            "http://plato.asu.edu/ftp/lptestset/neos-5052403-cygnet.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/neos-5052403-cygnet.mps.bz2",
             "mps"
         ],
         "neos5251015_lp" : [
-            "http://plato.asu.edu/ftp/lptestset/neos-5251015.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/neos-5251015.mps.bz2",
             "mps"
         ],
         "neos5251915" : [
-            "http://plato.asu.edu/ftp/lptestset/neos-5251015.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/neos-5251015.mps.bz2",
             "mps"
         ],
         "netlarge1" : [
-            "http://plato.asu.edu/ftp/lptestset/network/netlarge1.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/network/netlarge1.mps.bz2",
             "mps"
         ],
         "netlarge2" : [
-            "http://plato.asu.edu/ftp/lptestset/network/netlarge2.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/network/netlarge2.mps.bz2",
             "mps"
         ],
         "netlarge3" : [
-            "http://plato.asu.edu/ftp/lptestset/network/netlarge3.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/network/netlarge3.mps.bz2",
             "mps"
         ],
         "netlarge6" : [
-            "http://plato.asu.edu/ftp/lptestset/network/netlarge6.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/network/netlarge6.mps.bz2",
             "mps"
         ],
         "ns1687037" : [
-            "http://plato.asu.edu/ftp/lptestset/misc/ns1687037.bz2",
+            "https://plato.asu.edu/ftp/lptestset/misc/ns1687037.bz2",
             "netlib"
         ],
         "ns1688926" : [
-            "http://plato.asu.edu/ftp/lptestset/misc/ns1688926.bz2",
+            "https://plato.asu.edu/ftp/lptestset/misc/ns1688926.bz2",
             "netlib"
         ],
         "nug08-3rd" : [
-            "http://plato.asu.edu/ftp/lptestset/nug/nug08-3rd.bz2",
+            "https://plato.asu.edu/ftp/lptestset/nug/nug08-3rd.bz2",
             "netlib"
         ],
         "pds-100" : [
-            "http://plato.asu.edu/ftp/lptestset/pds/pds-100.bz2",
+            "https://plato.asu.edu/ftp/lptestset/pds/pds-100.bz2",
             "netlib"
         ],
         "psched3-3" : ["", "mps"],
         "qap15" : [
-            "http://plato.asu.edu/ftp/lptestset/qap15.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/qap15.mps.bz2",
             "mps"
         ],
         "rail02" : [
@@ -302,35 +302,35 @@
             "mps"
         ],
         "rail4284" : [
-            "http://plato.asu.edu/ftp/lptestset/rail/rail4284.bz2",
+            "https://plato.asu.edu/ftp/lptestset/rail/rail4284.bz2",
             "netlib"
         ],
         "rmine15" : [
-            "http://plato.asu.edu/ftp/lptestset/rmine15.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/rmine15.mps.bz2",
             "mps"
         ],
         "s100" : [
-            "http://plato.asu.edu/ftp/lptestset/s100.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/s100.mps.bz2",
             "mps"
         ],
         "s250r10" : [
-            "http://plato.asu.edu/ftp/lptestset/s250r10.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/s250r10.mps.bz2",
             "mps"
         ],
         "s82" : [
-            "http://plato.asu.edu/ftp/lptestset/s82.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/s82.mps.bz2",
             "mps"
         ],
         "savsched1" : [
-            "http://plato.asu.edu/ftp/lptestset/savsched1.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/savsched1.mps.bz2",
             "mps"
         ],
         "scpm1" : [
-            "http://plato.asu.edu/ftp/lptestset/scpm1.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/scpm1.mps.bz2",
             "mps"
         ],
         "set-cover-model" : [
-            "http://plato.asu.edu/ftp/lptestset/set-cover-model.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/set-cover-model.mps.bz2",
             "mps"
         ],
         "shs1023" : [
@@ -338,19 +338,19 @@
             "mps"
         ],
         "square15" : [
-            "http://plato.asu.edu/ftp/lptestset/network/square15.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/network/square15.mps.bz2",
             "mps"
         ],
         "square41" : [
-            "http://plato.asu.edu/ftp/lptestset/square41.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/square41.mps.bz2",
             "mps"
         ],
         "stat96v2" : [
-            "http://old.sztaki.hu/~meszaros/public_ftp/lptestset/misc/stat96v2.gz",
+            "https://old.sztaki.hu/~meszaros/public_ftp/lptestset/misc/stat96v2.gz",
             "netlib"
         ],
         "stormG2_1000" : [
-            "http://plato.asu.edu/ftp/lptestset/misc/stormG2_1000.bz2",
+            "https://plato.asu.edu/ftp/lptestset/misc/stormG2_1000.bz2",
             "netlib"
         ],
         "storm_1000" : ["", "mps"],
@@ -359,15 +359,15 @@
             "mps"
         ],
         "supportcase10" : [
-            "http://plato.asu.edu/ftp/lptestset/supportcase10.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/supportcase10.mps.bz2",
             "mps"
         ],
         "support19" : [
-            "http://plato.asu.edu/ftp/lptestset/supportcase19.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/supportcase19.mps.bz2",
             "mps"
         ],
         "supportcase19" : [
-            "http://plato.asu.edu/ftp/lptestset/supportcase19.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/supportcase19.mps.bz2",
             "mps"
         ],
         "test03" : ["", "mps"], # Kept secret by Mittlemman
@@ -388,15 +388,15 @@
         "thor" : ["", "mps"],  # Kept secret by Mittlemman
         "tpl-tub-ws" : ["", "mps"],
         "tpl-tub-ws1617" : [
-            "http://plato.asu.edu/ftp/lptestset/tpl-tub-ws1617.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/tpl-tub-ws1617.mps.bz2",
             "mps"
         ],
         "wide15" : [
-            "http://plato.asu.edu/ftp/lptestset/network/wide15.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/network/wide15.mps.bz2",
             "mps"
         ],
         "woodlands09" : [
-            "http://plato.asu.edu/ftp/lptestset/woodlands09.mps.bz2",
+            "https://plato.asu.edu/ftp/lptestset/woodlands09.mps.bz2",
             "mps"
         ]
     },
@@ -550,7 +550,7 @@
             "square15",
             "wide15"
         ],
-        # <=100s in bench: http://plato.asu.edu/ftp/lpbar.html
+        # <=100s in bench: https://plato.asu.edu/ftp/lpbar.html
         "L0" : [
             "ex10",
             "datt256",
@@ -693,11 +693,11 @@ def extract(file, dir, type):
         raise Exception(f"Unknown file extension found for extraction {file}")
     # download emps and compile
     # Disable emps for now
-    if type == "netlib" and False:
+    if type == "netlib":
         url = MittelmannInstances["emps"]
         file = os.path.join(dir, "emps.c")
         download(url, file)
-        subprocess.run(f"cd {dir} && gcc emps.c -o emps",
+        subprocess.run(f"cd {dir} && gcc -Wno-implicit-int emps.c -o emps",
                         shell=True)
         # determine output file and run emps
         subprocess.run(f"cd {dir} && ./emps {unzippedfile} > {outfile}",
diff --git a/build.sh b/build.sh
index a5f57f9c9..ffb9e1a0f 100755
--- a/build.sh
+++ b/build.sh
@@ -27,7 +27,7 @@ REPODIR=$(cd "$(dirname "$0")"; pwd)
 LIBCUOPT_BUILD_DIR=${LIBCUOPT_BUILD_DIR:=${REPODIR}/cpp/build}
 LIBMPS_PARSER_BUILD_DIR=${LIBMPS_PARSER_BUILD_DIR:=${REPODIR}/cpp/libmps_parser/build}
 
-VALIDARGS="clean libcuopt libmps_parser cuopt_mps_parser cuopt cuopt_server cuopt_sh_client docs deb -a -b -g -fsanitize -v -l= --verbose-pdlp --build-lp-only  --no-fetch-rapids --skip-c-python-adapters --skip-tests-build --skip-routing-build --skip-fatbin-write [--cmake-args=\\\"<args>\\\"] [--cache-tool=<tool>] -n --allgpuarch --ci-only-arch --show_depr_warn -h --help"
+VALIDARGS="clean libcuopt libmps_parser cuopt_mps_parser cuopt cuopt_server cuopt_sh_client docs deb -a -b -g -fsanitize -v -l= --verbose-pdlp --build-lp-only  --no-fetch-rapids --skip-c-python-adapters --skip-tests-build --skip-routing-build --skip-fatbin-write --host-lineinfo [--cmake-args=\\\"<args>\\\"] [--cache-tool=<tool>] -n --allgpuarch --ci-only-arch --show_depr_warn -h --help"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
@@ -54,6 +54,7 @@ HELP="$0 [<target> ...] [<flag> ...]
    --skip-tests-build  - disable building of all tests
    --skip-routing-build - skip building routing components
    --skip-fatbin-write      - skip the fatbin write
+   --host-lineinfo           - build with debug line information for host code
    --cache-tool=<tool> - pass the build cache tool (eg: ccache, sccache, distcc) that will be used
                       to speedup the build process.
    --cmake-args=\\\"<args>\\\"   - pass arbitrary list of CMake configuration options (escape all quotes in argument)
@@ -91,6 +92,7 @@ SKIP_C_PYTHON_ADAPTERS=0
 SKIP_TESTS_BUILD=0
 SKIP_ROUTING_BUILD=0
 WRITE_FATBIN=1
+HOST_LINEINFO=0
 CACHE_ARGS=()
 PYTHON_ARGS_FOR_INSTALL=("-m" "pip" "install" "--no-build-isolation" "--no-deps")
 LOGGING_ACTIVE_LEVEL="INFO"
@@ -133,7 +135,7 @@ function cacheTool {
 }
 
 function loggingArgs {
-    if [[ $(echo "$ARGS" | { grep -Eo "\-l" || true; } | wc -l ) -gt 1 ]]; then
+    if [[ $(echo "$ARGS" | { grep -Eo "\-l=" || true; } | wc -l ) -gt 1 ]]; then
         echo "Multiple -l logging options were provided, please provide only one: ${ARGS}"
         exit 1
     fi
@@ -141,7 +143,7 @@ function loggingArgs {
     LOG_LEVEL_LIST=("TRACE" "DEBUG" "INFO" "WARN" "ERROR" "CRITICAL" "OFF")
 
     # Check for logging option
-    if [[ -n $(echo "$ARGS" | { grep -E "\-l" || true; } ) ]]; then
+    if [[ -n $(echo "$ARGS" | { grep -E "\-l=" || true; } ) ]]; then
         LOGGING_ARGS=$(echo "$ARGS" | { grep -Eo "\-l=\S+" || true; })
         if [[ -n ${LOGGING_ARGS} ]]; then
             # Remove the full log argument from list of args so that it passes validArgs function
@@ -252,6 +254,9 @@ fi
 if hasArg --skip-fatbin-write; then
     WRITE_FATBIN=0
 fi
+if hasArg --host-lineinfo; then
+    HOST_LINEINFO=1
+fi
 
 function contains_string {
     local search_string="$1"
@@ -355,6 +360,7 @@ if buildAll || hasArg libcuopt; then
           -DBUILD_TESTS=$((1 - ${SKIP_TESTS_BUILD})) \
           -DSKIP_ROUTING_BUILD=${SKIP_ROUTING_BUILD} \
           -DWRITE_FATBIN=${WRITE_FATBIN} \
+          -DHOST_LINEINFO=${HOST_LINEINFO} \
           "${CACHE_ARGS[@]}" \
           "${EXTRA_CMAKE_ARGS[@]}" \
           "${REPODIR}"/cpp
diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 2a3120f5a..e173a0410 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -30,7 +30,6 @@ dependencies:
 - gcc_linux-aarch64=14.*
 - gmock
 - gtest
-- httpx
 - ipython
 - jsonref==1.1.0
 - libcurand-dev
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index da6d19c19..7767e5a95 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -30,7 +30,6 @@ dependencies:
 - gcc_linux-64=14.*
 - gmock
 - gtest
-- httpx
 - ipython
 - jsonref==1.1.0
 - libcurand-dev
diff --git a/conda/environments/all_cuda-130_arch-aarch64.yaml b/conda/environments/all_cuda-130_arch-aarch64.yaml
index cf306f74e..e8b8ab2b5 100644
--- a/conda/environments/all_cuda-130_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-130_arch-aarch64.yaml
@@ -30,7 +30,6 @@ dependencies:
 - gcc_linux-aarch64=14.*
 - gmock
 - gtest
-- httpx
 - ipython
 - jsonref==1.1.0
 - libcurand-dev
diff --git a/conda/environments/all_cuda-130_arch-x86_64.yaml b/conda/environments/all_cuda-130_arch-x86_64.yaml
index c9ba5beba..66b5f8f4b 100644
--- a/conda/environments/all_cuda-130_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-130_arch-x86_64.yaml
@@ -30,7 +30,6 @@ dependencies:
 - gcc_linux-64=14.*
 - gmock
 - gtest
-- httpx
 - ipython
 - jsonref==1.1.0
 - libcurand-dev
diff --git a/conda/recipes/cuopt-server/recipe.yaml b/conda/recipes/cuopt-server/recipe.yaml
index ad9952f25..32c9efa0f 100644
--- a/conda/recipes/cuopt-server/recipe.yaml
+++ b/conda/recipes/cuopt-server/recipe.yaml
@@ -34,11 +34,11 @@ requirements:
     - cuopt =${{ version }}
     - fastapi >=0.104.1
     - jsonref =1.1.0
-    - httpx
     - msgpack-python =1.1.0
     - msgpack-numpy =0.4.8
     - numpy >=1.23,<3.0a0
     - pandas>=2
+    - psutil>=5.9,<6.0a0
     - python
     - uvicorn ${{ uvicorn_version }}
 
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 45789cedd..b377f659b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -56,6 +56,7 @@ option(BUILD_LP_ONLY "Build only linear programming components, exclude routing
 option(SKIP_C_PYTHON_ADAPTERS "Skip building C and Python adapter files (cython_solve.cu and cuopt_c.cpp)" OFF)
 option(SKIP_ROUTING_BUILD "Skip building routing components" OFF)
 option(WRITE_FATBIN "Enable fatbin writing" ON)
+option(HOST_LINEINFO "Build with debug line information for host code" OFF)
 
 message(VERBOSE "cuOpt: Enable nvcc -lineinfo: ${CMAKE_CUDA_LINEINFO}")
 message(VERBOSE "cuOpt: Build cuOpt unit-tests: ${BUILD_TESTS}")
@@ -64,6 +65,7 @@ message(VERBOSE "cuOpt: Disable OpenMP: ${DISABLE_OPENMP}")
 message(VERBOSE "cuOpt: Build LP-only mode: ${BUILD_LP_ONLY}")
 message(VERBOSE "cuOpt: Skip C/Python adapters: ${SKIP_C_PYTHON_ADAPTERS}")
 message(VERBOSE "cuOpt: Skip routing build: ${SKIP_ROUTING_BUILD}")
+message(VERBOSE "cuOpt: Build with debug line information for host code: ${HOST_LINEINFO}")
 message(VERBOSE "cuOpt: fatbin: ${WRITE_FATBIN}")
 
 # ##################################################################################################
@@ -203,6 +205,9 @@ endif()
 
 set(CUOPT_SRC_FILES )
 add_subdirectory(src)
+if (HOST_LINEINFO)
+  set_source_files_properties(${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES COMPILE_OPTIONS "-g1")
+endif()
 add_library(cuopt SHARED
   ${CUOPT_SRC_FILES}
 )
diff --git a/cpp/src/mip/CMakeLists.txt b/cpp/src/mip/CMakeLists.txt
index e694a9f6f..2d11a8dcb 100644
--- a/cpp/src/mip/CMakeLists.txt
+++ b/cpp/src/mip/CMakeLists.txt
@@ -48,7 +48,7 @@ set(MIP_NON_LP_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/presolve/trivial_presolve.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/feasibility_jump.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/feasibility_jump_kernels.cu
-)
+  ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/fj_cpu.cu)
 
 # Choose which files to include based on build mode
 if(BUILD_LP_ONLY)
diff --git a/cpp/src/mip/diversity/diversity_manager.cu b/cpp/src/mip/diversity/diversity_manager.cu
index 0fb3f9fc0..5da9dff84 100644
--- a/cpp/src/mip/diversity/diversity_manager.cu
+++ b/cpp/src/mip/diversity/diversity_manager.cu
@@ -229,13 +229,16 @@ bool diversity_manager_t<i_t, f_t>::run_presolve(f_t time_limit)
     trivial_presolve(*problem_ptr);
     if (!problem_ptr->empty && !check_bounds_sanity(*problem_ptr)) { return false; }
   }
-  if (!problem_ptr->empty) {
-    // do the resizing no-matter what, bounds presolve might not change the bounds but initial
-    // trivial presolve might have
-    ls.constraint_prop.bounds_update.resize(*problem_ptr);
-    ls.constraint_prop.conditional_bounds_update.update_constraint_bounds(
-      *problem_ptr, ls.constraint_prop.bounds_update);
-    if (!check_bounds_sanity(*problem_ptr)) { return false; }
+  // May overconstrain if Papilo presolve has been run before
+  if (!context.settings.presolve) {
+    if (!problem_ptr->empty) {
+      // do the resizing no-matter what, bounds presolve might not change the bounds but initial
+      // trivial presolve might have
+      ls.constraint_prop.bounds_update.resize(*problem_ptr);
+      ls.constraint_prop.conditional_bounds_update.update_constraint_bounds(
+        *problem_ptr, ls.constraint_prop.bounds_update);
+      if (!check_bounds_sanity(*problem_ptr)) { return false; }
+    }
   }
   stats.presolve_time = presolve_timer.elapsed_time();
   lp_optimal_solution.resize(problem_ptr->n_variables, problem_ptr->handle_ptr->get_stream());
@@ -314,6 +317,13 @@ void diversity_manager_t<i_t, f_t>::run_fp_alone(solution_t<i_t, f_t>& solution)
   CUOPT_LOG_INFO("FP alone finished!");
 }
 
+template <typename i_t, typename f_t>
+struct ls_cpufj_raii_guard_t {
+  ls_cpufj_raii_guard_t(local_search_t<i_t, f_t>& ls) : ls(ls) {}
+  ~ls_cpufj_raii_guard_t() { ls.stop_cpufj_scratch_threads(); }
+  local_search_t<i_t, f_t>& ls;
+};
+
 // returns the best feasible solution
 template <typename i_t, typename f_t>
 solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
@@ -342,8 +352,15 @@ solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
     "The problem must not be ii");
   population.initialize_population();
   if (check_b_b_preemption()) { return population.best_feasible(); }
+
+  // Run CPUFJ early to find quick initial solutions
+  population.allocate_solutions();
+  ls_cpufj_raii_guard_t ls_cpufj_raii_guard(ls);  // RAII to stop cpufj threads on solve stop
+  ls.start_cpufj_scratch_threads(population);
+
   // before probing cache or LP, run FJ to generate initial primal feasible solution
-  if (!from_dir && !fj_only_run) { generate_quick_feasible_solution(); }
+  // TODO: commenting this out decreases the gap on trento1.mps dramatically. figure out why?
+  // if (!from_dir && !fj_only_run) { generate_quick_feasible_solution(); }
   const f_t time_ratio_of_probing_cache = diversity_config.time_ratio_of_probing_cache;
   const f_t max_time_on_probing         = diversity_config.max_time_on_probing;
   f_t time_for_probing_cache =
@@ -412,8 +429,8 @@ solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
     clamp_within_var_bounds(lp_optimal_solution, problem_ptr, problem_ptr->handle_ptr);
   }
 
-  population.allocate_solutions();
   population.add_solutions_from_vec(std::move(initial_sol_vector));
+
   if (check_b_b_preemption()) { return population.best_feasible(); }
 
   if (context.settings.benchmark_info_ptr != nullptr) {
diff --git a/cpp/src/mip/diversity/population.cu b/cpp/src/mip/diversity/population.cu
index 63f4b3c85..17c88df3f 100644
--- a/cpp/src/mip/diversity/population.cu
+++ b/cpp/src/mip/diversity/population.cu
@@ -142,13 +142,36 @@ size_t population_t<i_t, f_t>::get_external_solution_size()
 }
 
 template <typename i_t, typename f_t>
-void population_t<i_t, f_t>::add_external_solution(std::vector<f_t>& solution, f_t objective)
+void population_t<i_t, f_t>::add_external_solution(const std::vector<f_t>& solution,
+                                                   f_t objective,
+                                                   solution_origin_t origin)
 {
   std::lock_guard<std::mutex> lock(solution_mutex);
-  CUOPT_LOG_INFO("B&B added a solution to population, solution queue size %lu with objective %g",
+
+  if (origin == solution_origin_t::CPUFJ) {
+    external_solution_queue_cpufj.emplace_back(solution, objective, origin);
+  } else {
+    external_solution_queue.emplace_back(solution, objective, origin);
+  }
+
+  // Prevent CPUFJ scratch solutions from flooding the queue
+  if (external_solution_queue_cpufj.size() >= 10) {
+    auto worst_obj_it =
+      std::max_element(external_solution_queue_cpufj.begin(),
+                       external_solution_queue_cpufj.end(),
+                       [](const external_solution_t& a, const external_solution_t& b) {
+                         return a.objective < b.objective;
+                       });
+    if (objective > worst_obj_it->objective) return;
+    auto worst_obj_idx = std::distance(external_solution_queue_cpufj.begin(), worst_obj_it);
+
+    external_solution_queue_cpufj.erase(external_solution_queue_cpufj.begin() + worst_obj_idx);
+  }
+
+  CUOPT_LOG_INFO("%s added a solution to population, solution queue size %lu with objective %g",
+                 solution_origin_to_string(origin),
                  external_solution_queue.size(),
                  problem_ptr->get_user_obj_from_solver_obj(objective));
-  external_solution_queue.emplace_back(solution);
   if (external_solution_queue.size() >= 5) { early_exit_primal_generation = true; }
 }
 
@@ -166,18 +189,49 @@ std::vector<solution_t<i_t, f_t>> population_t<i_t, f_t>::get_external_solutions
 {
   std::lock_guard<std::mutex> lock(solution_mutex);
   std::vector<solution_t<i_t, f_t>> return_vector;
-  for (auto h_solution_vec : external_solution_queue) {
-    solution_t<i_t, f_t> sol(*problem_ptr);
-    sol.copy_new_assignment(h_solution_vec);
-    sol.compute_feasibility();
-    sol.handle_ptr->sync_stream();
-    return_vector.emplace_back(std::move(sol));
+  i_t counter                     = 0;
+  f_t new_best_feasible_objective = best_feasible_objective;
+  for (auto& queue : {external_solution_queue, external_solution_queue_cpufj}) {
+    for (auto& h_entry : queue) {
+      // ignore CPUFJ solutions if they're not better than the best feasible.
+      // It seems they worsen results on some instances despite the potential for improved diversity
+      if (h_entry.origin == solution_origin_t::CPUFJ &&
+          h_entry.objective > new_best_feasible_objective) {
+        continue;
+      } else if (h_entry.origin != solution_origin_t::CPUFJ &&
+                 h_entry.objective > new_best_feasible_objective) {
+        new_best_feasible_objective = h_entry.objective;
+      }
+
+      solution_t<i_t, f_t> sol(*problem_ptr);
+      sol.copy_new_assignment(h_entry.solution);
+      sol.compute_feasibility();
+      if (!sol.get_feasible()) {
+        CUOPT_LOG_ERROR(
+          "External solution %d is infeasible, excess %g, obj %g, int viol %g, var viol %g, cstr "
+          "viol %g, n_feasible %d/%d, integers %d/%d",
+          counter,
+          sol.get_total_excess(),
+          sol.get_user_objective(),
+          sol.compute_max_int_violation(),
+          sol.compute_max_variable_violation(),
+          sol.compute_max_constraint_violation(),
+          sol.n_feasible_constraints.value(sol.handle_ptr->get_stream()),
+          problem_ptr->n_constraints,
+          sol.compute_number_of_integers(),
+          problem_ptr->n_integer_vars);
+      }
+      sol.handle_ptr->sync_stream();
+      return_vector.emplace_back(std::move(sol));
+      counter++;
+    }
   }
   if (external_solution_queue.size() > 0) {
     CUOPT_LOG_INFO("Consuming B&B solutions, solution queue size %lu",
                    external_solution_queue.size());
     external_solution_queue.clear();
   }
+  external_solution_queue_cpufj.clear();
   return return_vector;
 }
 
@@ -276,7 +330,8 @@ void population_t<i_t, f_t>::run_solution_callbacks(solution_t<i_t, f_t>& sol)
       cuopt_assert(std::abs(outside_sol.get_user_objective() - outside_sol_objective) <= 1e-6,
                    "External solution objective mismatch");
       auto h_outside_sol = outside_sol.get_host_assignment();
-      add_external_solution(h_outside_sol, outside_sol.get_objective());
+      add_external_solution(
+        h_outside_sol, outside_sol.get_objective(), solution_origin_t::EXTERNAL);
     }
   }
 }
@@ -319,7 +374,7 @@ i_t population_t<i_t, f_t>::add_solution(solution_t<i_t, f_t>&& sol)
   raft::common::nvtx::range fun_scope("add_solution");
   population_hash_map.insert(sol);
   double sol_cost = sol.get_quality(weights);
-  CUOPT_LOG_TRACE("Adding solution with quality %f and objective %f n_integers %d!",
+  CUOPT_LOG_DEBUG("Adding solution with quality %f and objective %f n_integers %d!",
                   sol_cost,
                   sol.get_user_objective(),
                   sol.n_assigned_integers);
@@ -366,6 +421,7 @@ i_t population_t<i_t, f_t>::add_solution(solution_t<i_t, f_t>&& sol)
 
     int inserted_pos = insert_index(std::pair<size_t, double>((size_t)hint, sol_cost));
     cuopt_assert(test_invariant(), "Population invariant doesn't hold");
+    test_invariant();
     return inserted_pos;
 
   } else if (sol_cost + OBJECTIVE_EPSILON < indices[index].second) {
@@ -379,10 +435,12 @@ i_t population_t<i_t, f_t>::add_solution(solution_t<i_t, f_t>&& sol)
 
     int inserted_pos = insert_index(std::pair<size_t, double>((size_t)free, sol_cost));
     cuopt_assert(test_invariant(), "Population invariant doesn't hold");
+    test_invariant();
     return inserted_pos;
   }
   CUOPT_LOG_TRACE("Adding solution failed!");
   cuopt_assert(test_invariant(), "Population invariant doesn't hold");
+  test_invariant();
   return -1;
 }
 
diff --git a/cpp/src/mip/diversity/population.cuh b/cpp/src/mip/diversity/population.cuh
index 532b9f24d..79b2040ff 100644
--- a/cpp/src/mip/diversity/population.cuh
+++ b/cpp/src/mip/diversity/population.cuh
@@ -35,6 +35,18 @@ namespace cuopt::linear_programming::detail {
 template <typename i_t, typename f_t>
 class diversity_manager_t;
 
+enum class solution_origin_t { BRANCH_AND_BOUND, CPUFJ, EXTERNAL };
+
+constexpr const char* solution_origin_to_string(solution_origin_t origin)
+{
+  switch (origin) {
+    case solution_origin_t::BRANCH_AND_BOUND: return "B&B";
+    case solution_origin_t::CPUFJ: return "CPUFJ";
+    case solution_origin_t::EXTERNAL: return "injected";
+    default: return "unknown";
+  }
+}
+
 template <typename i_t, typename f_t>
 class population_t {
  public:
@@ -101,7 +113,9 @@ class population_t {
    *  \return { -1 = not inserted , others = inserted index}
    */
   i_t add_solution(solution_t<i_t, f_t>&& sol);
-  void add_external_solution(std::vector<f_t>& solution, f_t objective);
+  void add_external_solution(const std::vector<f_t>& solution,
+                             f_t objective,
+                             solution_origin_t origin);
   std::vector<solution_t<i_t, f_t>> get_external_solutions();
   size_t get_external_solution_size();
   void preempt_heuristic_solver();
@@ -172,7 +186,20 @@ class population_t {
   weight_t<i_t, f_t> weights;
   std::vector<std::pair<size_t, f_t>> indices;
   std::vector<std::pair<bool, solution_t<i_t, f_t>>> solutions;
-  std::vector<std::vector<f_t>> external_solution_queue;
+
+  struct external_solution_t {
+    external_solution_t() = default;
+    external_solution_t(const std::vector<f_t>& solution, f_t objective, solution_origin_t origin)
+      : solution(solution), objective(objective), origin(origin)
+    {
+    }
+    std::vector<f_t> solution;
+    f_t objective;
+    solution_origin_t origin;
+  };
+
+  std::vector<external_solution_t> external_solution_queue;
+  std::vector<external_solution_t> external_solution_queue_cpufj;
   std::mt19937 rng;
   i_t update_iter = 0;
   std::mutex solution_mutex;
diff --git a/cpp/src/mip/feasibility_jump/feasibility_jump.cu b/cpp/src/mip/feasibility_jump/feasibility_jump.cu
index a30a9b182..ddb45200f 100644
--- a/cpp/src/mip/feasibility_jump/feasibility_jump.cu
+++ b/cpp/src/mip/feasibility_jump/feasibility_jump.cu
@@ -80,11 +80,13 @@ fj_t<i_t, f_t>::fj_t(mip_solver_context_t<i_t, f_t>& context_, fj_settings_t in_
                                   TPB_update_changed_constraints,
                                   pb_ptr->handle_ptr);
   resetmoves_launch_dims = get_launch_dims_max_occupancy(
-    (void*)compute_mtm_moves_kernel<i_t, f_t, FJ_MTM_VIOLATED>, TPB_resetmoves, pb_ptr->handle_ptr);
-  resetmoves_bin_launch_dims =
-    get_launch_dims_max_occupancy((void*)compute_mtm_moves_kernel<i_t, f_t, FJ_MTM_VIOLATED, true>,
-                                  TPB_resetmoves,
-                                  pb_ptr->handle_ptr);
+    (void*)compute_mtm_moves_kernel<i_t, f_t, MTMMoveType::FJ_MTM_VIOLATED>,
+    TPB_resetmoves,
+    pb_ptr->handle_ptr);
+  resetmoves_bin_launch_dims = get_launch_dims_max_occupancy(
+    (void*)compute_mtm_moves_kernel<i_t, f_t, MTMMoveType::FJ_MTM_VIOLATED, true>,
+    TPB_resetmoves,
+    pb_ptr->handle_ptr);
   update_weights_launch_dims = get_launch_dims_max_occupancy(
     (void*)handle_local_minimum_kernel<i_t, f_t>, TPB_localmin, pb_ptr->handle_ptr);
   lift_move_launch_dims = get_launch_dims_max_occupancy(
@@ -149,6 +151,10 @@ void fj_t<i_t, f_t>::randomize_weights(const raft::handle_t* handle_ptr)
   f_t h_max_weight = *std::max_element(h_cstr_vec.begin(), h_cstr_vec.end());
   max_cstr_weight.set_value_async(h_max_weight, handle_ptr->get_stream());
   raft::copy(cstr_weights.data(), h_cstr_vec.data(), h_cstr_vec.size(), handle_ptr->get_stream());
+  raft::copy(
+    cstr_left_weights.data(), h_cstr_vec.data(), h_cstr_vec.size(), handle_ptr->get_stream());
+  raft::copy(
+    cstr_right_weights.data(), h_cstr_vec.data(), h_cstr_vec.size(), handle_ptr->get_stream());
   handle_ptr->sync_stream();
 }
 
@@ -722,7 +728,7 @@ void fj_t<i_t, f_t>::run_step_device(const rmm::cuda_stream_view& climber_stream
         } else {
           if (is_binary_pb) {
             cudaLaunchCooperativeKernel(
-              (void*)compute_mtm_moves_kernel<i_t, f_t, FJ_MTM_VIOLATED, true>,
+              (void*)compute_mtm_moves_kernel<i_t, f_t, MTMMoveType::FJ_MTM_VIOLATED, true>,
               grid_resetmoves_bin,
               blocks_resetmoves_bin,
               reset_moves_args,
@@ -730,7 +736,7 @@ void fj_t<i_t, f_t>::run_step_device(const rmm::cuda_stream_view& climber_stream
               climber_stream);
           } else {
             cudaLaunchCooperativeKernel(
-              (void*)compute_mtm_moves_kernel<i_t, f_t, FJ_MTM_VIOLATED, false>,
+              (void*)compute_mtm_moves_kernel<i_t, f_t, MTMMoveType::FJ_MTM_VIOLATED, false>,
               grid_resetmoves,
               blocks_resetmoves,
               reset_moves_args,
@@ -1061,6 +1067,7 @@ i_t fj_t<i_t, f_t>::solve(solution_t<i_t, f_t>& solution)
   resize_vectors(solution.handle_ptr);
 
   bool is_initial_feasible = solution.compute_feasibility();
+  auto initial_solution    = solution;
   // if we're in rounding mode, split the time/iteration limit between the first and second stage
   cuopt_assert(settings.parameters.rounding_second_stage_split >= 0 &&
                  settings.parameters.rounding_second_stage_split <= 1,
@@ -1133,11 +1140,11 @@ i_t fj_t<i_t, f_t>::solve(solution_t<i_t, f_t>& solution)
   bool is_new_feasible = solution.compute_feasibility();
 
   if (is_initial_feasible && !is_new_feasible) {
-    CUOPT_LOG_ERROR(
-      "Feasibility jump caused feasible solution to become infeasible\n"
-      "Best excess is %g",
+    CUOPT_LOG_WARN(
+      "Feasibility jump caused feasible solution to become infeasible: Best excess is %g",
       climbers[0]->best_excess.value(handle_ptr->get_stream()));
-    cuopt_assert(false, "Feasibility jump caused feasible solution to become infeasible");
+    solution.copy_from(initial_solution);
+    cuopt_assert(solution.compute_feasibility(), "Reverted solution should be feasible");
   }
 
   return is_new_feasible;
diff --git a/cpp/src/mip/feasibility_jump/feasibility_jump.cuh b/cpp/src/mip/feasibility_jump/feasibility_jump.cuh
index e5e7c6d97..c7e999509 100644
--- a/cpp/src/mip/feasibility_jump/feasibility_jump.cuh
+++ b/cpp/src/mip/feasibility_jump/feasibility_jump.cuh
@@ -57,7 +57,7 @@ struct fj_hyper_parameters_t {
   int global_move_update_period      = 10;
   int heavy_move_update_period       = 50;
   int sync_period                    = 200;
-  int lhs_refresh_period             = 1000;
+  int lhs_refresh_period             = 500;
   int allow_infeasibility_iterations = 200;
   // The value added to the objective weight everytime a new best solution is
   // found in order to move towards better solutions
@@ -98,6 +98,8 @@ enum class fj_mode_t {
   EXIT_NON_IMPROVING  // iterate until we are don't improve the best
 };
 
+enum class MTMMoveType { FJ_MTM_VIOLATED, FJ_MTM_SATISFIED, FJ_MTM_ALL };
+
 enum class fj_load_balancing_mode_t { ALWAYS_ON, AUTO, ALWAYS_OFF };
 
 enum class fj_candidate_selection_t { WEIGHTED_SCORE, FEASIBLE_FIRST };
@@ -121,7 +123,16 @@ struct fj_move_t {
   int var_idx;
   double value;
 
-  bool operator<(const fj_move_t& rhs) const { return value < rhs.value; }
+  bool operator<(const fj_move_t& rhs) const
+  {
+    if (var_idx == rhs.var_idx) return value < rhs.value;
+    return var_idx < rhs.var_idx;
+  }
+  bool operator==(const fj_move_t& rhs) const
+  {
+    return var_idx == rhs.var_idx && value == rhs.value;
+  }
+  bool operator!=(const fj_move_t& rhs) const { return !(*this == rhs); }
 };
 
 // TODO: use 32bit integers instead,
@@ -187,6 +198,9 @@ struct fj_move_candidate_t {
   f_t delta;
 };
 
+template <typename i_t, typename f_t>
+struct fj_cpu_climber_t;
+
 template <typename i_t, typename f_t>
 class fj_t {
  public:
@@ -198,6 +212,15 @@ class fj_t {
   ~fj_t();
   void reset_cuda_graph();
   i_t solve(solution_t<i_t, f_t>& solution);
+  std::unique_ptr<fj_cpu_climber_t<i_t, f_t>> create_cpu_climber(
+    solution_t<i_t, f_t>& solution,
+    const std::vector<f_t>& left_weights,
+    const std::vector<f_t>& right_weights,
+    f_t objective_weight,
+    fj_settings_t settings = fj_settings_t{},
+    bool randomize_params  = false);
+  bool cpu_solve(fj_cpu_climber_t<i_t, f_t>& fj_cpu,
+                 f_t time_limit = +std::numeric_limits<f_t>::infinity());
   i_t alloc_max_climbers(i_t desired_climbers);
   void resize_vectors(const raft::handle_t* handle_ptr);
   void device_init(const rmm::cuda_stream_view& stream);
@@ -501,41 +524,52 @@ class fj_t {
 
       fj_settings_t* settings;
 
-      DI f_t lower_excess_score(i_t cstr, f_t lhs) const
+      HDI f_t lower_excess_score(i_t cstr, f_t lhs, f_t c_lb) const
       {
-        return raft::min(lhs - pb.constraint_lower_bounds[cstr], (f_t)0);
+        return raft::min(lhs - c_lb, (f_t)0);
       }
 
-      DI f_t upper_excess_score(i_t cstr, f_t lhs) const
+      HDI f_t upper_excess_score(i_t cstr, f_t lhs, f_t c_ub) const
       {
-        return raft::min(pb.constraint_upper_bounds[cstr] - lhs, (f_t)0);
+        return raft::min(c_ub - lhs, (f_t)0);
       }
 
       // Computes the constraint's contribution to the feasibility score:
       // If the constraint is satisfied by the given LHS value, returns 0.
       // If the constraint is violated by the given LHS value, returns -|lhs-rhs|.
       // caution: is inverted compared to solution_t's excess convention
-      DI f_t excess_score(i_t cstr, f_t lhs) const
+      HDI f_t excess_score(i_t cstr, f_t cnstr_value, f_t c_lb, f_t c_ub) const
+      {
+        f_t right_score = upper_excess_score(cstr, cnstr_value, c_ub);
+        if (right_score < 0.) { return right_score; }
+        return lower_excess_score(cstr, cnstr_value, c_lb);
+      }
+      HDI f_t excess_score(i_t cstr, f_t lhs) const
       {
-        f_t right_score = upper_excess_score(cstr, lhs);
+        f_t c_lb        = pb.constraint_lower_bounds[cstr];
+        f_t c_ub        = pb.constraint_upper_bounds[cstr];
+        f_t right_score = upper_excess_score(cstr, lhs, c_ub);
         if (right_score < 0.) { return right_score; }
-        return lower_excess_score(cstr, lhs);
+        return lower_excess_score(cstr, lhs, c_lb);
       }
 
       // FJ relies on maintaining a running LHS value for each constraint
       // which may suffer from numerical errors and lead to very slight (~machine epsilon)
       // violations of the actual bounds.
       // Use a slightly tightened tolerance in FJ to account for this.
-      DI f_t get_corrected_tolerance(i_t cstr) const
+      HDI f_t get_corrected_tolerance(i_t cstr, f_t c_lb, f_t c_ub) const
       {
-        f_t cstr_tolerance = get_cstr_tolerance<i_t, f_t>(pb.constraint_lower_bounds[cstr],
-                                                          pb.constraint_upper_bounds[cstr],
-                                                          pb.tolerances.absolute_tolerance,
-                                                          pb.tolerances.relative_tolerance);
-        return max((f_t)0, cstr_tolerance - MACHINE_EPSILON);
+        f_t cstr_tolerance = get_cstr_tolerance<i_t, f_t>(
+          c_lb, c_ub, pb.tolerances.absolute_tolerance, pb.tolerances.relative_tolerance);
+        return max((f_t)1e-12, cstr_tolerance - MACHINE_EPSILON);
+      }
+      HDI f_t get_corrected_tolerance(i_t cstr) const
+      {
+        return get_corrected_tolerance(
+          cstr, pb.constraint_lower_bounds[cstr], pb.constraint_upper_bounds[cstr]);
       }
 
-      DI bool cstr_satisfied(i_t cstr, f_t lhs) const
+      HDI bool cstr_satisfied(i_t cstr, f_t lhs) const
       {
         f_t cstr_tolerance = get_cstr_tolerance<i_t, f_t>(pb.constraint_lower_bounds[cstr],
                                                           pb.constraint_upper_bounds[cstr],
@@ -544,7 +578,7 @@ class fj_t {
         return excess_score(cstr, lhs) >= -cstr_tolerance;
       }
 
-      DI bool move_numerically_stable(f_t old_val, f_t new_val, f_t infeasibility) const
+      HDI bool move_numerically_stable(f_t old_val, f_t new_val, f_t infeasibility) const
       {
         return fabs(new_val - old_val) < 1e6 && fabs(new_val) < 1e20 &&
                fabs(*violation_score - infeasibility) < 1e20;
diff --git a/cpp/src/mip/feasibility_jump/feasibility_jump_impl_common.cuh b/cpp/src/mip/feasibility_jump/feasibility_jump_impl_common.cuh
new file mode 100644
index 000000000..71513a2ec
--- /dev/null
+++ b/cpp/src/mip/feasibility_jump/feasibility_jump_impl_common.cuh
@@ -0,0 +1,221 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "feasibility_jump.cuh"
+
+namespace cuopt::linear_programming::detail {
+
+// Returns the current slack, and the variable delta that would nullify this slack ("tighten" it)
+template <typename i_t, typename f_t>
+HDI thrust::tuple<f_t, f_t> get_mtm_for_bound(
+  const typename fj_t<i_t, f_t>::climber_data_t::view_t& fj,
+  i_t var_idx,
+  i_t cstr_idx,
+  f_t cstr_coeff,
+  f_t bound,
+  f_t sign)
+{
+  f_t delta_ij = 0;
+  f_t slack    = 0;
+  f_t old_val  = fj.incumbent_assignment[var_idx];
+
+  f_t lhs = fj.incumbent_lhs[cstr_idx] * sign;
+  f_t rhs = bound * sign;
+  slack   = rhs - lhs;  // bound might be infinite. let the caller handle this case
+
+  delta_ij = slack / (cstr_coeff * sign);
+
+  return {delta_ij, slack};
+}
+
+template <typename i_t, typename f_t, MTMMoveType move_type>
+HDI thrust::tuple<f_t, f_t, f_t, f_t> get_mtm_for_constraint(
+  const typename fj_t<i_t, f_t>::climber_data_t::view_t& fj,
+  i_t var_idx,
+  i_t cstr_idx,
+  f_t cstr_coeff,
+  f_t c_lb,
+  f_t c_ub)
+{
+  f_t sign     = -1;
+  f_t delta_ij = 0;
+  f_t slack    = 0;
+
+  f_t cstr_tolerance = fj.get_corrected_tolerance(cstr_idx);
+
+  f_t old_val = fj.incumbent_assignment[var_idx];
+
+  // process each bound as two separate constraints
+  f_t bounds[2] = {c_lb, c_ub};
+  cuopt_assert(isfinite(bounds[0]) || isfinite(bounds[1]), "bounds are not finite");
+
+  for (i_t bound_idx = 0; bound_idx < 2; ++bound_idx) {
+    if (!isfinite(bounds[bound_idx])) continue;
+
+    // factor to correct the lhs/rhs to turn a lb <= lhs <= ub constraint into
+    // two virtual constraints lhs <= ub and -lhs <= -lb
+    sign    = bound_idx == 0 ? -1 : 1;
+    f_t lhs = fj.incumbent_lhs[cstr_idx] * sign;
+    f_t rhs = bounds[bound_idx] * sign;
+    slack   = rhs - lhs;
+
+    // skip constraints that are violated/satisfied based on the MTM move type
+    bool violated = slack < -cstr_tolerance;
+    if (move_type == MTMMoveType::FJ_MTM_VIOLATED ? !violated : violated) continue;
+
+    f_t new_val = old_val;
+
+    delta_ij = slack / (cstr_coeff * sign);
+    break;
+  }
+
+  return {delta_ij, sign, slack, cstr_tolerance};
+}
+
+template <typename i_t, typename f_t>
+HDI std::pair<f_t, f_t> feas_score_constraint(
+  const typename fj_t<i_t, f_t>::climber_data_t::view_t& fj,
+  i_t var_idx,
+  f_t delta,
+  i_t cstr_idx,
+  f_t cstr_coeff,
+  f_t c_lb,
+  f_t c_ub,
+  f_t current_lhs)
+{
+  cuopt_assert(isfinite(delta), "invalid delta");
+  cuopt_assert(cstr_coeff != 0 && isfinite(cstr_coeff), "invalid coefficient");
+
+  f_t base_feas    = 0;
+  f_t bonus_robust = 0;
+
+  f_t bounds[2] = {c_lb, c_ub};
+  cuopt_assert(isfinite(c_lb) || isfinite(c_ub), "no range");
+  for (i_t bound_idx = 0; bound_idx < 2; ++bound_idx) {
+    if (!isfinite(bounds[bound_idx])) continue;
+
+    // factor to correct the lhs/rhs to turn a lb <= lhs <= ub constraint into
+    // two virtual leq constraints "lhs <= ub" and "-lhs <= -lb" in order to match
+    // the convention of the paper
+
+    // TODO: broadcast left/right weights to a csr_offset-indexed table? local minimums
+    // usually occur on a rarer basis (around 50 iteratiosn to 1 local minimum)
+    // likely unreasonable and overkill however
+    f_t cstr_weight =
+      bound_idx == 0 ? fj.cstr_left_weights[cstr_idx] : fj.cstr_right_weights[cstr_idx];
+    f_t sign      = bound_idx == 0 ? -1 : 1;
+    f_t rhs       = bounds[bound_idx] * sign;
+    f_t old_lhs   = current_lhs * sign;
+    f_t new_lhs   = (current_lhs + cstr_coeff * delta) * sign;
+    f_t old_slack = rhs - old_lhs;
+    f_t new_slack = rhs - new_lhs;
+
+    cuopt_assert(isfinite(cstr_weight), "invalid weight");
+    cuopt_assert(cstr_weight >= 0, "invalid weight");
+    cuopt_assert(isfinite(old_lhs), "");
+    cuopt_assert(isfinite(new_lhs), "");
+    cuopt_assert(isfinite(old_slack) && isfinite(new_slack), "");
+
+    f_t cstr_tolerance = fj.get_corrected_tolerance(cstr_idx);
+
+    bool old_viol = fj.excess_score(cstr_idx, current_lhs, c_lb, c_ub) < -cstr_tolerance;
+    bool new_viol =
+      fj.excess_score(cstr_idx, current_lhs + cstr_coeff * delta, c_lb, c_ub) < -cstr_tolerance;
+
+    bool old_sat = old_lhs < rhs + cstr_tolerance;
+    bool new_sat = new_lhs < rhs + cstr_tolerance;
+
+    // equality
+    if (fj.pb.integer_equal(c_lb, c_ub)) {
+      if (!old_viol) cuopt_assert(old_sat == !old_viol, "");
+      if (!new_viol) cuopt_assert(new_sat == !new_viol, "");
+    }
+
+    // if it would feasibilize this constraint
+    if (!old_sat && new_sat) {
+      cuopt_assert(old_viol, "");
+      base_feas += cstr_weight;
+    }
+    // would cause this constraint to be violated
+    else if (old_sat && !new_sat) {
+      cuopt_assert(new_viol, "");
+      base_feas -= cstr_weight;
+    }
+    // simple improvement
+    else if (!old_sat && !new_sat && old_lhs > new_lhs) {
+      cuopt_assert(old_viol && new_viol, "");
+      base_feas += (i_t)(cstr_weight * fj.settings->parameters.excess_improvement_weight);
+    }
+    // simple worsening
+    else if (!old_sat && !new_sat && old_lhs <= new_lhs) {
+      cuopt_assert(old_viol && new_viol, "");
+      base_feas -= (i_t)(cstr_weight * fj.settings->parameters.excess_improvement_weight);
+    }
+
+    // robustness score bonus if this would leave some strick slack
+    bool old_stable = old_lhs < rhs - cstr_tolerance;
+    bool new_stable = new_lhs < rhs - cstr_tolerance;
+    if (!old_stable && new_stable) {
+      bonus_robust += cstr_weight;
+    } else if (old_stable && !new_stable) {
+      bonus_robust -= cstr_weight;
+    }
+  }
+
+  return {base_feas, bonus_robust};
+}
+
+template <typename i_t, typename f_t>
+HDI f_t get_breakthrough_move(typename fj_t<i_t, f_t>::climber_data_t::view_t fj, i_t var_idx)
+{
+  f_t obj_coeff = fj.pb.objective_coefficients[var_idx];
+  auto bounds   = fj.pb.variable_bounds[var_idx];
+  f_t v_lb      = get_lower(bounds);
+  f_t v_ub      = get_upper(bounds);
+  cuopt_assert(isfinite(v_lb) || isfinite(v_ub), "unexpected free variable");
+  cuopt_assert(v_lb <= v_ub, "invalid bounds");
+  cuopt_assert(fj.pb.check_variable_within_bounds(var_idx, fj.incumbent_assignment[var_idx]),
+               "invalid incumbent assignment");
+  cuopt_assert(isfinite(obj_coeff), "invalid objective coefficient");
+  cuopt_assert(obj_coeff != 0, "objective coefficient shouldn't be null");
+
+  f_t excess = (*fj.best_objective) - *fj.incumbent_objective;
+
+  cuopt_assert(isfinite(excess) && excess < 0,
+               "breakthru move invoked during invalid solver state");
+
+  f_t old_val = fj.incumbent_assignment[var_idx];
+  f_t new_val = old_val;
+
+  f_t delta_ij = excess / obj_coeff;
+
+  if (fj.pb.is_integer_var(var_idx)) {
+    new_val = obj_coeff > 0 ? floor(old_val + delta_ij + fj.pb.tolerances.integrality_tolerance)
+                            : ceil(old_val + delta_ij - fj.pb.tolerances.integrality_tolerance);
+  } else {
+    new_val = old_val + delta_ij;
+  }
+
+  // fallback
+  if (!fj.pb.check_variable_within_bounds(var_idx, new_val)) {
+    new_val = obj_coeff > 0 ? v_lb : v_ub;
+  }
+
+  return new_val;
+}
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip/feasibility_jump/feasibility_jump_kernels.cu b/cpp/src/mip/feasibility_jump/feasibility_jump_kernels.cu
index 0d96cd4b2..dc63d94cd 100644
--- a/cpp/src/mip/feasibility_jump/feasibility_jump_kernels.cu
+++ b/cpp/src/mip/feasibility_jump/feasibility_jump_kernels.cu
@@ -26,6 +26,8 @@
 
 #include <cooperative_groups.h>
 
+#include "feasibility_jump_impl_common.cuh"
+
 namespace cg = cooperative_groups;
 
 #define CONSTRAINT_FLAG_INSERT 0
@@ -76,98 +78,6 @@ DI thrust::pair<f_t, f_t> move_objective_score(
   return {base_obj, bonus_breakthrough};
 }
 
-template <typename i_t, typename f_t>
-DI std::pair<f_t, f_t> feas_score_constraint(
-  const typename fj_t<i_t, f_t>::climber_data_t::view_t& fj,
-  i_t var_idx,
-  f_t delta,
-  i_t cstr_idx,
-  f_t cstr_coeff,
-  f_t c_lb,
-  f_t c_ub,
-  f_t current_lhs)
-{
-  cuopt_assert(isfinite(delta), "invalid delta");
-  cuopt_assert(cstr_coeff != 0 && isfinite(cstr_coeff), "invalid coefficient");
-
-  f_t base_feas    = 0;
-  f_t bonus_robust = 0;
-
-  f_t bounds[2] = {c_lb, c_ub};
-  cuopt_assert(isfinite(c_lb) || isfinite(c_ub), "no range");
-  for (i_t bound_idx = 0; bound_idx < 2; ++bound_idx) {
-    if (!isfinite(bounds[bound_idx])) continue;
-
-    // factor to correct the lhs/rhs to turn a lb <= lhs <= ub constraint into
-    // two virtual leq constraints "lhs <= ub" and "-lhs <= -lb" in order to match
-    // the convention of the paper
-
-    // TODO: broadcast left/right weights to a csr_offset-indexed table? local minimums
-    // usually occur on a rarer basis (around 50 iteratiosn to 1 local minimum)
-    // likely unreasonable and overkill however
-    f_t cstr_weight =
-      bound_idx == 0 ? fj.cstr_left_weights[cstr_idx] : fj.cstr_right_weights[cstr_idx];
-    f_t sign      = bound_idx == 0 ? -1 : 1;
-    f_t rhs       = bounds[bound_idx] * sign;
-    f_t old_lhs   = current_lhs * sign;
-    f_t new_lhs   = (current_lhs + cstr_coeff * delta) * sign;
-    f_t old_slack = rhs - old_lhs;
-    f_t new_slack = rhs - new_lhs;
-
-    cuopt_assert(isfinite(cstr_weight), "invalid weight");
-    cuopt_assert(cstr_weight >= 0, "invalid weight");
-    cuopt_assert(isfinite(old_lhs), "");
-    cuopt_assert(isfinite(new_lhs), "");
-    cuopt_assert(isfinite(old_slack) && isfinite(new_slack), "");
-
-    f_t cstr_tolerance = fj.get_corrected_tolerance(cstr_idx);
-
-    bool old_viol = fj.excess_score(cstr_idx, current_lhs) < -cstr_tolerance;
-    bool new_viol = fj.excess_score(cstr_idx, current_lhs + cstr_coeff * delta) < -cstr_tolerance;
-
-    bool old_sat = old_lhs < rhs + cstr_tolerance;
-    bool new_sat = new_lhs < rhs + cstr_tolerance;
-
-    // equality
-    if (fj.pb.integer_equal(c_lb, c_ub)) {
-      if (!old_viol) cuopt_assert(old_sat == !old_viol, "");
-      if (!new_viol) cuopt_assert(new_sat == !new_viol, "");
-    }
-
-    // if it would feasibilize this constraint
-    if (!old_sat && new_sat) {
-      cuopt_assert(old_viol, "");
-      base_feas += cstr_weight;
-    }
-    // would cause this constraint to be violated
-    else if (old_sat && !new_sat) {
-      cuopt_assert(new_viol, "");
-      base_feas -= cstr_weight;
-    }
-    // simple improvement
-    else if (!old_sat && !new_sat && old_lhs > new_lhs) {
-      cuopt_assert(old_viol && new_viol, "");
-      base_feas += (i_t)(cstr_weight * fj.settings->parameters.excess_improvement_weight);
-    }
-    // simple worsening
-    else if (!old_sat && !new_sat && old_lhs <= new_lhs) {
-      cuopt_assert(old_viol && new_viol, "");
-      base_feas -= (i_t)(cstr_weight * fj.settings->parameters.excess_improvement_weight);
-    }
-
-    // robustness score bonus if this would leave some strick slack
-    bool old_stable = old_lhs < rhs - cstr_tolerance;
-    bool new_stable = new_lhs < rhs - cstr_tolerance;
-    if (!old_stable && new_stable) {
-      bonus_robust += cstr_weight;
-    } else if (old_stable && !new_stable) {
-      bonus_robust -= cstr_weight;
-    }
-  }
-
-  return {base_feas, bonus_robust};
-}
-
 template <typename i_t, typename f_t>
 DI void smooth_weights(typename fj_t<i_t, f_t>::climber_data_t::view_t& fj)
 {
@@ -203,9 +113,11 @@ DI void update_weights(typename fj_t<i_t, f_t>::climber_data_t::view_t& fj)
   for (i_t i = blockIdx.x; i < fj.violated_constraints.size(); i += gridDim.x) {
     i_t cstr_idx           = fj.violated_constraints.contents[i];
     f_t curr_incumbent_lhs = fj.incumbent_lhs[cstr_idx];
-    f_t curr_lower_excess  = fj.lower_excess_score(cstr_idx, curr_incumbent_lhs);
-    f_t curr_upper_excess  = fj.upper_excess_score(cstr_idx, curr_incumbent_lhs);
-    f_t curr_excess_score  = curr_lower_excess + curr_upper_excess;
+    f_t curr_lower_excess =
+      fj.lower_excess_score(cstr_idx, curr_incumbent_lhs, fj.pb.constraint_lower_bounds[cstr_idx]);
+    f_t curr_upper_excess =
+      fj.upper_excess_score(cstr_idx, curr_incumbent_lhs, fj.pb.constraint_upper_bounds[cstr_idx]);
+    f_t curr_excess_score = curr_lower_excess + curr_upper_excess;
 
     f_t old_weight;
     if (curr_lower_excess < 0.) {
@@ -362,73 +274,6 @@ DI typename fj_t<i_t, f_t>::move_score_info_t compute_new_score(
   return score_info;
 }
 
-// Returns the current slack, and the variable delta that would nullify this slack ("tighten" it)
-template <typename i_t, typename f_t>
-DI thrust::tuple<f_t, f_t> get_mtm_for_bound(
-  const typename fj_t<i_t, f_t>::climber_data_t::view_t& fj,
-  i_t var_idx,
-  i_t cstr_idx,
-  f_t cstr_coeff,
-  f_t bound,
-  f_t sign)
-{
-  f_t delta_ij = 0;
-  f_t slack    = 0;
-  f_t old_val  = fj.incumbent_assignment[var_idx];
-
-  f_t lhs = fj.incumbent_lhs[cstr_idx] * sign;
-  f_t rhs = bound * sign;
-  slack   = rhs - lhs;  // bound might be infinite. let the caller handle this case
-
-  delta_ij = slack / (cstr_coeff * sign);
-
-  return {delta_ij, slack};
-}
-
-template <typename i_t, typename f_t, MTMMoveType move_type>
-DI thrust::tuple<f_t, f_t, f_t, f_t> get_mtm_for_constraint(
-  const typename fj_t<i_t, f_t>::climber_data_t::view_t& fj,
-  i_t var_idx,
-  i_t cstr_idx,
-  f_t cstr_coeff)
-{
-  f_t sign     = -1;
-  f_t delta_ij = 0;
-  f_t slack    = 0;
-
-  f_t c_lb           = fj.pb.constraint_lower_bounds[cstr_idx];
-  f_t c_ub           = fj.pb.constraint_upper_bounds[cstr_idx];
-  f_t cstr_tolerance = fj.get_corrected_tolerance(cstr_idx);
-
-  f_t old_val = fj.incumbent_assignment[var_idx];
-
-  // process each bound as two separate constraints
-  f_t bounds[2] = {c_lb, c_ub};
-  cuopt_assert(isfinite(bounds[0]) || isfinite(bounds[1]), "");
-
-  for (i_t bound_idx = 0; bound_idx < 2; ++bound_idx) {
-    if (!isfinite(bounds[bound_idx])) continue;
-
-    // factor to correct the lhs/rhs to turn a lb <= lhs <= ub constraint into
-    // two virtual constraints lhs <= ub and -lhs <= -lb
-    sign    = bound_idx == 0 ? -1 : 1;
-    f_t lhs = fj.incumbent_lhs[cstr_idx] * sign;
-    f_t rhs = bounds[bound_idx] * sign;
-    slack   = rhs - lhs;
-
-    // skip constraints that are violated/satisfied based on the MTM move type
-    bool violated = slack < -cstr_tolerance;
-    if (move_type == MTMMoveType::FJ_MTM_VIOLATED ? !violated : violated) continue;
-
-    f_t new_val = old_val;
-
-    delta_ij = slack / (cstr_coeff * sign);
-    break;
-  }
-
-  return {delta_ij, sign, slack, cstr_tolerance};
-}
-
 // find best mixed tight move
 template <typename i_t, typename f_t, i_t TPB, MTMMoveType move_type>
 DI std::pair<f_t, typename fj_t<i_t, f_t>::move_score_info_t> compute_best_mtm(
@@ -472,9 +317,11 @@ DI std::pair<f_t, typename fj_t<i_t, f_t>::move_score_info_t> compute_best_mtm(
       ;
     }
 
+    f_t c_lb = fj.pb.constraint_lower_bounds[cstr_idx];
+    f_t c_ub = fj.pb.constraint_upper_bounds[cstr_idx];
     f_t new_val;
     auto [delta_ij, sign, slack, cstr_tolerance] =
-      get_mtm_for_constraint<i_t, f_t, move_type>(fj, var_idx, cstr_idx, cstr_coeff);
+      get_mtm_for_constraint<i_t, f_t, move_type>(fj, var_idx, cstr_idx, cstr_coeff, c_lb, c_ub);
     if (fj.pb.is_integer_var(var_idx)) {
       new_val = cstr_coeff * sign > 0
                   ? floor(old_val + delta_ij + fj.pb.tolerances.integrality_tolerance)
@@ -990,45 +837,6 @@ __global__ void update_lift_moves_kernel(typename fj_t<i_t, f_t>::climber_data_t
   update_lift_moves<i_t, f_t, TPB_liftmoves>(fj);
 }
 
-template <typename i_t, typename f_t>
-DI f_t get_breakthrough_move(typename fj_t<i_t, f_t>::climber_data_t::view_t fj, i_t var_idx)
-{
-  f_t obj_coeff = fj.pb.objective_coefficients[var_idx];
-  auto bounds   = fj.pb.variable_bounds[var_idx];
-  f_t v_lb      = get_lower(bounds);
-  f_t v_ub      = get_upper(bounds);
-  cuopt_assert(isfinite(v_lb) || isfinite(v_ub), "unexpected free variable");
-  cuopt_assert(v_lb <= v_ub, "invalid bounds");
-  cuopt_assert(fj.pb.check_variable_within_bounds(var_idx, fj.incumbent_assignment[var_idx]),
-               "invalid incumbent assignment");
-  cuopt_assert(isfinite(obj_coeff), "invalid objective coefficient");
-  cuopt_assert(obj_coeff != 0, "objective coefficient shouldn't be null");
-
-  f_t excess = (*fj.best_objective) - *fj.incumbent_objective;
-
-  cuopt_assert(isfinite(excess) && excess < 0,
-               "breakthru move invoked during invalid solver state");
-
-  f_t old_val = fj.incumbent_assignment[var_idx];
-  f_t new_val = old_val;
-
-  f_t delta_ij = excess / obj_coeff;
-
-  if (fj.pb.is_integer_var(var_idx)) {
-    new_val = obj_coeff > 0 ? floor(old_val + delta_ij + fj.pb.tolerances.integrality_tolerance)
-                            : ceil(old_val + delta_ij - fj.pb.tolerances.integrality_tolerance);
-  } else {
-    new_val = old_val + delta_ij;
-  }
-
-  // fallback
-  if (!fj.pb.check_variable_within_bounds(var_idx, new_val)) {
-    new_val = obj_coeff > 0 ? v_lb : v_ub;
-  }
-
-  return new_val;
-}
-
 template <typename i_t, typename f_t, i_t TPB>
 DI void update_breakthrough_moves(typename fj_t<i_t, f_t>::climber_data_t::view_t fj)
 {
diff --git a/cpp/src/mip/feasibility_jump/feasibility_jump_kernels.cuh b/cpp/src/mip/feasibility_jump/feasibility_jump_kernels.cuh
index 2b299ccd2..080a6c210 100644
--- a/cpp/src/mip/feasibility_jump/feasibility_jump_kernels.cuh
+++ b/cpp/src/mip/feasibility_jump/feasibility_jump_kernels.cuh
@@ -96,11 +96,9 @@ __global__ void update_changed_constraints_kernel(
 template <typename i_t, typename f_t>
 __global__ void update_best_solution_kernel(typename fj_t<i_t, f_t>::climber_data_t::view_t fj);
 
-enum MTMMoveType { FJ_MTM_VIOLATED, FJ_MTM_SATISFIED, FJ_MTM_ALL };
-
 template <typename i_t,
           typename f_t,
-          MTMMoveType move_type = FJ_MTM_VIOLATED,
+          MTMMoveType move_type = MTMMoveType::FJ_MTM_VIOLATED,
           bool is_binary_pb     = false>
 __global__ void compute_mtm_moves_kernel(typename fj_t<i_t, f_t>::climber_data_t::view_t fj,
                                          bool ForceRefresh = false);
diff --git a/cpp/src/mip/feasibility_jump/fj_cpu.cu b/cpp/src/mip/feasibility_jump/fj_cpu.cu
new file mode 100644
index 000000000..b617f5a1d
--- /dev/null
+++ b/cpp/src/mip/feasibility_jump/fj_cpu.cu
@@ -0,0 +1,1004 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <mip/mip_constants.hpp>
+
+#include "feasibility_jump.cuh"
+#include "feasibility_jump_impl_common.cuh"
+#include "fj_cpu.cuh"
+
+#include <utilities/seed_generator.cuh>
+
+#include <chrono>
+#include <random>
+#include <unordered_set>
+#include <vector>
+
+#define CPUFJ_TIMING_TRACE 0
+
+namespace cuopt::linear_programming::detail {
+
+template <typename i_t, typename f_t>
+class timing_raii_t {
+ public:
+  timing_raii_t(std::vector<double>& times_vec)
+    : times_vec_(times_vec), start_time_(std::chrono::high_resolution_clock::now())
+  {
+  }
+
+  ~timing_raii_t()
+  {
+    auto end_time = std::chrono::high_resolution_clock::now();
+    auto duration =
+      std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time_);
+    times_vec_.push_back(duration.count());
+  }
+
+ private:
+  std::vector<double>& times_vec_;
+  std::chrono::high_resolution_clock::time_point start_time_;
+};
+
+template <typename i_t, typename f_t>
+static void print_timing_stats(fj_cpu_climber_t<i_t, f_t>& fj_cpu)
+{
+  auto compute_avg_and_total = [](const std::vector<double>& times) -> std::pair<double, double> {
+    if (times.empty()) return {0.0, 0.0};
+    double sum = 0.0;
+    for (double time : times)
+      sum += time;
+    return {sum / times.size(), sum};
+  };
+
+  auto [lift_avg, lift_total]       = compute_avg_and_total(fj_cpu.find_lift_move_times);
+  auto [viol_avg, viol_total]       = compute_avg_and_total(fj_cpu.find_mtm_move_viol_times);
+  auto [sat_avg, sat_total]         = compute_avg_and_total(fj_cpu.find_mtm_move_sat_times);
+  auto [apply_avg, apply_total]     = compute_avg_and_total(fj_cpu.apply_move_times);
+  auto [weights_avg, weights_total] = compute_avg_and_total(fj_cpu.update_weights_times);
+  auto [compute_score_avg, compute_score_total] = compute_avg_and_total(fj_cpu.compute_score_times);
+  CUOPT_LOG_TRACE("=== Timing Statistics (Iteration %d) ===\n", fj_cpu.iterations);
+  CUOPT_LOG_TRACE("find_lift_move:      avg=%.6f ms, total=%.6f ms, calls=%zu\n",
+                  lift_avg * 1000.0,
+                  lift_total * 1000.0,
+                  fj_cpu.find_lift_move_times.size());
+  CUOPT_LOG_TRACE("find_mtm_move_viol:  avg=%.6f ms, total=%.6f ms, calls=%zu\n",
+                  viol_avg * 1000.0,
+                  viol_total * 1000.0,
+                  fj_cpu.find_mtm_move_viol_times.size());
+  CUOPT_LOG_TRACE("find_mtm_move_sat:   avg=%.6f ms, total=%.6f ms, calls=%zu\n",
+                  sat_avg * 1000.0,
+                  sat_total * 1000.0,
+                  fj_cpu.find_mtm_move_sat_times.size());
+  CUOPT_LOG_TRACE("apply_move:          avg=%.6f ms, total=%.6f ms, calls=%zu\n",
+                  apply_avg * 1000.0,
+                  apply_total * 1000.0,
+                  fj_cpu.apply_move_times.size());
+  CUOPT_LOG_TRACE("update_weights:      avg=%.6f ms, total=%.6f ms, calls=%zu\n",
+                  weights_avg * 1000.0,
+                  weights_total * 1000.0,
+                  fj_cpu.update_weights_times.size());
+  CUOPT_LOG_TRACE("compute_score:       avg=%.6f ms, total=%.6f ms, calls=%zu\n",
+                  compute_score_avg * 1000.0,
+                  compute_score_total * 1000.0,
+                  fj_cpu.compute_score_times.size());
+  CUOPT_LOG_TRACE("cache hit percentage: %.2f%%\n",
+                  (double)fj_cpu.hit_count / (fj_cpu.hit_count + fj_cpu.miss_count) * 100.0);
+  CUOPT_LOG_TRACE("bin  candidate move hit percentage: %.2f%%\n",
+                  (double)fj_cpu.candidate_move_hits[0] /
+                    (fj_cpu.candidate_move_hits[0] + fj_cpu.candidate_move_misses[0]) * 100.0);
+  CUOPT_LOG_TRACE("int  candidate move hit percentage: %.2f%%\n",
+                  (double)fj_cpu.candidate_move_hits[1] /
+                    (fj_cpu.candidate_move_hits[1] + fj_cpu.candidate_move_misses[1]) * 100.0);
+  CUOPT_LOG_TRACE("cont candidate move hit percentage: %.2f%%\n",
+                  (double)fj_cpu.candidate_move_hits[2] /
+                    (fj_cpu.candidate_move_hits[2] + fj_cpu.candidate_move_misses[2]) * 100.0);
+  CUOPT_LOG_TRACE("========================================\n");
+}
+
+template <typename i_t, typename f_t>
+static inline bool tabu_check(fj_cpu_climber_t<i_t, f_t>& fj_cpu,
+                              i_t var_idx,
+                              f_t delta,
+                              bool localmin = false)
+{
+  if (localmin) {
+    return (delta < 0 && fj_cpu.iterations == fj_cpu.h_tabu_lastinc[var_idx] + 1) ||
+           (delta >= 0 && fj_cpu.iterations == fj_cpu.h_tabu_lastdec[var_idx] + 1);
+  } else {
+    return (delta < 0 && fj_cpu.iterations < fj_cpu.h_tabu_nodec_until[var_idx]) ||
+           (delta >= 0 && fj_cpu.iterations < fj_cpu.h_tabu_noinc_until[var_idx]);
+  }
+}
+
+template <typename i_t, typename f_t>
+static inline fj_staged_score_t compute_score(fj_cpu_climber_t<i_t, f_t>& fj_cpu,
+                                              i_t var_idx,
+                                              f_t delta)
+{
+  // timing_raii_t<i_t, f_t> timer(fj_cpu.compute_score_times);
+
+  f_t obj_diff = fj_cpu.h_obj_coeffs[var_idx] * delta;
+
+  cuopt_assert(isfinite(delta), "");
+
+  cuopt_assert(var_idx < fj_cpu.view.pb.n_variables, "variable index out of bounds");
+
+  f_t base_feas_sum    = 0;
+  f_t bonus_robust_sum = 0;
+
+  auto [offset_begin, offset_end] = fj_cpu.view.pb.reverse_range_for_var(var_idx);
+  for (i_t i = offset_begin; i < offset_end; i++) {
+    auto cstr_idx     = fj_cpu.h_reverse_constraints[i];
+    auto cstr_coeff   = fj_cpu.h_reverse_coefficients[i];
+    auto [c_lb, c_ub] = fj_cpu.cached_cstr_bounds[i];
+
+    cuopt_assert(c_lb <= c_ub, "invalid bounds");
+
+    auto [cstr_base_feas, cstr_bonus_robust] = feas_score_constraint<i_t, f_t>(
+      fj_cpu.view, var_idx, delta, cstr_idx, cstr_coeff, c_lb, c_ub, fj_cpu.h_lhs[cstr_idx]);
+
+    base_feas_sum += cstr_base_feas;
+    bonus_robust_sum += cstr_bonus_robust;
+  }
+
+  f_t base_obj = 0;
+  if (obj_diff < 0)  // improving move wrt objective
+    base_obj = fj_cpu.h_objective_weight;
+  else if (obj_diff > 0)
+    base_obj = -fj_cpu.h_objective_weight;
+
+  f_t bonus_breakthrough = 0;
+
+  bool old_obj_better = fj_cpu.h_incumbent_objective < fj_cpu.h_best_objective;
+  bool new_obj_better = fj_cpu.h_incumbent_objective + obj_diff < fj_cpu.h_best_objective;
+  if (!old_obj_better && new_obj_better)
+    bonus_breakthrough += fj_cpu.h_objective_weight;
+  else if (old_obj_better && !new_obj_better) {
+    bonus_breakthrough -= fj_cpu.h_objective_weight;
+  }
+
+  fj_staged_score_t score;
+  score.base  = round(base_obj + base_feas_sum);
+  score.bonus = round(bonus_breakthrough + bonus_robust_sum);
+  return score;
+}
+
+template <typename i_t, typename f_t>
+static void smooth_weights(fj_cpu_climber_t<i_t, f_t>& fj_cpu)
+{
+  for (i_t cstr_idx = 0; cstr_idx < fj_cpu.view.pb.n_constraints; cstr_idx++) {
+    // consider only satisfied constraints
+    if (fj_cpu.violated_constraints.count(cstr_idx)) continue;
+
+    f_t weight_l = max((f_t)0, fj_cpu.h_cstr_left_weights[cstr_idx] - 1);
+    f_t weight_r = max((f_t)0, fj_cpu.h_cstr_right_weights[cstr_idx] - 1);
+
+    fj_cpu.h_cstr_left_weights[cstr_idx]  = weight_l;
+    fj_cpu.h_cstr_right_weights[cstr_idx] = weight_r;
+  }
+
+  if (fj_cpu.h_objective_weight > 0 && fj_cpu.h_incumbent_objective >= fj_cpu.h_best_objective) {
+    fj_cpu.h_objective_weight = max((f_t)0, fj_cpu.h_objective_weight - 1);
+  }
+}
+
+template <typename i_t, typename f_t>
+static void update_weights(fj_cpu_climber_t<i_t, f_t>& fj_cpu)
+{
+  timing_raii_t<i_t, f_t> timer(fj_cpu.update_weights_times);
+
+  raft::random::PCGenerator rng(fj_cpu.settings.seed + fj_cpu.iterations, 0, 0);
+  bool smoothing = rng.next_float() <= fj_cpu.settings.parameters.weight_smoothing_probability;
+
+  if (smoothing) {
+    smooth_weights<i_t, f_t>(fj_cpu);
+    return;
+  }
+
+  for (auto cstr_idx : fj_cpu.violated_constraints) {
+    f_t curr_incumbent_lhs = fj_cpu.h_lhs[cstr_idx];
+    f_t curr_lower_excess =
+      fj_cpu.view.lower_excess_score(cstr_idx, curr_incumbent_lhs, fj_cpu.h_cstr_lb[cstr_idx]);
+    f_t curr_upper_excess =
+      fj_cpu.view.upper_excess_score(cstr_idx, curr_incumbent_lhs, fj_cpu.h_cstr_ub[cstr_idx]);
+    f_t curr_excess_score = curr_lower_excess + curr_upper_excess;
+
+    f_t old_weight;
+    if (curr_lower_excess < 0.) {
+      old_weight = fj_cpu.h_cstr_left_weights[cstr_idx];
+    } else {
+      old_weight = fj_cpu.h_cstr_right_weights[cstr_idx];
+    }
+
+    cuopt_assert(curr_excess_score < 0, "constraint not violated");
+
+    i_t int_delta = 1.0;
+    f_t delta     = int_delta;
+
+    f_t new_weight = old_weight + delta;
+    new_weight     = round(new_weight);
+
+    if (curr_lower_excess < 0.) {
+      fj_cpu.h_cstr_left_weights[cstr_idx] = new_weight;
+      fj_cpu.max_weight                    = max(fj_cpu.max_weight, new_weight);
+    } else {
+      fj_cpu.h_cstr_right_weights[cstr_idx] = new_weight;
+      fj_cpu.max_weight                     = max(fj_cpu.max_weight, new_weight);
+    }
+
+    // Invalidate related cached move scores
+    auto [relvar_offset_begin, relvar_offset_end] = fj_cpu.view.pb.range_for_constraint(cstr_idx);
+    for (auto i = relvar_offset_begin; i < relvar_offset_end; i++) {
+      fj_cpu.cached_mtm_moves[i].first = 0;
+    }
+  }
+
+  if (fj_cpu.violated_constraints.empty()) { fj_cpu.h_objective_weight += 1; }
+}
+
+template <typename i_t, typename f_t>
+static void apply_move(fj_cpu_climber_t<i_t, f_t>& fj_cpu,
+                       i_t var_idx,
+                       f_t delta,
+                       bool localmin = false)
+{
+  timing_raii_t<i_t, f_t> timer(fj_cpu.apply_move_times);
+
+  raft::random::PCGenerator rng(fj_cpu.settings.seed + fj_cpu.iterations, 0, 0);
+
+  cuopt_assert(var_idx < fj_cpu.view.pb.n_variables, "variable index out of bounds");
+  // Update the LHSs of all involved constraints.
+  auto [offset_begin, offset_end] = fj_cpu.view.pb.reverse_range_for_var(var_idx);
+
+  i_t previous_viol = fj_cpu.violated_constraints.size();
+
+  for (auto i = offset_begin; i < offset_end; i++) {
+    cuopt_assert(i < (i_t)fj_cpu.h_reverse_constraints.size(), "");
+    auto [c_lb, c_ub] = fj_cpu.cached_cstr_bounds[i];
+
+    auto cstr_idx   = fj_cpu.h_reverse_constraints[i];
+    auto cstr_coeff = fj_cpu.h_reverse_coefficients[i];
+
+    f_t old_lhs        = fj_cpu.h_lhs[cstr_idx];
+    f_t new_lhs        = old_lhs + cstr_coeff * delta;
+    f_t old_cost       = fj_cpu.view.excess_score(cstr_idx, old_lhs, c_lb, c_ub);
+    f_t new_cost       = fj_cpu.view.excess_score(cstr_idx, new_lhs, c_lb, c_ub);
+    f_t cstr_tolerance = fj_cpu.view.get_corrected_tolerance(cstr_idx, c_lb, c_ub);
+
+    if (new_cost < -cstr_tolerance && !fj_cpu.violated_constraints.count(cstr_idx)) {
+      fj_cpu.violated_constraints.insert(cstr_idx);
+      fj_cpu.satisfied_constraints.erase(cstr_idx);
+    } else if (!(new_cost < -cstr_tolerance) && fj_cpu.violated_constraints.count(cstr_idx)) {
+      fj_cpu.violated_constraints.erase(cstr_idx);
+      fj_cpu.satisfied_constraints.insert(cstr_idx);
+    }
+
+    cuopt_assert(isfinite(delta), "delta should be finite");
+    // Kahan compensated summation
+    f_t y                          = cstr_coeff * delta - fj_cpu.h_lhs_sumcomp[cstr_idx];
+    f_t t                          = old_lhs + y;
+    fj_cpu.h_lhs_sumcomp[cstr_idx] = (t - old_lhs) - y;
+    fj_cpu.h_lhs[cstr_idx]         = t;
+    cuopt_assert(isfinite(fj_cpu.h_lhs[cstr_idx]), "assignment should be finite");
+
+    // Invalidate related cached move scores
+    auto [relvar_offset_begin, relvar_offset_end] = fj_cpu.view.pb.range_for_constraint(cstr_idx);
+    for (auto i = relvar_offset_begin; i < relvar_offset_end; i++) {
+      fj_cpu.cached_mtm_moves[i].first = 0;
+    }
+  }
+
+  if (previous_viol > 0 && fj_cpu.violated_constraints.empty()) {
+    fj_cpu.last_feasible_entrance_iter = fj_cpu.iterations;
+  }
+
+  // update the assignment and objective proper
+  f_t new_val = fj_cpu.h_assignment[var_idx] + delta;
+  if (fj_cpu.view.pb.is_integer_var(var_idx)) {
+    cuopt_assert(fj_cpu.view.pb.integer_equal(new_val, round(new_val)), "new_val is not integer");
+    new_val = round(new_val);
+  }
+  fj_cpu.h_assignment[var_idx] = new_val;
+
+  cuopt_assert(fj_cpu.view.pb.check_variable_within_bounds(var_idx, new_val),
+               "assignment not within bounds");
+  cuopt_assert(isfinite(new_val), "assignment is not finite");
+
+  fj_cpu.h_incumbent_objective += fj_cpu.h_obj_coeffs[var_idx] * delta;
+  if (fj_cpu.h_incumbent_objective < fj_cpu.h_best_objective &&
+      fj_cpu.violated_constraints.empty()) {
+    cuopt_assert(fj_cpu.satisfied_constraints.size() == fj_cpu.view.pb.n_constraints, "");
+    fj_cpu.h_best_objective =
+      fj_cpu.h_incumbent_objective - fj_cpu.settings.parameters.breakthrough_move_epsilon;
+    fj_cpu.h_best_assignment = fj_cpu.h_assignment;
+    CUOPT_LOG_TRACE("%sCPUFJ: new best objective: %g\n",
+                    fj_cpu.log_prefix.c_str(),
+                    fj_cpu.pb_ptr->get_user_obj_from_solver_obj(fj_cpu.h_best_objective));
+    if (fj_cpu.improvement_callback) {
+      fj_cpu.improvement_callback(fj_cpu.h_best_objective, fj_cpu.h_assignment);
+    }
+    fj_cpu.feasible_found = true;
+  }
+
+  i_t tabu_tenure = fj_cpu.settings.parameters.tabu_tenure_min +
+                    rng.next_u32() % (fj_cpu.settings.parameters.tabu_tenure_max -
+                                      fj_cpu.settings.parameters.tabu_tenure_min);
+  if (delta > 0) {
+    fj_cpu.h_tabu_lastinc[var_idx]     = fj_cpu.iterations;
+    fj_cpu.h_tabu_nodec_until[var_idx] = fj_cpu.iterations + tabu_tenure;
+    fj_cpu.h_tabu_noinc_until[var_idx] = fj_cpu.iterations + tabu_tenure / 2;
+    // CUOPT_LOG_TRACE("CPU: tabu nodec_until: %d\n", fj_cpu.h_tabu_nodec_until[var_idx]);
+  } else {
+    fj_cpu.h_tabu_lastdec[var_idx]     = fj_cpu.iterations;
+    fj_cpu.h_tabu_noinc_until[var_idx] = fj_cpu.iterations + tabu_tenure;
+    fj_cpu.h_tabu_nodec_until[var_idx] = fj_cpu.iterations + tabu_tenure / 2;
+    // CUOPT_LOG_TRACE("CPU: tabu noinc_until: %d\n", fj_cpu.h_tabu_noinc_until[var_idx]);
+  }
+
+  std::fill(fj_cpu.flip_move_computed.begin(), fj_cpu.flip_move_computed.end(), false);
+  std::fill(fj_cpu.var_bitmap.begin(), fj_cpu.var_bitmap.end(), false);
+  fj_cpu.iter_mtm_vars.clear();
+}
+
+template <typename i_t, typename f_t, MTMMoveType move_type>
+static thrust::tuple<fj_move_t, fj_staged_score_t> find_mtm_move(
+  fj_cpu_climber_t<i_t, f_t>& fj_cpu, const std::vector<i_t>& target_cstrs, bool localmin = false)
+{
+  auto& problem = *fj_cpu.pb_ptr;
+
+  raft::random::PCGenerator rng(fj_cpu.settings.seed + fj_cpu.iterations, 0, 0);
+
+  fj_move_t best_move          = fj_move_t{-1, 0};
+  fj_staged_score_t best_score = fj_staged_score_t::invalid();
+
+  // collect all the variables that are involved in the target constraints
+  for (size_t cstr_idx : target_cstrs) {
+    auto [offset_begin, offset_end] = fj_cpu.view.pb.range_for_constraint(cstr_idx);
+    for (auto i = offset_begin; i < offset_end; i++) {
+      i_t var_idx = fj_cpu.h_variables[i];
+      if (fj_cpu.var_bitmap[var_idx]) continue;
+      fj_cpu.iter_mtm_vars.push_back(var_idx);
+      fj_cpu.var_bitmap[var_idx] = true;
+    }
+  }
+  // estimate the amount of nnzs to consider
+  i_t nnz_sum = 0;
+  for (auto var_idx : fj_cpu.iter_mtm_vars) {
+    auto [offset_begin, offset_end] = fj_cpu.view.pb.reverse_range_for_var(var_idx);
+    nnz_sum += offset_end - offset_begin;
+  }
+
+  f_t nnz_pick_probability = 1;
+  if (nnz_sum > fj_cpu.nnz_samples) nnz_pick_probability = (f_t)fj_cpu.nnz_samples / nnz_sum;
+
+  for (size_t cstr_idx : target_cstrs) {
+    f_t cstr_tol = fj_cpu.view.get_corrected_tolerance(cstr_idx);
+
+    cuopt_assert(cstr_idx < fj_cpu.h_cstr_lb.size(), "cstr_idx is out of bounds");
+    auto [offset_begin, offset_end] = fj_cpu.view.pb.range_for_constraint(cstr_idx);
+    for (auto i = offset_begin; i < offset_end; i++) {
+      // early cached check
+      if (auto& cached_move = fj_cpu.cached_mtm_moves[i]; cached_move.first != 0) {
+        if (best_score < cached_move.second) {
+          auto var_idx = fj_cpu.h_variables[i];
+          if (fj_cpu.view.pb.check_variable_within_bounds(
+                var_idx, fj_cpu.h_assignment[var_idx] + cached_move.first)) {
+            best_score = cached_move.second;
+            best_move  = fj_move_t{var_idx, cached_move.first};
+          }
+          // cuopt_assert(fj_cpu.view.pb.check_variable_within_bounds(var_idx,
+          // fj_cpu.h_assignment[var_idx] + cached_move.first), "best move not within bounds");
+        }
+        fj_cpu.hit_count++;
+        continue;
+      }
+
+      // random chance to skip this nnz if there are many to consider
+      if (nnz_pick_probability < 1)
+        if (rng.next_float() > nnz_pick_probability) continue;
+
+      auto var_idx = fj_cpu.h_variables[i];
+
+      f_t val     = fj_cpu.h_assignment[var_idx];
+      f_t new_val = val;
+      f_t delta   = 0;
+
+      // Special case for binary variables
+      if (fj_cpu.h_is_binary_variable[var_idx]) {
+        if (fj_cpu.flip_move_computed[var_idx]) continue;
+        fj_cpu.flip_move_computed[var_idx] = true;
+        new_val                            = 1 - val;
+      } else {
+        auto cstr_coeff = fj_cpu.h_coefficients[i];
+
+        f_t c_lb                                  = fj_cpu.h_cstr_lb[cstr_idx];
+        f_t c_ub                                  = fj_cpu.h_cstr_ub[cstr_idx];
+        auto [delta, sign, slack, cstr_tolerance] = get_mtm_for_constraint<i_t, f_t, move_type>(
+          fj_cpu.view, var_idx, cstr_idx, cstr_coeff, c_lb, c_ub);
+        if (fj_cpu.view.pb.is_integer_var(var_idx)) {
+          new_val = cstr_coeff * sign > 0
+                      ? floor(val + delta + fj_cpu.view.pb.tolerances.integrality_tolerance)
+                      : ceil(val + delta - fj_cpu.view.pb.tolerances.integrality_tolerance);
+        } else {
+          new_val = val + delta;
+        }
+        // fallback
+        if (new_val < get_lower(fj_cpu.h_var_bounds[var_idx]) ||
+            new_val > get_upper(fj_cpu.h_var_bounds[var_idx])) {
+          new_val = cstr_coeff * sign > 0 ? get_lower(fj_cpu.h_var_bounds[var_idx])
+                                          : get_upper(fj_cpu.h_var_bounds[var_idx]);
+        }
+      }
+      if (!isfinite(new_val)) continue;
+      cuopt_assert(fj_cpu.view.pb.check_variable_within_bounds(var_idx, new_val),
+                   "new_val is not within bounds");
+      delta = new_val - val;
+      // more permissive tabu in the case of local minima
+      if (tabu_check(fj_cpu, var_idx, delta, localmin)) continue;
+      if (fabs(delta) < cstr_tol) continue;
+
+      auto move = fj_move_t{var_idx, delta};
+      cuopt_assert(move.var_idx < fj_cpu.h_assignment.size(), "move.var_idx is out of bounds");
+      cuopt_assert(move.var_idx >= 0, "move.var_idx is not positive");
+
+      // candidate_moves.insert(move);
+      fj_staged_score_t score;
+      score                      = compute_score<i_t, f_t>(fj_cpu, var_idx, delta);
+      fj_cpu.cached_mtm_moves[i] = std::make_pair(delta, score);
+      fj_cpu.miss_count++;
+      if (best_score < score) {
+        best_score = score;
+        best_move  = move;
+      }
+    }
+  }
+
+  // also consider BM moves if we have found a feasible solution at least once
+  if (move_type == MTMMoveType::FJ_MTM_VIOLATED &&
+      fj_cpu.h_best_objective < std::numeric_limits<f_t>::infinity() &&
+      fj_cpu.h_incumbent_objective >=
+        fj_cpu.h_best_objective + fj_cpu.settings.parameters.breakthrough_move_epsilon) {
+    for (auto var_idx : fj_cpu.h_objective_vars) {
+      f_t old_val = fj_cpu.h_assignment[var_idx];
+      f_t new_val = get_breakthrough_move<i_t, f_t>(fj_cpu.view, var_idx);
+
+      if (fj_cpu.view.pb.integer_equal(new_val, old_val) || !isfinite(new_val)) continue;
+
+      f_t delta = new_val - old_val;
+
+      // Check if we already have a move for this variable
+      auto move = fj_move_t{var_idx, delta};
+      cuopt_assert(move.var_idx < fj_cpu.h_assignment.size(), "move.var_idx is out of bounds");
+      cuopt_assert(move.var_idx >= 0, "move.var_idx is not positive");
+
+      if (tabu_check(fj_cpu, var_idx, delta)) continue;
+
+      fj_staged_score_t score = compute_score<i_t, f_t>(fj_cpu, var_idx, delta);
+
+      cuopt_assert(fj_cpu.view.pb.check_variable_within_bounds(var_idx, new_val), "");
+      cuopt_assert(isfinite(delta), "");
+
+      if (best_score < score) {
+        best_score = score;
+        best_move  = move;
+      }
+    }
+  }
+
+  return thrust::make_tuple(best_move, best_score);
+}
+
+template <typename i_t, typename f_t>
+static thrust::tuple<fj_move_t, fj_staged_score_t> find_mtm_move_viol(
+  fj_cpu_climber_t<i_t, f_t>& fj_cpu, i_t sample_size = 100, bool localmin = false)
+{
+  timing_raii_t<i_t, f_t> timer(fj_cpu.find_mtm_move_viol_times);
+
+  std::vector<i_t> sampled_cstrs;
+  sampled_cstrs.reserve(sample_size);
+  std::sample(fj_cpu.violated_constraints.begin(),
+              fj_cpu.violated_constraints.end(),
+              std::back_inserter(sampled_cstrs),
+              sample_size,
+              std::mt19937(fj_cpu.settings.seed + fj_cpu.iterations));
+
+  return find_mtm_move<i_t, f_t, MTMMoveType::FJ_MTM_VIOLATED>(fj_cpu, sampled_cstrs, localmin);
+}
+
+template <typename i_t, typename f_t>
+static thrust::tuple<fj_move_t, fj_staged_score_t> find_mtm_move_sat(
+  fj_cpu_climber_t<i_t, f_t>& fj_cpu, i_t sample_size = 100)
+{
+  timing_raii_t<i_t, f_t> timer(fj_cpu.find_mtm_move_sat_times);
+
+  std::vector<i_t> sampled_cstrs;
+  sampled_cstrs.reserve(sample_size);
+  std::sample(fj_cpu.satisfied_constraints.begin(),
+              fj_cpu.satisfied_constraints.end(),
+              std::back_inserter(sampled_cstrs),
+              sample_size,
+              std::mt19937(fj_cpu.settings.seed + fj_cpu.iterations));
+
+  return find_mtm_move<i_t, f_t, MTMMoveType::FJ_MTM_SATISFIED>(fj_cpu, sampled_cstrs);
+}
+
+template <typename i_t, typename f_t>
+static void init_lhs(fj_cpu_climber_t<i_t, f_t>& fj_cpu)
+{
+  cuopt_assert(fj_cpu.h_lhs.size() == fj_cpu.view.pb.n_constraints, "h_lhs size mismatch");
+
+  fj_cpu.violated_constraints.clear();
+  fj_cpu.satisfied_constraints.clear();
+  for (i_t cstr_idx = 0; cstr_idx < fj_cpu.view.pb.n_constraints; ++cstr_idx) {
+    auto [offset_begin, offset_end] = fj_cpu.view.pb.range_for_constraint(cstr_idx);
+    f_t lhs                         = 0;
+    for (i_t i = offset_begin; i < offset_end; ++i) {
+      lhs += fj_cpu.h_coefficients[i] * fj_cpu.h_assignment[fj_cpu.h_variables[i]];
+    }
+
+    fj_cpu.h_lhs[cstr_idx] = lhs;
+
+    f_t cstr_tolerance = fj_cpu.view.get_corrected_tolerance(cstr_idx);
+    f_t new_cost       = fj_cpu.view.excess_score(cstr_idx, lhs);
+    if (new_cost < -cstr_tolerance) {
+      fj_cpu.violated_constraints.insert(cstr_idx);
+    } else {
+      fj_cpu.satisfied_constraints.insert(cstr_idx);
+    }
+  }
+
+  // compute incumbent objective
+  fj_cpu.h_incumbent_objective = thrust::inner_product(
+    fj_cpu.h_assignment.begin(), fj_cpu.h_assignment.end(), fj_cpu.h_obj_coeffs.begin(), 0.);
+}
+
+template <typename i_t, typename f_t>
+static thrust::tuple<fj_move_t, fj_staged_score_t> find_lift_move(
+  fj_cpu_climber_t<i_t, f_t>& fj_cpu)
+{
+  timing_raii_t<i_t, f_t> timer(fj_cpu.find_lift_move_times);
+
+  fj_move_t best_move          = fj_move_t{-1, 0};
+  fj_staged_score_t best_score = fj_staged_score_t::zero();
+
+  for (auto var_idx : fj_cpu.h_objective_vars) {
+    cuopt_assert(var_idx < fj_cpu.h_obj_coeffs.size(), "var_idx is out of bounds");
+    cuopt_assert(var_idx >= 0, "var_idx is out of bounds");
+
+    f_t obj_coeff = fj_cpu.h_obj_coeffs[var_idx];
+    f_t delta     = -std::numeric_limits<f_t>::infinity();
+    f_t val       = fj_cpu.h_assignment[var_idx];
+
+    // special path for binary variables
+    if (fj_cpu.h_is_binary_variable[var_idx]) {
+      cuopt_assert(fj_cpu.view.pb.is_integer(val), "binary variable is not integer");
+      cuopt_assert(fj_cpu.view.pb.integer_equal(val, 0) || fj_cpu.view.pb.integer_equal(val, 1),
+                   "Current assignment is not binary!");
+      delta = round(1.0 - 2 * val);
+      // flip move wouldn't improve
+      if (delta * obj_coeff >= 0) continue;
+    } else {
+      f_t lfd_lb                      = get_lower(fj_cpu.h_var_bounds[var_idx]) - val;
+      f_t lfd_ub                      = get_upper(fj_cpu.h_var_bounds[var_idx]) - val;
+      auto [offset_begin, offset_end] = fj_cpu.view.pb.reverse_range_for_var(var_idx);
+      for (i_t j = offset_begin; j < offset_end; j += 1) {
+        auto cstr_idx      = fj_cpu.view.pb.reverse_constraints[j];
+        auto cstr_coeff    = fj_cpu.view.pb.reverse_coefficients[j];
+        f_t c_lb           = fj_cpu.view.pb.constraint_lower_bounds[cstr_idx];
+        f_t c_ub           = fj_cpu.view.pb.constraint_upper_bounds[cstr_idx];
+        f_t cstr_tolerance = fj_cpu.view.get_corrected_tolerance(cstr_idx, c_lb, c_ub);
+        cuopt_assert(c_lb <= c_ub, "invalid bounds");
+        cuopt_assert(fj_cpu.view.cstr_satisfied(cstr_idx, fj_cpu.h_lhs[cstr_idx]),
+                     "cstr should be satisfied");
+
+        // Process each bound separately, as both are satified and may both be finite
+        // otherwise range constraints aren't correctly handled
+        for (auto [bound, sign] : {std::make_tuple(c_lb, -1), std::make_tuple(c_ub, 1)}) {
+          auto [delta, slack] =
+            get_mtm_for_bound<i_t, f_t>(fj_cpu.view, var_idx, cstr_idx, cstr_coeff, bound, sign);
+
+          if (cstr_coeff * sign < 0) {
+            if (fj_cpu.view.pb.is_integer_var(var_idx)) delta = ceil(delta);
+          } else {
+            if (fj_cpu.view.pb.is_integer_var(var_idx)) delta = floor(delta);
+          }
+
+          // skip this variable if there is no slack
+          if (fabs(slack) <= cstr_tolerance) {
+            if (cstr_coeff * sign > 0) {
+              lfd_ub = 0;
+            } else {
+              lfd_lb = 0;
+            }
+          } else if (!fj_cpu.view.pb.check_variable_within_bounds(var_idx, val + delta)) {
+            continue;
+          } else {
+            if (cstr_coeff * sign < 0) {
+              lfd_lb = max(lfd_lb, delta);
+            } else {
+              lfd_ub = min(lfd_ub, delta);
+            }
+          }
+        }
+        if (lfd_lb >= lfd_ub) break;
+      }
+
+      // invalid crossing bounds
+      if (lfd_lb >= lfd_ub) { lfd_lb = lfd_ub = 0; }
+
+      if (!fj_cpu.view.pb.check_variable_within_bounds(var_idx, val + lfd_lb)) { lfd_lb = 0; }
+      if (!fj_cpu.view.pb.check_variable_within_bounds(var_idx, val + lfd_ub)) { lfd_ub = 0; }
+
+      // Now that the life move domain is computed, compute the correct lift move
+      cuopt_assert(isfinite(val), "invalid assignment value");
+      delta = obj_coeff < 0 ? lfd_ub : lfd_lb;
+    }
+
+    if (!isfinite(delta)) delta = 0;
+    if (fj_cpu.view.pb.integer_equal(delta, (f_t)0)) continue;
+    if (tabu_check(fj_cpu, var_idx, delta)) continue;
+
+    cuopt_assert(delta * obj_coeff < 0, "lift move doesn't improve the objective!");
+
+    // get the score
+    auto move               = fj_move_t{var_idx, delta};
+    fj_staged_score_t score = fj_staged_score_t::zero();
+    f_t obj_score           = -1 * obj_coeff * delta;  // negated to turn this into a positive score
+    score.base              = round(obj_score);
+
+    if (best_score < score) {
+      best_score = score;
+      best_move  = move;
+    }
+  }
+
+  return thrust::make_tuple(best_move, best_score);
+}
+
+template <typename i_t, typename f_t>
+static void perturb(fj_cpu_climber_t<i_t, f_t>& fj_cpu)
+{
+  // select N variables, assign them a random value between their bounds
+  std::vector<i_t> sampled_vars;
+  std::sample(fj_cpu.h_objective_vars.begin(),
+              fj_cpu.h_objective_vars.end(),
+              std::back_inserter(sampled_vars),
+              2,
+              std::mt19937(fj_cpu.settings.seed + fj_cpu.iterations));
+  raft::random::PCGenerator rng(fj_cpu.settings.seed + fj_cpu.iterations, 0, 0);
+
+  for (auto var_idx : sampled_vars) {
+    f_t lb  = ceil(std::max(get_lower(fj_cpu.h_var_bounds[var_idx]), -1e7));
+    f_t ub  = floor(std::min(get_upper(fj_cpu.h_var_bounds[var_idx]), 1e7));
+    f_t val = lb + (ub - lb) * rng.next_double();
+    if (fj_cpu.view.pb.is_integer_var(var_idx)) {
+      val = std::round(val);
+      val = std::min(std::max(val, lb), ub);
+    }
+
+    cuopt_assert(fj_cpu.view.pb.check_variable_within_bounds(var_idx, val),
+                 "value is out of bounds");
+    fj_cpu.h_assignment[var_idx] = val;
+  }
+
+  init_lhs(fj_cpu);
+}
+
+template <typename i_t, typename f_t>
+static void init_fj_cpu(fj_cpu_climber_t<i_t, f_t>& fj_cpu,
+                        solution_t<i_t, f_t>& solution,
+                        const std::vector<f_t>& left_weights,
+                        const std::vector<f_t>& right_weights,
+                        f_t objective_weight)
+{
+  auto& problem   = *solution.problem_ptr;
+  auto handle_ptr = solution.handle_ptr;
+
+  // build a cpu-based fj_view_t
+  fj_cpu.view    = typename fj_t<i_t, f_t>::climber_data_t::view_t{};
+  fj_cpu.view.pb = problem.view();
+  fj_cpu.pb_ptr  = &problem;
+  // Get host copies of device data
+  fj_cpu.h_reverse_coefficients =
+    cuopt::host_copy(problem.reverse_coefficients, handle_ptr->get_stream());
+  fj_cpu.h_reverse_constraints =
+    cuopt::host_copy(problem.reverse_constraints, handle_ptr->get_stream());
+  fj_cpu.h_reverse_offsets = cuopt::host_copy(problem.reverse_offsets, handle_ptr->get_stream());
+  fj_cpu.h_coefficients    = cuopt::host_copy(problem.coefficients, handle_ptr->get_stream());
+  fj_cpu.h_offsets         = cuopt::host_copy(problem.offsets, handle_ptr->get_stream());
+  fj_cpu.h_variables       = cuopt::host_copy(problem.variables, handle_ptr->get_stream());
+  fj_cpu.h_obj_coeffs = cuopt::host_copy(problem.objective_coefficients, handle_ptr->get_stream());
+  fj_cpu.h_var_bounds = cuopt::host_copy(problem.variable_bounds, handle_ptr->get_stream());
+  fj_cpu.h_cstr_lb    = cuopt::host_copy(problem.constraint_lower_bounds, handle_ptr->get_stream());
+  fj_cpu.h_cstr_ub    = cuopt::host_copy(problem.constraint_upper_bounds, handle_ptr->get_stream());
+  fj_cpu.h_var_types  = cuopt::host_copy(problem.variable_types, handle_ptr->get_stream());
+  fj_cpu.h_is_binary_variable =
+    cuopt::host_copy(problem.is_binary_variable, handle_ptr->get_stream());
+  fj_cpu.h_binary_indices = cuopt::host_copy(problem.binary_indices, handle_ptr->get_stream());
+
+  fj_cpu.h_cstr_left_weights  = left_weights;
+  fj_cpu.h_cstr_right_weights = right_weights;
+  fj_cpu.max_weight           = 1.0;
+  fj_cpu.h_objective_weight   = objective_weight;
+  fj_cpu.h_assignment         = solution.get_host_assignment();
+  fj_cpu.h_best_assignment    = solution.get_host_assignment();
+  fj_cpu.h_lhs.resize(fj_cpu.pb_ptr->n_constraints);
+  fj_cpu.h_lhs_sumcomp.resize(fj_cpu.pb_ptr->n_constraints, 0);
+  fj_cpu.h_tabu_nodec_until.resize(fj_cpu.pb_ptr->n_variables, 0);
+  fj_cpu.h_tabu_noinc_until.resize(fj_cpu.pb_ptr->n_variables, 0);
+  fj_cpu.h_tabu_lastdec.resize(fj_cpu.pb_ptr->n_variables, 0);
+  fj_cpu.h_tabu_lastinc.resize(fj_cpu.pb_ptr->n_variables, 0);
+  fj_cpu.iterations = 0;
+
+  // set pointers to host copies
+  // technically not 'device_span's but raft doesn't have a universal span.
+  // cuda::std::span?
+  fj_cpu.view.cstr_left_weights =
+    raft::device_span<f_t>(fj_cpu.h_cstr_left_weights.data(), fj_cpu.h_cstr_left_weights.size());
+  fj_cpu.view.cstr_right_weights =
+    raft::device_span<f_t>(fj_cpu.h_cstr_right_weights.data(), fj_cpu.h_cstr_right_weights.size());
+  fj_cpu.view.objective_weight = &fj_cpu.h_objective_weight;
+  fj_cpu.view.incumbent_assignment =
+    raft::device_span<f_t>(fj_cpu.h_assignment.data(), fj_cpu.h_assignment.size());
+  fj_cpu.view.incumbent_lhs = raft::device_span<f_t>(fj_cpu.h_lhs.data(), fj_cpu.h_lhs.size());
+  fj_cpu.view.tabu_nodec_until =
+    raft::device_span<i_t>(fj_cpu.h_tabu_nodec_until.data(), fj_cpu.h_tabu_nodec_until.size());
+  fj_cpu.view.tabu_noinc_until =
+    raft::device_span<i_t>(fj_cpu.h_tabu_noinc_until.data(), fj_cpu.h_tabu_noinc_until.size());
+  fj_cpu.view.tabu_lastdec =
+    raft::device_span<i_t>(fj_cpu.h_tabu_lastdec.data(), fj_cpu.h_tabu_lastdec.size());
+  fj_cpu.view.tabu_lastinc =
+    raft::device_span<i_t>(fj_cpu.h_tabu_lastinc.data(), fj_cpu.h_tabu_lastinc.size());
+  fj_cpu.view.objective_vars =
+    raft::device_span<i_t>(fj_cpu.h_objective_vars.data(), fj_cpu.h_objective_vars.size());
+  fj_cpu.view.incumbent_objective = &fj_cpu.h_incumbent_objective;
+  fj_cpu.view.best_objective      = &fj_cpu.h_best_objective;
+
+  fj_cpu.view.settings = &fj_cpu.settings;
+  fj_cpu.view.pb.constraint_lower_bounds =
+    raft::device_span<f_t>(fj_cpu.h_cstr_lb.data(), fj_cpu.h_cstr_lb.size());
+  fj_cpu.view.pb.constraint_upper_bounds =
+    raft::device_span<f_t>(fj_cpu.h_cstr_ub.data(), fj_cpu.h_cstr_ub.size());
+  fj_cpu.view.pb.variable_bounds = raft::device_span<typename type_2<f_t>::type>(
+    fj_cpu.h_var_bounds.data(), fj_cpu.h_var_bounds.size());
+  fj_cpu.view.pb.variable_types =
+    raft::device_span<var_t>(fj_cpu.h_var_types.data(), fj_cpu.h_var_types.size());
+  fj_cpu.view.pb.is_binary_variable =
+    raft::device_span<i_t>(fj_cpu.h_is_binary_variable.data(), fj_cpu.h_is_binary_variable.size());
+  fj_cpu.view.pb.coefficients =
+    raft::device_span<f_t>(fj_cpu.h_coefficients.data(), fj_cpu.h_coefficients.size());
+  fj_cpu.view.pb.offsets = raft::device_span<i_t>(fj_cpu.h_offsets.data(), fj_cpu.h_offsets.size());
+  fj_cpu.view.pb.variables =
+    raft::device_span<i_t>(fj_cpu.h_variables.data(), fj_cpu.h_variables.size());
+  fj_cpu.view.pb.reverse_coefficients = raft::device_span<f_t>(
+    fj_cpu.h_reverse_coefficients.data(), fj_cpu.h_reverse_coefficients.size());
+  fj_cpu.view.pb.reverse_constraints = raft::device_span<i_t>(fj_cpu.h_reverse_constraints.data(),
+                                                              fj_cpu.h_reverse_constraints.size());
+  fj_cpu.view.pb.reverse_offsets =
+    raft::device_span<i_t>(fj_cpu.h_reverse_offsets.data(), fj_cpu.h_reverse_offsets.size());
+  fj_cpu.view.pb.objective_coefficients =
+    raft::device_span<f_t>(fj_cpu.h_obj_coeffs.data(), fj_cpu.h_obj_coeffs.size());
+  fj_cpu.h_objective_vars.resize(problem.n_variables);
+  auto end = std::copy_if(
+    thrust::counting_iterator<i_t>(0),
+    thrust::counting_iterator<i_t>(problem.n_variables),
+    fj_cpu.h_objective_vars.begin(),
+    [&fj_cpu](i_t idx) { return !fj_cpu.view.pb.integer_equal(fj_cpu.h_obj_coeffs[idx], (f_t)0); });
+  fj_cpu.h_objective_vars.resize(end - fj_cpu.h_objective_vars.begin());
+
+  fj_cpu.h_best_objective = +std::numeric_limits<f_t>::infinity();
+
+  // nnz count
+  fj_cpu.cached_mtm_moves.resize(fj_cpu.h_coefficients.size(),
+                                 std::make_pair(0, fj_staged_score_t::zero()));
+
+  fj_cpu.cached_cstr_bounds.resize(fj_cpu.h_reverse_coefficients.size());
+  for (i_t var_idx = 0; var_idx < (i_t)fj_cpu.view.pb.n_variables; ++var_idx) {
+    auto [offset_begin, offset_end] = fj_cpu.view.pb.reverse_range_for_var(var_idx);
+    for (i_t i = offset_begin; i < offset_end; ++i) {
+      fj_cpu.cached_cstr_bounds[i] =
+        std::make_pair(fj_cpu.h_cstr_lb[fj_cpu.h_reverse_constraints[i]],
+                       fj_cpu.h_cstr_ub[fj_cpu.h_reverse_constraints[i]]);
+    }
+  }
+
+  fj_cpu.flip_move_computed.resize(fj_cpu.view.pb.n_variables, false);
+  fj_cpu.var_bitmap.resize(fj_cpu.view.pb.n_variables, false);
+  fj_cpu.iter_mtm_vars.reserve(fj_cpu.view.pb.n_variables);
+
+  init_lhs(fj_cpu);
+}
+
+template <typename i_t, typename f_t>
+static void sanity_checks(fj_cpu_climber_t<i_t, f_t>& fj_cpu)
+{
+  // Check that each violated constraint is actually violated and not present in
+  // satisfied_constraints
+  for (const auto& cstr_idx : fj_cpu.violated_constraints) {
+    cuopt_assert(fj_cpu.satisfied_constraints.count(cstr_idx) == 0,
+                 "Violated constraint also in satisfied_constraints");
+    f_t lhs    = fj_cpu.h_lhs[cstr_idx];
+    f_t tol    = fj_cpu.view.get_corrected_tolerance(cstr_idx);
+    f_t excess = fj_cpu.view.excess_score(cstr_idx, lhs);
+    cuopt_assert(excess < -tol, "Constraint in violated_constraints is not actually violated");
+  }
+
+  // Check that each satisfied constraint is actually satisfied and not present in
+  // violated_constraints
+  for (const auto& cstr_idx : fj_cpu.satisfied_constraints) {
+    cuopt_assert(fj_cpu.violated_constraints.count(cstr_idx) == 0,
+                 "Satisfied constraint also in violated_constraints");
+    f_t lhs    = fj_cpu.h_lhs[cstr_idx];
+    f_t tol    = fj_cpu.view.get_corrected_tolerance(cstr_idx);
+    f_t excess = fj_cpu.view.excess_score(cstr_idx, lhs);
+    cuopt_assert(!(excess < -tol), "Constraint in satisfied_constraints is actually violated");
+  }
+
+  // Check that each constraint is in exactly one of violated_constraints or satisfied_constraints
+  for (i_t cstr_idx = 0; cstr_idx < fj_cpu.view.pb.n_constraints; ++cstr_idx) {
+    bool in_viol = fj_cpu.violated_constraints.count(cstr_idx) > 0;
+    bool in_sat  = fj_cpu.satisfied_constraints.count(cstr_idx) > 0;
+    cuopt_assert(
+      in_viol != in_sat,
+      "Constraint must be in exactly one of violated_constraints or satisfied_constraints");
+
+    cuopt_assert(fj_cpu.h_cstr_left_weights[cstr_idx] >= 0, "Weights should be positive or zero");
+    cuopt_assert(fj_cpu.h_cstr_right_weights[cstr_idx] >= 0, "Weights should be positive or zero");
+  }
+  cuopt_assert(fj_cpu.h_objective_weight >= 0, "Objective weight should be positive or zero");
+}
+
+template <typename i_t, typename f_t>
+std::unique_ptr<fj_cpu_climber_t<i_t, f_t>> fj_t<i_t, f_t>::create_cpu_climber(
+  solution_t<i_t, f_t>& solution,
+  const std::vector<f_t>& left_weights,
+  const std::vector<f_t>& right_weights,
+  f_t objective_weight,
+  fj_settings_t settings,
+  bool randomize_params)
+{
+  raft::common::nvtx::range scope("fj_cpu_init");
+
+  auto fj_cpu = std::make_unique<fj_cpu_climber_t<i_t, f_t>>();
+
+  // Initialize fj_cpu with all the data
+  init_fj_cpu(*fj_cpu, solution, left_weights, right_weights, objective_weight);
+  fj_cpu->settings = settings;
+  if (randomize_params) {
+    auto rng                 = std::mt19937(cuopt::seed_generator::get_seed());
+    fj_cpu->mtm_viol_samples = std::uniform_int_distribution<i_t>(15, 50)(rng);
+    fj_cpu->mtm_sat_samples  = std::uniform_int_distribution<i_t>(10, 30)(rng);
+    fj_cpu->nnz_samples      = std::uniform_int_distribution<i_t>(2000, 15000)(rng);
+    fj_cpu->perturb_interval = std::uniform_int_distribution<i_t>(50, 500)(rng);
+  }
+  fj_cpu->settings.seed = cuopt::seed_generator::get_seed();
+  return fj_cpu;  // move
+}
+
+template <typename i_t, typename f_t>
+bool fj_t<i_t, f_t>::cpu_solve(fj_cpu_climber_t<i_t, f_t>& fj_cpu, f_t in_time_limit)
+{
+  raft::common::nvtx::range scope("fj_cpu");
+
+  i_t local_mins       = 0;
+  auto loop_start      = std::chrono::high_resolution_clock::now();
+  auto time_limit      = std::chrono::milliseconds((int)(in_time_limit * 1000));
+  auto loop_time_start = std::chrono::high_resolution_clock::now();
+  while (!fj_cpu.halted) {
+    // Check if 5 seconds have passed
+    auto now = std::chrono::high_resolution_clock::now();
+    if (in_time_limit < std::numeric_limits<f_t>::infinity() &&
+        now - loop_time_start > time_limit) {
+      CUOPT_LOG_TRACE("%sTime limit of %.4f seconds reached, breaking loop at iteration %d\n",
+                      fj_cpu.log_prefix.c_str(),
+                      time_limit.count() / 1000.f,
+                      fj_cpu.iterations);
+      break;
+    }
+
+    fj_move_t move          = fj_move_t{-1, 0};
+    fj_staged_score_t score = fj_staged_score_t::invalid();
+    // Perform lift moves
+    if (fj_cpu.violated_constraints.empty()) { thrust::tie(move, score) = find_lift_move(fj_cpu); }
+    // Regular MTM
+    if (!(score > fj_staged_score_t::zero())) {
+      thrust::tie(move, score) = find_mtm_move_viol(fj_cpu, fj_cpu.mtm_viol_samples);
+    }
+    // try with MTM in satisfied constraints
+    if (fj_cpu.feasible_found && !(score > fj_staged_score_t::zero())) {
+      thrust::tie(move, score) = find_mtm_move_sat(fj_cpu, fj_cpu.mtm_sat_samples);
+    }
+    // if we're in the feasible region but haven't found improvements in the last n iterations,
+    // perturb
+    bool should_perturb = false;
+    if (fj_cpu.violated_constraints.empty() &&
+        fj_cpu.iterations - fj_cpu.last_feasible_entrance_iter > fj_cpu.perturb_interval) {
+      should_perturb                     = true;
+      fj_cpu.last_feasible_entrance_iter = fj_cpu.iterations;
+    }
+
+    if (score > fj_staged_score_t::zero() && !should_perturb) {
+      apply_move(fj_cpu, move.var_idx, move.value, false);
+    } else {
+      // Local Min
+      update_weights(fj_cpu);
+      if (should_perturb) {
+        perturb(fj_cpu);
+        for (auto& cached_move : fj_cpu.cached_mtm_moves)
+          cached_move.first = 0;
+      }
+      thrust::tie(move, score) =
+        find_mtm_move_viol(fj_cpu, 1, true);  // pick a single random violated constraint
+      i_t var_idx = move.var_idx >= 0 ? move.var_idx : 0;
+      f_t delta   = move.var_idx >= 0 ? move.value : 0;
+      apply_move(fj_cpu, var_idx, delta, true);
+      ++local_mins;
+    }
+    if (fj_cpu.iterations % fj_cpu.log_interval == 0) {
+      CUOPT_LOG_TRACE(
+        "%sCPUFJ iteration: %d, local mins: %d, best_objective: %g, viol: %zu, obj weight %g, maxw "
+        "%g\n",
+        fj_cpu.log_prefix.c_str(),
+        fj_cpu.iterations,
+        local_mins,
+        fj_cpu.pb_ptr->get_user_obj_from_solver_obj(fj_cpu.h_best_objective),
+        fj_cpu.violated_constraints.size(),
+        fj_cpu.h_objective_weight,
+        fj_cpu.max_weight);
+    }
+    // send current solution to callback every 3000 steps for diversity
+    if (fj_cpu.iterations % fj_cpu.diversity_callback_interval == 0) {
+      if (fj_cpu.diversity_callback) {
+        fj_cpu.diversity_callback(fj_cpu.h_incumbent_objective, fj_cpu.h_assignment);
+      }
+    }
+
+    // Print timing statistics every N iterations
+#if CPUFJ_TIMING_TRACE
+    if (fj_cpu.iterations % fj_cpu.timing_stats_interval == 0 && fj_cpu.iterations > 0) {
+      print_timing_stats(fj_cpu);
+    }
+#endif
+    cuopt_func_call(sanity_checks(fj_cpu));
+    fj_cpu.iterations++;
+  }
+  auto loop_end = std::chrono::high_resolution_clock::now();
+  double total_time =
+    std::chrono::duration_cast<std::chrono::duration<double>>(loop_end - loop_start).count();
+  double avg_time_per_iter = total_time / fj_cpu.iterations;
+  CUOPT_LOG_TRACE("%sCPUFJ Average time per iteration: %.8fms\n",
+                  fj_cpu.log_prefix.c_str(),
+                  avg_time_per_iter * 1000.0);
+
+#if CPUFJ_TIMING_TRACE
+  // Print final timing statistics
+  CUOPT_LOG_TRACE("\n=== Final Timing Statistics ===\n");
+  print_timing_stats(fj_cpu);
+#endif
+
+  return fj_cpu.feasible_found;
+}
+
+#if MIP_INSTANTIATE_FLOAT
+template class fj_t<int, float>;
+#endif
+
+#if MIP_INSTANTIATE_DOUBLE
+template class fj_t<int, double>;
+#endif
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip/feasibility_jump/fj_cpu.cuh b/cpp/src/mip/feasibility_jump/fj_cpu.cuh
new file mode 100644
index 000000000..41993da02
--- /dev/null
+++ b/cpp/src/mip/feasibility_jump/fj_cpu.cuh
@@ -0,0 +1,122 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <functional>
+#include <unordered_set>
+#include <vector>
+
+#include <mip/feasibility_jump/feasibility_jump.cuh>
+
+namespace cuopt::linear_programming::detail {
+
+// NOTE: this seems an easy pick for reflection/xmacros once this is available (C++26?)
+// Maintaining a single source of truth for all members would be nice
+template <typename i_t, typename f_t>
+struct fj_cpu_climber_t {
+  fj_cpu_climber_t()                                                             = default;
+  fj_cpu_climber_t(const fj_cpu_climber_t<i_t, f_t>& other)                      = delete;
+  fj_cpu_climber_t<i_t, f_t>& operator=(const fj_cpu_climber_t<i_t, f_t>& other) = delete;
+
+  fj_cpu_climber_t(fj_cpu_climber_t<i_t, f_t>&& other)                      = default;
+  fj_cpu_climber_t<i_t, f_t>& operator=(fj_cpu_climber_t<i_t, f_t>&& other) = default;
+
+  problem_t<i_t, f_t>* pb_ptr;
+  fj_settings_t settings;
+  typename fj_t<i_t, f_t>::climber_data_t::view_t view;
+  // Host copies of device data as struct members
+  std::vector<f_t> h_reverse_coefficients;
+  std::vector<i_t> h_reverse_constraints;
+  std::vector<i_t> h_reverse_offsets;
+  std::vector<f_t> h_coefficients;
+  std::vector<i_t> h_offsets;
+  std::vector<i_t> h_variables;
+  std::vector<f_t> h_obj_coeffs;
+  std::vector<typename type_2<f_t>::type> h_var_bounds;
+  std::vector<f_t> h_cstr_lb;
+  std::vector<f_t> h_cstr_ub;
+  std::vector<var_t> h_var_types;
+  std::vector<i_t> h_is_binary_variable;
+  std::vector<i_t> h_objective_vars;
+  std::vector<i_t> h_binary_indices;
+
+  std::vector<i_t> h_tabu_nodec_until;
+  std::vector<i_t> h_tabu_noinc_until;
+  std::vector<i_t> h_tabu_lastdec;
+  std::vector<i_t> h_tabu_lastinc;
+
+  std::vector<f_t> h_lhs;
+  std::vector<f_t> h_lhs_sumcomp;
+  std::vector<f_t> h_cstr_left_weights;
+  std::vector<f_t> h_cstr_right_weights;
+  f_t max_weight;
+  std::vector<f_t> h_assignment;
+  std::vector<f_t> h_best_assignment;
+  f_t h_objective_weight;
+  f_t h_incumbent_objective;
+  f_t h_best_objective;
+  i_t last_feasible_entrance_iter{0};
+  i_t iterations;
+  std::unordered_set<i_t> violated_constraints;
+  std::unordered_set<i_t> satisfied_constraints;
+  bool feasible_found{false};
+
+  // Timing data structures
+  std::vector<double> find_lift_move_times;
+  std::vector<double> find_mtm_move_viol_times;
+  std::vector<double> find_mtm_move_sat_times;
+  std::vector<double> apply_move_times;
+  std::vector<double> update_weights_times;
+  std::vector<double> compute_score_times;
+
+  i_t hit_count{0};
+  i_t miss_count{0};
+
+  i_t candidate_move_hits[3]   = {0};
+  i_t candidate_move_misses[3] = {0};
+
+  // vector<bool> is actually likely beneficial here since we're memory bound
+  std::vector<bool> flip_move_computed;
+  ;
+  // CSR nnz offset -> (delta, score)
+  std::vector<std::pair<f_t, fj_staged_score_t>> cached_mtm_moves;
+
+  // CSC (transposed!) nnz-offset-indexed constraint bounds (lb, ub)
+  // std::pair<f_t, f_t> better compile down to 16 bytes!! GCC do your job!
+  std::vector<std::pair<f_t, f_t>> cached_cstr_bounds;
+
+  std::vector<bool> var_bitmap;
+  std::vector<i_t> iter_mtm_vars;
+
+  i_t mtm_viol_samples{25};
+  i_t mtm_sat_samples{15};
+  i_t nnz_samples{50000};
+  i_t perturb_interval{100};
+
+  i_t log_interval{1000};
+  i_t diversity_callback_interval{3000};
+  i_t timing_stats_interval{5000};
+
+  std::function<void(f_t, const std::vector<f_t>&)> improvement_callback{nullptr};
+  std::function<void(f_t, const std::vector<f_t>&)> diversity_callback{nullptr};
+  std::string log_prefix{""};
+
+  std::atomic<bool> halted{false};
+};
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip/feasibility_jump/load_balancing.cuh b/cpp/src/mip/feasibility_jump/load_balancing.cuh
index 67af9c06a..d8c43df16 100644
--- a/cpp/src/mip/feasibility_jump/load_balancing.cuh
+++ b/cpp/src/mip/feasibility_jump/load_balancing.cuh
@@ -139,7 +139,7 @@ __global__ void load_balancing_prepare_iteration(const __grid_constant__
 
     for (i_t i = blockIdx.x + range.first; i < range.second; i += gridDim.x) {
       i_t var_idx = fj.pb.related_variables[i];
-      update_jump_value<i_t, f_t, FJ_MTM_VIOLATED, false>(fj, var_idx);
+      update_jump_value<i_t, f_t, MTMMoveType::FJ_MTM_VIOLATED, false>(fj, var_idx);
     }
 
     if (FIRST_THREAD) *fj.load_balancing_skip = true;
diff --git a/cpp/src/mip/local_search/local_search.cu b/cpp/src/mip/local_search/local_search.cu
index f0497d382..196c6bdac 100644
--- a/cpp/src/mip/local_search/local_search.cu
+++ b/cpp/src/mip/local_search/local_search.cu
@@ -26,10 +26,85 @@
 #include <utilities/seed_generator.cuh>
 #include <utilities/timer.hpp>
 
+#include <mip/feasibility_jump/fj_cpu.cuh>
+
 #include <cuda_profiler_api.h>
 
+#include <future>
+
 namespace cuopt::linear_programming::detail {
 
+template <typename i_t, typename f_t>
+cpu_fj_thread_t<i_t, f_t>::cpu_fj_thread_t()
+{
+  cpu_worker = std::thread(&cpu_fj_thread_t<i_t, f_t>::cpu_worker_thread, this);
+}
+
+template <typename i_t, typename f_t>
+cpu_fj_thread_t<i_t, f_t>::~cpu_fj_thread_t()
+{
+  if (!cpu_thread_terminate) { kill_cpu_solver(); }
+}
+
+template <typename i_t, typename f_t>
+void cpu_fj_thread_t<i_t, f_t>::cpu_worker_thread()
+{
+  while (!cpu_thread_terminate) {
+    // Wait for start signal
+    {
+      std::unique_lock<std::mutex> lock(cpu_mutex);
+      cpu_cv.wait(lock, [this] { return cpu_thread_should_start || cpu_thread_terminate; });
+    }
+
+    if (cpu_thread_terminate) break;
+
+    // Run CPU solver
+    {
+      raft::common::nvtx::range fun_scope("Running CPU FJ");
+      cpu_fj_solution_found = fj_ptr->cpu_solve(*fj_cpu);
+    }
+
+    cpu_thread_should_start = false;
+    cpu_thread_done         = true;
+  }
+}
+
+template <typename i_t, typename f_t>
+void cpu_fj_thread_t<i_t, f_t>::kill_cpu_solver()
+{
+  cpu_thread_terminate = true;
+  if (fj_cpu) fj_cpu->halted = true;
+  cpu_cv.notify_one();
+  cpu_worker.join();
+}
+
+template <typename i_t, typename f_t>
+void cpu_fj_thread_t<i_t, f_t>::start_cpu_solver()
+{
+  cuopt_assert(fj_cpu != nullptr, "fj_cpu must not be null");
+  // Reset flags
+  cpu_thread_done         = false;
+  cpu_thread_should_start = true;
+  fj_cpu->halted          = false;
+  cpu_cv.notify_one();
+}
+
+template <typename i_t, typename f_t>
+void cpu_fj_thread_t<i_t, f_t>::stop_cpu_solver()
+{
+  fj_cpu->halted = true;
+}
+
+template <typename i_t, typename f_t>
+bool cpu_fj_thread_t<i_t, f_t>::wait_for_cpu_solver()
+{
+  while (!cpu_thread_done && !cpu_thread_terminate) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+  }
+
+  return cpu_fj_solution_found;
+}
+
 template <typename i_t, typename f_t>
 local_search_t<i_t, f_t>::local_search_t(mip_solver_context_t<i_t, f_t>& context_,
                                          rmm::device_uvector<f_t>& lp_optimal_solution_)
@@ -50,6 +125,174 @@ local_search_t<i_t, f_t>::local_search_t(mip_solver_context_t<i_t, f_t>& context
     rng(cuopt::seed_generator::get_seed()),
     problem_with_objective_cut(*context.problem_ptr, context.problem_ptr->handle_ptr)
 {
+  for (auto& cpu_fj : ls_cpu_fj) {
+    cpu_fj.fj_ptr = &fj;
+  }
+  for (auto& cpu_fj : scratch_cpu_fj) {
+    cpu_fj.fj_ptr = &fj;
+  }
+  scratch_cpu_fj_on_lp_opt.fj_ptr = &fj;
+}
+
+static double local_search_best_obj       = std::numeric_limits<double>::max();
+static population_t<int, double>* pop_ptr = nullptr;
+
+template <typename i_t, typename f_t>
+void local_search_t<i_t, f_t>::start_cpufj_scratch_threads(population_t<i_t, f_t>& population)
+{
+  pop_ptr = &population;
+
+  std::vector<f_t> default_weights(context.problem_ptr->n_constraints, 1.);
+
+  solution_t<i_t, f_t> solution(*context.problem_ptr);
+  thrust::fill(solution.handle_ptr->get_thrust_policy(),
+               solution.assignment.begin(),
+               solution.assignment.end(),
+               0.0);
+  solution.clamp_within_bounds();
+  i_t counter = 0;
+  for (auto& cpu_fj : scratch_cpu_fj) {
+    if (counter > 0) solution.assign_random_within_bounds(0.4);
+    cpu_fj.fj_cpu = cpu_fj.fj_ptr->create_cpu_climber(solution,
+                                                      default_weights,
+                                                      default_weights,
+                                                      0.,
+                                                      fj_settings_t{},
+                                                      /*randomize=*/counter > 0);
+
+    cpu_fj.fj_cpu->log_prefix           = "******* scratch " + std::to_string(counter) + ": ";
+    cpu_fj.fj_cpu->improvement_callback = [this, &population, &cpu_fj](
+                                            f_t obj, const std::vector<f_t>& h_vec) {
+      population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ);
+      if (obj < local_search_best_obj) {
+        CUOPT_LOG_DEBUG("******* New local search best obj %g, best overall %g",
+                        context.problem_ptr->get_user_obj_from_solver_obj(obj),
+                        context.problem_ptr->get_user_obj_from_solver_obj(
+                          population.is_feasible() ? population.best_feasible().get_objective()
+                                                   : std::numeric_limits<f_t>::max()));
+        local_search_best_obj = obj;
+      }
+    };
+    counter++;
+  };
+
+  // default weights
+  cudaDeviceSynchronize();
+
+  for (auto& cpu_fj : scratch_cpu_fj) {
+    cpu_fj.start_cpu_solver();
+  }
+
+  solution_t<i_t, f_t> solution_lp(*context.problem_ptr);
+  solution_lp.copy_new_assignment(host_copy(lp_optimal_solution));
+  solution_lp.round_random_nearest(500);
+  scratch_cpu_fj_on_lp_opt.fj_cpu =
+    fj.create_cpu_climber(solution_lp, default_weights, default_weights, 0.);
+  scratch_cpu_fj_on_lp_opt.fj_cpu->log_prefix = "******* scratch on LP optimal: ";
+  scratch_cpu_fj_on_lp_opt.fj_cpu->improvement_callback =
+    [this, &population](f_t obj, const std::vector<f_t>& h_vec) {
+      population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ);
+      if (obj < local_search_best_obj) {
+        CUOPT_LOG_DEBUG("******* New local search best obj %g, best overall %g",
+                        context.problem_ptr->get_user_obj_from_solver_obj(obj),
+                        context.problem_ptr->get_user_obj_from_solver_obj(
+                          population.is_feasible() ? population.best_feasible().get_objective()
+                                                   : std::numeric_limits<f_t>::max()));
+        local_search_best_obj = obj;
+      }
+    };
+
+  // default weights
+  cudaDeviceSynchronize();
+  scratch_cpu_fj_on_lp_opt.start_cpu_solver();
+}
+
+template <typename i_t, typename f_t>
+void local_search_t<i_t, f_t>::stop_cpufj_scratch_threads()
+{
+  for (auto& cpu_fj : scratch_cpu_fj) {
+    cpu_fj.kill_cpu_solver();
+  }
+  scratch_cpu_fj_on_lp_opt.kill_cpu_solver();
+}
+
+template <typename i_t, typename f_t>
+bool local_search_t<i_t, f_t>::do_fj_solve(solution_t<i_t, f_t>& solution,
+                                           fj_t<i_t, f_t>& in_fj,
+                                           const std::string& source)
+{
+  auto h_weights          = cuopt::host_copy(in_fj.cstr_weights, solution.handle_ptr->get_stream());
+  auto h_objective_weight = in_fj.objective_weight.value(solution.handle_ptr->get_stream());
+  for (auto& cpu_fj : ls_cpu_fj) {
+    cpu_fj.fj_cpu = cpu_fj.fj_ptr->create_cpu_climber(
+      solution, h_weights, h_weights, h_objective_weight, fj_settings_t{}, true);
+  }
+
+  auto solution_copy = solution;
+
+  // Start CPU solver in background thread
+  for (auto& cpu_fj : ls_cpu_fj) {
+    cpu_fj.start_cpu_solver();
+  }
+
+  // Run GPU solver and measure execution time
+  auto gpu_fj_start = std::chrono::high_resolution_clock::now();
+  in_fj.solve(solution);
+
+  // Stop CPU solver
+  for (auto& cpu_fj : ls_cpu_fj) {
+    cpu_fj.stop_cpu_solver();
+  }
+
+  auto gpu_fj_end        = std::chrono::high_resolution_clock::now();
+  double gpu_fj_duration = std::chrono::duration<double>(gpu_fj_end - gpu_fj_start).count();
+
+  solution_t<i_t, f_t> solution_cpu(*solution.problem_ptr);
+
+  f_t best_cpu_obj = std::numeric_limits<f_t>::max();
+  // // Wait for CPU solver to finish
+  for (auto& cpu_fj : ls_cpu_fj) {
+    bool cpu_sol_found = cpu_fj.wait_for_cpu_solver();
+    if (cpu_sol_found) {
+      f_t cpu_obj = cpu_fj.fj_cpu->h_best_objective;
+      if (cpu_obj < best_cpu_obj) {
+        best_cpu_obj = cpu_obj;
+        solution_cpu.copy_new_assignment(cpu_fj.fj_cpu->h_best_assignment);
+        solution_cpu.compute_feasibility();
+      }
+    }
+  }
+  bool cpu_sol_found = best_cpu_obj < std::numeric_limits<f_t>::max();
+
+  bool gpu_feasible = solution.get_feasible();
+  bool cpu_feasible = cpu_sol_found && solution_cpu.get_feasible();
+
+  static std::unordered_map<std::string, int> total_calls;
+  static std::unordered_map<std::string, int> cpu_better;
+
+  CUOPT_LOG_DEBUG("GPU FJ returns feas %d, obj %g", gpu_feasible, solution.get_user_objective());
+  CUOPT_LOG_DEBUG("CPU FJ returns feas %d, obj %g, stats %d/%d",
+                  cpu_feasible,
+                  solution_cpu.get_user_objective(),
+                  total_calls[source],
+                  cpu_better[source]);
+
+  total_calls[source]++;
+  if (cpu_feasible && !gpu_feasible ||
+      (cpu_feasible && solution_cpu.get_objective() < solution.get_objective())) {
+    CUOPT_LOG_DEBUG(
+      "CPU FJ returns better solution! cpu_obj %g, gpu_obj %g, stats %d/%d, source %s",
+      solution_cpu.get_user_objective(),
+      solution.get_user_objective(),
+      total_calls[source],
+      cpu_better[source],
+      source.c_str());
+    solution.copy_from(solution_cpu);
+    cpu_better[source]++;
+  }
+  solution.compute_feasibility();
+
+  return cpu_feasible;
 }
 
 template <typename i_t, typename f_t>
@@ -73,7 +316,7 @@ void local_search_t<i_t, f_t>::generate_fast_solution(solution_t<i_t, f_t>& solu
     if (timer.check_time_limit()) { return; };
     fj.settings.time_limit = std::min(3., timer.remaining_time());
     // run fj on the solution
-    fj.solve(solution);
+    do_fj_solve(solution, fj, "fast");
     // TODO check if FJ returns the same solution
     // check if the solution is feasible
     if (solution.compute_feasibility()) { return; }
@@ -136,7 +379,7 @@ bool local_search_t<i_t, f_t>::run_fj_until_timer(solution_t<i_t, f_t>& solution
   fj.settings.update_weights         = false;
   fj.settings.feasibility_run        = false;
   fj.copy_weights(weights, solution.handle_ptr);
-  fj.solve(solution);
+  do_fj_solve(solution, fj, "until_timer");
   CUOPT_LOG_DEBUG("Initial FJ feasibility done");
   is_feasible = solution.compute_feasibility();
   if (fj.settings.feasibility_run || timer.check_time_limit()) { return is_feasible; }
@@ -150,6 +393,12 @@ bool local_search_t<i_t, f_t>::run_fj_annealing(solution_t<i_t, f_t>& solution,
 {
   auto prev_settings = fj.settings;
 
+  solution.compute_feasibility();
+  CUOPT_LOG_DEBUG("Running FJ Annealing on solution with obj %g/%g, feas? %d",
+                  solution.get_user_objective(),
+                  solution.get_objective(),
+                  solution.get_feasible());
+
   // run in FEASIBLE_FIRST to priorize feasibility-improving moves
   fj.settings.n_of_minimums_for_exit                    = ls_config.n_local_mins;
   fj.settings.mode                                      = fj_mode_t::EXIT_NON_IMPROVING;
@@ -159,7 +408,7 @@ bool local_search_t<i_t, f_t>::run_fj_annealing(solution_t<i_t, f_t>& solution,
   fj.settings.parameters.allow_infeasibility_iterations = 100;
   fj.settings.update_weights                            = 1;
   fj.settings.baseline_objective_for_longer_run         = ls_config.best_objective_of_parents;
-  fj.solve(solution);
+  do_fj_solve(solution, fj, "annealing");
   bool is_feasible = solution.compute_feasibility();
 
   fj.settings = prev_settings;
@@ -225,7 +474,7 @@ bool local_search_t<i_t, f_t>::check_fj_on_lp_optimal(solution_t<i_t, f_t>& solu
   fj.settings.update_weights         = true;
   fj.settings.feasibility_run        = false;
   fj.settings.time_limit             = std::min(30., timer.remaining_time());
-  fj.solve(solution);
+  do_fj_solve(solution, fj, "on_lp_optimal");
   return solution.get_feasible();
 }
 
@@ -242,7 +491,7 @@ bool local_search_t<i_t, f_t>::run_fj_on_zero(solution_t<i_t, f_t>& solution, ti
   fj.settings.update_weights         = true;
   fj.settings.feasibility_run        = false;
   fj.settings.time_limit             = std::min(30., timer.remaining_time());
-  bool is_feasible                   = fj.solve(solution);
+  bool is_feasible                   = do_fj_solve(solution, fj, "on_zero");
   return is_feasible;
 }
 
diff --git a/cpp/src/mip/local_search/local_search.cuh b/cpp/src/mip/local_search/local_search.cuh
index d878b4b55..2a30db2b8 100644
--- a/cpp/src/mip/local_search/local_search.cuh
+++ b/cpp/src/mip/local_search/local_search.cuh
@@ -24,6 +24,12 @@
 #include <mip/solver.cuh>
 #include <utilities/timer.hpp>
 
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+
 namespace cuopt::linear_programming::detail {
 
 // make sure RANDOM is always the last
@@ -40,18 +46,44 @@ struct ls_config_t {
   f_t best_objective_of_parents{std::numeric_limits<f_t>::lowest()};
   i_t n_local_mins_for_line_segment       = 50;
   i_t n_points_to_search_for_line_segment = 5;
-  i_t n_local_mins                        = 250;
+  i_t n_local_mins                        = 2500;
   i_t iteration_limit_for_line_segment    = 20 * n_local_mins_for_line_segment;
   i_t iteration_limit                     = 20 * n_local_mins;
   ls_method_t ls_method                   = ls_method_t::RANDOM;
 };
 
+template <typename i_t, typename f_t>
+struct cpu_fj_thread_t {
+  cpu_fj_thread_t();
+  ~cpu_fj_thread_t();
+
+  void cpu_worker_thread();
+  void start_cpu_solver();
+  void stop_cpu_solver();
+  bool wait_for_cpu_solver();  // return feasibility
+  void kill_cpu_solver();
+
+  std::thread cpu_worker;
+  std::mutex cpu_mutex;
+  std::condition_variable cpu_cv;
+  std::atomic<bool> should_stop{false};
+  std::atomic<bool> cpu_thread_should_start{false};
+  std::atomic<bool> cpu_thread_done{false};
+  std::atomic<bool> cpu_thread_terminate{false};
+  bool cpu_fj_solution_found{false};
+  std::unique_ptr<fj_cpu_climber_t<i_t, f_t>> fj_cpu;
+  fj_t<i_t, f_t>* fj_ptr{nullptr};
+};
+
 template <typename i_t, typename f_t>
 class local_search_t {
  public:
   local_search_t() = delete;
   local_search_t(mip_solver_context_t<i_t, f_t>& context,
                  rmm::device_uvector<f_t>& lp_optimal_solution_);
+
+  void start_cpufj_scratch_threads(population_t<i_t, f_t>& population);
+  void stop_cpufj_scratch_threads();
   void generate_fast_solution(solution_t<i_t, f_t>& solution, timer_t timer);
   bool generate_solution(solution_t<i_t, f_t>& solution,
                          bool perturb,
@@ -79,6 +111,10 @@ class local_search_t {
               timer_t timer,
               population_t<i_t, f_t>* population_ptr = nullptr);
   void resize_vectors(problem_t<i_t, f_t>& problem, const raft::handle_t* handle_ptr);
+
+  bool do_fj_solve(solution_t<i_t, f_t>& solution, fj_t<i_t, f_t>& fj, const std::string& source);
+
+  i_t ls_threads() const { return ls_cpu_fj.size() + scratch_cpu_fj.size(); }
   void save_solution_and_add_cutting_plane(solution_t<i_t, f_t>& solution,
                                            rmm::device_uvector<f_t>& best_solution,
                                            f_t& best_objective);
@@ -97,11 +133,14 @@ class local_search_t {
   bool lp_optimal_exists{false};
   rmm::device_uvector<f_t> fj_sol_on_lp_opt;
   fj_t<i_t, f_t> fj;
-  // fj_tree_t<i_t, f_t> fj_tree;
   constraint_prop_t<i_t, f_t> constraint_prop;
   line_segment_search_t<i_t, f_t> line_segment_search;
   feasibility_pump_t<i_t, f_t> fp;
   std::mt19937 rng;
+
+  std::array<cpu_fj_thread_t<i_t, f_t>, 8> ls_cpu_fj;
+  std::array<cpu_fj_thread_t<i_t, f_t>, 1> scratch_cpu_fj;
+  cpu_fj_thread_t<i_t, f_t> scratch_cpu_fj_on_lp_opt;
   problem_t<i_t, f_t> problem_with_objective_cut;
   bool cutting_plane_added_for_active_run{false};
 };
diff --git a/cpp/src/mip/local_search/rounding/simple_rounding.cu b/cpp/src/mip/local_search/rounding/simple_rounding.cu
index 17c8793e3..bc7f9c58f 100644
--- a/cpp/src/mip/local_search/rounding/simple_rounding.cu
+++ b/cpp/src/mip/local_search/rounding/simple_rounding.cu
@@ -82,12 +82,44 @@ bool check_brute_force_rounding(solution_t<i_t, f_t>& solution)
   return false;
 }
 
+template <typename i_t, typename f_t>
+bool invoke_simple_rounding(solution_t<i_t, f_t>& solution)
+{
+  solution.compute_feasibility();
+
+  solution_t<i_t, f_t> sol_copy(*solution.problem_ptr);
+  sol_copy.copy_from(solution);
+
+  rmm::device_scalar<bool> successful(true, solution.handle_ptr->get_stream());
+  i_t TPB = 128;
+  simple_rounding_kernel<i_t, f_t>
+    <<<2048, TPB, 0, solution.handle_ptr->get_stream()>>>(solution.view(), successful.data());
+  if (!successful.value(solution.handle_ptr->get_stream())) {
+    CUOPT_LOG_DEBUG("Simple rounding failed");
+    solution.copy_from(sol_copy);
+    return false;
+  } else {
+    solution.compute_feasibility();
+    CUOPT_LOG_DEBUG("Simple rounding successful");
+    CUOPT_LOG_DEBUG("objective %g, feas %d, excess %g, int violation %g",
+                    solution.get_user_objective(),
+                    solution.get_feasible(),
+                    solution.get_total_excess(),
+                    solution.compute_max_int_violation());
+    return true;
+  }
+}
+
 template <typename i_t, typename f_t>
 void invoke_round_nearest(solution_t<i_t, f_t>& solution)
 {
   i_t TPB                     = 128;
   bool brute_force_found_feas = check_brute_force_rounding(solution);
   if (brute_force_found_feas) { return; }
+
+  bool simple_round = invoke_simple_rounding(solution);
+  if (simple_round) { return; }
+
   i_t n_blocks = (solution.problem_ptr->n_integer_vars + TPB - 1) / TPB;
   nearest_rounding_kernel<i_t, f_t><<<n_blocks, TPB, 0, solution.handle_ptr->get_stream()>>>(
     solution.view(), cuopt::seed_generator::get_seed());
@@ -153,6 +185,7 @@ void invoke_correct_integers(solution_t<i_t, f_t>& solution, f_t tol)
   template void invoke_random_round_nearest<int, F_TYPE>(solution_t<int, F_TYPE> & solution, \
                                                          int n_target_random_rounds);        \
   template void invoke_round_nearest<int, F_TYPE>(solution_t<int, F_TYPE> & solution);       \
+  template bool invoke_simple_rounding<int, F_TYPE>(solution_t<int, F_TYPE> & solution);     \
   template void invoke_correct_integers<int, F_TYPE>(solution_t<int, F_TYPE> & solution,     \
                                                      F_TYPE tol);
 
diff --git a/cpp/src/mip/local_search/rounding/simple_rounding.cuh b/cpp/src/mip/local_search/rounding/simple_rounding.cuh
index 5e1edd1fe..47416c3ff 100644
--- a/cpp/src/mip/local_search/rounding/simple_rounding.cuh
+++ b/cpp/src/mip/local_search/rounding/simple_rounding.cuh
@@ -24,6 +24,9 @@ namespace cuopt::linear_programming::detail {
 template <typename i_t, typename f_t>
 void invoke_round_nearest(solution_t<i_t, f_t>& solution);
 
+template <typename i_t, typename f_t>
+bool invoke_simple_rounding(solution_t<i_t, f_t>& solution);
+
 template <typename i_t, typename f_t>
 void invoke_random_round_nearest(solution_t<i_t, f_t>& solution, i_t n_target_random_rounds);
 
diff --git a/cpp/src/mip/local_search/rounding/simple_rounding_kernels.cuh b/cpp/src/mip/local_search/rounding/simple_rounding_kernels.cuh
index 1c0103110..bc0e2157d 100644
--- a/cpp/src/mip/local_search/rounding/simple_rounding_kernels.cuh
+++ b/cpp/src/mip/local_search/rounding/simple_rounding_kernels.cuh
@@ -25,6 +25,58 @@
 
 namespace cuopt::linear_programming::detail {
 
+template <typename i_t, typename f_t>
+__global__ void simple_rounding_kernel(typename solution_t<i_t, f_t>::view_t solution,
+                                       bool* successful)
+{
+  if (TH_ID_X >= solution.problem.n_integer_vars) { return; }
+  i_t var_id   = solution.problem.integer_indices[TH_ID_X];
+  f_t curr_val = solution.assignment[var_id];
+  if (solution.problem.is_integer(curr_val)) { return; }
+
+  i_t up_locks   = 0;
+  i_t down_locks = 0;
+
+  auto [offset_begin, offset_end] = solution.problem.reverse_range_for_var(var_id);
+  for (i_t i = offset_begin; i < offset_end; i += 1) {
+    auto cstr_idx   = solution.problem.reverse_constraints[i];
+    auto cstr_coeff = solution.problem.reverse_coefficients[i];
+
+    // boxed constraint. can't be rounded safely
+    if (std::isfinite(solution.problem.constraint_lower_bounds[cstr_idx]) &&
+        std::isfinite(solution.problem.constraint_upper_bounds[cstr_idx])) {
+      up_locks += 1;
+      down_locks += 1;
+      continue;
+    }
+
+    f_t sign = std::isfinite(solution.problem.constraint_upper_bounds[cstr_idx]) ? 1 : -1;
+
+    if (cstr_coeff * sign > 0) {
+      up_locks += 1;
+    } else {
+      down_locks += 1;
+    }
+  }
+
+  bool can_round_up   = up_locks == 0;
+  bool can_round_down = down_locks == 0;
+
+  if (can_round_up && can_round_down) {
+    if (solution.problem.objective_coefficients[var_id] > 0) {
+      solution.assignment[var_id] = floor(curr_val);
+    } else {
+      solution.assignment[var_id] = ceil(curr_val);
+    }
+  } else if (can_round_up) {
+    solution.assignment[var_id] = ceil(curr_val);
+  } else if (can_round_down) {
+    solution.assignment[var_id] = floor(curr_val);
+  } else {
+    *successful = false;
+  }
+}
+
 // rounds each integer variable to the nearest integer value that doesn't violate the bounds
 template <typename i_t, typename f_t>
 __global__ void nearest_rounding_kernel(typename solution_t<i_t, f_t>::view_t solution,
diff --git a/cpp/src/mip/problem/problem.cu b/cpp/src/mip/problem/problem.cu
index 598534c61..aa9407452 100644
--- a/cpp/src/mip/problem/problem.cu
+++ b/cpp/src/mip/problem/problem.cu
@@ -85,7 +85,7 @@ void problem_t<i_t, f_t>::op_problem_cstr_body(const optimization_problem_t<i_t,
   check_bounds_sanity(*this);
 
   // Check before any modifications
-  check_problem_representation(false, false);
+  cuopt_func_call(check_problem_representation(false, false));
   // If maximization problem, convert the problem
   if (maximize) convert_to_maximization_problem(*this);
   if (is_mip) {
@@ -97,7 +97,7 @@ void problem_t<i_t, f_t>::op_problem_cstr_body(const optimization_problem_t<i_t,
   }
   compute_transpose_of_problem();
   // Check after modifications
-  check_problem_representation(true, is_mip);
+  cuopt_func_call(check_problem_representation(true, is_mip));
   combine_constraint_bounds<i_t, f_t>(*this, combined_bounds);
 }
 
@@ -712,7 +712,7 @@ void problem_t<i_t, f_t>::recompute_auxilliary_data(bool check_representation)
   // TODO: speedup compute related variables
   const double time_limit = 30.;
   compute_related_variables(time_limit);
-  if (check_representation) check_problem_representation(true);
+  if (check_representation) cuopt_func_call(check_problem_representation(true));
 }
 
 template <typename i_t, typename f_t>
@@ -1251,7 +1251,7 @@ void problem_t<i_t, f_t>::remove_given_variables(problem_t<i_t, f_t>& original_p
   combine_constraint_bounds<i_t, f_t>(*this, combined_bounds);
   handle_ptr->sync_stream();
   recompute_auxilliary_data();
-  check_problem_representation(true);
+  cuopt_func_call(check_problem_representation(true));
 }
 
 template <typename i_t, typename f_t>
@@ -1281,7 +1281,7 @@ void problem_t<i_t, f_t>::compute_integer_fixed_problem()
   thrust::fill(handle_ptr->get_thrust_policy(), assignment.begin(), assignment.end(), 0.);
   integer_fixed_problem = std::make_shared<problem_t<i_t, f_t>>(get_problem_after_fixing_vars(
     assignment, integer_indices, integer_fixed_variable_map, handle_ptr));
-  integer_fixed_problem->check_problem_representation(true);
+  cuopt_func_call(integer_fixed_problem->check_problem_representation(true));
   integer_fixed_problem->lp_state.resize(*integer_fixed_problem, handle_ptr->get_stream());
 }
 
@@ -1306,7 +1306,7 @@ void problem_t<i_t, f_t>::fill_integer_fixed_problem(rmm::device_uvector<f_t>& a
   integer_fixed_problem->fix_given_variables(*this, assignment, integer_indices, handle_ptr);
   combine_constraint_bounds<i_t, f_t>(*integer_fixed_problem,
                                       integer_fixed_problem->combined_bounds);
-  integer_fixed_problem->check_problem_representation(true);
+  cuopt_func_call(integer_fixed_problem->check_problem_representation(true));
 }
 
 template <typename i_t, typename f_t>
@@ -1445,7 +1445,7 @@ void problem_t<i_t, f_t>::preprocess_problem()
   standardize_bounds(variable_constraint_map, *this);
   compute_csr(variable_constraint_map, *this);
   compute_transpose_of_problem();
-  check_problem_representation(true, false);
+  cuopt_func_call(check_problem_representation(true, false));
   presolve_data.initialize_var_mapping(*this, handle_ptr);
   integer_indices.resize(n_variables, handle_ptr->get_stream());
   is_binary_variable.resize(n_variables, handle_ptr->get_stream());
@@ -1455,7 +1455,7 @@ void problem_t<i_t, f_t>::preprocess_problem()
   std::iota(reverse_original_ids.begin(), reverse_original_ids.end(), 0);
   compute_n_integer_vars();
   compute_binary_var_table();
-  check_problem_representation(true);
+  cuopt_func_call(check_problem_representation(true));
   preprocess_called = true;
 }
 
@@ -1556,7 +1556,7 @@ void problem_t<i_t, f_t>::get_host_user_problem(
   }
 }
 template <typename i_t, typename f_t>
-f_t problem_t<i_t, f_t>::get_user_obj_from_solver_obj(f_t solver_obj)
+f_t problem_t<i_t, f_t>::get_user_obj_from_solver_obj(f_t solver_obj) const
 {
   return presolve_data.objective_scaling_factor * (solver_obj + presolve_data.objective_offset);
 }
diff --git a/cpp/src/mip/problem/problem.cuh b/cpp/src/mip/problem/problem.cuh
index f9c147b89..c210c3cfa 100644
--- a/cpp/src/mip/problem/problem.cuh
+++ b/cpp/src/mip/problem/problem.cuh
@@ -99,7 +99,7 @@ class problem_t {
                                bool resize_to_original_problem = true);
   void post_process_solution(solution_t<i_t, f_t>& solution);
   void compute_transpose_of_problem();
-  f_t get_user_obj_from_solver_obj(f_t solver_obj);
+  f_t get_user_obj_from_solver_obj(f_t solver_obj) const;
   void compute_integer_fixed_problem();
   void fill_integer_fixed_problem(rmm::device_uvector<f_t>& assignment,
                                   const raft::handle_t* handle_ptr);
@@ -116,23 +116,23 @@ class problem_t {
   void compute_vars_with_objective_coeffs();
 
   struct view_t {
-    DI std::pair<i_t, i_t> reverse_range_for_var(i_t v) const
+    HDI std::pair<i_t, i_t> reverse_range_for_var(i_t v) const
     {
       cuopt_assert(v >= 0 && v < n_variables, "Variable should be within the range");
       return std::make_pair(reverse_offsets[v], reverse_offsets[v + 1]);
     }
 
-    DI std::pair<i_t, i_t> range_for_constraint(i_t c) const
+    HDI std::pair<i_t, i_t> range_for_constraint(i_t c) const
     {
       return std::make_pair(offsets[c], offsets[c + 1]);
     }
 
-    DI std::pair<i_t, i_t> range_for_related_vars(i_t v) const
+    HDI std::pair<i_t, i_t> range_for_related_vars(i_t v) const
     {
       return std::make_pair(related_variables_offsets[v], related_variables_offsets[v + 1]);
     }
 
-    DI bool check_variable_within_bounds(i_t v, f_t val) const
+    HDI bool check_variable_within_bounds(i_t v, f_t val) const
     {
       const f_t int_tol = tolerances.integrality_tolerance;
       auto bounds       = variable_bounds[v];
@@ -141,20 +141,20 @@ class problem_t {
       return within_bounds;
     }
 
-    DI bool is_integer_var(i_t v) const { return var_t::INTEGER == variable_types[v]; }
+    HDI bool is_integer_var(i_t v) const { return var_t::INTEGER == variable_types[v]; }
 
     // check if the variable is integer according to the tolerances
     // specified for this problem
-    DI bool is_integer(f_t val) const
+    HDI bool is_integer(f_t val) const
     {
       return raft::abs(round(val) - (val)) <= tolerances.integrality_tolerance;
     }
-    DI bool integer_equal(f_t val1, f_t val2) const
+    HDI bool integer_equal(f_t val1, f_t val2) const
     {
       return raft::abs(val1 - val2) <= tolerances.integrality_tolerance;
     }
 
-    DI f_t get_random_for_var(i_t v, raft::random::PCGenerator& rng) const
+    HDI f_t get_random_for_var(i_t v, raft::random::PCGenerator& rng) const
     {
       cuopt_assert(var_t::INTEGER != variable_types[v],
                    "Random value can only be called on continuous values");
diff --git a/cpp/src/mip/solution/solution.cu b/cpp/src/mip/solution/solution.cu
index 7c95aaeed..cdffa37b0 100644
--- a/cpp/src/mip/solution/solution.cu
+++ b/cpp/src/mip/solution/solution.cu
@@ -287,6 +287,13 @@ void solution_t<i_t, f_t>::compute_constraints()
 {
   if (problem_ptr->n_constraints == 0) { return; }
 
+  cuopt_assert(problem_ptr->n_constraints == problem_ptr->constraint_upper_bounds.size(),
+               "invalid assignment size");
+  // TODO: investigate why sometimes the sizes are incorrectly set before the kernel call
+  constraint_value.resize(problem_ptr->n_constraints, handle_ptr->get_stream());
+  lower_excess.resize(problem_ptr->n_constraints, handle_ptr->get_stream());
+  upper_excess.resize(problem_ptr->n_constraints, handle_ptr->get_stream());
+
   i_t TPB = 64;
   compute_constraint_values<i_t, f_t>
     <<<problem_ptr->n_constraints, TPB, 0, handle_ptr->get_stream()>>>(view());
@@ -372,6 +379,13 @@ bool solution_t<i_t, f_t>::round_random_nearest(i_t n_target_random_rounds)
   return compute_feasibility();
 }
 
+template <typename i_t, typename f_t>
+bool solution_t<i_t, f_t>::round_simple()
+{
+  invoke_simple_rounding(*this);
+  return compute_feasibility();
+}
+
 template <typename i_t, typename f_t>
 void solution_t<i_t, f_t>::correct_integer_precision()
 {
diff --git a/cpp/src/mip/solution/solution.cuh b/cpp/src/mip/solution/solution.cuh
index 0774bdef2..956434c90 100644
--- a/cpp/src/mip/solution/solution.cuh
+++ b/cpp/src/mip/solution/solution.cuh
@@ -62,6 +62,7 @@ class solution_t {
   bool round_nearest();
   // rounds integers to random if fractionality is between 0.25 and 0.75. otherwise, to nearest
   bool round_random_nearest(i_t n_target_random_rounds);
+  bool round_simple();
   // makes the approximate integer values up to INTEGRALITY TOLERANCE whole integers
   void correct_integer_precision();
   // does a reduction and returns if the current solution is feasible
diff --git a/cpp/src/mip/solve.cu b/cpp/src/mip/solve.cu
index 7941f41dc..22638c4f9 100644
--- a/cpp/src/mip/solve.cu
+++ b/cpp/src/mip/solve.cu
@@ -90,6 +90,8 @@ mip_solution_t<i_t, f_t> run_mip(detail::problem_t<i_t, f_t>& problem,
     solution.compute_objective();  // just to ensure h_user_obj is set
     auto stats           = solver_stats_t<i_t, f_t>{};
     stats.solution_bound = solution.get_user_objective();
+    // log the objective for scripts which need it
+    CUOPT_LOG_INFO("Best feasible: %f", solution.get_user_objective());
     return solution.get_solution(true, stats, false);
   }
   // problem contains unpreprocessed data
diff --git a/cpp/src/mip/solver.cu b/cpp/src/mip/solver.cu
index ea3272419..a9b7615ed 100644
--- a/cpp/src/mip/solver.cu
+++ b/cpp/src/mip/solver.cu
@@ -73,7 +73,7 @@ struct branch_and_bound_solution_helper_t {
 
   void solution_callback(std::vector<f_t>& solution, f_t objective)
   {
-    dm->population.add_external_solution(solution, objective);
+    dm->population.add_external_solution(solution, objective, solution_origin_t::BRANCH_AND_BOUND);
   }
 
   void set_simplex_solution(std::vector<f_t>& solution,
@@ -175,6 +175,7 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     if (context.settings.num_cpu_threads != -1) {
       branch_and_bound_settings.num_threads = std::max(1, context.settings.num_cpu_threads);
     }
+    CUOPT_LOG_INFO("Using %d CPU threads for B&B", branch_and_bound_settings.num_threads);
 
     // Set the branch and bound -> primal heuristics callback
     branch_and_bound_settings.solution_callback =
diff --git a/cpp/tests/mip/termination_test.cu b/cpp/tests/mip/termination_test.cu
index ce2003c40..90221d3f1 100644
--- a/cpp/tests/mip/termination_test.cu
+++ b/cpp/tests/mip/termination_test.cu
@@ -104,9 +104,9 @@ TEST(termination_status, optimality_test)
 // Ensure the lower bound on maximization problems when BB times out has the right sign
 TEST(termination_status, lower_bound_bb_timeout)
 {
-  auto [termination_status, obj_val, lb] = test_mps_file("mip/cod105_max.mps", 0.5, false);
+  auto [termination_status, obj_val, lb] = test_mps_file("mip/cod105_max.mps", 5.0, false);
   EXPECT_EQ(termination_status, mip_termination_status_t::FeasibleFound);
-  EXPECT_EQ(obj_val, 12);
+  EXPECT_GE(obj_val, 6);
   EXPECT_GE(lb, obj_val);
 }
 
diff --git a/dependencies.yaml b/dependencies.yaml
index d153a0291..ee479e520 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -393,7 +393,6 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - fastapi
-          - httpx
           - *jsonref
           - *msgpack_numpy
           - *pandas
diff --git a/docs/cuopt/source/resources.rst b/docs/cuopt/source/resources.rst
index 9121800ee..b20ef4a24 100644
--- a/docs/cuopt/source/resources.rst
+++ b/docs/cuopt/source/resources.rst
@@ -13,6 +13,9 @@ cuOpt Examples and Tutorials Videos
 .. dropdown:: Tutorial List
 
     - `Quick Start to GPU-Accelerated Large-Scale Logistics and Supply Chain Optimization with NVIDIA cuOpt <https://www.youtube.com/watch?v=CVtqcUmW9nU>`_
+    - `Accelerated MILP for Supply Chain, Logistics & Planning Optimization — Quick Start with NVIDIA cuOpt <https://www.youtube.com/watch?v=jnDPQeUO0CM>`_
+    - `Solving Vehicle Routing Problems — Hands-On with Open Source NVIDIA cuOpt <https://www.youtube.com/watch?v=kSATyJrELUI>`_
+
 
 `Test cuOpt with NVIDIA Launchable <https://brev.nvidia.com/launchable/deploy?launchableID=env-2qIG6yjGKDtdMSjXHcuZX12mDNJ>`_
 ------------------------------------------------------------------------------------------------------------------------------
diff --git a/python/cuopt_server/pyproject.toml b/python/cuopt_server/pyproject.toml
index 83e764f23..f84297589 100644
--- a/python/cuopt_server/pyproject.toml
+++ b/python/cuopt_server/pyproject.toml
@@ -35,7 +35,6 @@ dependencies = [
     "cuopt==25.10.*,>=0.0.0a0",
     "cupy-cuda13x>=13.6.0",
     "fastapi",
-    "httpx",
     "jsonref==1.1.0",
     "msgpack-numpy==0.4.8",
     "msgpack==1.1.0",