diff --git a/ci/build_wheel_cuopt.sh b/ci/build_wheel_cuopt.sh index cd49b6ffb..0a9a8edff 100755 --- a/ci/build_wheel_cuopt.sh +++ b/ci/build_wheel_cuopt.sh @@ -20,6 +20,9 @@ set -euo pipefail source rapids-init-pip +# Install cudss +bash ci/utils/install_cudss.sh + package_dir="python/cuopt" export SKBUILD_CMAKE_ARGS="-DCUOPT_BUILD_WHEELS=ON;-DDISABLE_DEPRECATION_WARNINGS=ON"; diff --git a/ci/build_wheel_libcuopt.sh b/ci/build_wheel_libcuopt.sh index 7651b52f8..88d6ebf01 100755 --- a/ci/build_wheel_libcuopt.sh +++ b/ci/build_wheel_libcuopt.sh @@ -34,6 +34,24 @@ else echo "Building in release mode" fi +# Install cudss +bash ci/utils/install_cudss.sh + +# Clean metadata & install cudss +if command -v dnf &> /dev/null; then + dnf clean all + # Adding static library just to please CMAKE requirements + dnf -y install libcudss0-static-cuda-12 libcudss0-devel-cuda-12 libcudss0-cuda-12 +elif command -v apt-get &> /dev/null; then + apt-get update + apt-get install -y libcudss-devel +else + echo "Neither dnf nor apt-get found. Cannot install cudss dependencies." + exit 1 +fi +# dnf -y install libcudss0-devel-cuda-12 + + rapids-logger "Generating build requirements" CUOPT_MPS_PARSER_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="cuopt_mps_parser" rapids-download-wheels-from-github python) @@ -62,6 +80,7 @@ EXCLUDE_ARGS=( --exclude "libraft.so" --exclude "libcublas.so.*" --exclude "libcublasLt.so.*" + --exclude "libcudss.so.*" --exclude "libcurand.so.*" --exclude "libcusolver.so.*" --exclude "libcusparse.so.*" diff --git a/ci/utils/install_cudss.sh b/ci/utils/install_cudss.sh new file mode 100755 index 000000000..d885ee385 --- /dev/null +++ b/ci/utils/install_cudss.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +# Clean metadata & install cudss +if command -v dnf &> /dev/null; then + dnf clean all + # Adding static library just to please CMAKE requirements + dnf -y install libcudss0-static-cuda-12 libcudss0-devel-cuda-12 libcudss0-cuda-12 +elif command -v apt-get &> /dev/null; then + apt-get update + apt-get install -y libcudss-devel +else + echo "Neither dnf nor apt-get found. Cannot install cudss dependencies." + exit 1 +fi diff --git a/cmake/FindCUDSS.cmake b/cmake/FindCUDSS.cmake new file mode 100644 index 000000000..9e1e110cb --- /dev/null +++ b/cmake/FindCUDSS.cmake @@ -0,0 +1,42 @@ +# ============================================================================= +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +if (CUDSS_INCLUDE AND CUDSS_LIBRARIES) + set(CUDSS_FIND_QUIETLY TRUE) +endif (CUDSS_INCLUDE AND CUDSS_LIBRARIES) + +message(STATUS "CUDSS DIR = $ENV{CUDSS_DIR}") + + +find_path(CUDSS_INCLUDE + NAMES + cudss.h + PATHS + $ENV{CUDSS_DIR}/include + ${INCLUDE_INSTALL_DIR} + PATH_SUFFIXES + cudss +) + + +find_library(CUDSS_LIBRARIES cudss PATHS $ENV{CUDSS_DIR}/lib ${LIB_INSTALL_DIR}) + +message(STATUS "CUDSS_INCLUDE = ${CUDSS_INCLUDE}") +message(STATUS "CUDSS_LIBRARIES = ${CUDSS_LIBRARIES}") + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(CHOLMOD DEFAULT_MSG + CUDSS_INCLUDE CUDSS_LIBRARIES) + +mark_as_advanced(CUDSS_INCLUDE CUDSS_LIBRARIES) diff --git a/conda/environments/all_cuda-128_arch-aarch64.yaml b/conda/environments/all_cuda-128_arch-aarch64.yaml index ba6b6945e..da85d2c05 100644 --- a/conda/environments/all_cuda-128_arch-aarch64.yaml +++ b/conda/environments/all_cuda-128_arch-aarch64.yaml @@ -35,6 +35,7 @@ dependencies: - httpx - ipython - jsonref==1.1.0 +- libcudss-dev - libcurand-dev - libcusolver-dev - libcusparse-dev diff --git a/conda/environments/all_cuda-128_arch-x86_64.yaml b/conda/environments/all_cuda-128_arch-x86_64.yaml index f91c22957..ec173cfa6 100644 --- a/conda/environments/all_cuda-128_arch-x86_64.yaml +++ b/conda/environments/all_cuda-128_arch-x86_64.yaml @@ -35,6 +35,7 @@ dependencies: - httpx - ipython - jsonref==1.1.0 +- libcudss-dev - libcurand-dev - libcusolver-dev - libcusparse-dev diff --git a/conda/recipes/libcuopt/recipe.yaml b/conda/recipes/libcuopt/recipe.yaml index 9924107d2..d8701f645 100644 --- a/conda/recipes/libcuopt/recipe.yaml +++ b/conda/recipes/libcuopt/recipe.yaml @@ -66,6 +66,7 @@ cache: - librmm =${{ dep_minor_version }} - rapids-logger =0.1 - cuda-nvtx-dev + - libcudss-dev - libcurand-dev - libcusparse-dev - cuda-cudart-dev @@ -133,6 +134,7 @@ outputs: - rapids-logger =0.1 - librmm =${{ dep_minor_version }} - cuda-cudart-dev + - libcudss-dev - libcublas - libcusparse-dev run: @@ -183,6 +185,7 @@ outputs: - gmock ${{ gtest_version }} - gtest ${{ gtest_version }} - cuda-cudart-dev + - libcudss-dev - libcublas - libcusparse-dev run: diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 53071afae..f59825e7c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -30,6 +30,9 @@ include(rapids-find) rapids_cuda_init_architectures(CUOPT) +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../cmake") +message(STATUS "CMAKE_MODULE_PATH = ${CMAKE_MODULE_PATH}") + project( CUOPT VERSION 25.10.00 @@ -180,6 +183,8 @@ include(${rapids-cmake-dir}/cpm/rapids_logger.cmake) rapids_cpm_rapids_logger(BUILD_EXPORT_SET cuopt-exports INSTALL_EXPORT_SET cuopt-exports) create_logger_macros(CUOPT "cuopt::default_logger()" include/cuopt) +find_package(CUDSS REQUIRED) + if(BUILD_TESTS) include(cmake/thirdparty/get_gtest.cmake) endif() @@ -222,6 +227,7 @@ target_link_options(cuopt PRIVATE "${CUOPT_BINARY_DIR}/fatbin.ld") add_library(cuopt::cuopt ALIAS cuopt) # ################################################################################################## # - include paths --------------------------------------------------------------------------------- +message(STATUS "target include directories CUDSS_INCLUDES = ${CUDSS_INCLUDE}") target_include_directories(cuopt SYSTEM PRIVATE "${papilo_SOURCE_DIR}/src" "${papilo_BINARY_DIR}") @@ -229,12 +235,14 @@ target_include_directories(cuopt PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../thirdparty" "${CMAKE_CURRENT_SOURCE_DIR}/src" + "${CUDSS_INCLUDE}" PUBLIC "$" "$" "$" INTERFACE "$" + ${CUDSS_INCLUDE} ) # ################################################################################################## @@ -250,6 +258,23 @@ list(PREPEND CUOPT_PRIVATE_CUDA_LIBS CUDA::cublasLt) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/libmps_parser) set(CMAKE_LIBRARY_PATH ${CMAKE_CURRENT_BINARY_DIR}/libmps_parser/) +find_package(cudss REQUIRED CONFIG) + +# Print all details of the cudss package +message(STATUS "cudss package details:") +message(STATUS " cudss_FOUND: ${cudss_FOUND}") +message(STATUS " cudss_VERSION: ${cudss_VERSION}") +message(STATUS " cudss_INCLUDE_DIRS: ${cudss_INCLUDE_DIRS}") +message(STATUS " cudss_LIBRARY_DIR: ${cudss_LIBRARY_DIR}") +message(STATUS " cudss_LIBRARIES: ${cudss_LIBRARIES}") +message(STATUS " cudss_DEFINITIONS: ${cudss_DEFINITIONS}") +message(STATUS " cudss_COMPILE_OPTIONS: ${cudss_COMPILE_OPTIONS}") +message(STATUS " cudss_LINK_OPTIONS: ${cudss_LINK_OPTIONS}") + +# Use the specific cudss library version to avoid symlink issues +set(CUDSS_LIB_FILE "${cudss_LIBRARY_DIR}/libcudss.so.0") +message(STATUS "Using cudss library: ${CUDSS_LIB_FILE}") + target_link_libraries(cuopt PUBLIC @@ -260,6 +285,7 @@ target_link_libraries(cuopt CCCL::CCCL raft::raft cuopt::mps_parser + ${CUDSS_LIB_FILE} PRIVATE ${CUOPT_PRIVATE_CUDA_LIBS} ) diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h index 72d05a6ad..ab1552387 100644 --- a/cpp/include/cuopt/linear_programming/constants.h +++ b/cpp/include/cuopt/linear_programming/constants.h @@ -50,6 +50,7 @@ #define CUOPT_LOG_FILE "log_file" #define CUOPT_LOG_TO_CONSOLE "log_to_console" #define CUOPT_CROSSOVER "crossover" +#define CUOPT_USE_CUDSS "use_cudss" #define CUOPT_PRESOLVE "presolve" #define CUOPT_MIP_ABSOLUTE_TOLERANCE "mip_absolute_tolerance" #define CUOPT_MIP_RELATIVE_TOLERANCE "mip_relative_tolerance" @@ -105,6 +106,7 @@ #define CUOPT_METHOD_CONCURRENT 0 #define CUOPT_METHOD_PDLP 1 #define CUOPT_METHOD_DUAL_SIMPLEX 2 +#define CUOPT_METHOD_BARRIER 3 /* @brief Status codes constants */ #define CUOPT_SUCCESS 0 diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp index f4e515288..9e9d6a93b 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp @@ -65,7 +65,8 @@ enum pdlp_solver_mode_t : int { enum method_t : int { Concurrent = CUOPT_METHOD_CONCURRENT, PDLP = CUOPT_METHOD_PDLP, - DualSimplex = CUOPT_METHOD_DUAL_SIMPLEX + DualSimplex = CUOPT_METHOD_DUAL_SIMPLEX, + Barrier = CUOPT_METHOD_BARRIER }; template @@ -207,6 +208,7 @@ class pdlp_solver_settings_t { std::string user_problem_file{""}; bool per_constraint_residual{false}; bool crossover{false}; + bool use_cudss{false}; bool save_best_primal_so_far{false}; bool first_primal_feasible{false}; bool presolve{false}; diff --git a/cpp/src/dual_simplex/CMakeLists.txt b/cpp/src/dual_simplex/CMakeLists.txt index d85471f9b..e3586e5f8 100644 --- a/cpp/src/dual_simplex/CMakeLists.txt +++ b/cpp/src/dual_simplex/CMakeLists.txt @@ -14,6 +14,7 @@ # limitations under the License. set(DUAL_SIMPLEX_SRC_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/barrier.cpp ${CMAKE_CURRENT_SOURCE_DIR}/basis_solves.cpp ${CMAKE_CURRENT_SOURCE_DIR}/basis_updates.cpp ${CMAKE_CURRENT_SOURCE_DIR}/bound_flipping_ratio_test.cpp @@ -36,5 +37,7 @@ set(DUAL_SIMPLEX_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/triangle_solve.cpp ${CMAKE_CURRENT_SOURCE_DIR}/vector_math.cpp) +set_source_files_properties(${DUAL_SIMPLEX_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES COMPILE_OPTIONS "-g1") + set(CUOPT_SRC_FILES ${CUOPT_SRC_FILES} ${DUAL_SIMPLEX_SRC_FILES} PARENT_SCOPE) diff --git a/cpp/src/dual_simplex/barrier.cpp b/cpp/src/dual_simplex/barrier.cpp new file mode 100644 index 000000000..caa2218ef --- /dev/null +++ b/cpp/src/dual_simplex/barrier.cpp @@ -0,0 +1,1298 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include + +namespace cuopt::linear_programming::dual_simplex { + +template +class iteration_data_t { + public: + iteration_data_t(const lp_problem_t& lp, + i_t num_upper_bounds, + const simplex_solver_settings_t& settings) + : upper_bounds(num_upper_bounds), + c(lp.objective), + b(lp.rhs), + w(num_upper_bounds), + x(lp.num_cols), + y(lp.num_rows), + v(num_upper_bounds), + z(lp.num_cols), + w_save(num_upper_bounds), + x_save(lp.num_cols), + y_save(lp.num_rows), + v_save(num_upper_bounds), + z_save(lp.num_cols), + diag(lp.num_cols), + inv_diag(lp.num_cols), + inv_sqrt_diag(lp.num_cols), + AD(lp.num_cols, lp.num_rows, 0), + AT(lp.num_rows, lp.num_cols, 0), + ADAT(lp.num_rows, lp.num_rows, 0), + primal_residual(lp.num_rows), + bound_residual(num_upper_bounds), + dual_residual(lp.num_cols), + complementarity_xz_residual(lp.num_cols), + complementarity_wv_residual(num_upper_bounds), + primal_rhs(lp.num_rows), + bound_rhs(num_upper_bounds), + dual_rhs(lp.num_cols), + complementarity_xz_rhs(lp.num_cols), + complementarity_wv_rhs(num_upper_bounds), + dw_aff(num_upper_bounds), + dx_aff(lp.num_cols), + dy_aff(lp.num_rows), + dv_aff(num_upper_bounds), + dz_aff(lp.num_cols), + dw(num_upper_bounds), + dx(lp.num_cols), + dy(lp.num_rows), + dv(num_upper_bounds), + dz(lp.num_cols), + has_factorization(false) + { + // Create the upper bounds vector + n_upper_bounds = 0; + for (i_t j = 0; j < lp.num_cols; j++) { + if (lp.upper[j] < inf) { upper_bounds[n_upper_bounds++] = j; } + } + settings.log.printf("n_upper_bounds %d\n", n_upper_bounds); + // Form A*Dinv*A' + diag.set_scalar(1.0); + if (n_upper_bounds > 0) { + for (i_t k = 0; k < n_upper_bounds; k++) { + i_t j = upper_bounds[k]; + diag[j] = 2.0; + } + } + inv_diag.set_scalar(1.0); + if (n_upper_bounds > 0) { diag.inverse(inv_diag); } + inv_sqrt_diag.set_scalar(1.0); + if (n_upper_bounds > 0) { inv_diag.sqrt(inv_sqrt_diag); } + + column_density(lp.A, settings); + + // Copy A into AD + AD = lp.A; + // Perform column scaling on A to get AD^(-1) + AD.scale_columns(inv_diag); + + // Form A*Dinv*A' + float64_t start_form_aat = tic(); + lp.A.transpose(AT); + multiply(AD, AT, ADAT); + + float64_t aat_time = toc(start_form_aat); + settings.log.printf("AAT time %.2fs\n", aat_time); + settings.log.printf("AAT nonzeros %e\n", static_cast(ADAT.col_start[lp.num_rows])); + + chol = std::make_unique>(settings, lp.num_rows); + chol->set_positive_definite(false); + + // Perform symbolic analysis + chol->analyze(ADAT); + } + + void to_solution(const lp_problem_t& lp, + i_t iterations, + f_t objective, + f_t user_objective, + f_t primal_residual, + f_t dual_residual, + lp_solution_t& solution) + { + solution.x = x; + solution.y = y; + dense_vector_t z_tilde(z.size()); + scatter_upper_bounds(v, z_tilde); + z_tilde.axpy(1.0, z, -1.0); + solution.z = z_tilde; + + dense_vector_t dual_res = z_tilde; + dual_res.axpy(-1.0, lp.objective, 1.0); + matrix_transpose_vector_multiply(lp.A, 1.0, solution.y, 1.0, dual_res); + f_t dual_residual_norm = vector_norm_inf(dual_res); + printf("Solution Dual residual: %e\n", dual_residual_norm); + + solution.iterations = iterations; + solution.objective = objective; + solution.user_objective = user_objective; + solution.l2_primal_residual = primal_residual; + solution.l2_dual_residual = dual_residual; + } + + void column_density(const csc_matrix_t& A, + const simplex_solver_settings_t& settings) + { + const i_t m = A.m; + const i_t n = A.n; + if (n < 2) { return; } + std::vector density(n); + for (i_t j = 0; j < n; j++) { + const i_t nnz = A.col_start[j + 1] - A.col_start[j]; + density[j] = static_cast(nnz); + } + + std::vector permutation(n); + std::iota(permutation.begin(), permutation.end(), 0); + std::sort(permutation.begin(), permutation.end(), [&density](i_t i, i_t j) { + return density[i] < density[j]; + }); + + std::vector columns_to_keep; + columns_to_keep.reserve(n); + + f_t nnz = density[permutation[0]]; + columns_to_keep.push_back(permutation[0]); + for (i_t i = 1; i < n; i++) { + const i_t j = permutation[i]; + const f_t density_j = density[j]; + + if (i > n - 5 && density_j > 10.0) { + settings.log.printf("Dense column %6d nz %6d density %.2e nnz %.2e\n", + j, + static_cast(density_j), + density_j / m, + nnz + density_j * density_j); + } + nnz += density_j * density_j; + + columns_to_keep.push_back(j); + } + } + + void scatter_upper_bounds(const dense_vector_t& y, dense_vector_t& z) + { + if (n_upper_bounds > 0) { + for (i_t k = 0; k < n_upper_bounds; k++) { + i_t j = upper_bounds[k]; + z[j] = y[k]; + } + } + } + + void gather_upper_bounds(const dense_vector_t& z, dense_vector_t& y) + { + if (n_upper_bounds > 0) { + for (i_t k = 0; k < n_upper_bounds; k++) { + i_t j = upper_bounds[k]; + y[k] = z[j]; + } + } + } + + void preconditioned_conjugate_gradient(const simplex_solver_settings_t& settings, + const dense_vector_t& b, + f_t tolerance, + dense_vector_t& xinout) const + { + dense_vector_t residual = b; + dense_vector_t y(ADAT.n); + + dense_vector_t x = xinout; + + const csc_matrix_t& A = ADAT; + + // r = A*x - b + matrix_vector_multiply(A, 1.0, x, -1.0, residual); + + // Solve M y = r for y + chol->solve(residual, y); + + dense_vector_t p = y; + p.multiply_scalar(-1.0); + + dense_vector_t Ap(A.n); + i_t iter = 0; + f_t norm_residual = vector_norm2(residual); + f_t initial_norm_residual = norm_residual; + settings.log.printf("PCG initial residual 2-norm %e inf-norm %e\n", + norm_residual, + vector_norm_inf(residual)); + + f_t rTy = residual.inner_product(y); + + while (norm_residual > tolerance && iter < 100) { + // Compute Ap = A * p + matrix_vector_multiply(A, 1.0, p, 0.0, Ap); + + // Compute alpha = (r^T * y) / (p^T * Ap) + f_t alpha = rTy / p.inner_product(Ap); + + // Update residual = residual + alpha * Ap + residual.axpy(alpha, Ap, 1.0); + + f_t new_residual = vector_norm2(residual); + if (new_residual > 1.1 * norm_residual || new_residual > 1.1 * initial_norm_residual) { + settings.log.printf( + "PCG residual increased by more than 10%%. New %e > %e\n", new_residual, norm_residual); + break; + } + norm_residual = new_residual; + + // Update x = x + alpha * p + x.axpy(alpha, p, 1.0); + + // residual = A*x - b + residual = b; + matrix_vector_multiply(A, 1.0, x, -1.0, residual); + norm_residual = vector_norm2(residual); + + // Solve M y = r for y + chol->solve(residual, y); + + // Compute beta = (r_+^T y_+) / (r^T y) + f_t r1Ty1 = residual.inner_product(y); + f_t beta = r1Ty1 / rTy; + + rTy = r1Ty1; + + // Update p = -y + beta * p + p.axpy(-1.0, y, beta); + + iter++; + settings.log.printf("PCG iter %3d 2-norm_residual %.2e inf-norm_residual %.2e\n", + iter, + norm_residual, + vector_norm_inf(residual)); + } + + residual = b; + matrix_vector_multiply(A, 1.0, x, -1.0, residual); + norm_residual = vector_norm2(residual); + if (norm_residual < initial_norm_residual) { + settings.log.printf("PCG improved residual 2-norm %.2e/%.2e in %d iterations\n", + norm_residual, + initial_norm_residual, + iter); + xinout = x; + } else { + settings.log.printf("Rejecting PCG solution\n"); + } + } + + i_t n_upper_bounds; + dense_vector_t upper_bounds; + dense_vector_t c; + dense_vector_t b; + + dense_vector_t w; + dense_vector_t x; + dense_vector_t y; + dense_vector_t v; + dense_vector_t z; + + dense_vector_t w_save; + dense_vector_t x_save; + dense_vector_t y_save; + dense_vector_t v_save; + dense_vector_t z_save; + + dense_vector_t diag; + dense_vector_t inv_diag; + dense_vector_t inv_sqrt_diag; + + csc_matrix_t AD; + csc_matrix_t AT; + csc_matrix_t ADAT; + + std::unique_ptr> chol; + + bool has_factorization; + + dense_vector_t primal_residual; + dense_vector_t bound_residual; + dense_vector_t dual_residual; + dense_vector_t complementarity_xz_residual; + dense_vector_t complementarity_wv_residual; + + dense_vector_t primal_rhs; + dense_vector_t bound_rhs; + dense_vector_t dual_rhs; + dense_vector_t complementarity_xz_rhs; + dense_vector_t complementarity_wv_rhs; + + dense_vector_t dw_aff; + dense_vector_t dx_aff; + dense_vector_t dy_aff; + dense_vector_t dv_aff; + dense_vector_t dz_aff; + + dense_vector_t dw; + dense_vector_t dx; + dense_vector_t dy; + dense_vector_t dv; + dense_vector_t dz; +}; + +template +void barrier_solver_t::initial_point(iteration_data_t& data) +{ + // Perform a numerical factorization + i_t status = data.chol->factorize(data.ADAT); + if (status != 0) { + settings.log.printf("Initial factorization failed\n"); + return; + } + + // rhs_x <- b + dense_vector_t rhs_x(lp.rhs); + + dense_vector_t Fu(lp.num_cols); + data.gather_upper_bounds(lp.upper, Fu); + + dense_vector_t DinvFu(lp.num_cols); // DinvFu = Dinv * Fu + data.inv_diag.pairwise_product(Fu, DinvFu); + + // rhs_x <- A * Dinv * F * u - b + matrix_vector_multiply(lp.A, 1.0, DinvFu, -1.0, rhs_x); + + // Solve A*Dinv*A'*q = A*Dinv*F*u - b + dense_vector_t q(lp.num_rows); + settings.log.printf("||rhs_x|| = %e\n", vector_norm2(rhs_x)); + i_t solve_status = data.chol->solve(rhs_x, q); + settings.log.printf("Initial solve status %d\n", solve_status); + settings.log.printf("||q|| = %e\n", vector_norm2(q)); + + // rhs_x <- A*Dinv*A'*q - rhs_x + matrix_vector_multiply(data.ADAT, 1.0, q, -1.0, rhs_x); + settings.log.printf("|| A*Dinv*A'*q - (A*Dinv*F*u - b) || = %e\n", vector_norm2(rhs_x)); + + // x = Dinv*(F*u - A'*q) + data.inv_diag.pairwise_product(Fu, data.x); + // Fu <- -1.0 * A' * q + 1.0 * Fu + matrix_transpose_vector_multiply(lp.A, -1.0, q, 1.0, Fu); + + // x <- Dinv * (F*u - A'*q) + data.inv_diag.pairwise_product(Fu, data.x); + + // w <- E'*u - E'*x + if (data.n_upper_bounds > 0) { + for (i_t k = 0; k < data.n_upper_bounds; k++) { + i_t j = data.upper_bounds[k]; + data.w[k] = lp.upper[j] - data.x[j]; + } + } + + // Verify A*x = b + data.primal_residual = lp.rhs; + matrix_vector_multiply(lp.A, 1.0, data.x, -1.0, data.primal_residual); + settings.log.printf("||b - A * x||: %e\n", vector_norm2(data.primal_residual)); + + if (data.n_upper_bounds > 0) { + for (i_t k = 0; k < data.n_upper_bounds; k++) { + i_t j = data.upper_bounds[k]; + data.bound_residual[k] = lp.upper[j] - data.w[k] - data.x[j]; + } + settings.log.printf("|| u - w - x||: %e\n", vector_norm2(data.bound_residual)); + } + + // First compute rhs = A*Dinv*c + dense_vector_t rhs(lp.num_rows); + dense_vector_t Dinvc(lp.num_cols); + data.inv_diag.pairwise_product(lp.objective, Dinvc); + // rhs = 1.0 * A * Dinv * c + matrix_vector_multiply(lp.A, 1.0, Dinvc, 0.0, rhs); + + // Solve A*Dinv*A'*q = A*Dinv*c + data.chol->solve(rhs, data.y); + + // z = Dinv*(c - A'*y) + dense_vector_t cmATy = data.c; + matrix_transpose_vector_multiply(lp.A, -1.0, data.y, 1.0, cmATy); + // z <- Dinv * (c - A'*y) + data.inv_diag.pairwise_product(cmATy, data.z); + + // v = -E'*z + data.gather_upper_bounds(data.z, data.v); + data.v.multiply_scalar(-1.0); + + // Verify A'*y + z - E*v = c + data.z.pairwise_subtract(data.c, data.dual_residual); + matrix_transpose_vector_multiply(lp.A, 1.0, data.y, 1.0, data.dual_residual); + if (data.n_upper_bounds > 0) { + for (i_t k = 0; k < data.n_upper_bounds; k++) { + i_t j = data.upper_bounds[k]; + data.dual_residual[j] -= data.v[k]; + } + } + settings.log.printf("||A^T y + z - E*v - c ||: %e\n", vector_norm2(data.dual_residual)); + + // Make sure (w, x, v, z) > 0 + float64_t epsilon_adjust = 10.0; + data.w.ensure_positive(epsilon_adjust); + data.x.ensure_positive(epsilon_adjust); + data.v.ensure_positive(epsilon_adjust); + data.z.ensure_positive(epsilon_adjust); +} + +template +void barrier_solver_t::compute_residuals(const dense_vector_t& w, + const dense_vector_t& x, + const dense_vector_t& y, + const dense_vector_t& v, + const dense_vector_t& z, + iteration_data_t& data) +{ + // Compute primal_residual = b - A*x + data.primal_residual = lp.rhs; + matrix_vector_multiply(lp.A, -1.0, x, 1.0, data.primal_residual); + + // Compute bound_residual = E'*u - w - E'*x + if (data.n_upper_bounds > 0) { + for (i_t k = 0; k < data.n_upper_bounds; k++) { + i_t j = data.upper_bounds[k]; + data.bound_residual[k] = lp.upper[j] - w[k] - x[j]; + } + } + + // Compute dual_residual = c - A'*y - z + E*v + data.c.pairwise_subtract(z, data.dual_residual); + matrix_transpose_vector_multiply(lp.A, -1.0, y, 1.0, data.dual_residual); + if (data.n_upper_bounds > 0) { + for (i_t k = 0; k < data.n_upper_bounds; k++) { + i_t j = data.upper_bounds[k]; + data.dual_residual[j] += v[k]; + } + } + + // Compute complementarity_xz_residual = x.*z + x.pairwise_product(z, data.complementarity_xz_residual); + + // Compute complementarity_wv_residual = w.*v + w.pairwise_product(v, data.complementarity_wv_residual); +} + +template +void barrier_solver_t::compute_residual_norms(const dense_vector_t& w, + const dense_vector_t& x, + const dense_vector_t& y, + const dense_vector_t& v, + const dense_vector_t& z, + iteration_data_t& data, + f_t& primal_residual_norm, + f_t& dual_residual_norm, + f_t& complementarity_residual_norm) +{ + compute_residuals(w, x, y, v, z, data); + primal_residual_norm = std::max(vector_norm_inf(data.primal_residual), + vector_norm_inf(data.bound_residual)); + dual_residual_norm = vector_norm_inf(data.dual_residual); + complementarity_residual_norm = + std::max(vector_norm_inf(data.complementarity_xz_residual), + vector_norm_inf(data.complementarity_wv_residual)); +} + +template +f_t barrier_solver_t::max_step_to_boundary(const dense_vector_t& x, + const dense_vector_t& dx, + i_t& index) const +{ + float64_t max_step = 1.0; + index = -1; + for (i_t i = 0; i < x.size(); i++) { + // x_i + alpha * dx_i >= 0, x_i >= 0, alpha >= 0 + // We only need to worry about the case where dx_i < 0 + // alpha * dx_i >= -x_i => alpha <= -x_i / dx_i + if (dx[i] < 0.0) { + const f_t ratio = -x[i] / dx[i]; + if (ratio < max_step) { + max_step = ratio; + index = i; + } + } + } + return max_step; +} + +template +f_t barrier_solver_t::max_step_to_boundary(const dense_vector_t& x, + const dense_vector_t& dx) const +{ + i_t index; + return max_step_to_boundary(x, dx, index); +} + +template +i_t barrier_solver_t::compute_search_direction(iteration_data_t& data, + dense_vector_t& dw, + dense_vector_t& dx, + dense_vector_t& dy, + dense_vector_t& dv, + dense_vector_t& dz, + f_t& max_residual) +{ + const bool debug = true; + // Solves the linear system + // + // dw dx dy dv dz + // [ 0 A 0 0 0 ] [ dw ] = [ rp ] + // [ I E' 0 0 0 ] [ dx ] [ rw ] + // [ 0 0 A' -E I ] [ dy ] [ rd ] + // [ 0 Z 0 0 X ] [ dv ] [ rxz ] + // [ V 0 0 W 0 ] [ dz ] [ rwv ] + + max_residual = 0.0; + // diag = z ./ x + data.z.pairwise_divide(data.x, data.diag); + // diag = z ./ x + E * (v ./ w) * E' + if (data.n_upper_bounds > 0) { + for (i_t k = 0; k < data.n_upper_bounds; k++) { + i_t j = data.upper_bounds[k]; + data.diag[j] += data.v[k] / data.w[k]; + } + } + + // inv_diag = 1.0 ./ diag + data.diag.inverse(data.inv_diag); + + // inv_sqrt_diag <- sqrt(inv_diag) + data.inv_diag.sqrt(data.inv_sqrt_diag); + + // Form A*D*A' + if (!data.has_factorization) { + data.AD = lp.A; + data.AD.scale_columns(data.inv_diag); + // compute ADAT = A Dinv * A^T + multiply(data.AD, data.AT, data.ADAT); + + // factorize + i_t status = data.chol->factorize(data.ADAT); + if (status < 0) { + settings.log.printf("Factorization failed.\n"); + return -1; + } + data.has_factorization = true; + } + + // Compute h = primal_rhs + A*inv_diag*(dual_rhs - complementarity_xz_rhs ./ x + + // E*((complementarity_wv_rhs - v .* bound_rhs) ./ w) ) + dense_vector_t tmp1(data.n_upper_bounds); + dense_vector_t tmp2(data.n_upper_bounds); + dense_vector_t tmp3(lp.num_cols); + dense_vector_t tmp4(lp.num_cols); + + // tmp2 <- v .* bound_rhs + data.v.pairwise_product(data.bound_rhs, tmp2); + // tmp2 <- complementarity_wv_rhs - v .* bound_rhs + tmp2.axpy(1.0, data.complementarity_wv_rhs, -1.0); + // tmp1 <- (complementarity_wv_rhs - v .* bound_rhs) ./ w + tmp2.pairwise_divide(data.w, tmp1); + tmp3.set_scalar(0.0); + // tmp3 <- E * tmp1 + data.scatter_upper_bounds(tmp1, tmp3); + + // tmp4 <- complementarity_xz_rhs ./ x + data.complementarity_xz_rhs.pairwise_divide(data.x, tmp4); + // tmp3 <- -complementarity_xz_rhs ./ x + E * ((complementarity_wv_rhs - v .* bound_rhs) ./ w) + tmp3.axpy(-1.0, tmp4, 1.0); + // tmp3 <- dual_rhs -complementarity_xz_rhs ./ x + E * ((complementarity_wv_rhs - v .* bound_rhs) + // ./ w) + tmp3.axpy(1.0, data.dual_rhs, 1.0); + // r1 <- dual_rhs -complementarity_xz_rhs ./ x + E * ((complementarity_wv_rhs - v .* bound_rhs) + // ./ w) + dense_vector_t r1 = tmp3; + dense_vector_t r1_prime = r1; + // tmp4 <- inv_diag * ( dual_rhs -complementarity_xz_rhs ./ x + E *((complementarity_wv_rhs - v .* + // bound_rhs) ./ w)) + data.inv_diag.pairwise_product(tmp3, tmp4); + + dense_vector_t h = data.primal_rhs; + // h <- 1.0 * A * tmp4 + primal_rhs + matrix_vector_multiply(lp.A, 1.0, tmp4, 1.0, h); + + // Solve A D^{-1} A^T dy = h + i_t solve_status = data.chol->solve(h, dy); + if (solve_status < 0) { + settings.log.printf("Linear solve failed\n"); + return -1; + } + + // y_residual <- ADAT*dy - h + dense_vector_t y_residual = h; + matrix_vector_multiply(data.ADAT, 1.0, dy, -1.0, y_residual); + const f_t y_residual_norm = vector_norm_inf(y_residual); + max_residual = std::max(max_residual, y_residual_norm); + if (y_residual_norm > 1e-2) { + settings.log.printf("|| h || = %.2e\n", vector_norm_inf(h)); + settings.log.printf("||ADAT*dy - h|| = %.2e\n", y_residual_norm); + } + if (y_residual_norm > 10) { return -1; } + + if (0 && y_residual_norm > 1e-10) { + settings.log.printf("Running PCG to improve residual 2-norm %e inf-norm %e\n", + vector_norm2(y_residual), + y_residual_norm); + data.preconditioned_conjugate_gradient( + settings, h, std::min(y_residual_norm / 100.0, 1e-6), dy); + y_residual = h; + matrix_vector_multiply(data.ADAT, 1.0, dy, -1.0, y_residual); + const f_t y_residual_norm_pcg = vector_norm_inf(y_residual); + settings.log.printf("PCG improved residual || ADAT * dy - h || = %.2e\n", y_residual_norm_pcg); + } + + // dx = dinv .* (A'*dy - dual_rhs + complementarity_xz_rhs ./ x - E *((complementarity_wv_rhs - v + // .* bound_rhs) ./ w)) + // r1 <- A'*dy - r1 + matrix_transpose_vector_multiply(lp.A, 1.0, dy, -1.0, r1); + // dx <- dinv .* r1 + data.inv_diag.pairwise_product(r1, dx); + + // dx_residual <- D * dx - A'*dy - r1 + dense_vector_t dx_residual(lp.num_cols); + // dx_residual <- D*dx + data.diag.pairwise_product(dx, dx_residual); + // dx_residual <- -A'*dy + D*dx + matrix_transpose_vector_multiply(lp.A, -1.0, dy, 1.0, dx_residual); + // dx_residual <- D*dx - A'*dy + r1 + dx_residual.axpy(1.0, r1_prime, 1.0); + if (debug) { + const f_t dx_residual_norm = vector_norm_inf(dx_residual); + max_residual = std::max(max_residual, dx_residual_norm); + if (dx_residual_norm > 1e-2) { + settings.log.printf("|| D * dx - A'*y + r1 || = %.2e\n", dx_residual_norm); + } + } + + dense_vector_t dx_residual_2(lp.num_cols); + // dx_residual_2 <- D^-1 * (A'*dy - r1) + data.inv_diag.pairwise_product(r1, dx_residual_2); + // dx_residual_2 <- D^-1 * (A'*dy - r1) - dx + dx_residual_2.axpy(-1.0, dx, 1.0); + // dx_residual_2 <- D^-1 * (A'*dy - r1) - dx + const f_t dx_residual_2_norm = vector_norm_inf(dx_residual_2); + max_residual = std::max(max_residual, dx_residual_2_norm); + if (dx_residual_2_norm > 1e-2) { + settings.log.printf("|| D^-1 (A'*dy - r1) - dx || = %.2e\n", dx_residual_2_norm); + } + + dense_vector_t dx_residual_5(lp.num_cols); + dense_vector_t dx_residual_6(lp.num_rows); + // dx_residual_5 <- D^-1 * (A'*dy - r1) + data.inv_diag.pairwise_product(r1, dx_residual_5); + // dx_residual_6 <- A * D^-1 (A'*dy - r1) + matrix_vector_multiply(lp.A, 1.0, dx_residual_5, 0.0, dx_residual_6); + // dx_residual_6 <- A * D^-1 (A'*dy - r1) - A * dx + matrix_vector_multiply(lp.A, -1.0, dx, 1.0, dx_residual_6); + const f_t dx_residual_6_norm = vector_norm_inf(dx_residual_6); + max_residual = std::max(max_residual, dx_residual_6_norm); + if (dx_residual_6_norm > 1e-2) { + settings.log.printf("|| A * D^-1 (A'*dy - r1) - A * dx || = %.2e\n", dx_residual_6_norm); + } + + dense_vector_t dx_residual_3(lp.num_cols); + // dx_residual_3 <- D^-1 * r1 + data.inv_diag.pairwise_product(r1_prime, dx_residual_3); + dense_vector_t dx_residual_4(lp.num_rows); + // dx_residual_4 <- A * D^-1 * r1 + matrix_vector_multiply(lp.A, 1.0, dx_residual_3, 0.0, dx_residual_4); + // dx_residual_4 <- A * D^-1 * r1 + A * dx + matrix_vector_multiply(lp.A, 1.0, dx, 1.0, dx_residual_4); + // dx_residual_4 <- - A * D^-1 * r1 - A * dx + ADAT * dy + +#if CHECK_FORM_ADAT + csc_matrix_t ADinv = lp.A; + ADinv.scale_columns(data.inv_diag); + csc_matrix_t ADinvAT(lp.num_rows, lp.num_rows, 1); + csc_matrix_t Atranspose(1, 1, 0); + lp.A.transpose(Atranspose); + multiply(ADinv, Atranspose, ADinvAT); + matrix_vector_multiply(ADinvAT, 1.0, dy, -1.0, dx_residual_4); + const f_t dx_residual_4_norm = vector_norm_inf(dx_residual_4); + max_residual = std::max(max_residual, dx_residual_4_norm); + if (dx_residual_4_norm > 1e-2) { + settings.log.printf("|| ADAT * dy - A * D^-1 * r1 - A * dx || = %.2e\n", dx_residual_4_norm); + } + + csc_matrix_t C(lp.num_rows, lp.num_rows, 1); + add(ADinvAT, data.ADAT, 1.0, -1.0, C); + const f_t matrix_residual = C.norm1(); + max_residual = std::max(max_residual, matrix_residual); + if (matrix_residual > 1e-2) { + settings.log.printf("|| AD^{-1/2} D^{-1} A^T + E - A D^{-1} A^T|| = %.2e\n", matrix_residual); + } +#endif + + dense_vector_t dx_residual_7 = h; + matrix_vector_multiply(data.ADAT, 1.0, dy, -1.0, dx_residual_7); + const f_t dx_residual_7_norm = vector_norm_inf(dx_residual_7); + max_residual = std::max(max_residual, dx_residual_7_norm); + if (dx_residual_7_norm > 1e-2) { + settings.log.printf("|| A D^{-1} A^T * dy - h || = %.2e\n", dx_residual_7_norm); + } + + // x_residual <- A * dx - primal_rhs + dense_vector_t x_residual = data.primal_rhs; + matrix_vector_multiply(lp.A, 1.0, dx, -1.0, x_residual); + if (debug) { + const f_t x_residual_norm = vector_norm_inf(x_residual); + max_residual = std::max(max_residual, x_residual_norm); + if (x_residual_norm > 1e-2) { + settings.log.printf("|| A * dx - rp || = %.2e\n", x_residual_norm); + } + } + + // dz = (complementarity_xz_rhs - z.* dx) ./ x; + // tmp3 <- z .* dx + data.z.pairwise_product(dx, tmp3); + // tmp3 <- 1.0 * complementarity_xz_rhs - tmp3 + tmp3.axpy(1.0, data.complementarity_xz_rhs, -1.0); + // dz <- tmp3 ./ x + tmp3.pairwise_divide(data.x, dz); + + // xz_residual <- z .* dx + x .* dz - complementarity_xz_rhs + dense_vector_t xz_residual = data.complementarity_xz_rhs; + dense_vector_t zdx(lp.num_cols); + dense_vector_t xdz(lp.num_cols); + data.z.pairwise_product(dx, zdx); + data.x.pairwise_product(dz, xdz); + xz_residual.axpy(1.0, zdx, -1.0); + xz_residual.axpy(1.0, xdz, 1.0); + if (debug) { + const f_t xz_residual_norm = vector_norm_inf(xz_residual); + max_residual = std::max(max_residual, xz_residual_norm); + if (xz_residual_norm > 1e-2) { + settings.log.printf("|| Z dx + X dz - rxz || = %.2e\n", xz_residual_norm); + } + } + + // dv = (v .* E' dx + complementarity_wv_rhs - v .* bound_rhs) ./ w + // tmp1 <- E' * dx + data.gather_upper_bounds(dx, tmp1); + // tmp2 <- v .* E' * dx + data.v.pairwise_product(tmp1, tmp2); + // tmp1 <- v .* bound_rhs + data.v.pairwise_product(data.bound_rhs, tmp1); + // tmp1 <- v .* E' * dx - v . * bound_rhs + tmp1.axpy(1.0, tmp2, -1.0); + // tmp1 <- v .* E' * dx + complementarity_wv_rhs - v.* bound_rhs + tmp1.axpy(1.0, data.complementarity_wv_rhs, 1.0); + // dv <- tmp1 ./ w + tmp1.pairwise_divide(data.w, dv); + + dense_vector_t dv_residual(data.n_upper_bounds); + dense_vector_t dv_residual_2(data.n_upper_bounds); + // dv_residual <- E'*dx + data.gather_upper_bounds(dx, dv_residual); + // dv_residual_2 <- v .* E' * dx + data.v.pairwise_product(dv_residual, dv_residual_2); + // dv_residual_2 <- -v .* E' * dx + dv_residual_2.multiply_scalar(-1.0); + // dv_residual_ <- W .* dv + data.w.pairwise_product(dv, dv_residual); + // dv_residual <- -v .* E' * dx + w .* dv + dv_residual.axpy(1.0, dv_residual_2, 1.0); + // dv_residual <- -v .* E' * dx + w .* dv - complementarity_wv_rhs + dv_residual.axpy(-1.0, data.complementarity_wv_rhs, 1.0); + // dv_residual_2 <- V * bound_rhs + data.v.pairwise_product(data.bound_rhs, dv_residual_2); + // dv_residual <- -v .* E' * dx + w .* dv - complementarity_wv_rhs + v .* bound_rhs + dv_residual.axpy(1.0, dv_residual_2, 1.0); + if (debug) { + const f_t dv_residual_norm = vector_norm_inf(dv_residual); + max_residual = std::max(max_residual, dv_residual_norm); + if (dv_residual_norm > 1e-2) { + settings.log.printf( + "|| -v .* E' * dx + w .* dv - complementarity_wv_rhs - v .* bound_rhs || = %.2e\n", + dv_residual_norm); + } + } + + // dual_residual <- A' * dy - E * dv + dz - dual_rhs + dense_vector_t dual_residual(lp.num_cols); + // dual_residual <- E * dv + data.scatter_upper_bounds(dv, dual_residual); + // dual_residual <- A' * dy - E * dv + matrix_transpose_vector_multiply(lp.A, 1.0, dy, -1.0, dual_residual); + // dual_residual <- A' * dy - E * dv + dz + dual_residual.axpy(1.0, dz, 1.0); + // dual_residual <- A' * dy - E * dv + dz - dual_rhs + dual_residual.axpy(-1.0, data.dual_rhs, 1.0); + if (debug) { + const f_t dual_residual_norm = vector_norm_inf(dual_residual); + max_residual = std::max(max_residual, dual_residual_norm); + if (dual_residual_norm > 1e-2) { + settings.log.printf("|| A' * dy - E * dv + dz - dual_rhs || = %.2e\n", dual_residual_norm); + } + } + + // dw = bound_rhs - E'*dx + data.gather_upper_bounds(dx, tmp1); + dw = data.bound_rhs; + dw.axpy(-1.0, tmp1, 1.0); + // dw_residual <- dw + E'*dx - bound_rhs + dense_vector_t dw_residual(data.n_upper_bounds); + data.gather_upper_bounds(dx, dw_residual); + dw_residual.axpy(1.0, dw, 1.0); + dw_residual.axpy(-1.0, data.bound_rhs, 1.0); + if (debug) { + const f_t dw_residual_norm = vector_norm_inf(dw_residual); + max_residual = std::max(max_residual, dw_residual_norm); + if (dw_residual_norm > 1e-2) { + settings.log.printf("|| dw + E'*dx - bound_rhs || = %.2e\n", dw_residual_norm); + } + } + + // wv_residual <- v .* dw + w .* dv - complementarity_wv_rhs + dense_vector_t wv_residual = data.complementarity_wv_rhs; + dense_vector_t vdw(data.n_upper_bounds); + dense_vector_t wdv(data.n_upper_bounds); + data.v.pairwise_product(dw, vdw); + data.w.pairwise_product(dv, wdv); + wv_residual.axpy(1.0, vdw, -1.0); + wv_residual.axpy(1.0, wdv, 1.0); + if (debug) { + const f_t wv_residual_norm = vector_norm_inf(wv_residual); + max_residual = std::max(max_residual, wv_residual_norm); + if (wv_residual_norm > 1e-2) { + settings.log.printf("|| V dw + W dv - rwv || = %.2e\n", wv_residual_norm); + } + } + + return 0; +} + +template +lp_status_t barrier_solver_t::check_for_suboptimal_solution( + const barrier_solver_settings_t& options, + iteration_data_t& data, + i_t iter, + f_t norm_b, + f_t norm_c, + f_t& primal_objective, + f_t& relative_primal_residual, + f_t& relative_dual_residual, + f_t& relative_complementarity_residual, + lp_solution_t& solution) +{ + if (relative_primal_residual < 100 * options.feasibility_tol && + relative_dual_residual < 100 * options.optimality_tol && + relative_complementarity_residual < 100 * options.complementarity_tol) { + settings.log.printf("Suboptimal solution found\n"); + data.to_solution(lp, + iter, + primal_objective, + primal_objective + lp.obj_constant, + vector_norm2(data.primal_residual), + vector_norm2(data.dual_residual), + solution); + return lp_status_t::OPTIMAL; // TODO: Barrier should probably have a separate suboptimal status + } + f_t primal_residual_norm; + f_t dual_residual_norm; + f_t complementarity_residual_norm; + compute_residual_norms(data.w_save, + data.x_save, + data.y_save, + data.v_save, + data.z_save, + data, + primal_residual_norm, + dual_residual_norm, + complementarity_residual_norm); + primal_objective = data.c.inner_product(data.x_save); + relative_primal_residual = primal_residual_norm / (1.0 + norm_b); + relative_dual_residual = dual_residual_norm / (1.0 + norm_c); + relative_complementarity_residual = + complementarity_residual_norm / (1.0 + std::abs(primal_objective)); + + if (relative_primal_residual < 100 * options.feasibility_tol && + relative_dual_residual < 100 * options.optimality_tol && + relative_complementarity_residual < 100 * options.complementarity_tol) { + settings.log.printf("Restoring previous solution: primal %.2e dual %.2e complementarity %.2e\n", + primal_residual_norm, + dual_residual_norm, + complementarity_residual_norm); + data.to_solution(lp, + iter, + primal_objective, + primal_objective + lp.obj_constant, + vector_norm2(data.primal_residual), + vector_norm2(data.dual_residual), + solution); + settings.log.printf("Suboptimal solution found\n"); + return lp_status_t::OPTIMAL; // TODO: Barrier should probably have a separate suboptimal status + } else { + settings.log.printf("Primal residual %.2e dual residual %.2e complementarity residual %.2e\n", + relative_primal_residual, + relative_dual_residual, + relative_complementarity_residual); + } + settings.log.printf("Search direction computation failed\n"); + return lp_status_t::NUMERICAL_ISSUES; +} + +template +lp_status_t barrier_solver_t::solve(const barrier_solver_settings_t& options, + lp_solution_t& solution) +{ + float64_t start_time = tic(); + lp_status_t status = lp_status_t::UNSET; + + i_t n = lp.num_cols; + i_t m = lp.num_rows; + + solution.resize(m, n); + settings.log.printf( + "Barrier solver: %d constraints, %d variables, %ld nonzeros\n", m, n, lp.A.col_start[n]); + + // Compute the number of free variables + i_t num_free_variables = presolve_info.free_variable_pairs.size() / 2; + settings.log.printf("%d free variables\n", num_free_variables); + + // Compute the number of upper bounds + i_t num_upper_bounds = 0; + for (i_t j = 0; j < n; j++) { + if (lp.upper[j] < inf) { num_upper_bounds++; } + } + iteration_data_t data(lp, num_upper_bounds, settings); + if (toc(start_time) > settings.time_limit) { + settings.log.printf("Barrier time limit exceeded\n"); + return lp_status_t::TIME_LIMIT; + } + settings.log.printf("%d finite upper bounds\n", num_upper_bounds); + + initial_point(data); + if (toc(start_time) > settings.time_limit) { + settings.log.printf("Barrier time limit exceeded\n"); + return lp_status_t::TIME_LIMIT; + } + if (settings.concurrent_halt != nullptr && + settings.concurrent_halt->load(std::memory_order_acquire) == 1) { + settings.log.printf("Barrier solver halted\n"); + return lp_status_t::CONCURRENT_LIMIT; + } + compute_residuals(data.w, data.x, data.y, data.v, data.z, data); + + f_t primal_residual_norm = std::max(vector_norm_inf(data.primal_residual), + vector_norm_inf(data.bound_residual)); + f_t dual_residual_norm = vector_norm_inf(data.dual_residual); + f_t complementarity_residual_norm = + std::max(vector_norm_inf(data.complementarity_xz_residual), + vector_norm_inf(data.complementarity_wv_residual)); + f_t mu = (data.complementarity_xz_residual.sum() + data.complementarity_wv_residual.sum()) / + (static_cast(n) + static_cast(num_upper_bounds)); + + f_t norm_b = vector_norm_inf(data.b); + f_t norm_c = vector_norm_inf(data.c); + + f_t primal_objective = data.c.inner_product(data.x); + + f_t relative_primal_residual = primal_residual_norm / (1.0 + norm_b); + f_t relative_dual_residual = dual_residual_norm / (1.0 + norm_c); + f_t relative_complementarity_residual = + complementarity_residual_norm / (1.0 + std::abs(primal_objective)); + + dense_vector_t restrict_u(num_upper_bounds); + dense_vector_t upper(lp.upper); + data.gather_upper_bounds(upper, restrict_u); + f_t dual_objective = data.b.inner_product(data.y) - restrict_u.inner_product(data.v); + + i_t iter = 0; + settings.log.printf( + " Objective Residual Step-Length " + "Time\n"); + settings.log.printf( + "Iter Primal Dual Primal Dual Compl. Primal Dual " + "Elapsed\n"); + float64_t elapsed_time = toc(start_time); + settings.log.printf("%2d %+.12e %+.12e %.2e %.2e %.2e %.1f\n", + iter, + primal_objective, + dual_objective, + primal_residual_norm, + dual_residual_norm, + complementarity_residual_norm, + elapsed_time); + + bool converged = primal_residual_norm < options.feasibility_tol && + dual_residual_norm < options.optimality_tol && + complementarity_residual_norm < options.complementarity_tol; + + data.w_save = data.w; + data.x_save = data.x; + data.y_save = data.y; + data.v_save = data.v; + data.z_save = data.z; + + const i_t iteration_limit = std::min(options.iteration_limit, settings.iteration_limit); + + while (iter < iteration_limit) { + if (toc(start_time) > settings.time_limit) { + settings.log.printf("Barrier time limit exceeded\n"); + return lp_status_t::TIME_LIMIT; + } + if (settings.concurrent_halt != nullptr && + settings.concurrent_halt->load(std::memory_order_acquire) == 1) { + settings.log.printf("Barrier solver halted\n"); + return lp_status_t::CONCURRENT_LIMIT; + } + // Compute the affine step + data.primal_rhs = data.primal_residual; + data.bound_rhs = data.bound_residual; + data.dual_rhs = data.dual_residual; + data.complementarity_xz_rhs = data.complementarity_xz_residual; + data.complementarity_wv_rhs = data.complementarity_wv_residual; + // x.*z -> -x .* z + data.complementarity_xz_rhs.multiply_scalar(-1.0); + // w.*v -> -w .* v + data.complementarity_wv_rhs.multiply_scalar(-1.0); + + f_t max_affine_residual = 0.0; + i_t status = compute_search_direction( + data, data.dw_aff, data.dx_aff, data.dy_aff, data.dv_aff, data.dz_aff, max_affine_residual); + if (status < 0) { + return check_for_suboptimal_solution(options, + data, + iter, + norm_b, + norm_c, + primal_objective, + relative_primal_residual, + relative_dual_residual, + relative_complementarity_residual, + solution); + } + if (toc(start_time) > settings.time_limit) { + settings.log.printf("Barrier time limit exceeded\n"); + return lp_status_t::TIME_LIMIT; + } + if (settings.concurrent_halt != nullptr && + settings.concurrent_halt->load(std::memory_order_acquire) == 1) { + settings.log.printf("Barrier solver halted\n"); + return lp_status_t::CONCURRENT_LIMIT; + } + + f_t step_primal_aff = std::min(max_step_to_boundary(data.w, data.dw_aff), + max_step_to_boundary(data.x, data.dx_aff)); + f_t step_dual_aff = std::min(max_step_to_boundary(data.v, data.dv_aff), + max_step_to_boundary(data.z, data.dz_aff)); + + // w_aff = w + step_primal_aff * dw_aff + // x_aff = x + step_primal_aff * dx_aff + // v_aff = v + step_dual_aff * dv_aff + // z_aff = z + step_dual_aff * dz_aff + dense_vector_t w_aff = data.w; + dense_vector_t x_aff = data.x; + dense_vector_t v_aff = data.v; + dense_vector_t z_aff = data.z; + w_aff.axpy(step_primal_aff, data.dw_aff, 1.0); + x_aff.axpy(step_primal_aff, data.dx_aff, 1.0); + v_aff.axpy(step_dual_aff, data.dv_aff, 1.0); + z_aff.axpy(step_dual_aff, data.dz_aff, 1.0); + + dense_vector_t complementarity_xz_aff(lp.num_cols); + dense_vector_t complementarity_wv_aff(num_upper_bounds); + x_aff.pairwise_product(z_aff, complementarity_xz_aff); + w_aff.pairwise_product(v_aff, complementarity_wv_aff); + + f_t complementarity_aff_sum = complementarity_xz_aff.sum() + complementarity_wv_aff.sum(); + f_t mu_aff = + (complementarity_aff_sum) / (static_cast(n) + static_cast(num_upper_bounds)); + f_t sigma = std::max(0.0, std::min(1.0, std::pow(mu_aff / mu, 3.0))); + f_t new_mu = sigma * mu_aff; + + // Compute the combined centering corrector step + data.primal_rhs.set_scalar(0.0); + data.bound_rhs.set_scalar(0.0); + data.dual_rhs.set_scalar(0.0); + // complementarity_xz_rhs = -dx_aff .* dz_aff + sigma * mu + data.dx_aff.pairwise_product(data.dz_aff, data.complementarity_xz_rhs); + data.complementarity_xz_rhs.multiply_scalar(-1.0); + data.complementarity_xz_rhs.add_scalar(new_mu); + + // complementarity_wv_rhs = -dw_aff .* dv_aff + sigma * mu + data.dw_aff.pairwise_product(data.dv_aff, data.complementarity_wv_rhs); + data.complementarity_wv_rhs.multiply_scalar(-1.0); + data.complementarity_wv_rhs.add_scalar(new_mu); + + f_t max_corrector_residual = 0.0; + status = compute_search_direction( + data, data.dw, data.dx, data.dy, data.dv, data.dz, max_corrector_residual); + if (status < 0) { + return check_for_suboptimal_solution(options, + data, + iter, + norm_b, + norm_c, + primal_objective, + relative_primal_residual, + relative_dual_residual, + relative_complementarity_residual, + solution); + } + data.has_factorization = false; + if (toc(start_time) > settings.time_limit) { + settings.log.printf("Barrier time limit exceeded\n"); + return lp_status_t::TIME_LIMIT; + } + if (settings.concurrent_halt != nullptr && + settings.concurrent_halt->load(std::memory_order_acquire) == 1) { + settings.log.printf("Barrier solver halted\n"); + return lp_status_t::CONCURRENT_LIMIT; + } + + // dw = dw_aff + dw_cc + // dx = dx_aff + dx_cc + // dy = dy_aff + dy_cc + // dv = dv_aff + dv_cc + // dz = dz_aff + dz_cc + // Note: dw_cc - dz_cc are stored in dw - dz + data.dw.axpy(1.0, data.dw_aff, 1.0); + data.dx.axpy(1.0, data.dx_aff, 1.0); + data.dy.axpy(1.0, data.dy_aff, 1.0); + data.dv.axpy(1.0, data.dv_aff, 1.0); + data.dz.axpy(1.0, data.dz_aff, 1.0); + + f_t max_step_primal = + std::min(max_step_to_boundary(data.w, data.dw), max_step_to_boundary(data.x, data.dx)); + f_t max_step_dual = + std::min(max_step_to_boundary(data.v, data.dv), max_step_to_boundary(data.z, data.dz)); + + f_t step_primal = options.step_scale * max_step_primal; + f_t step_dual = options.step_scale * max_step_dual; + + data.w.axpy(step_primal, data.dw, 1.0); + data.x.axpy(step_primal, data.dx, 1.0); + data.y.axpy(step_dual, data.dy, 1.0); + data.v.axpy(step_dual, data.dv, 1.0); + data.z.axpy(step_dual, data.dz, 1.0); + + // Handle free variables + if (num_free_variables > 0) { + for (i_t k = 0; k < 2 * num_free_variables; k += 2) { + i_t u = presolve_info.free_variable_pairs[k]; + i_t v = presolve_info.free_variable_pairs[k + 1]; + f_t u_val = data.x[u]; + f_t v_val = data.x[v]; + f_t min_val = std::min(u_val, v_val); + f_t eta = options.step_scale * min_val; + data.x[u] -= eta; + data.x[v] -= eta; + } + } + + compute_residual_norms(data.w, + data.x, + data.y, + data.v, + data.z, + data, + primal_residual_norm, + dual_residual_norm, + complementarity_residual_norm); + + mu = (data.complementarity_xz_residual.sum() + data.complementarity_wv_residual.sum()) / + (static_cast(n) + static_cast(num_upper_bounds)); + + primal_objective = data.c.inner_product(data.x); + dual_objective = data.b.inner_product(data.y) - restrict_u.inner_product(data.v); + + relative_primal_residual = primal_residual_norm / (1.0 + norm_b); + relative_dual_residual = dual_residual_norm / (1.0 + norm_c); + relative_complementarity_residual = + complementarity_residual_norm / (1.0 + std::abs(primal_objective)); + + if (relative_primal_residual < 100 * options.feasibility_tol && + relative_dual_residual < 100 * options.optimality_tol && + relative_complementarity_residual < 100 * options.complementarity_tol) { + data.w_save = data.w; + data.x_save = data.x; + data.y_save = data.y; + data.v_save = data.v; + data.z_save = data.z; + } + + iter++; + elapsed_time = toc(start_time); + + if (primal_objective != primal_objective || dual_objective != dual_objective) { + settings.log.printf("Numerical error in objective\n"); + return lp_status_t::NUMERICAL_ISSUES; + } + + settings.log.printf("%2d %+.12e %+.12e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.1f\n", + iter, + primal_objective + lp.obj_constant, + dual_objective + lp.obj_constant, + relative_primal_residual, + relative_dual_residual, + relative_complementarity_residual, + step_primal, + step_dual, + std::min(data.complementarity_xz_residual.minimum(), + data.complementarity_wv_residual.minimum()), + mu, + std::max(max_affine_residual, max_corrector_residual), + elapsed_time); + + bool primal_feasible = relative_primal_residual < options.feasibility_tol; + bool dual_feasible = relative_dual_residual < options.optimality_tol; + bool small_gap = relative_complementarity_residual < options.complementarity_tol; + + converged = primal_feasible && dual_feasible && small_gap; + + if (converged) { + settings.log.printf( + "Optimal solution found in %d iterations and %.2fs\n", iter, toc(start_time)); + settings.log.printf("Objective %+.8e\n", primal_objective + lp.obj_constant); + settings.log.printf("Primal infeasibility (abs/rel): %8.2e/%8.2e\n", + primal_residual_norm, + relative_primal_residual); + settings.log.printf("Dual infeasibility (abs/rel): %8.2e/%8.2e\n", + dual_residual_norm, + relative_dual_residual); + settings.log.printf("Complementarity gap (abs/rel): %8.2e/%8.2e\n", + complementarity_residual_norm, + relative_complementarity_residual); + data.to_solution(lp, + iter, + primal_objective, + primal_objective + lp.obj_constant, + primal_residual_norm, + dual_residual_norm, + solution); + return lp_status_t::OPTIMAL; + } + } + data.to_solution(lp, + iter, + primal_objective, + primal_objective + lp.obj_constant, + vector_norm2(data.primal_residual), + vector_norm2(data.dual_residual), + solution); + return lp_status_t::ITERATION_LIMIT; +} + +#ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE +template class barrier_solver_t; +template class sparse_cholesky_base_t; +template class sparse_cholesky_cudss_t; +template class iteration_data_t; +template class barrier_solver_settings_t; +#endif + +} // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/barrier.hpp b/cpp/src/dual_simplex/barrier.hpp new file mode 100644 index 000000000..b8ec5fed6 --- /dev/null +++ b/cpp/src/dual_simplex/barrier.hpp @@ -0,0 +1,100 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "dual_simplex/dense_vector.hpp" +#include "dual_simplex/presolve.hpp" +#include "dual_simplex/simplex_solver_settings.hpp" +#include "dual_simplex/solution.hpp" +#include "dual_simplex/solve.hpp" +#include "dual_simplex/sparse_cholesky.hpp" +#include "dual_simplex/sparse_matrix.hpp" +#include "dual_simplex/tic_toc.hpp" + +namespace cuopt::linear_programming::dual_simplex { + +template +struct barrier_solver_settings_t { + f_t feasibility_tol = 1e-8; + f_t optimality_tol = 1e-8; + f_t complementarity_tol = 1e-8; + i_t iteration_limit = 1000; + f_t step_scale = 0.9; +}; + +template +class iteration_data_t; // Forward declare + +template +class barrier_solver_t { + public: + barrier_solver_t(const lp_problem_t& lp, + const presolve_info_t& presolve, + const simplex_solver_settings_t& settings) + : lp(lp), settings(settings), presolve_info(presolve) + { + } + lp_status_t solve(const barrier_solver_settings_t& options, + lp_solution_t& solution); + + private: + void initial_point(iteration_data_t& data); + void compute_residual_norms(const dense_vector_t& w, + const dense_vector_t& x, + const dense_vector_t& y, + const dense_vector_t& v, + const dense_vector_t& z, + iteration_data_t& data, + f_t& primal_residual_norm, + f_t& dual_residual_norm, + f_t& complementarity_residual_norm); + void compute_residuals(const dense_vector_t& w, + const dense_vector_t& x, + const dense_vector_t& y, + const dense_vector_t& v, + const dense_vector_t& z, + iteration_data_t& data); + f_t max_step_to_boundary(const dense_vector_t& x, + const dense_vector_t& dx, + i_t& index) const; + f_t max_step_to_boundary(const dense_vector_t& x, + const dense_vector_t& dx) const; + i_t compute_search_direction(iteration_data_t& data, + dense_vector_t& dw, + dense_vector_t& dx, + dense_vector_t& dy, + dense_vector_t& dv, + dense_vector_t& dz, + f_t& max_residual); + + lp_status_t check_for_suboptimal_solution(const barrier_solver_settings_t& options, + iteration_data_t& data, + i_t iter, + f_t norm_b, + f_t norm_c, + f_t& primal_objective, + f_t& relative_primal_residual, + f_t& relative_dual_residual, + f_t& relative_complementarity_residual, + lp_solution_t& solution); + + const lp_problem_t& lp; + const simplex_solver_settings_t& settings; + const presolve_info_t& presolve_info; +}; + +} // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/dense_vector.hpp b/cpp/src/dual_simplex/dense_vector.hpp new file mode 100644 index 000000000..b8e5cf114 --- /dev/null +++ b/cpp/src/dual_simplex/dense_vector.hpp @@ -0,0 +1,181 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "dual_simplex/types.hpp" + +#include +#include +#include + +namespace cuopt::linear_programming::dual_simplex { + +template +class dense_vector_t : public std::vector { + public: + dense_vector_t(i_t n) { this->resize(n, 0.0); } + dense_vector_t(const std::vector& in) + { + this->resize(in.size()); + for (i_t i = 0; i < in.size(); i++) { + (*this)[i] = in[i]; + } + } + + f_t minimum() const + { + const i_t n = this->size(); + f_t min_x = inf; + for (i_t i = 0; i < n; i++) { + min_x = std::min(min_x, (*this)[i]); + } + return min_x; + } + + f_t maximum() const + { + const i_t n = this->size(); + f_t max_x = -inf; + for (i_t i = 0; i < n; i++) { + max_x = std::max(max_x, (*this)[i]); + } + return max_x; + } + + // b <- sqrt(a) + void sqrt(dense_vector_t& b) const + { + const i_t n = this->size(); + for (i_t i = 0; i < n; i++) { + b[i] = std::sqrt((*this)[i]); + } + } + // a <- a + alpha + void add_scalar(f_t alpha) + { + const i_t n = this->size(); + for (i_t i = 0; i < n; i++) { + (*this)[i] += alpha; + } + } + // a <- alpha * e, e = (1, 1, ..., 1) + void set_scalar(f_t alpha) + { + const i_t n = this->size(); + for (i_t i = 0; i < n; i++) { + (*this)[i] = alpha; + } + } + // a <- alpha * a + void multiply_scalar(f_t alpha) + { + const i_t n = this->size(); + for (i_t i = 0; i < n; i++) { + (*this)[i] *= alpha; + } + } + f_t sum() const + { + f_t sum = 0.0; + const i_t n = this->size(); + for (i_t i = 0; i < n; ++i) { + sum += (*this)[i]; + } + return sum; + } + + f_t inner_product(dense_vector_t& b) const + { + f_t dot = 0.0; + const i_t n = this->size(); + for (i_t i = 0; i < n; i++) { + dot += (*this)[i] * b[i]; + } + return dot; + } + + // c <- a .* b + void pairwise_product(const dense_vector_t& b, dense_vector_t& c) const + { + const i_t n = this->size(); + if (b.size() != n) { + printf("Error: b.size() %d != n %d\n", b.size(), n); + exit(1); + } + if (c.size() != n) { + printf("Error: c.size() %d != n %d\n", c.size(), n); + exit(1); + } + for (i_t i = 0; i < n; i++) { + c[i] = (*this)[i] * b[i]; + } + } + // c <- a ./ b + void pairwise_divide(const dense_vector_t& b, dense_vector_t& c) const + { + const i_t n = this->size(); + for (i_t i = 0; i < n; i++) { + c[i] = (*this)[i] / b[i]; + } + } + + // c <- a - b + void pairwise_subtract(const dense_vector_t& b, dense_vector_t& c) const + { + const i_t n = this->size(); + for (i_t i = 0; i < n; i++) { + c[i] = (*this)[i] - b[i]; + } + } + + // y <- alpha * x + beta * y + void axpy(f_t alpha, const dense_vector_t& x, f_t beta) + { + const i_t n = this->size(); + for (i_t i = 0; i < n; i++) { + (*this)[i] = alpha * x[i] + beta * (*this)[i]; + } + } + + void ensure_positive(f_t epsilon_adjust) + { + const f_t mix_x = minimum(); + if (mix_x <= 0.0) { + const f_t delta_x = -mix_x + epsilon_adjust; + add_scalar(delta_x); + } + } + + void bound_away_from_zero(f_t epsilon_adjust) + { + const i_t n = this->size(); + for (i_t i = 0; i < n; i++) { + if ((*this)[i] < epsilon_adjust) { (*this)[i] = epsilon_adjust; } + } + } + + // b <- 1.0 /a + void inverse(dense_vector_t& b) const + { + const i_t n = this->size(); + for (i_t i = 0; i < n; i++) { + b[i] = 1.0 / (*this)[i]; + } + } +}; + +} // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/presolve.cpp b/cpp/src/dual_simplex/presolve.cpp index 457da9113..bb113e4ba 100644 --- a/cpp/src/dual_simplex/presolve.cpp +++ b/cpp/src/dual_simplex/presolve.cpp @@ -18,6 +18,7 @@ #include #include +#include #include @@ -219,9 +220,10 @@ template i_t remove_rows(lp_problem_t& problem, const std::vector& row_sense, csr_matrix_t& Arow, - std::vector& row_marker) + std::vector& row_marker, + bool error_on_nonzero_rhs) { - constexpr bool verbose = false; + constexpr bool verbose = true; if (verbose) { printf("Removing rows %d %ld\n", Arow.m, row_marker.size()); } csr_matrix_t Aout; Arow.remove_rows(row_marker, Aout); @@ -236,7 +238,7 @@ i_t remove_rows(lp_problem_t& problem, new_rhs[row_count] = problem.rhs[i]; row_count++; } else { - if (problem.rhs[i] != 0.0) { + if (error_on_nonzero_rhs && problem.rhs[i] != 0.0) { if (verbose) { printf( "Error nonzero rhs %e for zero row %d sense %c\n", problem.rhs[i], i, row_sense[i]); @@ -273,7 +275,7 @@ i_t remove_empty_rows(lp_problem_t& problem, row_marker[i] = 0; } } - const i_t retval = remove_rows(problem, row_sense, Arow, row_marker); + const i_t retval = remove_rows(problem, row_sense, Arow, row_marker, true); return retval; } @@ -522,14 +524,11 @@ i_t find_dependent_rows(lp_problem_t& problem, csc_matrix_t U(m, m, nz); std::vector pinv(n); std::vector q(m); - for (i_t i = 0; i < m; ++i) { - q[i] = i; - } - std::optional> optional_q = q; - // TODO: Replace with right looking LU in crossover PR - // i_t pivots = left_looking_lu(C, settings, 1e-13, optional_q, L, U, pinv); - i_t pivots = 0; + + i_t pivots = right_looking_lu_row_permutation_only(C, settings, 1e-13, tic(), q, pinv); + if (pivots < m) { + settings.log.printf("Found %d dependent rows\n", m - pivots); const i_t num_dependent = m - pivots; std::vector independent_rhs(pivots); std::vector dependent_rhs(num_dependent); @@ -537,7 +536,7 @@ i_t find_dependent_rows(lp_problem_t& problem, i_t ind_count = 0; i_t dep_count = 0; for (i_t i = 0; i < m; ++i) { - i_t row = (*optional_q)[i]; + i_t row = q[i]; if (i < pivots) { dependent_rows[row] = 0; independent_rhs[ind_count++] = problem.rhs[row]; @@ -548,6 +547,7 @@ i_t find_dependent_rows(lp_problem_t& problem, } } +#if 0 std::vector z = independent_rhs; // Solve U1^T z = independent_rhs for (i_t k = 0; k < pivots; ++k) { @@ -579,6 +579,9 @@ i_t find_dependent_rows(lp_problem_t& problem, problem.rhs[dependent_row_list[k]] = 0.0; } } +#endif + } else { + settings.log.printf("No dependent rows found\n"); } return pivots; } @@ -694,27 +697,12 @@ void convert_user_problem(const user_problem_t& user_problem, bound_strengthening(row_sense, settings, problem); } - // The original problem may have a variable without a lower bound - // but a finite upper bound - // -inf < x_j <= u_j - i_t no_lower_bound = 0; - for (i_t j = 0; j < problem.num_cols; j++) { - if (problem.lower[j] == -INFINITY && problem.upper[j] < INFINITY) { no_lower_bound++; } - } - - // The original problem may have nonzero lower bounds - // 0 != l_j <= x_j <= u_j - i_t nonzero_lower_bounds = 0; - for (i_t j = 0; j < problem.num_cols; j++) { - if (problem.lower[j] != 0.0 && problem.lower[j] > -INFINITY) { nonzero_lower_bounds++; } - } - if (less_rows > 0) { convert_less_than_to_equal(user_problem, row_sense, problem, less_rows, new_slacks); } // Add artifical variables - add_artifical_variables(problem, equality_rows, new_slacks); + if (!settings.barrier_presolve) { add_artifical_variables(problem, equality_rows, new_slacks); } } template @@ -726,11 +714,115 @@ i_t presolve(const lp_problem_t& original, problem = original; std::vector row_sense(problem.num_rows, '='); + // The original problem may have a variable without a lower bound + // but a finite upper bound + // -inf < x_j <= u_j + i_t no_lower_bound = 0; + for (i_t j = 0; j < problem.num_cols; j++) { + if (problem.lower[j] == -inf && problem.upper[j] < inf) { no_lower_bound++; } + } + settings.log.printf("%d variables with no lower bound\n", no_lower_bound); + + // The original problem may have nonzero lower bounds + // 0 != l_j <= x_j <= u_j + i_t nonzero_lower_bounds = 0; + for (i_t j = 0; j < problem.num_cols; j++) { + if (problem.lower[j] != 0.0 && problem.lower[j] > -inf) { nonzero_lower_bounds++; } + } + if (settings.barrier_presolve && nonzero_lower_bounds > 0) { + settings.log.printf("Transforming %ld nonzero lower bound\n", nonzero_lower_bounds); + presolve_info.removed_lower_bounds.resize(problem.num_cols); + // We can construct a new variable: x'_j = x_j - l_j or x_j = x'_j + l_j + // than we have 0 <= x'_j <= u_j - l_j + // Constraints in the form: + // sum_{k != j} a_ik x_k + a_ij * x_j {=, <=} beta_i + // become + // sum_{k != j} a_ik x_k + a_ij * (x'_j + l_j) {=, <=} beta_i + // or + // sum_{k != j} a_ik x_k + a_ij * x'_j {=, <=} beta_i - a_{ij} l_j + // + // the cost function + // sum_{k != j} c_k x_k + c_j * x_j + // becomes + // sum_{k != j} c_k x_k + c_j (x'_j + l_j) + // + // so we get the constant term c_j * l_j + for (i_t j = 0; j < problem.num_cols; j++) { + if (problem.lower[j] != 0.0 && problem.lower[j] > -inf) { + for (i_t p = problem.A.col_start[j]; p < problem.A.col_start[j + 1]; p++) { + i_t i = problem.A.i[p]; + f_t aij = problem.A.x[p]; + problem.rhs[i] -= aij * problem.lower[j]; + } + problem.obj_constant += problem.objective[j] * problem.lower[j]; + problem.upper[j] -= problem.lower[j]; + presolve_info.removed_lower_bounds[j] = problem.lower[j]; + problem.lower[j] = 0.0; + } + } + } + + // Check for free variables i_t free_variables = 0; for (i_t j = 0; j < problem.num_cols; j++) { - if (problem.lower[j] == -INFINITY && problem.upper[j] == INFINITY) { free_variables++; } + if (problem.lower[j] == -inf && problem.upper[j] == inf) { free_variables++; } + } + if (free_variables > 0) { + settings.log.printf("%d free variables\n", free_variables); + + // We have a variable x_j: with -inf < x_j < inf + // we create new variables v and w with 0 <= v, w and x_j = v - w + // Constraints + // sum_{k != j} a_ik x_k + a_ij x_j {=, <=} beta + // become + // sum_{k != j} a_ik x_k + aij v - a_ij w {=, <=} beta + // + // The cost function + // sum_{k != j} c_k x_k + c_j x_j + // becomes + // sum_{k != j} c_k x_k + c_j v - c_j w + + i_t num_cols = problem.num_cols + free_variables; + i_t nnz = problem.A.col_start[problem.num_cols]; + for (i_t j = 0; j < problem.num_cols; j++) { + if (problem.lower[j] == -inf && problem.upper[j] == inf) { + nnz += (problem.A.col_start[j + 1] - problem.A.col_start[j]); + } + } + + problem.A.col_start.resize(num_cols + 1); + problem.A.i.resize(nnz); + problem.A.x.resize(nnz); + problem.lower.resize(num_cols); + problem.upper.resize(num_cols); + problem.objective.resize(num_cols); + + presolve_info.free_variable_pairs.resize(free_variables * 2); + i_t pair_count = 0; + i_t q = problem.A.col_start[problem.num_cols]; + i_t col = problem.num_cols; + for (i_t j = 0; j < problem.num_cols; j++) { + if (problem.lower[j] == -inf && problem.upper[j] == inf) { + for (i_t p = problem.A.col_start[j]; p < problem.A.col_start[j + 1]; p++) { + i_t i = problem.A.i[p]; + f_t aij = problem.A.x[p]; + problem.A.i[q] = i; + problem.A.x[q] = -aij; + q++; + } + problem.lower[col] = 0.0; + problem.upper[col] = inf; + problem.objective[col] = -problem.objective[j]; + presolve_info.free_variable_pairs[pair_count++] = j; + presolve_info.free_variable_pairs[pair_count++] = col; + problem.A.col_start[++col] = q; + problem.lower[j] = 0.0; + } + } + assert(problem.A.col_start[num_cols] == nnz); + problem.A.n = num_cols; + problem.num_cols = num_cols; } - if (free_variables > 0) { settings.log.printf("%d free variables\n", free_variables); } // Check for empty rows i_t num_empty_rows = 0; @@ -760,11 +852,12 @@ i_t presolve(const lp_problem_t& original, } // Check for dependent rows - constexpr bool check_dependent_rows = false; + bool check_dependent_rows = false; // settings.barrier; if (check_dependent_rows) { std::vector dependent_rows; constexpr i_t kOk = -1; i_t infeasible; + f_t dependent_row_start = tic(); const i_t independent_rows = find_dependent_rows(problem, settings, dependent_rows, infeasible); if (infeasible != kOk) { settings.log.printf("Found problem infeasible in presolve\n"); @@ -775,8 +868,9 @@ i_t presolve(const lp_problem_t& original, settings.log.printf("%d dependent rows\n", num_dependent_rows); csr_matrix_t Arow; problem.A.to_compressed_row(Arow); - remove_rows(problem, row_sense, Arow, dependent_rows); + remove_rows(problem, row_sense, Arow, dependent_rows, false); } + settings.log.printf("Dependent row check in %.2fs\n", toc(dependent_row_start)); } assert(problem.num_rows == problem.A.m); assert(problem.num_cols == problem.A.n); @@ -1012,25 +1106,50 @@ void uncrush_solution(const presolve_info_t& presolve_info, if (presolve_info.removed_variables.size() == 0) { uncrushed_x = crushed_x; uncrushed_z = crushed_z; - return; - } + } else { + printf("Presolve info removed variables %d\n", presolve_info.removed_variables.size()); + // We removed some variables, so we need to map the crushed solution back to the original + // variables + const i_t n = presolve_info.removed_variables.size() + presolve_info.remaining_variables.size(); + uncrushed_x.resize(n); + uncrushed_z.resize(n); + + i_t k = 0; + for (const i_t j : presolve_info.remaining_variables) { + uncrushed_x[j] = crushed_x[k]; + uncrushed_z[j] = crushed_z[k]; + k++; + } - const i_t n = presolve_info.removed_variables.size() + presolve_info.remaining_variables.size(); - uncrushed_x.resize(n); - uncrushed_z.resize(n); + k = 0; + for (const i_t j : presolve_info.removed_variables) { + uncrushed_x[j] = presolve_info.removed_values[k]; + uncrushed_z[j] = presolve_info.removed_reduced_costs[k]; + k++; + } + } - i_t k = 0; - for (const i_t j : presolve_info.remaining_variables) { - uncrushed_x[j] = crushed_x[k]; - uncrushed_z[j] = crushed_z[k]; - k++; + const i_t num_free_variables = presolve_info.free_variable_pairs.size() / 2; + if (num_free_variables > 0) { + printf("Presolve info free variables %d\n", num_free_variables); + // We added free variables so we need to map the crushed solution back to the original variables + for (i_t k = 0; k < 2 * num_free_variables; k += 2) { + const i_t u = presolve_info.free_variable_pairs[k]; + const i_t v = presolve_info.free_variable_pairs[k + 1]; + uncrushed_x[u] -= uncrushed_x[v]; + } + const i_t n = uncrushed_x.size(); + uncrushed_x.resize(n - num_free_variables); + uncrushed_z.resize(n - num_free_variables); } - k = 0; - for (const i_t j : presolve_info.removed_variables) { - uncrushed_x[j] = presolve_info.removed_values[k]; - uncrushed_z[j] = presolve_info.removed_reduced_costs[k]; - k++; + if (presolve_info.removed_lower_bounds.size() > 0) { + printf("Presolve info removed lower bounds %d\n", presolve_info.removed_lower_bounds.size()); + // We removed some lower bounds so we need to map the crushed solution back to the original + // variables + for (i_t j = 0; j < uncrushed_x.size(); j++) { + uncrushed_x[j] += presolve_info.removed_lower_bounds[j]; + } } } diff --git a/cpp/src/dual_simplex/presolve.hpp b/cpp/src/dual_simplex/presolve.hpp index 7a307e6f7..b528578de 100644 --- a/cpp/src/dual_simplex/presolve.hpp +++ b/cpp/src/dual_simplex/presolve.hpp @@ -59,6 +59,10 @@ struct presolve_info_t { std::vector removed_values; // values of the removed reduced costs std::vector removed_reduced_costs; + // Free variable pairs + std::vector free_variable_pairs; + // Removed lower bounds + std::vector removed_lower_bounds; }; template diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp index a51ed19bc..c14d09ab1 100644 --- a/cpp/src/dual_simplex/simplex_solver_settings.hpp +++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp @@ -56,6 +56,10 @@ struct simplex_solver_settings_t { use_left_looking_lu(false), eliminate_singletons(true), print_presolve_stats(true), + barrier_presolve(false), + use_cudss(false), + barrier(false), + crossover(false), refactor_frequency(100), iteration_log_frequency(1000), first_iteration_log(2), @@ -99,8 +103,12 @@ struct simplex_solver_settings_t { bool relaxation; // true to only solve the LP relaxation of a MIP bool use_left_looking_lu; // true to use left looking LU factorization, false to use right looking - bool eliminate_singletons; // true to eliminate singletons from the basis - bool print_presolve_stats; // true to print presolve stats + bool eliminate_singletons; // true to eliminate singletons from the basis + bool print_presolve_stats; // true to print presolve stats + bool barrier_presolve; // true to use barrier presolve + bool use_cudss; // true to use cuDSS for sparse Cholesky factorization, false to use cholmod + bool barrier; // true to use barrier method, false to use dual simplex method + bool crossover; // true to do crossover, false to not i_t refactor_frequency; // number of basis updates before refactorization i_t iteration_log_frequency; // number of iterations between log updates i_t first_iteration_log; // number of iterations to log at beginning of solve diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp index e665bae97..e3a43307c 100644 --- a/cpp/src/dual_simplex/solve.cpp +++ b/cpp/src/dual_simplex/solve.cpp @@ -17,7 +17,9 @@ #include +#include #include +#include #include #include #include @@ -236,6 +238,177 @@ lp_status_t solve_linear_program_advanced(const lp_problem_t& original return lp_status; } +template +lp_status_t solve_linear_program_with_barrier(const user_problem_t& user_problem, + const simplex_solver_settings_t& settings, + lp_solution_t& solution) +{ + f_t start_time = tic(); + lp_problem_t original_lp(1, 1, 1); + + // Convert the user problem to a linear program with only equality constraints + std::vector new_slacks; + simplex_solver_settings_t barrier_settings = settings; + barrier_settings.barrier_presolve = true; + convert_user_problem(user_problem, barrier_settings, original_lp, new_slacks); + lp_solution_t lp_solution(original_lp.num_rows, original_lp.num_cols); + + // Presolve the linear program + presolve_info_t presolve_info; + lp_problem_t presolved_lp(1, 1, 1); + const i_t ok = presolve(original_lp, barrier_settings, presolved_lp, presolve_info); + if (ok == -1) { return lp_status_t::INFEASIBLE; } + + // Apply columns scaling to the presolve LP + lp_problem_t barrier_lp( + presolved_lp.num_rows, presolved_lp.num_cols, presolved_lp.A.col_start[presolved_lp.num_cols]); + std::vector column_scales; + column_scaling(presolved_lp, barrier_settings, barrier_lp, column_scales); + + // Solve using barrier + lp_solution_t barrier_solution(barrier_lp.num_rows, barrier_lp.num_cols); + barrier_solver_t barrier_solver(barrier_lp, presolve_info, barrier_settings); + barrier_solver_settings_t barrier_solver_settings; + lp_status_t barrier_status = barrier_solver.solve(barrier_solver_settings, barrier_solution); + if (barrier_status == lp_status_t::OPTIMAL) { + std::vector scaled_residual = barrier_lp.rhs; + matrix_vector_multiply(barrier_lp.A, 1.0, barrier_solution.x, -1.0, scaled_residual); + f_t scaled_primal_residual = vector_norm_inf(scaled_residual); + settings.log.printf("Scaled Primal residual: %e\n", scaled_primal_residual); + + std::vector scaled_dual_residual = barrier_solution.z; + for (i_t j = 0; j < scaled_dual_residual.size(); ++j) { + scaled_dual_residual[j] -= barrier_lp.objective[j]; + } + matrix_transpose_vector_multiply( + barrier_lp.A, 1.0, barrier_solution.y, 1.0, scaled_dual_residual); + f_t scaled_dual_residual_norm = vector_norm_inf(scaled_dual_residual); + settings.log.printf("Scaled Dual residual: %e\n", scaled_dual_residual_norm); + + // Unscale the solution + std::vector unscaled_x(barrier_lp.num_cols); + std::vector unscaled_z(barrier_lp.num_cols); + unscale_solution( + column_scales, barrier_solution.x, barrier_solution.z, unscaled_x, unscaled_z); + + std::vector residual = presolved_lp.rhs; + matrix_vector_multiply(presolved_lp.A, 1.0, unscaled_x, -1.0, residual); + f_t primal_residual = vector_norm_inf(residual); + settings.log.printf("Unscaled Primal residual: %e\n", primal_residual); + + std::vector unscaled_dual_residual = unscaled_z; + for (i_t j = 0; j < unscaled_dual_residual.size(); ++j) { + unscaled_dual_residual[j] -= presolved_lp.objective[j]; + } + matrix_transpose_vector_multiply( + presolved_lp.A, 1.0, barrier_solution.y, 1.0, unscaled_dual_residual); + f_t unscaled_dual_residual_norm = vector_norm_inf(unscaled_dual_residual); + settings.log.printf("Unscaled Dual residual: %e\n", unscaled_dual_residual_norm); + + // Undo presolve + lp_solution.y = barrier_solution.y; + uncrush_solution(presolve_info, unscaled_x, unscaled_z, lp_solution.x, lp_solution.z); + + std::vector post_solve_residual = original_lp.rhs; + matrix_vector_multiply(original_lp.A, 1.0, lp_solution.x, -1.0, post_solve_residual); + f_t post_solve_primal_residual = vector_norm_inf(post_solve_residual); + settings.log.printf("Post-solve Primal residual: %e\n", post_solve_primal_residual); + + std::vector post_solve_dual_residual = lp_solution.z; + for (i_t j = 0; j < post_solve_dual_residual.size(); ++j) { + post_solve_dual_residual[j] -= original_lp.objective[j]; + } + matrix_transpose_vector_multiply( + original_lp.A, 1.0, lp_solution.y, 1.0, post_solve_dual_residual); + f_t post_solve_dual_residual_norm = vector_norm_inf(post_solve_dual_residual); + settings.log.printf("Post-solve Dual residual: %e\n", post_solve_dual_residual_norm); + + uncrush_primal_solution(user_problem, original_lp, lp_solution.x, solution.x); + uncrush_dual_solution( + user_problem, original_lp, lp_solution.y, lp_solution.z, solution.y, solution.z); + solution.objective = barrier_solution.objective; + solution.user_objective = barrier_solution.user_objective; + solution.l2_primal_residual = barrier_solution.l2_primal_residual; + solution.l2_dual_residual = barrier_solution.l2_dual_residual; + solution.iterations = barrier_solution.iterations; + } + + // If we aren't doing crossover, we're done + if (!settings.crossover) { return barrier_status; } + + if (settings.crossover && barrier_status == lp_status_t::OPTIMAL) { + // Check to see if we need to add artifical variables + csc_matrix_t Atranspose(1, 1, 1); + original_lp.A.transpose(Atranspose); + std::vector artificial_variables; + artificial_variables.reserve(original_lp.num_rows); + for (i_t i = 0; i < original_lp.num_rows; ++i) { + const i_t row_start = Atranspose.col_start[i]; + const i_t row_end = Atranspose.col_start[i + 1]; + bool found_slack = false; + for (i_t p = row_start; p < row_end; ++p) { + const i_t j = Atranspose.i[p]; + const i_t nz_col = original_lp.A.col_start[j + 1] - original_lp.A.col_start[j]; + if (nz_col == 1) { + found_slack = true; + break; + } + } + if (!found_slack) { + // settings.log.printf("No slack found for row %d\n", i); + artificial_variables.push_back(i); + } + } + if (artificial_variables.size() > 0) { + settings.log.printf("Adding %ld artificial variables\n", artificial_variables.size()); + const i_t additional_vars = artificial_variables.size(); + const i_t new_cols = original_lp.num_cols + additional_vars; + i_t col = original_lp.num_cols; + i_t nz = original_lp.A.col_start[col]; + const i_t new_nz = nz + additional_vars; + original_lp.A.col_start.resize(new_cols + 1); + original_lp.A.x.resize(new_nz); + original_lp.A.i.resize(new_nz); + original_lp.objective.resize(new_cols); + original_lp.lower.resize(new_cols); + original_lp.upper.resize(new_cols); + lp_solution.x.resize(new_cols); + lp_solution.z.resize(new_cols); + for (i_t i : artificial_variables) { + original_lp.A.x[nz] = 1.0; + original_lp.A.i[nz] = i; + original_lp.objective[col] = 0.0; + original_lp.lower[col] = 0.0; + original_lp.upper[col] = 0.0; + lp_solution.x[col] = 0.0; + lp_solution.z[col] = 0.0; + nz++; + col++; + original_lp.A.col_start[col] = nz; + } + original_lp.A.n = new_cols; + original_lp.num_cols = new_cols; + printf("nz %d =? new_nz %d =? Acol %d, num_cols %d =? new_cols %d x size %ld z size %ld\n", + nz, + new_nz, + original_lp.A.col_start[original_lp.num_cols], + original_lp.num_cols, + new_cols, + lp_solution.x.size(), + lp_solution.z.size()); + } + + // Run crossover + lp_solution_t crossover_solution(original_lp.num_rows, original_lp.num_cols); + std::vector vstatus(original_lp.num_cols); + crossover_status_t crossover_status = crossover( + original_lp, barrier_settings, lp_solution, start_time, crossover_solution, vstatus); + settings.log.printf("Crossover status: %d\n", crossover_status); + if (crossover_status == crossover_status_t::OPTIMAL) { barrier_status = lp_status_t::OPTIMAL; } + } + return barrier_status; +} + template lp_status_t solve_linear_program(const user_problem_t& user_problem, const simplex_solver_settings_t& settings, @@ -342,6 +515,11 @@ template lp_status_t solve_linear_program_advanced( std::vector& vstatus, std::vector& edge_norms); +template lp_status_t solve_linear_program_with_barrier( + const user_problem_t& user_problem, + const simplex_solver_settings_t& settings, + lp_solution_t& solution); + template lp_status_t solve_linear_program(const user_problem_t& user_problem, const simplex_solver_settings_t& settings, lp_solution_t& solution); diff --git a/cpp/src/dual_simplex/solve.hpp b/cpp/src/dual_simplex/solve.hpp index f20844d6e..a54acf697 100644 --- a/cpp/src/dual_simplex/solve.hpp +++ b/cpp/src/dual_simplex/solve.hpp @@ -56,6 +56,11 @@ lp_status_t solve_linear_program_advanced(const lp_problem_t& original std::vector& vstatus, std::vector& edge_norms); +template +lp_status_t solve_linear_program_with_barrier(const user_problem_t& user_problem, + const simplex_solver_settings_t& settings, + lp_solution_t& solution); + template lp_status_t solve_linear_program(const user_problem_t& user_problem, const simplex_solver_settings_t& settings, diff --git a/cpp/src/dual_simplex/sparse_cholesky.hpp b/cpp/src/dual_simplex/sparse_cholesky.hpp new file mode 100644 index 000000000..4b777584f --- /dev/null +++ b/cpp/src/dual_simplex/sparse_cholesky.hpp @@ -0,0 +1,372 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "dual_simplex/dense_vector.hpp" +#include "dual_simplex/simplex_solver_settings.hpp" +#include "dual_simplex/sparse_matrix.hpp" +#include "dual_simplex/tic_toc.hpp" + +#include + +#include "cudss.h" + +namespace cuopt::linear_programming::dual_simplex { + +template +class sparse_cholesky_base_t { + public: + virtual ~sparse_cholesky_base_t() = default; + virtual i_t analyze(const csc_matrix_t& A_in) = 0; + virtual i_t factorize(const csc_matrix_t& A_in) = 0; + virtual i_t solve(const dense_vector_t& b, dense_vector_t& x) = 0; + virtual void set_positive_definite(bool positive_definite) = 0; +}; + +#define CUDSS_EXAMPLE_FREE \ + do { \ + } while (0) + +#define CUDA_CALL_AND_CHECK(call, msg) \ + do { \ + cuda_error = call; \ + if (cuda_error != cudaSuccess) { \ + printf("FAILED: CUDA API returned error = %d, details: " #msg "\n", cuda_error); \ + CUDSS_EXAMPLE_FREE; \ + return -1; \ + } \ + } while (0); + +#define CUDA_CALL_AND_CHECK_EXIT(call, msg) \ + do { \ + cuda_error = call; \ + if (cuda_error != cudaSuccess) { \ + printf("FAILED: CUDA API returned error = %d, details: " #msg "\n", cuda_error); \ + CUDSS_EXAMPLE_FREE; \ + exit(-1); \ + } \ + } while (0); + +#define CUDSS_CALL_AND_CHECK(call, status, msg) \ + do { \ + status = call; \ + if (status != CUDSS_STATUS_SUCCESS) { \ + printf("FAILED: CUDSS call ended unsuccessfully with status = %d, details: " #msg "\n", \ + status); \ + CUDSS_EXAMPLE_FREE; \ + return -2; \ + } \ + } while (0); + +#define CUDSS_CALL_AND_CHECK_EXIT(call, status, msg) \ + do { \ + status = call; \ + if (status != CUDSS_STATUS_SUCCESS) { \ + printf("FAILED: CUDSS call ended unsuccessfully with status = %d, details: " #msg "\n", \ + status); \ + CUDSS_EXAMPLE_FREE; \ + exit(-2); \ + } \ + } while (0); + +template +class sparse_cholesky_cudss_t : public sparse_cholesky_base_t { + public: + sparse_cholesky_cudss_t(const simplex_solver_settings_t& settings, i_t size) + : n(size), nnz(-1), first_factor(true), positive_definite(true), settings_(settings) + { + int major, minor, patch; + cudssGetProperty(MAJOR_VERSION, &major); + cudssGetProperty(MINOR_VERSION, &minor); + cudssGetProperty(PATCH_LEVEL, &patch); + settings.log.printf("CUDSS Version %d.%d.%d\n", major, minor, patch); + + cuda_error = cudaSuccess; + status = CUDSS_STATUS_SUCCESS; + CUDA_CALL_AND_CHECK_EXIT(cudaStreamCreate(&stream), "cudaStreamCreate"); + CUDSS_CALL_AND_CHECK_EXIT(cudssCreate(&handle), status, "cudssCreate"); + CUDSS_CALL_AND_CHECK_EXIT(cudssConfigCreate(&solverConfig), status, "cudssConfigCreate"); + CUDSS_CALL_AND_CHECK_EXIT(cudssDataCreate(handle, &solverData), status, "cudssDataCreate"); + +#ifdef USE_AMD + // Tell cuDSS to use AMD + cudssAlgType_t reorder_alg = CUDSS_ALG_3; + CUDSS_CALL_AND_CHECK_EXIT( + cudssConfigSet( + solverConfig, CUDSS_CONFIG_REORDERING_ALG, &reorder_alg, sizeof(cudssAlgType_t)), + status, + "cudssConfigSet for reordering alg"); +#endif + +#if 0 + int32_t ir_n_steps = 2; + CUDSS_CALL_AND_CHECK_EXIT(cudssConfigSet(solverConfig, CUDSS_CONFIG_IR_N_STEPS, + &ir_n_steps, sizeof(int32_t)), status, "cudssConfigSet for ir n steps"); +#endif + + // Device pointers + csr_offset_d = nullptr; + csr_columns_d = nullptr; + csr_values_d = nullptr; + x_values_d = nullptr; + b_values_d = nullptr; + CUDA_CALL_AND_CHECK_EXIT(cudaMalloc(&x_values_d, n * sizeof(f_t)), "cudaMalloc for x_values"); + CUDA_CALL_AND_CHECK_EXIT(cudaMalloc(&b_values_d, n * sizeof(f_t)), "cudaMalloc for b_values"); + + i_t ldb = n; + i_t ldx = n; + CUDSS_CALL_AND_CHECK_EXIT( + cudssMatrixCreateDn(&cudss_b, n, 1, ldb, b_values_d, CUDA_R_64F, CUDSS_LAYOUT_COL_MAJOR), + status, + "cudssMatrixCreateDn for b"); + CUDSS_CALL_AND_CHECK_EXIT( + cudssMatrixCreateDn(&cudss_x, n, 1, ldx, x_values_d, CUDA_R_64F, CUDSS_LAYOUT_COL_MAJOR), + status, + "cudssMatrixCreateDn for x"); + } + ~sparse_cholesky_cudss_t() override + { + cudaFree(csr_values_d); + cudaFree(csr_columns_d); + cudaFree(csr_offset_d); + + cudaFree(x_values_d); + cudaFree(b_values_d); + + CUDSS_CALL_AND_CHECK_EXIT(cudssMatrixDestroy(A), status, "cudssMatrixDestroy for A"); + + CUDSS_CALL_AND_CHECK_EXIT( + cudssMatrixDestroy(cudss_x), status, "cudssMatrixDestroy for cudss_x"); + CUDSS_CALL_AND_CHECK_EXIT( + cudssMatrixDestroy(cudss_b), status, "cudssMatrixDestroy for cudss_b"); + CUDSS_CALL_AND_CHECK_EXIT(cudssDataDestroy(handle, solverData), status, "cudssDataDestroy"); + CUDSS_CALL_AND_CHECK_EXIT(cudssConfigDestroy(solverConfig), status, "cudssConfigDestroy"); + CUDSS_CALL_AND_CHECK_EXIT(cudssDestroy(handle), status, "cudssDestroy"); + CUDA_CALL_AND_CHECK_EXIT(cudaStreamSynchronize(stream), "cudaStreamSynchronize"); + CUDA_CALL_AND_CHECK_EXIT(cudaStreamDestroy(stream), "cudaStreamDestroy"); + } + i_t analyze(const csc_matrix_t& A_in) override + { + csr_matrix_t Arow; +#ifdef WRITE_MATRIX_MARKET + FILE* fid = fopen("A.mtx", "w"); + A_in.write_matrix_market(fid); + fclose(fid); +#endif + A_in.to_compressed_row(Arow); + + nnz = A_in.col_start[A_in.n]; + + CUDA_CALL_AND_CHECK(cudaMalloc(&csr_offset_d, (n + 1) * sizeof(i_t)), + "cudaMalloc for csr_offset"); + CUDA_CALL_AND_CHECK(cudaMalloc(&csr_columns_d, nnz * sizeof(i_t)), + "cudaMalloc for csr_columns"); + CUDA_CALL_AND_CHECK(cudaMalloc(&csr_values_d, nnz * sizeof(f_t)), "cudaMalloc for csr_values"); + + CUDA_CALL_AND_CHECK( + cudaMemcpy( + csr_offset_d, Arow.row_start.data(), (n + 1) * sizeof(i_t), cudaMemcpyHostToDevice), + "cudaMemcpy for csr_offset"); + CUDA_CALL_AND_CHECK( + cudaMemcpy(csr_columns_d, Arow.j.data(), nnz * sizeof(i_t), cudaMemcpyHostToDevice), + "cudaMemcpy for csr_columns"); + CUDA_CALL_AND_CHECK( + cudaMemcpy(csr_values_d, Arow.x.data(), nnz * sizeof(f_t), cudaMemcpyHostToDevice), + "cudaMemcpy for csr_values"); + + if (!first_factor) { + CUDSS_CALL_AND_CHECK(cudssMatrixDestroy(A), status, "cudssMatrixDestroy for A"); + } + + CUDSS_CALL_AND_CHECK( + cudssMatrixCreateCsr(&A, + n, + n, + nnz, + csr_offset_d, + nullptr, + csr_columns_d, + csr_values_d, + CUDA_R_32I, + CUDA_R_64F, + positive_definite ? CUDSS_MTYPE_SPD : CUDSS_MTYPE_SYMMETRIC, + CUDSS_MVIEW_FULL, + CUDSS_BASE_ZERO), + status, + "cudssMatrixCreateCsr"); + + // Perform symbolic analysis + f_t start_analysis = tic(); + + CUDSS_CALL_AND_CHECK( + cudssExecute(handle, CUDSS_PHASE_REORDERING, solverConfig, solverData, A, cudss_x, cudss_b), + status, + "cudssExecute for reordering"); + + f_t reorder_time = toc(start_analysis); + settings_.log.printf("Reordering time %.2fs\n", reorder_time); + + f_t start_symbolic = tic(); + + CUDSS_CALL_AND_CHECK( + cudssExecute( + handle, CUDSS_PHASE_SYMBOLIC_FACTORIZATION, solverConfig, solverData, A, cudss_x, cudss_b), + status, + "cudssExecute for symbolic factorization"); + + f_t symbolic_time = toc(start_symbolic); + f_t analysis_time = toc(start_analysis); + settings_.log.printf("Symbolic factorization time %.2fs\n", symbolic_time); + settings_.log.printf("Symbolic time %.2fs\n", analysis_time); + int64_t lu_nz = 0; + size_t size_written = 0; + CUDSS_CALL_AND_CHECK( + cudssDataGet(handle, solverData, CUDSS_DATA_LU_NNZ, &lu_nz, sizeof(int64_t), &size_written), + status, + "cudssDataGet for LU_NNZ"); + settings_.log.printf("Symbolic nonzeros in factor %e\n", static_cast(lu_nz) / 2.0); + // TODO: Is there any way to get nonzeros in the factors? + // TODO: Is there any way to get flops for the factorization? + + return 0; + } + i_t factorize(const csc_matrix_t& A_in) override + { + csr_matrix_t Arow; + A_in.to_compressed_row(Arow); + + if (nnz != A_in.col_start[A_in.n]) { + settings_.log.printf( + "Error: nnz %d != A_in.col_start[A_in.n] %d\n", nnz, A_in.col_start[A_in.n]); + exit(1); + } + + CUDA_CALL_AND_CHECK( + cudaMemcpy(csr_values_d, Arow.x.data(), nnz * sizeof(f_t), cudaMemcpyHostToDevice), + "cudaMemcpy for csr_values"); + + CUDSS_CALL_AND_CHECK( + cudssMatrixSetValues(A, csr_values_d), status, "cudssMatrixSetValues for A"); + + f_t start_numeric = tic(); + CUDSS_CALL_AND_CHECK( + cudssExecute( + handle, CUDSS_PHASE_FACTORIZATION, solverConfig, solverData, A, cudss_x, cudss_b), + status, + "cudssExecute for factorization"); + + f_t numeric_time = toc(start_numeric); + + int info; + size_t sizeWritten = 0; + CUDSS_CALL_AND_CHECK( + cudssDataGet(handle, solverData, CUDSS_DATA_INFO, &info, sizeof(info), &sizeWritten), + status, + "cudssDataGet for info"); + if (info != 0) { + settings_.log.printf("Factorization failed info %d\n", info); + return -1; + } + + if (first_factor) { + settings_.log.printf("Factor time %.2fs\n", numeric_time); + first_factor = false; + } + if (status != CUDSS_STATUS_SUCCESS) { + settings_.log.printf("cuDSS Factorization failed\n"); + return -1; + } + return 0; + } + + i_t solve(const dense_vector_t& b, dense_vector_t& x) override + { + if (b.size() != n) { + settings_.log.printf("Error: b.size() %d != n %d\n", b.size(), n); + exit(1); + } + if (x.size() != n) { + settings_.log.printf("Error: x.size() %d != n %d\n", x.size(), n); + exit(1); + } + CUDA_CALL_AND_CHECK(cudaMemcpy(b_values_d, b.data(), n * sizeof(f_t), cudaMemcpyHostToDevice), + "cudaMemcpy for b_values"); + CUDA_CALL_AND_CHECK(cudaMemcpy(x_values_d, x.data(), n * sizeof(f_t), cudaMemcpyHostToDevice), + "cudaMemcpy for x_values"); + + CUDSS_CALL_AND_CHECK( + cudssMatrixSetValues(cudss_b, b_values_d), status, "cudssMatrixSetValues for b"); + CUDSS_CALL_AND_CHECK( + cudssMatrixSetValues(cudss_x, x_values_d), status, "cudssMatrixSetValues for x"); + + i_t ldb = n; + i_t ldx = n; + CUDSS_CALL_AND_CHECK_EXIT( + cudssMatrixCreateDn(&cudss_b, n, 1, ldb, b_values_d, CUDA_R_64F, CUDSS_LAYOUT_COL_MAJOR), + status, + "cudssMatrixCreateDn for b"); + CUDSS_CALL_AND_CHECK_EXIT( + cudssMatrixCreateDn(&cudss_x, n, 1, ldx, x_values_d, CUDA_R_64F, CUDSS_LAYOUT_COL_MAJOR), + status, + "cudssMatrixCreateDn for x"); + + CUDSS_CALL_AND_CHECK( + cudssExecute(handle, CUDSS_PHASE_SOLVE, solverConfig, solverData, A, cudss_x, cudss_b), + status, + "cudssExecute for solve"); + + CUDA_CALL_AND_CHECK(cudaStreamSynchronize(stream), "cudaStreamSynchronize"); + + CUDA_CALL_AND_CHECK(cudaMemcpy(x.data(), x_values_d, n * sizeof(f_t), cudaMemcpyDeviceToHost), + "cudaMemcpy for x"); + + for (i_t i = 0; i < n; i++) { + if (x[i] != x[i]) { return -1; } + } + + return 0; + } + + void set_positive_definite(bool positive_definite) override + { + this->positive_definite = positive_definite; + } + + private: + i_t n; + i_t nnz; + bool first_factor; + bool positive_definite; + cudaError_t cuda_error; + cudssStatus_t status; + cudaStream_t stream; + cudssHandle_t handle; + cudssConfig_t solverConfig; + cudssData_t solverData; + cudssMatrix_t A; + cudssMatrix_t cudss_x; + cudssMatrix_t cudss_b; + i_t* csr_offset_d; + i_t* csr_columns_d; + f_t* csr_values_d; + f_t* x_values_d; + f_t* b_values_d; + + const simplex_solver_settings_t& settings_; +}; + +} // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/sparse_matrix.cpp b/cpp/src/dual_simplex/sparse_matrix.cpp index dc4df3990..22d286cda 100644 --- a/cpp/src/dual_simplex/sparse_matrix.cpp +++ b/cpp/src/dual_simplex/sparse_matrix.cpp @@ -372,12 +372,40 @@ void csc_matrix_t::print_matrix(FILE* fid) const fprintf(fid, "A = sparse(ijx(:, 1), ijx(:, 2), ijx(:, 3), %d, %d);\n", this->m, this->n); } +template +void csc_matrix_t::write_matrix_market(FILE* fid) const +{ + fprintf(fid, "%%%%MatrixMarket matrix coordinate real general\n"); + fprintf(fid, "%d %d %d\n", this->m, this->n, this->col_start[this->n]); + for (i_t j = 0; j < this->n; ++j) { + const i_t col_beg = this->col_start[j]; + const i_t col_end = this->col_start[j + 1]; + for (i_t p = col_beg; p < col_end; ++p) { + fprintf(fid, "%d %d %.16e\n", this->i[p] + 1, j + 1, this->x[p]); + } + } +} + template void csc_matrix_t::print_matrix() const { this->print_matrix(stdout); } +template +void csc_matrix_t::scale_columns(const std::vector& scale) +{ + const i_t n = this->n; + assert(scale.size() == n); + for (i_t j = 0; j < n; ++j) { + const i_t col_start = this->col_start[j]; + const i_t col_end = this->col_start[j + 1]; + for (i_t p = col_start; p < col_end; ++p) { + this->x[p] *= scale[j]; + } + } +} + template i_t scatter(const csc_matrix_t& A, i_t j, diff --git a/cpp/src/dual_simplex/sparse_matrix.hpp b/cpp/src/dual_simplex/sparse_matrix.hpp index 1fa898b61..d09a88381 100644 --- a/cpp/src/dual_simplex/sparse_matrix.hpp +++ b/cpp/src/dual_simplex/sparse_matrix.hpp @@ -87,9 +87,15 @@ class csc_matrix_t { // Prints the matrix to a file void print_matrix(FILE* fid) const; + // Writes the matrix to a file in Matrix Market format + void write_matrix_market(FILE* fid) const; + // Compute || A ||_1 = max_j (sum {i = 1 to m} | A(i, j) | ) f_t norm1() const; + // Perform column scaling of the matrix + void scale_columns(const std::vector& scale); + i_t m; // number of rows i_t n; // number of columns i_t nz_max; // maximum number of entries diff --git a/cpp/src/linear_programming/solve.cu b/cpp/src/linear_programming/solve.cu index 504d87bd3..ebe20f4b2 100644 --- a/cpp/src/linear_programming/solve.cu +++ b/cpp/src/linear_programming/solve.cu @@ -287,13 +287,84 @@ optimization_problem_solution_t convert_dual_simplex_sol( if (termination_status != pdlp_termination_status_t::Optimal && termination_status != pdlp_termination_status_t::TimeLimit && termination_status != pdlp_termination_status_t::ConcurrentLimit) { - CUOPT_LOG_INFO("Dual simplex status %s", sol.get_termination_status_string().c_str()); + CUOPT_LOG_INFO("Solve status %s", sol.get_termination_status_string().c_str()); } problem.handle_ptr->sync_stream(); return sol; } +template +std::tuple, dual_simplex::lp_status_t, f_t, f_t, f_t> +run_barrier(dual_simplex::user_problem_t& user_problem, + pdlp_solver_settings_t const& settings) +{ + auto start_solver = std::chrono::high_resolution_clock::now(); + + f_t norm_user_objective = dual_simplex::vector_norm2(user_problem.objective); + f_t norm_rhs = dual_simplex::vector_norm2(user_problem.rhs); + + dual_simplex::simplex_solver_settings_t dual_simplex_settings; + dual_simplex_settings.time_limit = settings.time_limit; + dual_simplex_settings.iteration_limit = settings.iteration_limit; + dual_simplex_settings.concurrent_halt = settings.concurrent_halt; + dual_simplex_settings.use_cudss = settings.use_cudss; + dual_simplex_settings.barrier = true; + dual_simplex_settings.crossover = settings.crossover; + if (dual_simplex_settings.concurrent_halt != nullptr) { + // Don't show the dual simplex log in concurrent mode. Show the PDLP log instead + dual_simplex_settings.log.log = false; + } + + dual_simplex::lp_solution_t solution(user_problem.num_rows, user_problem.num_cols); + auto status = dual_simplex::solve_linear_program_with_barrier( + user_problem, dual_simplex_settings, solution); + + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end - start_solver); + + CUOPT_LOG_INFO("Barrier finished in %.2f seconds", duration.count() / 1000.0); + + if (settings.concurrent_halt != nullptr && (status == dual_simplex::lp_status_t::OPTIMAL || + status == dual_simplex::lp_status_t::UNBOUNDED || + status == dual_simplex::lp_status_t::INFEASIBLE)) { + // We finished. Tell PDLP to stop if it is still running. + settings.concurrent_halt->store(1, std::memory_order_release); + } + + return {std::move(solution), status, duration.count() / 1000.0, norm_user_objective, norm_rhs}; +} + +template +optimization_problem_solution_t run_barrier( + detail::problem_t& problem, pdlp_solver_settings_t const& settings) +{ + // Convert data structures to dual simplex format and back + dual_simplex::user_problem_t dual_simplex_problem = + cuopt_problem_to_simplex_problem(problem); + auto sol_dual_simplex = run_barrier(dual_simplex_problem, settings); + return convert_dual_simplex_sol(problem, + std::get<0>(sol_dual_simplex), + std::get<1>(sol_dual_simplex), + std::get<2>(sol_dual_simplex), + std::get<3>(sol_dual_simplex), + std::get<4>(sol_dual_simplex)); +} + +template +void run_barrier_thread( + dual_simplex::user_problem_t& problem, + pdlp_solver_settings_t const& settings, + std::unique_ptr< + std::tuple, dual_simplex::lp_status_t, f_t, f_t, f_t>>& + sol_ptr) +{ + // We will return the solution from the thread as a unique_ptr + sol_ptr = std::make_unique< + std::tuple, dual_simplex::lp_status_t, f_t, f_t, f_t>>( + run_barrier(problem, settings)); +} + template std::tuple, dual_simplex::lp_status_t, f_t, f_t, f_t> run_dual_simplex(dual_simplex::user_problem_t& user_problem, @@ -499,12 +570,25 @@ optimization_problem_solution_t run_concurrent( std::ref(settings_pdlp), std::ref(sol_dual_simplex_ptr)); + dual_simplex::user_problem_t barrier_problem = dual_simplex_problem; + // Create a thread for barrier + std::unique_ptr< + std::tuple, dual_simplex::lp_status_t, f_t, f_t, f_t>> + sol_barrier_ptr; + std::thread barrier_thread(run_barrier_thread, + std::ref(barrier_problem), + std::ref(settings_pdlp), + std::ref(sol_barrier_ptr)); + // Run pdlp in the main thread auto sol_pdlp = run_pdlp(problem, settings_pdlp, is_batch_mode); // Wait for dual simplex thread to finish dual_simplex_thread.join(); + // Wait for barrier thread to finish + barrier_thread.join(); + // copy the dual simplex solution to the device auto sol_dual_simplex = convert_dual_simplex_sol(problem, std::get<0>(*sol_dual_simplex_ptr), @@ -513,6 +597,14 @@ optimization_problem_solution_t run_concurrent( std::get<3>(*sol_dual_simplex_ptr), std::get<4>(*sol_dual_simplex_ptr)); + // copy the barrier solution to the device + auto sol_barrier = convert_dual_simplex_sol(problem, + std::get<0>(*sol_barrier_ptr), + std::get<1>(*sol_barrier_ptr), + std::get<2>(*sol_barrier_ptr), + std::get<3>(*sol_barrier_ptr), + std::get<4>(*sol_barrier_ptr)); + f_t end_time = dual_simplex::toc(start_time); CUOPT_LOG_INFO("Concurrent time: %.3fs", end_time); // Check status to see if we should return the pdlp solution or the dual simplex solution @@ -528,6 +620,16 @@ optimization_problem_solution_t run_concurrent( sol_pdlp.get_additional_termination_information().number_of_steps_taken, end_time); return sol_pdlp; + } else if (sol_barrier.get_termination_status() == pdlp_termination_status_t::Optimal) { + CUOPT_LOG_INFO("Solved with barrier"); + sol_pdlp.copy_from(op_problem.get_handle_ptr(), sol_barrier); + sol_pdlp.set_solve_time(end_time); + CUOPT_LOG_INFO("Status: %s Objective: %.8e Iterations: %d Time: %.3fs", + sol_pdlp.get_termination_status_string().c_str(), + sol_pdlp.get_objective_value(), + sol_pdlp.get_additional_termination_information().number_of_steps_taken, + end_time); + return sol_pdlp; } else if (sol_pdlp.get_termination_status() == pdlp_termination_status_t::Optimal) { CUOPT_LOG_INFO("Solved with PDLP"); return sol_pdlp; @@ -549,6 +651,8 @@ optimization_problem_solution_t solve_lp_with_method( { if (settings.method == method_t::DualSimplex) { return run_dual_simplex(problem, settings); + } else if (settings.method == method_t::Barrier) { + return run_barrier(problem, settings); } else if (settings.method == method_t::Concurrent) { return run_concurrent(op_problem, problem, settings, is_batch_mode); } else { diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu index a049d0d09..c097958dd 100644 --- a/cpp/src/math_optimization/solver_settings.cu +++ b/cpp/src/math_optimization/solver_settings.cu @@ -89,7 +89,7 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings int_parameters = { {CUOPT_ITERATION_LIMIT, &pdlp_settings.iteration_limit, 0, std::numeric_limits::max(), std::numeric_limits::max()}, {CUOPT_PDLP_SOLVER_MODE, reinterpret_cast(&pdlp_settings.pdlp_solver_mode), CUOPT_PDLP_SOLVER_MODE_STABLE1, CUOPT_PDLP_SOLVER_MODE_FAST1, CUOPT_PDLP_SOLVER_MODE_STABLE2}, - {CUOPT_METHOD, reinterpret_cast(&pdlp_settings.method), CUOPT_METHOD_CONCURRENT, CUOPT_METHOD_DUAL_SIMPLEX, CUOPT_METHOD_CONCURRENT}, + {CUOPT_METHOD, reinterpret_cast(&pdlp_settings.method), CUOPT_METHOD_CONCURRENT, CUOPT_METHOD_BARRIER, CUOPT_METHOD_CONCURRENT}, {CUOPT_NUM_CPU_THREADS, &mip_settings.num_cpu_threads, -1, std::numeric_limits::max(), -1} }; @@ -105,6 +105,7 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_LOG_TO_CONSOLE, &pdlp_settings.log_to_console, true}, {CUOPT_LOG_TO_CONSOLE, &mip_settings.log_to_console, true}, {CUOPT_CROSSOVER, &pdlp_settings.crossover, false}, + {CUOPT_USE_CUDSS, &pdlp_settings.use_cudss, false}, {CUOPT_PRESOLVE, &pdlp_settings.presolve, false}, {CUOPT_PRESOLVE, &mip_settings.presolve, true} }; diff --git a/cpp/src/mip/solver.cu b/cpp/src/mip/solver.cu index 0f2117991..44f6e8083 100644 --- a/cpp/src/mip/solver.cu +++ b/cpp/src/mip/solver.cu @@ -33,15 +33,25 @@ #include #include +#include + #include #include #include namespace cuopt::linear_programming::detail { +void force_link_cudss() +{ + cudssHandle_t handle; + cudssCreate(&handle); + cudssDestroy(handle); +} + // This serves as both a warm up but also a mandatory initial call to setup cuSparse and cuBLAS static void init_handler(const raft::handle_t* handle_ptr) { + force_link_cudss(); // Init cuBlas / cuSparse context here to avoid having it during solving time RAFT_CUBLAS_TRY(raft::linalg::detail::cublassetpointermode( handle_ptr->get_cublas_handle(), CUBLAS_POINTER_MODE_DEVICE, handle_ptr->get_stream())); diff --git a/dependencies.yaml b/dependencies.yaml index 07de52533..1da064fed 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -691,6 +691,7 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev + - libcudss-dev - cuda-nvtx-dev - matrix: cuda: "12.*" @@ -698,6 +699,7 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev + - libcudss-dev - cuda-nvtx-dev - cuda-nvvm - cuda-crt @@ -714,6 +716,7 @@ dependencies: - nvidia-curand-cu12 - nvidia-cusparse-cu12 - nvidia-cusolver-cu12 + - nvidia-cudss-cu12 - nvidia-nvtx-cu12 # if use_cuda_wheels=false is provided, do not add dependencies on any CUDA wheels # (e.g. for DLFW and pip devcontainers) @@ -728,6 +731,7 @@ dependencies: - nvidia-curand - nvidia-cusparse - nvidia-cusolver + - nvidia-cudss - nvidia-nvtx develop: common: diff --git a/python/libcuopt/CMakeLists.txt b/python/libcuopt/CMakeLists.txt index b99770887..5e76ce87d 100644 --- a/python/libcuopt/CMakeLists.txt +++ b/python/libcuopt/CMakeLists.txt @@ -69,6 +69,7 @@ set(rpaths "$ORIGIN/../../rapids_logger/lib64" "$ORIGIN/../../librmm/lib64" "$ORIGIN/../../nvidia/cublas/lib" + "$ORIGIN/../../nvidia/cu12/lib" "$ORIGIN/../../nvidia/curand/lib" "$ORIGIN/../../nvidia/cusolver/lib" "$ORIGIN/../../nvidia/cusparse/lib" diff --git a/python/libcuopt/pyproject.toml b/python/libcuopt/pyproject.toml index 908d9e28c..7e768fdd4 100644 --- a/python/libcuopt/pyproject.toml +++ b/python/libcuopt/pyproject.toml @@ -46,6 +46,7 @@ dependencies = [ "cuopt-mps-parser==25.10.*,>=0.0.0a0", "librmm==25.10.*,>=0.0.0a0", "nvidia-cublas", + "nvidia-cudss", "nvidia-curand", "nvidia-cusolver", "nvidia-cusparse",