diff --git a/cpp/src/mip/presolve/bounds_presolve.cu b/cpp/src/mip/presolve/bounds_presolve.cu index 45fee622e..72440cd9a 100644 --- a/cpp/src/mip/presolve/bounds_presolve.cu +++ b/cpp/src/mip/presolve/bounds_presolve.cu @@ -202,6 +202,7 @@ termination_criterion_t bound_presolve_t::bound_update_loop(problem_t< } pb.handle_ptr->sync_stream(); calculate_infeasible_redundant_constraints(pb); + solve_iter = iter; return criteria; } diff --git a/cpp/src/mip/presolve/bounds_presolve.cuh b/cpp/src/mip/presolve/bounds_presolve.cuh index 9a25b05e9..84853a781 100644 --- a/cpp/src/mip/presolve/bounds_presolve.cuh +++ b/cpp/src/mip/presolve/bounds_presolve.cuh @@ -86,6 +86,7 @@ class bound_presolve_t { i_t infeas_constraints_count = 0; i_t redund_constraints_count = 0; probing_cache_t probing_cache; + i_t solve_iter; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu index 0a497d466..4b65de9c2 100644 --- a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu +++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu @@ -245,6 +245,7 @@ void load_balanced_bounds_presolve_t::setup( heavy_degree_cutoff, problem.cnst_bin_offsets, problem.offsets); + RAFT_CHECK_CUDA(stream_heavy_cnst); num_blocks_heavy_vars = create_heavy_item_block_segments(stream_heavy_vars, heavy_vars_vertex_ids, @@ -253,49 +254,33 @@ void load_balanced_bounds_presolve_t::setup( heavy_degree_cutoff, problem.vars_bin_offsets, problem.reverse_offsets); + RAFT_CHECK_CUDA(stream_heavy_vars); tmp_act.resize(2 * num_blocks_heavy_cnst, stream_heavy_cnst); tmp_bnd.resize(2 * num_blocks_heavy_vars, stream_heavy_vars); - std::tie(is_cnst_sub_warp_single_bin, cnst_sub_warp_count) = sub_warp_meta( - streams.get_stream(), warp_cnst_offsets, warp_cnst_id_offsets, pb->cnst_bin_offsets, 4); + std::tie(is_cnst_sub_warp_single_bin, cnst_sub_warp_count) = + sub_warp_meta(stream, warp_cnst_offsets, warp_cnst_id_offsets, pb->cnst_bin_offsets, 4); - std::tie(is_vars_sub_warp_single_bin, vars_sub_warp_count) = sub_warp_meta( - streams.get_stream(), warp_vars_offsets, warp_vars_id_offsets, pb->vars_bin_offsets, 4); + std::tie(is_vars_sub_warp_single_bin, vars_sub_warp_count) = + sub_warp_meta(stream, warp_vars_offsets, warp_vars_id_offsets, pb->vars_bin_offsets, 4); - stream.synchronize(); - streams.sync_all_issued(); + RAFT_CHECK_CUDA(stream); + streams.sync_test_all_issued(); if (!calc_slack_erase_inf_cnst_graph_created) { - bool erase_inf_cnst = true; - calc_slack_erase_inf_cnst_graph_created = build_graph( - streams, - handle_ptr, - calc_slack_erase_inf_cnst_graph, - calc_slack_erase_inf_cnst_exec, - [erase_inf_cnst, this]() { this->calculate_activity_graph(erase_inf_cnst, true); }, - [erase_inf_cnst, this]() { this->calculate_activity_graph(erase_inf_cnst); }); + create_constraint_slack_graph(true); + calc_slack_erase_inf_cnst_graph_created = true; } if (!calc_slack_graph_created) { - bool erase_inf_cnst = false; - calc_slack_graph_created = build_graph( - streams, - handle_ptr, - calc_slack_graph, - calc_slack_exec, - [erase_inf_cnst, this]() { this->calculate_activity_graph(erase_inf_cnst, true); }, - [erase_inf_cnst, this]() { this->calculate_activity_graph(erase_inf_cnst); }); + create_constraint_slack_graph(false); + calc_slack_graph_created = true; } if (!upd_bnd_graph_created) { - upd_bnd_graph_created = build_graph( - streams, - handle_ptr, - upd_bnd_graph, - upd_bnd_exec, - [this]() { this->calculate_bounds_update_graph(true); }, - [this]() { this->calculate_bounds_update_graph(); }); + create_bounds_update_graph(); + upd_bnd_graph_created = true; } } @@ -368,6 +353,116 @@ void load_balanced_bounds_presolve_t::calculate_activity_graph(bool er dry_run); } +template +void load_balanced_bounds_presolve_t::create_bounds_update_graph() +{ + using f_t2 = typename type_2::type; + cudaGraph_t upd_graph; + cudaGraphCreate(&upd_graph, 0); + cudaGraphNode_t bounds_changed_node; + { + i_t* bounds_changed_ptr = bounds_changed.data(); + + cudaMemcpy3DParms memcpyParams = {0}; + memcpyParams.srcArray = NULL; + memcpyParams.srcPos = make_cudaPos(0, 0, 0); + memcpyParams.srcPtr = make_cudaPitchedPtr(bounds_changed_ptr, sizeof(i_t), 1, 1); + memcpyParams.dstArray = NULL; + memcpyParams.dstPos = make_cudaPos(0, 0, 0); + memcpyParams.dstPtr = make_cudaPitchedPtr(&h_bounds_changed, sizeof(i_t), 1, 1); + memcpyParams.extent = make_cudaExtent(sizeof(i_t), 1, 1); + memcpyParams.kind = cudaMemcpyDeviceToHost; + cudaGraphAddMemcpyNode(&bounds_changed_node, upd_graph, NULL, 0, &memcpyParams); + } + + auto bounds_update_view = get_bounds_update_view(*pb); + + create_update_bounds_heavy_vars(upd_graph, + bounds_changed_node, + bounds_update_view, + make_span_2(tmp_bnd), + heavy_vars_vertex_ids, + heavy_vars_pseudo_block_ids, + heavy_vars_block_segments, + pb->vars_bin_offsets, + heavy_degree_cutoff, + num_blocks_heavy_vars); + RAFT_CUDA_TRY(cudaGetLastError()); + create_update_bounds_per_block( + upd_graph, bounds_changed_node, bounds_update_view, pb->vars_bin_offsets, heavy_degree_cutoff); + RAFT_CUDA_TRY(cudaGetLastError()); + create_update_bounds_sub_warp(upd_graph, + bounds_changed_node, + bounds_update_view, + is_vars_sub_warp_single_bin, + vars_sub_warp_count, + warp_vars_offsets, + warp_vars_id_offsets, + pb->vars_bin_offsets); + RAFT_CUDA_TRY(cudaGetLastError()); + cudaGraphInstantiate(&upd_bnd_exec, upd_graph, NULL, NULL, 0); + RAFT_CUDA_TRY(cudaGetLastError()); +} + +template +void load_balanced_bounds_presolve_t::create_constraint_slack_graph(bool erase_inf_cnst) +{ + using f_t2 = typename type_2::type; + cudaGraph_t cnst_slack_graph; + cudaGraphCreate(&cnst_slack_graph, 0); + + cudaGraphNode_t set_bounds_changed_node; + { + // TODO : Investigate why memset node is not captured manually + i_t* bounds_changed_ptr = bounds_changed.data(); + + cudaMemcpy3DParms memcpyParams = {0}; + memcpyParams.srcArray = NULL; + memcpyParams.srcPos = make_cudaPos(0, 0, 0); + memcpyParams.srcPtr = make_cudaPitchedPtr(&h_bounds_changed, sizeof(i_t), 1, 1); + memcpyParams.dstArray = NULL; + memcpyParams.dstPos = make_cudaPos(0, 0, 0); + memcpyParams.dstPtr = make_cudaPitchedPtr(bounds_changed_ptr, sizeof(i_t), 1, 1); + memcpyParams.extent = make_cudaExtent(sizeof(i_t), 1, 1); + memcpyParams.kind = cudaMemcpyHostToDevice; + cudaGraphAddMemcpyNode(&set_bounds_changed_node, cnst_slack_graph, NULL, 0, &memcpyParams); + } + + auto activity_view = get_activity_view(*pb); + + create_activity_heavy_cnst(cnst_slack_graph, + set_bounds_changed_node, + activity_view, + make_span_2(tmp_act), + heavy_cnst_vertex_ids, + heavy_cnst_pseudo_block_ids, + heavy_cnst_block_segments, + pb->cnst_bin_offsets, + heavy_degree_cutoff, + num_blocks_heavy_cnst, + erase_inf_cnst); + create_activity_per_block(cnst_slack_graph, + set_bounds_changed_node, + activity_view, + pb->cnst_bin_offsets, + heavy_degree_cutoff, + erase_inf_cnst); + create_activity_sub_warp(cnst_slack_graph, + set_bounds_changed_node, + activity_view, + is_cnst_sub_warp_single_bin, + cnst_sub_warp_count, + warp_cnst_offsets, + warp_cnst_id_offsets, + pb->cnst_bin_offsets, + erase_inf_cnst); + if (erase_inf_cnst) { + cudaGraphInstantiate(&calc_slack_erase_inf_cnst_exec, cnst_slack_graph, NULL, NULL, 0); + } else { + cudaGraphInstantiate(&calc_slack_exec, cnst_slack_graph, NULL, NULL, 0); + } +} + template void load_balanced_bounds_presolve_t::calculate_bounds_update_graph(bool dry_run) { @@ -401,12 +496,13 @@ template void load_balanced_bounds_presolve_t::calculate_constraint_slack_iter( const raft::handle_t* handle_ptr) { + // h_bounds_changed is copied to bounds_changed in calc_slack_exec + h_bounds_changed = 0; { // writes nans to constraint activities that are infeasible //-> less expensive checks for update bounds step raft::common::nvtx::range scope("act_cuda_task_graph"); cudaGraphLaunch(calc_slack_erase_inf_cnst_exec, handle_ptr->get_stream()); - handle_ptr->sync_stream(); } infeas_cnst_slack_set_to_nan = true; RAFT_CHECK_CUDA(handle_ptr->get_stream()); @@ -416,6 +512,8 @@ template void load_balanced_bounds_presolve_t::calculate_constraint_slack( const raft::handle_t* handle_ptr) { + // h_bounds_changed is copied to bounds_changed in calc_slack_exec + h_bounds_changed = 0; { raft::common::nvtx::range scope("act_cuda_task_graph"); cudaGraphLaunch(calc_slack_exec, handle_ptr->get_stream()); @@ -428,13 +526,10 @@ template bool load_balanced_bounds_presolve_t::update_bounds_from_slack( const raft::handle_t* handle_ptr) { - i_t h_bounds_changed; - bounds_changed.set_value_to_zero_async(handle_ptr->get_stream()); - + // bounds_changed is copied to h_bounds_changed in upd_bnd_exec { raft::common::nvtx::range scope("upd_cuda_task_graph"); cudaGraphLaunch(upd_bnd_exec, handle_ptr->get_stream()); - h_bounds_changed = bounds_changed.value(handle_ptr->get_stream()); } RAFT_CHECK_CUDA(handle_ptr->get_stream()); constexpr i_t zero = 0; diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cuh b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cuh index 19aef04f8..42736b3a0 100644 --- a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cuh +++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cuh @@ -212,6 +212,8 @@ class load_balanced_bounds_presolve_t { activity_view_t get_activity_view(const load_balanced_problem_t& pb); bounds_update_view_t get_bounds_update_view(const load_balanced_problem_t& pb); + void create_bounds_update_graph(); + void create_constraint_slack_graph(bool erase_inf_cnst); rmm::cuda_stream main_stream; rmm::cuda_stream act_stream; @@ -221,6 +223,7 @@ class load_balanced_bounds_presolve_t { const load_balanced_problem_t* pb; rmm::device_scalar bounds_changed; + i_t h_bounds_changed; rmm::device_uvector cnst_slack; rmm::device_uvector vars_bnd; diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh b/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh index 53a76536b..7eb2b41a9 100644 --- a/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh +++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh @@ -108,6 +108,7 @@ i_t create_heavy_item_block_segments(rmm::cuda_stream_view stream, // Inclusive scan so that each block can determine which item it belongs to item_block_segments.set_element_to_zero_async(0, stream); + thrust::inclusive_scan(rmm::exec_policy(stream), calc_blocks_per_vertex_iter, calc_blocks_per_vertex_iter + heavy_id_count, @@ -156,6 +157,7 @@ void calc_activity_heavy_cnst(managed_stream_pool& streams, { if (num_blocks_heavy_cnst != 0) { auto heavy_cnst_stream = streams.get_stream(); + RAFT_CHECK_CUDA(heavy_cnst_stream); // TODO : Check heavy_cnst_block_segments size for profiling if (!dry_run) { auto heavy_cnst_beg_id = get_id_offset(cnst_bin_offsets, heavy_degree_cutoff); @@ -167,15 +169,18 @@ void calc_activity_heavy_cnst(managed_stream_pool& streams, heavy_degree_cutoff, view, tmp_cnst_act); + RAFT_CHECK_CUDA(heavy_cnst_stream); auto num_heavy_cnst = cnst_bin_offsets.back() - heavy_cnst_beg_id; if (erase_inf_cnst) { finalize_calc_act_kernel <<>>( heavy_cnst_beg_id, make_span(heavy_cnst_block_segments), tmp_cnst_act, view); + RAFT_CHECK_CUDA(heavy_cnst_stream); } else { finalize_calc_act_kernel <<>>( heavy_cnst_beg_id, make_span(heavy_cnst_block_segments), tmp_cnst_act, view); + RAFT_CHECK_CUDA(heavy_cnst_stream); } } } @@ -201,9 +206,11 @@ void calc_activity_per_block(managed_stream_pool& streams, if (erase_inf_cnst) { lb_calc_act_block_kernel <<>>(cnst_id_beg, view); + RAFT_CHECK_CUDA(block_stream); } else { lb_calc_act_block_kernel <<>>(cnst_id_beg, view); + RAFT_CHECK_CUDA(block_stream); } } } @@ -260,9 +267,11 @@ void calc_activity_sub_warp(managed_stream_pool& streams, if (erase_inf_cnst) { lb_calc_act_sub_warp_kernel <<>>(cnst_id_beg, cnst_id_end, view); + RAFT_CHECK_CUDA(sub_warp_thread); } else { lb_calc_act_sub_warp_kernel <<>>(cnst_id_beg, cnst_id_end, view); + RAFT_CHECK_CUDA(sub_warp_thread); } } } @@ -303,10 +312,12 @@ void calc_activity_sub_warp(managed_stream_pool& streams, lb_calc_act_sub_warp_kernel <<>>( view, make_span(warp_cnst_offsets), make_span(warp_cnst_id_offsets)); + RAFT_CHECK_CUDA(sub_warp_stream); } else { lb_calc_act_sub_warp_kernel <<>>( view, make_span(warp_cnst_offsets), make_span(warp_cnst_id_offsets)); + RAFT_CHECK_CUDA(sub_warp_stream); } } } @@ -358,44 +369,310 @@ void calc_activity_sub_warp(managed_stream_pool& streams, } } -/// BOUNDS UPDATE +template +void create_activity_sub_warp(cudaGraph_t act_graph, + cudaGraphNode_t& set_bounds_changed_node, + activity_view_t view, + i_t degree_beg, + i_t degree_end, + const std::vector& cnst_bin_offsets, + bool erase_inf_cnst) +{ + constexpr i_t block_dim = 32; + auto cnst_per_block = block_dim / threads_per_constraint; + auto [cnst_id_beg, cnst_id_end] = get_id_range(cnst_bin_offsets, degree_beg, degree_end); -template -void upd_bounds_heavy_vars(managed_stream_pool& streams, - bounds_update_view_t view, - raft::device_span tmp_vars_bnd, - const rmm::device_uvector& heavy_vars_vertex_ids, - const rmm::device_uvector& heavy_vars_pseudo_block_ids, - const rmm::device_uvector& heavy_vars_block_segments, - const std::vector& vars_bin_offsets, - i_t heavy_degree_cutoff, - i_t num_blocks_heavy_vars, - bool dry_run = false) + auto block_count = raft::ceildiv(cnst_id_end - cnst_id_beg, cnst_per_block); + if (block_count != 0) { + cudaGraphNode_t act_sub_warp_node; + void* kernelArgs[] = {&cnst_id_beg, &cnst_id_end, &view}; + cudaKernelNodeParams kernelNodeParams = {0}; + + kernelNodeParams.gridDim = dim3(block_count, 1, 1); + kernelNodeParams.blockDim = dim3(block_dim, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void**)kernelArgs; + kernelNodeParams.extra = NULL; + if (erase_inf_cnst) { + kernelNodeParams.func = (void*)lb_calc_act_sub_warp_kernel; + } else { + kernelNodeParams.func = (void*)lb_calc_act_sub_warp_kernel; + } + + cudaGraphAddKernelNode(&act_sub_warp_node, act_graph, NULL, 0, &kernelNodeParams); + cudaGraphAddDependencies(act_graph, &act_sub_warp_node, &set_bounds_changed_node, 1); + } +} + +template +void create_activity_sub_warp(cudaGraph_t act_graph, + cudaGraphNode_t& set_bounds_changed_node, + activity_view_t view, + i_t degree, + const std::vector& cnst_bin_offsets, + bool erase_inf_cnst) { - if (num_blocks_heavy_vars != 0) { - auto heavy_vars_stream = streams.get_stream(); - // TODO : Check heavy_vars_block_segments size for profiling - if (!dry_run) { - auto heavy_vars_beg_id = get_id_offset(vars_bin_offsets, heavy_degree_cutoff); - lb_upd_bnd_heavy_kernel - <<>>( - heavy_vars_beg_id, - make_span(heavy_vars_vertex_ids), - make_span(heavy_vars_pseudo_block_ids), - heavy_degree_cutoff, - view, - tmp_vars_bnd); - auto num_heavy_vars = vars_bin_offsets.back() - heavy_vars_beg_id; - finalize_upd_bnd_kernel<<>>( - heavy_vars_beg_id, make_span(heavy_vars_block_segments), tmp_vars_bnd, view); + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, degree, degree, cnst_bin_offsets, erase_inf_cnst); +} + +template +void create_activity_sub_warp(cudaGraph_t act_graph, + cudaGraphNode_t& set_bounds_changed_node, + activity_view_t view, + i_t cnst_sub_warp_count, + rmm::device_uvector& warp_cnst_offsets, + rmm::device_uvector& warp_cnst_id_offsets, + bool erase_inf_cnst) +{ + constexpr i_t block_dim = 256; + + auto block_count = raft::ceildiv(cnst_sub_warp_count * 32, block_dim); + if (block_count != 0) { + cudaGraphNode_t act_sub_warp_node; + auto warp_cnst_offsets_span = make_span(warp_cnst_offsets); + auto warp_cnst_id_offsets_span = make_span(warp_cnst_id_offsets); + + void* kernelArgs[] = {&view, &warp_cnst_offsets_span, &warp_cnst_id_offsets_span}; + cudaKernelNodeParams kernelNodeParams = {0}; + + kernelNodeParams.gridDim = dim3(block_count, 1, 1); + kernelNodeParams.blockDim = dim3(block_dim, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void**)kernelArgs; + kernelNodeParams.extra = NULL; + + if (erase_inf_cnst) { + kernelNodeParams.func = + (void*)lb_calc_act_sub_warp_kernel; + } else { + kernelNodeParams.func = + (void*)lb_calc_act_sub_warp_kernel; + } + + cudaGraphAddKernelNode(&act_sub_warp_node, act_graph, NULL, 0, &kernelNodeParams); + cudaGraphAddDependencies(act_graph, &act_sub_warp_node, &set_bounds_changed_node, 1); + } +} + +template +void create_activity_sub_warp(cudaGraph_t act_graph, + cudaGraphNode_t& set_bounds_changed_node, + activity_view_t view, + bool is_cnst_sub_warp_single_bin, + i_t cnst_sub_warp_count, + rmm::device_uvector& warp_cnst_offsets, + rmm::device_uvector& warp_cnst_id_offsets, + const std::vector& cnst_bin_offsets, + bool erase_inf_cnst) +{ + if (view.nnz < 10000) { + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, 16, cnst_bin_offsets, erase_inf_cnst); + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, 8, cnst_bin_offsets, erase_inf_cnst); + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, 4, cnst_bin_offsets, erase_inf_cnst); + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, 2, cnst_bin_offsets, erase_inf_cnst); + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, 1, cnst_bin_offsets, erase_inf_cnst); + } else { + if (is_cnst_sub_warp_single_bin) { + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, 64, cnst_bin_offsets, erase_inf_cnst); + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, 32, cnst_bin_offsets, erase_inf_cnst); + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, 16, cnst_bin_offsets, erase_inf_cnst); + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, 8, cnst_bin_offsets, erase_inf_cnst); + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, 1, 4, cnst_bin_offsets, erase_inf_cnst); + } else { + create_activity_sub_warp(act_graph, + set_bounds_changed_node, + view, + cnst_sub_warp_count, + warp_cnst_offsets, + warp_cnst_id_offsets, + erase_inf_cnst); + } + } +} + +template +void create_activity_per_block(cudaGraph_t act_graph, + cudaGraphNode_t& set_bounds_changed_node, + activity_view_t view, + const std::vector& cnst_bin_offsets, + i_t degree_beg, + i_t degree_end, + bool erase_inf_cnst) +{ + static_assert(block_dim <= 1024, "Cannot launch kernel with more than 1024 threads"); + + auto [cnst_id_beg, cnst_id_end] = get_id_range(cnst_bin_offsets, degree_beg, degree_end); + + auto block_count = cnst_id_end - cnst_id_beg; + if (block_count > 0) { + cudaGraphNode_t act_block_node; + void* kernelArgs[] = {&cnst_id_beg, &view}; + + cudaKernelNodeParams kernelNodeParams = {0}; + + kernelNodeParams.gridDim = dim3(block_count, 1, 1); + kernelNodeParams.blockDim = dim3(block_dim, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void**)kernelArgs; + kernelNodeParams.extra = NULL; + if (erase_inf_cnst) { + kernelNodeParams.func = + (void*)lb_calc_act_block_kernel; + } else { + kernelNodeParams.func = + (void*)lb_calc_act_block_kernel; + } + + cudaGraphAddKernelNode(&act_block_node, act_graph, NULL, 0, &kernelNodeParams); + cudaGraphAddDependencies(act_graph, &act_block_node, &set_bounds_changed_node, 1); + } +} + +template +void create_activity_per_block(cudaGraph_t act_graph, + cudaGraphNode_t& set_bounds_changed_node, + activity_view_t view, + const std::vector& cnst_bin_offsets, + i_t heavy_degree_cutoff, + bool erase_inf_cnst) +{ + if (view.nnz < 10000) { + create_activity_per_block( + act_graph, set_bounds_changed_node, view, cnst_bin_offsets, 32, 32, erase_inf_cnst); + create_activity_per_block( + act_graph, set_bounds_changed_node, view, cnst_bin_offsets, 64, 64, erase_inf_cnst); + create_activity_per_block( + act_graph, set_bounds_changed_node, view, cnst_bin_offsets, 128, 128, erase_inf_cnst); + create_activity_per_block( + act_graph, set_bounds_changed_node, view, cnst_bin_offsets, 256, 256, erase_inf_cnst); + } else { + //[1024, heavy_degree_cutoff/2] -> 1024 block size + create_activity_per_block(act_graph, + set_bounds_changed_node, + view, + cnst_bin_offsets, + 1024, + heavy_degree_cutoff / 2, + erase_inf_cnst); + //[512, 512] -> 128 block size + create_activity_per_block( + act_graph, set_bounds_changed_node, view, cnst_bin_offsets, 128, 512, erase_inf_cnst); + } +} + +template +void create_activity_heavy_cnst(cudaGraph_t act_graph, + cudaGraphNode_t& set_bounds_changed_node, + activity_view_t view, + raft::device_span tmp_cnst_act, + const rmm::device_uvector& heavy_cnst_vertex_ids, + const rmm::device_uvector& heavy_cnst_pseudo_block_ids, + const rmm::device_uvector& heavy_cnst_block_segments, + const std::vector& cnst_bin_offsets, + i_t heavy_degree_cutoff, + i_t num_blocks_heavy_cnst, + bool erase_inf_cnst, + bool dry_run = false) +{ + if (num_blocks_heavy_cnst != 0) { + cudaGraphNode_t act_heavy_node; + cudaGraphNode_t finalize_heavy_node; + // Add heavy kernel + { + auto heavy_cnst_beg_id = get_id_offset(cnst_bin_offsets, heavy_degree_cutoff); + auto heavy_cnst_vertex_ids_span = make_span(heavy_cnst_vertex_ids); + auto heavy_cnst_pseudo_block_ids_span = make_span(heavy_cnst_pseudo_block_ids); + i_t work_per_block = heavy_degree_cutoff; + + void* kernelArgs[] = {&heavy_cnst_beg_id, + &heavy_cnst_vertex_ids_span, + &heavy_cnst_pseudo_block_ids_span, + &work_per_block, + &view, + &tmp_cnst_act}; + + cudaKernelNodeParams kernelNodeParams = {0}; + + kernelNodeParams.func = + (void*)lb_calc_act_heavy_kernel; + kernelNodeParams.gridDim = dim3(num_blocks_heavy_cnst, 1, 1); + kernelNodeParams.blockDim = dim3(block_dim, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void**)kernelArgs; + kernelNodeParams.extra = NULL; + + cudaGraphAddKernelNode(&act_heavy_node, act_graph, NULL, 0, &kernelNodeParams); } + { + auto heavy_cnst_beg_id = get_id_offset(cnst_bin_offsets, heavy_degree_cutoff); + auto num_heavy_cnst = cnst_bin_offsets.back() - heavy_cnst_beg_id; + auto heavy_cnst_block_segments_span = make_span(heavy_cnst_block_segments); + + void* kernelArgs[] = { + &heavy_cnst_beg_id, &heavy_cnst_block_segments_span, &tmp_cnst_act, &view}; + + cudaKernelNodeParams kernelNodeParams = {0}; + + kernelNodeParams.gridDim = dim3(num_heavy_cnst, 1, 1); + kernelNodeParams.blockDim = dim3(32, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void**)kernelArgs; + kernelNodeParams.extra = NULL; + if (erase_inf_cnst) { + kernelNodeParams.func = + (void*)finalize_calc_act_kernel; + } else { + kernelNodeParams.func = + (void*)finalize_calc_act_kernel; + } + + cudaGraphAddKernelNode(&finalize_heavy_node, act_graph, NULL, 0, &kernelNodeParams); + } + + cudaGraphAddDependencies(act_graph, &act_heavy_node, &finalize_heavy_node, 1); + cudaGraphAddDependencies(act_graph, &finalize_heavy_node, &set_bounds_changed_node, 1); } } +/// BOUNDS UPDATE + template void upd_bounds_heavy_vars(managed_stream_pool& streams, bounds_update_view_t view, raft::device_span tmp_vars_bnd, + const rmm::device_uvector& heavy_vars_vertex_ids, + const rmm::device_uvector& heavy_vars_pseudo_block_ids, const rmm::device_uvector& heavy_vars_block_segments, const std::vector& vars_bin_offsets, i_t heavy_degree_cutoff, @@ -410,7 +687,8 @@ void upd_bounds_heavy_vars(managed_stream_pool& streams, lb_upd_bnd_heavy_kernel <<>>( heavy_vars_beg_id, - make_span(heavy_vars_block_segments, 1, heavy_vars_block_segments.size()), + make_span(heavy_vars_vertex_ids), + make_span(heavy_vars_pseudo_block_ids), heavy_degree_cutoff, view, tmp_vars_bnd); @@ -555,4 +833,275 @@ void upd_bounds_sub_warp(managed_stream_pool& streams, } } } + +template +void create_update_bounds_sub_warp(cudaGraph_t upd_graph, + cudaGraphNode_t& bounds_changed_node, + bounds_update_view_t view, + i_t degree_beg, + i_t degree_end, + const std::vector& vars_bin_offsets) +{ + constexpr i_t block_dim = 32; + auto vars_per_block = block_dim / threads_per_variable; + auto [vars_id_beg, vars_id_end] = get_id_range(vars_bin_offsets, degree_beg, degree_end); + + auto block_count = raft::ceildiv(vars_id_end - vars_id_beg, vars_per_block); + if (block_count != 0) { + cudaGraphNode_t upd_bnd_sub_warp_node; + + void* kernelArgs[] = {&vars_id_beg, &vars_id_end, &view}; + + cudaKernelNodeParams kernelNodeParams = {0}; + + kernelNodeParams.func = (void*)lb_upd_bnd_sub_warp_kernel; + kernelNodeParams.gridDim = dim3(block_count, 1, 1); + kernelNodeParams.blockDim = dim3(block_dim, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void**)kernelArgs; + kernelNodeParams.extra = NULL; + + cudaGraphAddKernelNode(&upd_bnd_sub_warp_node, upd_graph, NULL, 0, &kernelNodeParams); + RAFT_CUDA_TRY(cudaGetLastError()); + + cudaGraphAddDependencies(upd_graph, &upd_bnd_sub_warp_node, &bounds_changed_node, 1); + RAFT_CUDA_TRY(cudaGetLastError()); + } +} + +template +void create_update_bounds_sub_warp(cudaGraph_t upd_graph, + cudaGraphNode_t& bounds_changed_node, + bounds_update_view_t view, + i_t degree, + const std::vector& vars_bin_offsets) +{ + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, degree, degree, vars_bin_offsets); +} + +template +void create_update_bounds_sub_warp(cudaGraph_t upd_graph, + cudaGraphNode_t& bounds_changed_node, + bounds_update_view_t view, + i_t vars_sub_warp_count, + rmm::device_uvector& warp_vars_offsets, + rmm::device_uvector& warp_vars_id_offsets) +{ + constexpr i_t block_dim = 256; + + auto block_count = raft::ceildiv(vars_sub_warp_count * 32, block_dim); + if (block_count != 0) { + cudaGraphNode_t upd_bnd_sub_warp_node; + + auto warp_vars_offsets_span = make_span(warp_vars_offsets); + auto warp_vars_id_offsets_span = make_span(warp_vars_id_offsets); + + void* kernelArgs[] = {&view, &warp_vars_offsets_span, &warp_vars_id_offsets_span}; + + cudaKernelNodeParams kernelNodeParams = {0}; + + kernelNodeParams.func = + (void*)lb_upd_bnd_sub_warp_kernel; + kernelNodeParams.gridDim = dim3(block_count, 1, 1); + kernelNodeParams.blockDim = dim3(block_dim, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void**)kernelArgs; + kernelNodeParams.extra = NULL; + + cudaGraphAddKernelNode(&upd_bnd_sub_warp_node, upd_graph, NULL, 0, &kernelNodeParams); + RAFT_CUDA_TRY(cudaGetLastError()); + + cudaGraphAddDependencies(upd_graph, &upd_bnd_sub_warp_node, &bounds_changed_node, 1); + RAFT_CUDA_TRY(cudaGetLastError()); + } +} + +template +void create_update_bounds_sub_warp(cudaGraph_t upd_graph, + cudaGraphNode_t& bounds_changed_node, + bounds_update_view_t view, + bool is_vars_sub_warp_single_bin, + i_t vars_sub_warp_count, + rmm::device_uvector& warp_vars_offsets, + rmm::device_uvector& warp_vars_id_offsets, + const std::vector& vars_bin_offsets) +{ + if (view.nnz < 10000) { + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, 16, vars_bin_offsets); + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, 8, vars_bin_offsets); + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, 4, vars_bin_offsets); + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, 2, vars_bin_offsets); + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, 1, vars_bin_offsets); + } else { + if (is_vars_sub_warp_single_bin) { + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, 64, vars_bin_offsets); + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, 32, vars_bin_offsets); + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, 16, vars_bin_offsets); + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, 8, vars_bin_offsets); + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, 1, 4, vars_bin_offsets); + } else { + create_update_bounds_sub_warp(upd_graph, + bounds_changed_node, + view, + vars_sub_warp_count, + warp_vars_offsets, + warp_vars_id_offsets); + } + } +} + +template +void create_update_bounds_per_block(cudaGraph_t upd_graph, + cudaGraphNode_t& bounds_changed_node, + bounds_update_view_t view, + const std::vector& vars_bin_offsets, + i_t degree_beg, + i_t degree_end) +{ + auto [vars_id_beg, vars_id_end] = get_id_range(vars_bin_offsets, degree_beg, degree_end); + + auto block_count = vars_id_end - vars_id_beg; + if (block_count > 0) { + cudaGraphNode_t upd_bnd_block_node; + + void* kernelArgs[] = {&vars_id_beg, &view}; + + cudaKernelNodeParams kernelNodeParams = {0}; + + kernelNodeParams.func = + (void*)lb_upd_bnd_block_kernel; + kernelNodeParams.gridDim = dim3(block_count, 1, 1); + kernelNodeParams.blockDim = dim3(block_dim, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void**)kernelArgs; + kernelNodeParams.extra = NULL; + + cudaGraphAddKernelNode(&upd_bnd_block_node, upd_graph, NULL, 0, &kernelNodeParams); + RAFT_CUDA_TRY(cudaGetLastError()); + + cudaGraphAddDependencies(upd_graph, &upd_bnd_block_node, &bounds_changed_node, 1); + RAFT_CUDA_TRY(cudaGetLastError()); + } +} + +template +void create_update_bounds_per_block(cudaGraph_t upd_graph, + cudaGraphNode_t& bounds_changed_node, + bounds_update_view_t view, + const std::vector& vars_bin_offsets, + i_t heavy_degree_cutoff) +{ + if (view.nnz < 10000) { + create_update_bounds_per_block( + upd_graph, bounds_changed_node, view, vars_bin_offsets, 32, 32); + create_update_bounds_per_block( + upd_graph, bounds_changed_node, view, vars_bin_offsets, 64, 64); + create_update_bounds_per_block( + upd_graph, bounds_changed_node, view, vars_bin_offsets, 128, 128); + create_update_bounds_per_block( + upd_graph, bounds_changed_node, view, vars_bin_offsets, 256, 256); + } else { + //[1024, heavy_degree_cutoff/2] -> 128 block size + create_update_bounds_per_block( + upd_graph, bounds_changed_node, view, vars_bin_offsets, 1024, heavy_degree_cutoff / 2); + //[64, 512] -> 32 block size + create_update_bounds_per_block( + upd_graph, bounds_changed_node, view, vars_bin_offsets, 128, 512); + } +} + +template +void create_update_bounds_heavy_vars(cudaGraph_t upd_graph, + cudaGraphNode_t& bounds_changed_node, + bounds_update_view_t view, + raft::device_span tmp_vars_bnd, + const rmm::device_uvector& heavy_vars_vertex_ids, + const rmm::device_uvector& heavy_vars_pseudo_block_ids, + const rmm::device_uvector& heavy_vars_block_segments, + const std::vector& vars_bin_offsets, + i_t heavy_degree_cutoff, + i_t num_blocks_heavy_vars) +{ + if (num_blocks_heavy_vars != 0) { + cudaGraphNode_t upd_bnd_heavy_node; + cudaGraphNode_t finalize_heavy_node; + // Add heavy kernel + { + auto heavy_vars_beg_id = get_id_offset(vars_bin_offsets, heavy_degree_cutoff); + auto heavy_vars_vertex_ids_span = make_span(heavy_vars_vertex_ids); + auto heavy_vars_pseudo_block_ids_span = make_span(heavy_vars_pseudo_block_ids); + i_t work_per_block = heavy_degree_cutoff; + + void* kernelArgs[] = {&heavy_vars_beg_id, + &heavy_vars_vertex_ids_span, + &heavy_vars_pseudo_block_ids_span, + &work_per_block, + &view, + &tmp_vars_bnd}; + + cudaKernelNodeParams kernelNodeParams = {0}; + + kernelNodeParams.func = + (void*)lb_upd_bnd_heavy_kernel; + kernelNodeParams.gridDim = dim3(num_blocks_heavy_vars, 1, 1); + kernelNodeParams.blockDim = dim3(block_dim, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void**)kernelArgs; + kernelNodeParams.extra = NULL; + + cudaGraphAddKernelNode(&upd_bnd_heavy_node, upd_graph, NULL, 0, &kernelNodeParams); + RAFT_CUDA_TRY(cudaGetLastError()); + } + // Add finalize + { + auto heavy_vars_beg_id = get_id_offset(vars_bin_offsets, heavy_degree_cutoff); + auto num_heavy_vars = vars_bin_offsets.back() - heavy_vars_beg_id; + auto heavy_vars_block_segments_span = make_span(heavy_vars_block_segments); + + void* kernelArgs[] = { + &heavy_vars_beg_id, &heavy_vars_block_segments_span, &tmp_vars_bnd, &view}; + + cudaKernelNodeParams kernelNodeParams = {0}; + + kernelNodeParams.func = (void*)finalize_upd_bnd_kernel; + kernelNodeParams.gridDim = dim3(num_heavy_vars, 1, 1); + kernelNodeParams.blockDim = dim3(32, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void**)kernelArgs; + kernelNodeParams.extra = NULL; + + cudaGraphAddKernelNode(&finalize_heavy_node, upd_graph, NULL, 0, &kernelNodeParams); + RAFT_CUDA_TRY(cudaGetLastError()); + } + cudaGraphAddDependencies(upd_graph, &upd_bnd_heavy_node, &finalize_heavy_node, 1); + RAFT_CUDA_TRY(cudaGetLastError()); + cudaGraphAddDependencies(upd_graph, &finalize_heavy_node, &bounds_changed_node, 1); + RAFT_CUDA_TRY(cudaGetLastError()); + } +} + } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh b/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh index 328fa25b9..10089664a 100644 --- a/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh +++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh @@ -98,40 +98,6 @@ __global__ void lb_calc_act_heavy_kernel(i_t id_range_beg, if (threadIdx.x == 0) { tmp_cnst_act[blockIdx.x] = act; } } -template -__global__ void lb_calc_act_heavy_kernel(i_t id_range_beg, - raft::device_span item_block_segments, - i_t work_per_block, - activity_view_t view, - raft::device_span tmp_cnst_act) -{ - __shared__ i_t id_map; - __shared__ i_t pseudo_block_id; - if (threadIdx.x == 0) { - id_map = thrust::upper_bound( - thrust::seq, item_block_segments.begin(), item_block_segments.end(), blockIdx.x) - - item_block_segments.begin(); - pseudo_block_id = blockIdx.x - item_block_segments[id_map - 1]; - } - __syncthreads(); - auto idx = id_range_beg + id_map; - i_t item_off_beg = view.offsets[idx] + work_per_block * pseudo_block_id; - i_t item_off_end = min(item_off_beg + work_per_block, view.offsets[idx + 1]); - - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - - auto act = calc_act(view, threadIdx.x, item_off_beg, item_off_end); - - act.x = BlockReduce(temp_storage).Sum(act.x); - __syncthreads(); - act.y = BlockReduce(temp_storage).Sum(act.y); - - // don't subtract constraint bounds yet - // to be done in post processing in finalize_calc_act_kernel - if (threadIdx.x == 0) { tmp_cnst_act[blockIdx.x] = act; } -} - template inline __device__ void write_cnst_slack( activity_view_t view, i_t cnst_idx, f_t2 cnst_lb_ub, f_t2 act, f_t eps) diff --git a/cpp/tests/mip/CMakeLists.txt b/cpp/tests/mip/CMakeLists.txt index b9fd249a5..020c537f6 100644 --- a/cpp/tests/mip/CMakeLists.txt +++ b/cpp/tests/mip/CMakeLists.txt @@ -27,6 +27,9 @@ ConfigureTest(ELIM_VAR_REMAP_TEST ConfigureTest(STANDARDIZATION_TEST ${CMAKE_CURRENT_SOURCE_DIR}/bounds_standardization_test.cu ) +ConfigureTest(LB_TEST + ${CMAKE_CURRENT_SOURCE_DIR}/load_balancing_test.cu +) ConfigureTest(MULTI_PROBE_TEST ${CMAKE_CURRENT_SOURCE_DIR}/multi_probe_test.cu ) diff --git a/cpp/tests/mip/load_balancing_test.cu b/cpp/tests/mip/load_balancing_test.cu new file mode 100644 index 000000000..deed9ea85 --- /dev/null +++ b/cpp/tests/mip/load_balancing_test.cu @@ -0,0 +1,188 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights + * reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../linear_programming/utilities/pdlp_test_utilities.cuh" +#include "mip_utils.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include + +namespace cuopt::linear_programming::test { + +inline auto make_async() { return std::make_shared(); } + +void init_handler(const raft::handle_t* handle_ptr) +{ + // Init cuBlas / cuSparse context here to avoid having it during solving time + RAFT_CUBLAS_TRY(raft::linalg::detail::cublassetpointermode( + handle_ptr->get_cublas_handle(), CUBLAS_POINTER_MODE_DEVICE, handle_ptr->get_stream())); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsesetpointermode( + handle_ptr->get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, handle_ptr->get_stream())); +} + +std::tuple, std::vector, std::vector> select_k_random( + detail::problem_t& problem, int sample_size) +{ + auto seed = std::random_device{}(); + std::cerr << "Tested with seed " << seed << "\n"; + problem.compute_n_integer_vars(); + auto v_lb = host_copy(problem.variable_lower_bounds); + auto v_ub = host_copy(problem.variable_upper_bounds); + auto int_var_id = host_copy(problem.integer_indices); + int_var_id.erase(std::remove_if(int_var_id.begin(), + int_var_id.end(), + [v_lb, v_ub](auto id) { + return !(std::isfinite(v_lb[id]) && std::isfinite(v_ub[id])); + }), + int_var_id.end()); + sample_size = std::min(sample_size, static_cast(int_var_id.size())); + std::vector random_int_vars; + std::mt19937 m{seed}; + std::sample( + int_var_id.begin(), int_var_id.end(), std::back_inserter(random_int_vars), sample_size, m); + std::vector probe_0(sample_size); + std::vector probe_1(sample_size); + for (int i = 0; i < static_cast(random_int_vars.size()); ++i) { + if (i % 2) { + probe_0[i] = v_lb[random_int_vars[i]]; + probe_1[i] = v_ub[random_int_vars[i]]; + } else { + probe_1[i] = v_lb[random_int_vars[i]]; + probe_0[i] = v_ub[random_int_vars[i]]; + } + } + return std::make_tuple(std::move(random_int_vars), std::move(probe_0), std::move(probe_1)); +} + +std::pair>, std::vector>> +convert_probe_tuple(std::tuple, std::vector, std::vector>& probe) +{ + std::vector> probe_first; + std::vector> probe_second; + for (size_t i = 0; i < std::get<0>(probe).size(); ++i) { + probe_first.emplace_back(thrust::make_pair(std::get<0>(probe)[i], std::get<1>(probe)[i])); + probe_second.emplace_back(thrust::make_pair(std::get<0>(probe)[i], std::get<2>(probe)[i])); + } + return std::make_pair(std::move(probe_first), std::move(probe_second)); +} + +std::tuple, std::vector, std::vector, std::vector> +bounds_probe_results(detail::bound_presolve_t& bnd_prb_0, + detail::bound_presolve_t& bnd_prb_1, + detail::problem_t& problem, + const std::pair>, + std::vector>>& probe) +{ + auto& probe_first = std::get<0>(probe); + auto& probe_second = std::get<1>(probe); + rmm::device_uvector b_lb_0(problem.n_variables, problem.handle_ptr->get_stream()); + rmm::device_uvector b_ub_0(problem.n_variables, problem.handle_ptr->get_stream()); + rmm::device_uvector b_lb_1(problem.n_variables, problem.handle_ptr->get_stream()); + rmm::device_uvector b_ub_1(problem.n_variables, problem.handle_ptr->get_stream()); + bnd_prb_0.solve(problem, probe_first); + bnd_prb_0.set_updated_bounds(problem.handle_ptr, make_span(b_lb_0), make_span(b_ub_0)); + bnd_prb_1.solve(problem, probe_second); + bnd_prb_1.set_updated_bounds(problem.handle_ptr, make_span(b_lb_1), make_span(b_ub_1)); + + auto h_lb_0 = host_copy(b_lb_0); + auto h_ub_0 = host_copy(b_ub_0); + auto h_lb_1 = host_copy(b_lb_1); + auto h_ub_1 = host_copy(b_ub_1); + return std::make_tuple( + std::move(h_lb_0), std::move(h_ub_0), std::move(h_lb_1), std::move(h_ub_1)); +} + +void test_multi_probe(std::string path) +{ + auto memory_resource = make_async(); + rmm::mr::set_current_device_resource(memory_resource.get()); + const raft::handle_t handle_{}; + cuopt::mps_parser::mps_data_model_t mps_problem = + cuopt::mps_parser::parse_mps(path, false); + handle_.sync_stream(); + auto op_problem = mps_data_model_to_optimization_problem(&handle_, mps_problem); + problem_checking_t::check_problem_representation(op_problem); + detail::problem_t problem(op_problem); + mip_solver_settings_t default_settings{}; + detail::pdhg_solver_t pdhg_solver(problem.handle_ptr, problem); + detail::pdlp_initial_scaling_strategy_t scaling(&handle_, + problem, + 10, + 1.0, + pdhg_solver, + problem.reverse_coefficients, + problem.reverse_offsets, + problem.reverse_constraints, + true); + detail::mip_solver_t solver(problem, default_settings, scaling, cuopt::timer_t(0)); + detail::load_balanced_problem_t lb_problem(problem); + detail::load_balanced_bounds_presolve_t lb_prs(lb_problem, solver.context); + + detail::bound_presolve_t bnd_prb(solver.context); + + auto probe_tuple = select_k_random(problem, 100); + auto bounds_probe_vals = convert_probe_tuple(probe_tuple); + { + auto& probe_first = std::get<0>(bounds_probe_vals); + bnd_prb.solve(problem, probe_first); + rmm::device_uvector b_lb(problem.n_variables, problem.handle_ptr->get_stream()); + rmm::device_uvector b_ub(problem.n_variables, problem.handle_ptr->get_stream()); + bnd_prb.set_updated_bounds(problem.handle_ptr, make_span(b_lb), make_span(b_ub)); + + auto h_lb = host_copy(b_lb); + auto h_ub = host_copy(b_ub); + + lb_prs.solve(probe_first); + + auto bnds = host_copy(lb_prs.vars_bnd); + for (int i = 0; i < (int)h_lb.size(); ++i) { + EXPECT_DOUBLE_EQ(bnds[2 * i], h_lb[i]); + EXPECT_DOUBLE_EQ(bnds[2 * i + 1], h_ub[i]); + } + } +} + +TEST(presolve, multi_probe) +{ + std::vector test_instances = { + "mip/50v-10-free-bound.mps", "mip/neos5-free-bound.mps", "mip/neos5.mps"}; + for (const auto& test_instance : test_instances) { + auto path = make_path_absolute(test_instance); + test_multi_probe(path); + } +} + +} // namespace cuopt::linear_programming::test