diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp index e3e8641b4..fab2eea90 100644 --- a/benchmarks/linear_programming/cuopt/run_mip.cpp +++ b/benchmarks/linear_programming/cuopt/run_mip.cpp @@ -28,7 +28,10 @@ #include #include +#include +#include #include +#include #include @@ -256,7 +259,9 @@ void run_single_file_mp(std::string file_path, { std::cout << "running file " << file_path << " on gpu : " << device << std::endl; auto memory_resource = make_async(); - rmm::mr::set_current_device_resource(memory_resource.get()); + auto limiting_adaptor = + rmm::mr::limiting_resource_adaptor(memory_resource.get(), 6ULL * 1024ULL * 1024ULL * 1024ULL); + rmm::mr::set_current_device_resource(&limiting_adaptor); int sol_found = run_single_file(file_path, device, batch_id, @@ -340,6 +345,15 @@ int main(int argc, char* argv[]) .scan<'g', double>() .default_value(std::numeric_limits::max()); + program.add_argument("--memory-limit") + .help("memory limit in MB") + .scan<'g', double>() + .default_value(0.0); + + program.add_argument("--track-allocations") + .help("track allocations (t/f)") + .default_value(std::string("f")); + // Parse arguments try { program.parse_args(argc, argv); @@ -362,10 +376,12 @@ int main(int argc, char* argv[]) std::string result_file; int batch_num = -1; - bool heuristics_only = program.get("--heuristics-only")[0] == 't'; - int num_cpu_threads = program.get("--num-cpu-threads"); - bool write_log_file = program.get("--write-log-file")[0] == 't'; - bool log_to_console = program.get("--log-to-console")[0] == 't'; + bool heuristics_only = program.get("--heuristics-only")[0] == 't'; + int num_cpu_threads = program.get("--num-cpu-threads"); + bool write_log_file = program.get("--write-log-file")[0] == 't'; + bool log_to_console = program.get("--log-to-console")[0] == 't'; + double memory_limit = program.get("--memory-limit"); + bool track_allocations = program.get("--track-allocations")[0] == 't'; if (program.is_used("--out-dir")) { out_dir = program.get("--out-dir"); @@ -469,7 +485,17 @@ int main(int argc, char* argv[]) merge_result_files(out_dir, result_file, n_gpus, batch_num); } else { auto memory_resource = make_async(); - rmm::mr::set_current_device_resource(memory_resource.get()); + if (memory_limit > 0) { + auto limiting_adaptor = + rmm::mr::limiting_resource_adaptor(memory_resource.get(), memory_limit * 1024ULL * 1024ULL); + rmm::mr::set_current_device_resource(&limiting_adaptor); + } else if (track_allocations) { + rmm::mr::tracking_resource_adaptor tracking_adaptor(memory_resource.get(), + /*capture_stacks=*/true); + rmm::mr::set_current_device_resource(&tracking_adaptor); + } else { + rmm::mr::set_current_device_resource(memory_resource.get()); + } run_single_file(path, 0, 0, diff --git a/cpp/src/mip/problem/problem.cu b/cpp/src/mip/problem/problem.cu index 1a5f76b03..fa1312a8e 100644 --- a/cpp/src/mip/problem/problem.cu +++ b/cpp/src/mip/problem/problem.cu @@ -21,6 +21,7 @@ #include "problem_kernels.cuh" #include +#include #include #include @@ -810,16 +811,21 @@ void problem_t::compute_related_variables(double time_limit) handle_ptr->sync_stream(); + // previously used constants were based on 40GB of memory. Scale accordingly on smaller GPUs + // We can't rely on querying free memory or allocation try/catch + // since this would break determinism guarantees (GPU may be shared by other processes) + f_t size_factor = std::min(1.0, cuopt::get_device_memory_size() / 1e9 / 40.0); + // TODO: determine optimal number of slices based on available GPU memory? This used to be 2e9 / // n_variables - i_t max_slice_size = 6e8 / n_variables; + i_t max_slice_size = 6e8 * size_factor / n_variables; rmm::device_uvector varmap(max_slice_size * n_variables, handle_ptr->get_stream()); rmm::device_uvector offsets(max_slice_size * n_variables, handle_ptr->get_stream()); related_variables.resize(0, handle_ptr->get_stream()); // TODO: this used to be 1e8 - related_variables.reserve(1e8, handle_ptr->get_stream()); // reserve space + related_variables.reserve(1e8 * size_factor, handle_ptr->get_stream()); // reserve space related_variables_offsets.resize(n_variables + 1, handle_ptr->get_stream()); related_variables_offsets.set_element_to_zero_async(0, handle_ptr->get_stream()); @@ -863,7 +869,7 @@ void problem_t::compute_related_variables(double time_limit) auto current_time = std::chrono::high_resolution_clock::now(); // if the related variable array would wind up being too large for available memory, abort // TODO this used to be 1e9 - if (related_variables.size() > 1e9 || + if (related_variables.size() > 1e9 * size_factor || std::chrono::duration_cast(current_time - start_time).count() > time_limit) { CUOPT_LOG_DEBUG( diff --git a/cpp/src/utilities/cuda_helpers.cuh b/cpp/src/utilities/cuda_helpers.cuh index 3de820699..d70eb2d52 100644 --- a/cpp/src/utilities/cuda_helpers.cuh +++ b/cpp/src/utilities/cuda_helpers.cuh @@ -24,6 +24,8 @@ #include #include #include +#include +#include namespace cuopt { @@ -208,4 +210,27 @@ DI void sorted_insert(T* array, T item, int curr_size, int max_size) array[0] = item; } +inline size_t get_device_memory_size() +{ + // Otherwise, we need to get the free memory from the device + size_t free_mem, total_mem; + cudaMemGetInfo(&free_mem, &total_mem); + + auto res = rmm::mr::get_current_device_resource(); + auto limiting_adaptor = + dynamic_cast*>(res); + // Did we specifiy an explicit memory limit? + if (limiting_adaptor) { + printf("limiting_adaptor->get_allocation_limit(): %fMiB\n", + limiting_adaptor->get_allocation_limit() / (double)1e6); + printf("used_mem: %fMiB\n", limiting_adaptor->get_allocated_bytes() / (double)1e6); + printf("free_mem: %fMiB\n", + (limiting_adaptor->get_allocation_limit() - limiting_adaptor->get_allocated_bytes()) / + (double)1e6); + return std::min(total_mem, limiting_adaptor->get_allocation_limit()); + } else { + return total_mem; + } +} + } // namespace cuopt