From 3fa8b943b6af10f242b462739e4304d6a0906983 Mon Sep 17 00:00:00 2001 From: YuanTingHsieh Date: Wed, 4 Dec 2024 17:38:34 -0800 Subject: [PATCH] Allow max_num_of_gh_pair_per_launch to be customized --- .../cuda_plugin/src/cuda_plugin.h | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/integration/xgboost/encryption_plugins/cuda_plugin/src/cuda_plugin.h b/integration/xgboost/encryption_plugins/cuda_plugin/src/cuda_plugin.h index 82f26d1a9b..eeee9892a7 100755 --- a/integration/xgboost/encryption_plugins/cuda_plugin/src/cuda_plugin.h +++ b/integration/xgboost/encryption_plugins/cuda_plugin/src/cuda_plugin.h @@ -101,12 +101,16 @@ class CUDAPlugin: public LocalPlugin { double total_agg_time_ = 0; double total_prepare_bin_time_ = 0; size_t gh_pair_array_size = 0; - int threads_per_block_ = 512; + int max_num_of_gh_pair_per_launch_ = 1 << 23; public: explicit CUDAPlugin(std::vector > const &args): LocalPlugin(args) { - bool fix_seed = get_bool(args, "fix_seed"); - threads_per_block_ = get_int(args, "threads_per_block"); + bool fix_seed = get_bool(args, "fix_seed", false); + // the maximum number of instances (big number) is limited by + // (1) CPU memory, as the gh_pair_array need to be hold in CPU memory + // (2) GPU memory, when the active gh_pair_array is copied into GPU for calculation + + max_num_of_gh_pair_per_launch_ = get_int(args, "max_num_of_gh_pair_per_launch", 1 << 23); paillier_cipher_ptr_ = new PaillierCipher(bits/2, fix_seed, debug_); encrypted_gh_pairs_ = nullptr; gh_pair_array = nullptr; @@ -568,11 +572,7 @@ class CUDAPlugin: public LocalPlugin { int tuple_length = 2; size_t IPB = threads_per_block_ / TPI; - // the maximum number of instances (big number) is limited by - // (1) CPU memory, as the gh_pair_array need to be hold in CPU memory - // (2) GPU memory, when the active gh_pair_array is copied into GPU for calculation - size_t max_num_of_instances_per_launch = 1 << 23; // maximum numbers that can fit into GPU memory - unsigned int max_blocks = max_num_of_instances_per_launch / IPB; + unsigned int max_blocks = max_num_of_gh_pair_per_launch_ / IPB; if (debug_) std::cout << overall_timer_.now() << ": Preparing bin_xxx" << std::endl; @@ -589,7 +589,7 @@ class CUDAPlugin: public LocalPlugin { if (debug_) std::cout << overall_timer_.now() << ": before prepareBinArray" << std::endl; size_t total_sample_ids = prepareBinArray(ridx, size); - if (debug_) std::cout << overall_timer_.now() << ": after prepareBinArray, max_num_of_instances_per_launch: " << max_num_of_instances_per_launch << " , total_sample_ids: " << total_sample_ids << std::endl; + if (debug_) std::cout << overall_timer_.now() << ": after prepareBinArray, max_num_of_gh_pair_per_launch_: " << max_num_of_gh_pair_per_launch_ << " , total_sample_ids: " << total_sample_ids << std::endl; // weird situation where all things are empty if (total_sample_ids == 0) { @@ -597,7 +597,7 @@ class CUDAPlugin: public LocalPlugin { return; } - int num_tuples_per_array = std::min(total_sample_ids, max_num_of_instances_per_launch) / tuple_length; + int num_tuples_per_array = std::min(total_sample_ids, max_num_of_gh_pair_per_launch_) / tuple_length; int num_gh_pairs = num_tuples_per_array * tuple_length; // needs to be a multiple of tuple size_t array_size = sizeof(GHPair) * num_gh_pairs; if (debug_) std::cout << "gh pair array size is " << array_size << std::endl; @@ -616,7 +616,7 @@ class CUDAPlugin: public LocalPlugin { total_prepare_bin_time_ += elapsed; #endif - if (debug_) std::cout << overall_timer_.now() << ": max_num_of_instances_per_launch: " << max_num_of_instances_per_launch << " num_gh_pairs: " << num_gh_pairs << std::endl; + if (debug_) std::cout << overall_timer_.now() << ": max_num_of_gh_pair_per_launch_: " << max_num_of_gh_pair_per_launch_ << " num_gh_pairs: " << num_gh_pairs << std::endl; int* rbt = (int*)malloc(sizeof(int) * num_tuples_per_array);