From 3fa8b943b6af10f242b462739e4304d6a0906983 Mon Sep 17 00:00:00 2001
From: YuanTingHsieh <yuantingh@nvidia.com>
Date: Wed, 4 Dec 2024 17:38:34 -0800
Subject: [PATCH] Allow max_num_of_gh_pair_per_launch to be customized

---
 .../cuda_plugin/src/cuda_plugin.h             | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/integration/xgboost/encryption_plugins/cuda_plugin/src/cuda_plugin.h b/integration/xgboost/encryption_plugins/cuda_plugin/src/cuda_plugin.h
index 82f26d1a9b..eeee9892a7 100755
--- a/integration/xgboost/encryption_plugins/cuda_plugin/src/cuda_plugin.h
+++ b/integration/xgboost/encryption_plugins/cuda_plugin/src/cuda_plugin.h
@@ -101,12 +101,16 @@ class CUDAPlugin: public LocalPlugin {
     double total_agg_time_ = 0;
     double total_prepare_bin_time_ = 0;
     size_t gh_pair_array_size = 0;
-    int threads_per_block_ = 512;
+    int max_num_of_gh_pair_per_launch_ = 1 << 23;
 
   public:
     explicit CUDAPlugin(std::vector<std::pair<std::string_view, std::string_view> > const &args): LocalPlugin(args) {
-      bool fix_seed = get_bool(args, "fix_seed");
-      threads_per_block_ = get_int(args, "threads_per_block");
+      bool fix_seed = get_bool(args, "fix_seed", false);
+      // the maximum number of instances (big number) is limited by
+      // (1) CPU memory, as the gh_pair_array need to be hold in CPU memory
+      // (2) GPU memory, when the active gh_pair_array is copied into GPU for calculation
+
+      max_num_of_gh_pair_per_launch_ = get_int(args, "max_num_of_gh_pair_per_launch", 1 << 23);
       paillier_cipher_ptr_ = new PaillierCipher<bits>(bits/2, fix_seed, debug_);
       encrypted_gh_pairs_ = nullptr;
       gh_pair_array = nullptr;
@@ -568,11 +572,7 @@ class CUDAPlugin: public LocalPlugin {
       int tuple_length = 2;
       size_t IPB = threads_per_block_ / TPI;
 
-      // the maximum number of instances (big number) is limited by
-      // (1) CPU memory, as the gh_pair_array need to be hold in CPU memory
-      // (2) GPU memory, when the active gh_pair_array is copied into GPU for calculation
-      size_t max_num_of_instances_per_launch = 1 << 23; // maximum numbers that can fit into GPU memory
-      unsigned int max_blocks = max_num_of_instances_per_launch / IPB;
+      unsigned int max_blocks = max_num_of_gh_pair_per_launch_ / IPB;
 
       if (debug_) std::cout << overall_timer_.now() << ": Preparing bin_xxx" << std::endl;
 
@@ -589,7 +589,7 @@ class CUDAPlugin: public LocalPlugin {
       
       if (debug_) std::cout << overall_timer_.now() << ": before prepareBinArray" << std::endl;
       size_t total_sample_ids = prepareBinArray(ridx, size);
-      if (debug_) std::cout << overall_timer_.now() << ": after prepareBinArray, max_num_of_instances_per_launch: " << max_num_of_instances_per_launch << " , total_sample_ids: " << total_sample_ids << std::endl;
+      if (debug_) std::cout << overall_timer_.now() << ": after prepareBinArray, max_num_of_gh_pair_per_launch_: " << max_num_of_gh_pair_per_launch_ << " , total_sample_ids: " << total_sample_ids << std::endl;
 
       // weird situation where all things are empty
       if (total_sample_ids == 0) {
@@ -597,7 +597,7 @@ class CUDAPlugin: public LocalPlugin {
         return;
       }
 
-      int num_tuples_per_array = std::min(total_sample_ids, max_num_of_instances_per_launch) / tuple_length; 
+      int num_tuples_per_array = std::min(total_sample_ids, max_num_of_gh_pair_per_launch_) / tuple_length;
       int num_gh_pairs = num_tuples_per_array * tuple_length; // needs to be a multiple of tuple
       size_t array_size = sizeof(GHPair) * num_gh_pairs;
       if (debug_) std::cout << "gh pair array size is " << array_size << std::endl;
@@ -616,7 +616,7 @@ class CUDAPlugin: public LocalPlugin {
       total_prepare_bin_time_ += elapsed;
 #endif
 
-      if (debug_) std::cout << overall_timer_.now() << ": max_num_of_instances_per_launch: " << max_num_of_instances_per_launch << " num_gh_pairs: " << num_gh_pairs << std::endl;
+      if (debug_) std::cout << overall_timer_.now() << ": max_num_of_gh_pair_per_launch_: " << max_num_of_gh_pair_per_launch_ << " num_gh_pairs: " << num_gh_pairs << std::endl;
 
       int* rbt = (int*)malloc(sizeof(int) * num_tuples_per_array);