Merge pull request #7 from Tixxx/tix/vhddwithlocalreductiongpu

tixTix/vhddwithlocalreductiongpu
Tixxx · Aug 16, 2019 · eabaa57 · eabaa57
2 parents 4245b57 + 75363ef
commit eabaa57
Show file tree

Hide file tree

Showing 11 changed files with 676 additions and 192 deletions.
diff --git a/horovod/common/global_state.h b/horovod/common/global_state.h
@@ -71,7 +71,10 @@ struct HorovodGlobalState {
 
   // Mutex to be used when accessing the queue of temp buffers
   std::mutex buffer_lock;
-
+
+  // threads to be used for msallreduce operations
+  int num_msallreduce_threads;
+
   HorovodGlobalState() {
     auto horovod_number_of_threads = std::getenv(HOROVOD_NUMBER_OF_MPI_THREADS);
     auto msallreduce = std::getenv(HOROVOD_MSALLREDUCE_ENABLE);
@@ -95,12 +98,12 @@ struct HorovodGlobalState {
       //Making this static so that this pool is preverved throughout the lifetime of the program
       LOG(INFO)<<"Starting "<<num_threads<<" MPI threads for threadpool.";
       static boost::asio::thread_pool pool(num_threads);
+      num_msallreduce_threads = num_threads;
       // Create a buffer manager for temp buffers for each thread
       for (int i = 0; i < num_threads; ++i) {
         temp_buffers.emplace();
       }
       background_thread_pool = &pool;
-      finished_parallel_reductions = 0;
     }
   }
 

diff --git a/horovod/common/operations.cc b/horovod/common/operations.cc
@@ -48,6 +48,7 @@
 #include "timeline.h"
 
 #if HAVE_CUDA
+#include "ops/msallreduce_cuda_operations.h"
 #include "ops/cuda_operations.h"
 #include "ops/mpi_cuda_operations.h"
 #endif
@@ -152,13 +153,12 @@ OperationManager* CreateOperationManager(HorovodGlobalState& state) {
   std::vector<std::shared_ptr<BroadcastOp>> broadcast_ops;
   std::vector<std::shared_ptr<AllreduceOp>> msallreduce_ops;
 
-  // TODO: Do we still want to check for enabling this here? Probably want to add regardless
-  if (state.msallreduce_enabled == true){
-    LOG(INFO) << "msallreduce enabled.";
-    msallreduce_ops.push_back(std::shared_ptr<AllreduceOp>(new MsAllreduceOp(&mpi_context, &state)));
-  }
 #if HAVE_CUDA
 #if HOROVOD_GPU_ALLREDUCE == 'M'
+  if (state.msallreduce_enabled == true){
+    LOG(INFO) << "msallGpureduce enabled.";
+    msallreduce_ops.push_back(std::shared_ptr<AllreduceOp>(new MsCudaAllreduceOp(&mpi_context, &cuda_context, &state)));
+  }
   allreduce_ops.push_back(std::shared_ptr<AllreduceOp>(
       new MPI_CUDAAllreduce(&mpi_context, &cuda_context, &state)));
 
@@ -182,6 +182,12 @@ OperationManager* CreateOperationManager(HorovodGlobalState& state) {
 #endif
 #endif
 
+//TODO remove this check once fully tested
+if (state.msallreduce_enabled == true){
+  LOG(INFO) << "msallreduce enabled.";
+  msallreduce_ops.push_back(std::shared_ptr<AllreduceOp>(new MsAllreduceOp(&mpi_context, &state)));
+}
+
 #if HAVE_GLOO
   if (strcasecmp(state.cpu_operation.c_str(), "gloo") == 0) {
     LOG(INFO) << "Gloo enabled.";
@@ -1020,19 +1026,19 @@ void BackgroundThreadLoop(HorovodGlobalState& state, MPIContext& ctx) {
 
   // parasail new algo begin
   // TODO make this a condition and merge with horovod's hiearchical allreduce
-  if(state.msallreduce_enabled == true) {
-    //MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &state.local_comm);
-    //int ms_local_rank, ms_local_size;
-    //MPI_Comm_size(state.local_comm, &ms_local_size);
-    //MPI_Comm_rank(state.local_comm, &ms_local_rank);
-    if (true) //ms_local_rank == 0)
+if(state.msallreduce_enabled == true) {
+    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &state.local_comm);
+    int ms_local_rank, ms_local_size;
+    MPI_Comm_size(state.local_comm, &ms_local_size);
+    MPI_Comm_rank(state.local_comm, &ms_local_rank);
+    if (ms_local_rank == 0)
     {
         int rank, size;
         MPI_Comm_rank(MPI_COMM_WORLD, &rank);
         MPI_Comm_size(MPI_COMM_WORLD, &size);
         // converting to node-based rank and size
-        ///rank /= ms_local_size;
-        //size /= ms_local_size;
+        rank /= ms_local_size;
+        size /= ms_local_size;
 
         MPI_Group world_group;
         MPI_Comm_group(MPI_COMM_WORLD, &world_group);
@@ -1053,7 +1059,7 @@ void BackgroundThreadLoop(HorovodGlobalState& state, MPIContext& ctx) {
             for (int i = 0; i < (level << 1); i++)
             {
                 // converting back to world rank
-	      node_rank[i] = (base_rank + i);// * ms_local_size;
+                node_rank[i] = (base_rank + i) * ms_local_size;
             }
             MPI_Group red_group;
             MPI_Group_incl(world_group, (level << 1), node_rank, &red_group);

diff --git a/horovod/common/ops/cuda_operations.h b/horovod/common/ops/cuda_operations.h
@@ -53,7 +53,7 @@ struct CUDAContext {
   std::unordered_map<int, std::queue<cudaEvent_t>> cuda_events;
   std::mutex cuda_events_mutex;
 
-  void ErrorCheck(std::string op_name, cudaError_t cuda_result);
+  void static ErrorCheck(std::string op_name, cudaError_t cuda_result);
 
   void RecordEvent(std::queue<std::pair<std::string, cudaEvent_t>>& event_queue, std::string name,
                    cudaStream_t& stream);