Squashed commit of the following:

commit 44fd7f8 Merge: a3d5910 17e8d9c Author: Tixxx <tix@microsoft.com> Date: Thu Sep 5 14:34:51 2019 -0700 Merge pull request #11 from Tixxx/saemal/msallreducecudakernels Saemal/msallreducecudakernels commit 17e8d9c Merge: 03e225d a3d5910 Author: Saeed Maleki <30272783+saeedmaleki@users.noreply.github.com> Date: Wed Sep 4 15:55:17 2019 -0700 Merge branch 'tix/vhddwithlocalreduction' into saemal/msallreducecudakernels commit 03e225d Author: Ubuntu <ubuntu@ip-172-31-4-98.us-west-2.compute.internal> Date: Wed Sep 4 22:35:16 2019 +0000 tested ring allreduce for msallreduce commit 66305fa Author: Ubuntu <ubuntu@ip-172-31-4-98.us-west-2.compute.internal> Date: Wed Sep 4 01:36:39 2019 +0000 fixed the ring order commit 9331635 Author: Saeed Maleki <saemal@microsoft.com> Date: Fri Aug 30 20:40:28 2019 +0000 fixed most bugs commit a15ec1d Author: Saeed Maleki <saemal@microsoft.com> Date: Tue Aug 27 19:58:29 2019 +0000 checking before the nd40 goes away commit a3d5910 Author: Tix <tix@microsoft.com> Date: Tue Aug 27 11:19:12 2019 -0700 changed init and finalize logic in ms_cuda_msallreduce commit cd4aaed Author: Saeed Maleki <saemal@microsoft.com> Date: Mon Aug 26 22:53:07 2019 +0000 testing the ring allreduce commit 254cd7f Merge: d485099 e74f098 Author: Tixxx <tix@microsoft.com> Date: Mon Aug 26 12:30:22 2019 -0700 Merge pull request #10 from Tixxx/saemal/kernelcallsformsallreduce Saemal/kernelcallsformsallreduce commit e74f098 Author: Tix <tix@microsoft.com> Date: Mon Aug 26 12:04:29 2019 -0700 fixed copying from device to host commit fc4c733 Merge: d485099 4491b32 Author: Tix <tix@microsoft.com> Date: Mon Aug 26 11:00:27 2019 -0700 Merge branch 'saemal/kernelcallsformsallreduce' of https://github.com/Tixxx/horovod into saemal/kernelcallsformsallreduce commit f518e95 Author: Saeed Maleki <saemal@microsoft.com> Date: Fri Aug 23 22:52:34 2019 +0000 merged with ring allreducew commit e8bcec9 Merge: 4491b32 45b3488 Author: Saeed Maleki <saemal@microsoft.com> Date: Fri Aug 23 21:38:06 2019 +0000 Merge branch 'olsaarik/ringplusvhdd' into saemal/msallreducecudakernels commit 4491b32 Author: Saeed Maleki <saemal@microsoft.com> Date: Fri Aug 23 21:32:20 2019 +0000 fixed bug in setup.py commit 45b3488 Author: Olli Saarikivi <olsaarik@microsoft.com> Date: Fri Aug 23 21:28:38 2019 +0000 Fix variable declarations commit a1093e2 Author: Olli Saarikivi <olsaarik@microsoft.com> Date: Fri Aug 23 21:11:50 2019 +0000 Set ring cuda msallreduce as default commit eda4e4e Author: Saeed Maleki <saemal@microsoft.com> Date: Fri Aug 23 18:20:20 2019 +0000 cuda kernels compiles now -- need to fix for -arch=sm_ <60 commit 84288ad Author: Olli Saarikivi <olsaarik@microsoft.com> Date: Fri Aug 23 17:54:01 2019 +0000 Add hierarchical ring vhdd msallreduce commit d485099 Author: Tix <tix@microsoft.com> Date: Fri Aug 23 06:33:40 2019 -0700 fixed a type error in msallreduce commit 6604900 Merge: 71a82d9 2595113 Author: Saeed Maleki <saemal@microsoft.com> Date: Thu Aug 22 18:44:20 2019 +0000 Merge branch 'saemal/msallreducecudakernels' of https://github.com/Tixxx/horovod into saemal/msallreducecudakernels commit 71a82d9 Author: Saeed Maleki <saemal@microsoft.com> Date: Thu Aug 22 18:44:19 2019 +0000 fixing bugs with setup.py commit 2595113 Author: Saeed Maleki <saemal@microsoft.com> Date: Thu Aug 22 18:42:44 2019 +0000 added the CMakeList file for cuda kernel commit 799fc47 Author: Saeed Maleki <saemal@microsoft.com> Date: Thu Aug 22 07:36:32 2019 +0000 cuda kernel compiles now commit 925d3e4 Author: Saeed Maleki <saemal@microsoft.com> Date: Tue Aug 20 17:29:53 2019 -0700 added kernel calls and the hooks for calling them commit e69452a Author: Saeed Maleki <saemal@microsoft.com> Date: Tue Aug 20 17:29:21 2019 -0700 added kernel calls and the hooks for calling them commit d6408c9 Author: Tix <tix@microsoft.com> Date: Tue Aug 20 14:56:46 2019 -0700 fixed correctness bug commit eabaa57 Merge: 4245b57 75363ef Author: Tixxx <tix@microsoft.com> Date: Fri Aug 16 09:39:46 2019 -0700 Merge pull request #7 from Tixxx/tix/vhddwithlocalreductiongpu tixTix/vhddwithlocalreductiongpu commit 75363ef Author: Tix <tix@microsoft.com> Date: Fri Aug 16 09:26:29 2019 -0700 PR comments assign streams based on layerid and number of threads. Name change for cublas initilization method commit e3c75f7 Author: Tix <tix@microsoft.com> Date: Thu Aug 15 17:18:43 2019 -0700 fixed mem leak. fixed seg fault. improved stream usage. commit da32b1f Author: Tix <tix@microsoft.com> Date: Thu Aug 15 01:27:02 2019 -0700 fixed multithreading issue with tensorflow give each thread a cuda stream fixed communicator bug caused by merge commit 30056aa Merge: 756b4fa 4245b57 Author: Tix <tix@microsoft.com> Date: Wed Aug 14 23:48:56 2019 -0700 Merge branch 'tix/vhddwithlocalreduction' of https://github.com/Tixxx/horovod into tix/vhddwithlocalreductiongpu commit 756b4fa Author: Tix <tix@microsoft.com> Date: Wed Aug 14 22:48:00 2019 -0700 added fp16 support for gpu commit 4245b57 Merge: 2a1eedf 04fa0e4 Author: klipto <todd.mytkowicz@gmail.com> Date: Wed Aug 14 17:17:11 2019 -0700 Merge pull request #9 from Tixxx/tree_local_reduce tree local reduce commit 04fa0e4 Author: Saeed Maleki <saemal@microsoft.com> Date: Thu Aug 15 00:15:39 2019 +0000 simple fix commit 1f5c22f Author: Saeed Maleki <saemal@microsoft.com> Date: Wed Aug 14 23:58:15 2019 +0000 tree local reduce commit 33dbe83 Author: Tix <tix@microsoft.com> Date: Tue Aug 13 15:56:53 2019 -0700 fixed cuda init to make gpu reduction work commit 93d7b37 Author: Tix <tix@microsoft.com> Date: Mon Aug 12 15:37:14 2019 -0700 addressed some comments in pr commit bc889f3 Author: Tix <tix@microsoft.com> Date: Mon Aug 12 14:19:46 2019 -0700 integration branch commit 68de8a1 Author: Tix <tix@microsoft.com> Date: Mon Aug 12 14:18:09 2019 -0700 changed to cublasxxxEx call and only with float32 commit 8312976 Author: Tix <tix@microsoft.com> Date: Mon Aug 12 13:29:42 2019 -0700 compile pass. divide by zero exception in float to double casting commit 505aed1 Author: Tix <tix@microsoft.com> Date: Mon Aug 12 10:42:26 2019 -0700 adding gpu support for ms allreduce logic in progress commit 2a1eedf Merge: a1913e8 d33fa92 Author: Vadim Eksarevskiy <42353187+vaeksare@users.noreply.github.com> Date: Fri Aug 9 15:57:29 2019 -0700 Merge pull request #5 from vaeksare/vaeksare/separate_average Vaeksare/separate average commit d33fa92 Author: Vadim Eksarevskiy <vaeksare@microsoft.com> Date: Fri Aug 9 14:54:15 2019 -0700 deleted accidental binary files commit 2e63692 Author: Vadim Eksarevskiy <vaeksare@microsoft.com> Date: Fri Aug 9 14:51:00 2019 -0700 refactored msallreduce to be a separate op in horovod commit a1913e8 Merge: 3a8cdd2 9accd83 Author: klipto <toddm@microsoft.com> Date: Fri Aug 9 14:15:47 2019 -0700 Merge branch 'tix/vhddwithlocalreduction' of https://github.com/Tixxx/horovod into tix/vhddwithlocalreduction commit 3a8cdd2 Author: klipto <toddm@microsoft.com> Date: Fri Aug 9 14:06:02 2019 -0700 workaround for # of elements/size issue commit 55e6ce1 Author: root <root@GCRHYPCBJ016.redmond.corp.microsoft.com> Date: Fri Aug 9 13:29:42 2019 -0700 fixed load and added guard for potential bug commit 9accd83 Author: Tix <tix@microsoft.com> Date: Fri Aug 9 11:28:48 2019 -0700 simplified average logic commit e364f14 Merge: 278e86c 3dde0e4 Author: Tix <tix@microsoft.com> Date: Thu Aug 8 10:09:14 2019 -0700 Merge branch 'tix/vhddwithallreduce' into tix/vhddwithlocalreduction commit 278e86c Author: Tix <tix@microsoft.com> Date: Wed Aug 7 17:02:52 2019 -0700 merge with tf fixes commit 3dde0e4 Merge: 83e68e1 a0b9469 Author: klipto <todd.mytkowicz@gmail.com> Date: Wed Aug 7 16:32:43 2019 -0700 Merge pull request #4 from Tixxx/adding_test_functionality Added a test for fp16,32,64 tensor allreduce correctness commit a0b9469 Author: Todd Mytkowicz <toddm@microsoft.com> Date: Wed Aug 7 13:52:44 2019 -0700 Added a test for fp16,32,64 tensor allreduce correctness commit 83e68e1 Author: Tix <tix@microsoft.com> Date: Wed Aug 7 13:33:47 2019 -0700 replaced local reduction with mpi allreduce commit c1e5f9c Author: Tix <tix@microsoft.com> Date: Tue Aug 6 14:34:56 2019 -0700 added more optimization flags for compiler commit 5509baf Author: Tix <tix@microsoft.com> Date: Tue Aug 6 09:29:21 2019 -0700 integrated with the vhdd bug fix commit dfda595 Merge: c3c0257 efe1886 Author: Vadim Eksarevskiy <42353187+vaeksare@users.noreply.github.com> Date: Mon Aug 5 18:20:30 2019 -0700 Merge pull request #2 from vaeksare/vaeksare/hvdd pytorch workaround commit efe1886 Author: Vadim Eksarevskiy <vaeksare@microsoft.com> Date: Mon Aug 5 18:18:19 2019 -0700 pytorch workaround commit c3c0257 Author: Tix <tix@microsoft.com> Date: Mon Aug 5 17:50:39 2019 -0700 merged with vhdd. merged with fix in TF averaging logic. commit b02994a Author: Tix <tix@microsoft.com> Date: Mon Aug 5 11:37:23 2019 -0700 added float16 data type commit 6116e7e Author: Tix <tix@microsoft.com> Date: Fri Aug 2 18:44:20 2019 -0700 fixed averaging bug in tensorflow commit b8cab29 Author: Tix <tix@microsoft.com> Date: Thu Aug 1 14:29:56 2019 -0700 added new parasail algo commit fa658eb Author: Tix <tix@microsoft.com> Date: Thu Aug 1 09:37:34 2019 -0700 integrated new parasail algorithm commit 4402dac Author: Tix <tix@microsoft.com> Date: Tue Jul 30 10:43:29 2019 -0700 added single and multiple large tensor test commit f6e6c89 Author: Tix <tix@microsoft.com> Date: Fri Jul 26 17:22:47 2019 -0700 merged with local change commit 6d5fd6c Author: Tix <tix@microsoft.com> Date: Fri Jul 26 17:21:04 2019 -0700 merged with temp_buffer commit 46e6ab4 Merge: 9c0a7ac cb29e32 Author: Vadim Eksarevskiy <vaeksare@microsoft.com> Date: Fri Jul 26 14:34:02 2019 -0700 fix merge conflict in global state commit 9c0a7ac Author: Vadim Eksarevskiy <vaeksare@microsoft.com> Date: Fri Jul 26 13:44:36 2019 -0700 added basic pytorch tests for msallreduce commit c5b1a7f Author: Vadim Eksarevskiy <vaeksare@microsoft.com> Date: Thu Jul 25 17:27:22 2019 -0700 added temp buffer for msallreduce op commit a7c14a5 Author: Tix <tix@microsoft.com> Date: Fri Jul 26 13:52:16 2019 -0700 fixed some issues with broadcast when fusing respones. Added more logging. commit cb29e32 Author: Vadim Eksarevskiy <vaeksare@microsoft.com> Date: Fri Jul 26 13:44:36 2019 -0700 added basic pytorch tests for msallreduce commit bc40e87 Author: Vadim Eksarevskiy <vaeksare@microsoft.com> Date: Thu Jul 25 17:27:22 2019 -0700 added temp buffer for msallreduce op commit b644b1b Author: Tix <tix@microsoft.com> Date: Thu Jul 25 14:01:43 2019 -0700 fixed seg fault. added multi-tensor test commit 7babc10 Author: Tix <tix@microsoft.com> Date: Wed Jul 24 22:45:52 2019 -0700 fixed seg fault for 1 tensor case, still happens for multipl tensors commit 81f4de3 Author: Tix <tix@microsoft.com> Date: Wed Jul 24 13:40:29 2019 -0700 committing rest of the parallel code. debugging seg fault.. commit 5fadb9d Author: Tix <tix@microsoft.com> Date: Tue Jul 23 21:50:23 2019 -0700 incorporated threadpool and changed global state class. Added test. commit 4bf49e6 Author: Tix <tix@microsoft.com> Date: Tue Jul 23 14:22:51 2019 -0700 added more logging and data types for ms allreduce commit e4e3bb6 Author: Tix <tix@microsoft.com> Date: Tue Jul 16 15:15:47 2019 -0700 moved p2p comm implementations to header file commit 730e9fb Author: Tix <tix@microsoft.com> Date: Tue Jul 16 13:00:36 2019 -0700 first commit of p2p comm together with parasail op
Tixxx · Sep 13, 2019 · 686e7c4 · 686e7c4
1 parent e28e62b
commit 686e7c4
Show file tree

Hide file tree

Showing 41 changed files with 2,691 additions and 66 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -49,3 +49,12 @@
 [submodule "third_party/gloo"]
 	path = third_party/gloo
 	url = https://github.com/facebookincubator/gloo.git
+[submodule "third_party/boost/system"]
+	path = third_party/boost/system
+	url = https://github.com/boostorg/system.git
+[submodule "third_party/boost/throw_exception"]
+	path = third_party/boost/throw_exception
+	url = https://github.com/boostorg/throw_exception.git
+[submodule "third_party/boost/asio"]
+	path = third_party/boost/asio
+	url = https://github.com/boostorg/asio.git
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,4 +1,4 @@
-recursive-include * *.h *.hpp *.cc *.md
+recursive-include * *.h *.hpp *.cc *.md *.ipp
 
 include LICENSE horovod.lds horovod.exp
 prune .eggs
@@ -19,3 +19,6 @@ exclude third_party/eigen/Eigen/src/SparseCholesky/*
 graft third_party/gloo/cmake
 recursive-include third_party/gloo CMakeLists.txt
 recursive-include third_party/gloo *.in
+
+# include cmake related files for msallreduce kernel
+graft horovod/common/ops/cuda/
diff --git a/bin/horovodrun b/bin/horovodrun
@@ -19,3 +19,4 @@ from horovod.run import run
 
 if __name__ == '__main__':
     run.run()
+
diff --git a/horovod/common/common.h b/horovod/common/common.h
@@ -56,13 +56,18 @@ namespace common {
 #define GLOO_ALLREDUCE "GLOO_ALLREDUCE"
 #define GLOO_ALLGATHER "GLOO_ALLGATHER"
 #define GLOO_BCAST "GLOO_BCAST"
+#define POINT_TO_POINT_SEND "POINT_TO_POINT_SEND"
+#define POINT_TO_POINT_RECEIVE "POINT_TO_POINT_RECEIVE"
 
 // String constant for gloo interface.
 #define GLOO_DEFAULT_IFACE "eth0"
 
 // Device ID used for CPU.
 #define CPU_DEVICE_ID (-1)
 
+// Point-to-Point communication message chunk size for RDMA
+#define P2P_MESSAGE_CHUNK_SIZE (1 << 15)
+
 // List of supported frameworks.
 enum Framework { TENSORFLOW, PYTORCH, MXNET };
 

diff --git a/horovod/common/fusion_buffer_manager.cc b/horovod/common/fusion_buffer_manager.cc
@@ -21,11 +21,12 @@ namespace common {
 Status FusionBufferManager::InitializeBuffer(int64_t threshold, int device, std::shared_ptr<OpContext> context,
                                              int stream_id,
                                              std::function<void()> on_start_init,
-                                             std::function<void()> on_end_init) {
+                                             std::function<void()> on_end_init,
+                                             std::function<bool(int64_t&, int64_t&)> validity_check) {
   auto& elem = tensor_fusion_buffers_[std::make_tuple(device, context->framework(), stream_id)];
   auto& buffer = elem.first;
   int64_t& size = elem.second;
-  if (size != threshold) {
+  if (!validity_check(size, threshold)) {
     buffer.reset();
     size = 0;
   }
@@ -45,6 +46,10 @@ Status FusionBufferManager::InitializeBuffer(int64_t threshold, int device, std:
   return Status::OK();
 }
 
+Status IntializeTempBuffer(int64_t size,
+                              int device, std::shared_ptr<OpContext> context,
+                              int stream_id);
+
 std::shared_ptr<PersistentBuffer>& FusionBufferManager::GetBuffer(int device, Framework framework, int stream_id) {
   return tensor_fusion_buffers_[std::make_tuple(device, framework, stream_id)].first;
 }

diff --git a/horovod/common/fusion_buffer_manager.h b/horovod/common/fusion_buffer_manager.h
@@ -38,11 +38,14 @@ class FusionBufferManager {
   //  context: Framework used to create the buffer and associate it.
   //  on_start_init: Callback on starting buffer initialization.
   //  on_end_init: Callback on completing buffer initialization.
+  //  validity_check: function to use to compare existing and requested buffer sizes
+  //    to see if the existing buffer can be reused.
   Status InitializeBuffer(int64_t threshold,
                           int device, std::shared_ptr<OpContext> context,
                           int stream_id,
                           std::function<void()> on_start_init,
-                          std::function<void()> on_end_init);
+                          std::function<void()> on_end_init,
+                          std::function<bool(int64_t&, int64_t&)> validity_check);
 
   // Returns the buffer associated with the given device and framework, or null.
   std::shared_ptr<PersistentBuffer>& GetBuffer(int device, Framework framework, int stream_id);

diff --git a/horovod/common/global_state.h b/horovod/common/global_state.h
@@ -19,11 +19,15 @@
 
 #include <queue>
 #include <thread>
+#include <boost/asio/thread_pool.hpp>
+#include <boost/asio/post.hpp>
 
 #include "fusion_buffer_manager.h"
 #include "parameter_manager.h"
 #include "response_cache.h"
 #include "timeline.h"
+#include "logging.h"
+#include "mpi.h"
 
 namespace horovod {
 namespace common {
@@ -50,11 +54,72 @@ struct HorovodGlobalState {
   std::mutex mutex;
 
   // Tensors waiting to be allreduced or allgathered.
-  TensorTable tensor_table;
-
+  TensorTable tensor_table;  
+
+  // Thread pool
+  boost::asio::thread_pool* background_thread_pool;
+
+  //flag to indicate usage of ms allreduce algorithm
+  bool msallreduce_enabled = false;
+
+  // Counter used to keep track of how many of the parallel reductions finished
+  // TODO do we need this?
+  std::atomic_int finished_parallel_reductions;
+
+  // Encapsulates the temp buffers used for msallreduce.
+  std::queue<FusionBufferManager> temp_buffers;
+
+  // Mutex to be used when accessing the queue of temp buffers
+  std::mutex buffer_lock;
+
+  // threads to be used for msallreduce operations
+  int num_msallreduce_threads;
+
+  HorovodGlobalState() {
+    auto horovod_number_of_threads = std::getenv(HOROVOD_NUMBER_OF_MPI_THREADS);
+    auto msallreduce = std::getenv(HOROVOD_MSALLREDUCE_ENABLE);
+    if (msallreduce != nullptr) {
+      int msallreduce_value = std::strtol(msallreduce, nullptr, 10);
+      msallreduce_enabled = msallreduce_value == 1;
+    }
+    if (msallreduce_enabled == true) {
+      int num_threads;
+      if (horovod_number_of_threads != nullptr){
+        num_threads = std::strtol(horovod_number_of_threads, nullptr, 10);
+        LOG(INFO)<<"HOROVOD_NUMBER_OF_MPI_THREADS is set to "<<num_threads;
+        if (num_threads <= 0){
+          throw std::logic_error("Number of threads must be greater or equal to 1 when msallreduce is used.");
+        }
+      }
+      else {
+        LOG(INFO)<<"HOROVOD_NUMBER_OF_MPI_THREADS is not set. Creating threadpool with 1 thread by default. ";
+        num_threads = 1;
+      }
+      //Making this static so that this pool is preverved throughout the lifetime of the program
+      LOG(INFO)<<"Starting "<<num_threads<<" MPI threads for threadpool.";
+      static boost::asio::thread_pool pool(num_threads);
+      num_msallreduce_threads = num_threads;
+      // Create a buffer manager for temp buffers for each thread
+      for (int i = 0; i < num_threads; ++i) {
+        temp_buffers.emplace();
+      }
+      background_thread_pool = &pool;
+    }
+  }
+
   // Background thread running MPI communication.
   std::thread background_thread;
 
+  // MPI communicators used to do msallreduction
+  // TODO put this in a better place
+  MPI_Comm* reduction_comms;
+
+  //TODO find a better place
+  int rank_log_size = 0;
+
+  // TODO find a better place
+  MPI_Comm local_comm;
+
   // Whether the background thread should shutdown.
   std::atomic_bool shut_down {false};
 
@@ -88,15 +153,16 @@ struct HorovodGlobalState {
 
   // Encapsulates the fusion buffers, handles resizing and auto-tuning of buffer size.
   FusionBufferManager fusion_buffer;
-
+  
   // Time point when last cycle started.
   std::chrono::steady_clock::time_point last_cycle_start;
 
   // Whether MPI_Init has been completed on the background thread.
   std::atomic_bool initialization_done {false};
 
   // The MPI rank, local rank, size, local size, flag indicating whether MPI
-  // multi-threading is supported, ranks from which the MPI communicator will
+  // multi-threading is supported, flag indicating whether mpi point-to-point
+  // message chunking is enabled, ranks from which the MPI communicator will
   // be made and the communicator itself.
   int rank = 0;
   int local_rank = 0;
@@ -106,6 +172,7 @@ struct HorovodGlobalState {
   int cross_size = 1;
   bool mpi_threads_supported = false;
   bool is_homogeneous = false;
+  bool msg_chunk_enabled = false;
   std::vector<int> ranks;
 
   // COMM_WORLD ranks of processes running on this node.
@@ -154,6 +221,12 @@ struct HorovodGlobalState {
       shut_down = true;
       background_thread.join();
     }
+    //TODO merge this with background thread
+    if(background_thread_pool != nullptr){
+      background_thread_pool->stop();
+    }
+
+    delete reduction_comms;
   }
 };
 

diff --git a/horovod/common/message.cc b/horovod/common/message.cc
@@ -70,6 +70,9 @@ const std::string& Request::RequestType_Name(RequestType value) {
     case RequestType::BROADCAST:
       static const std::string broadcast("BROADCAST");
       return broadcast;
+    case RequestType::MSALLREDUCE:
+      static const std::string msallreduce("MSALLREDUCE");
+      return msallreduce;
     default:
       static const std::string unknown("<unknown>");
       return unknown;
@@ -234,6 +237,9 @@ const std::string& Response::ResponseType_Name(ResponseType value) {
     case ResponseType::BROADCAST:
       static const std::string broadcast("BROADCAST");
       return broadcast;
+    case ResponseType::MSALLREDUCE:
+      static const std::string msallreduce("MSALLREDUCE");
+      return msallreduce;
     case ResponseType::ERROR:
       static const std::string error("ERROR");
       return error;

diff --git a/horovod/common/message.h b/horovod/common/message.h
@@ -45,7 +45,7 @@ const std::string& DataType_Name(DataType value);
 class Request {
 public:
   enum RequestType {
-    ALLREDUCE = 0, ALLGATHER = 1, BROADCAST = 2
+    ALLREDUCE = 0, ALLGATHER = 1, BROADCAST = 2, MSALLREDUCE = 3
   };
 
   static const std::string& RequestType_Name(RequestType value);
@@ -130,7 +130,7 @@ class RequestList {
 class Response {
 public:
   enum ResponseType {
-    ALLREDUCE = 0, ALLGATHER = 1, BROADCAST = 2, ERROR = 3
+    ALLREDUCE = 0, ALLGATHER = 1, BROADCAST = 2, ERROR = 3, MSALLREDUCE = 4
   };
 
   static const std::string& ResponseType_Name(ResponseType value);
Original file line number	Diff line number	Diff line change
Expand Up		@@ -19,3 +19,4 @@ from horovod.run import run

		if __name__ == '__main__':
		run.run()