Skip to content

Commit

Permalink
Merge pull request #7 from Tixxx/tix/vhddwithlocalreductiongpu
Browse files Browse the repository at this point in the history
tixTix/vhddwithlocalreductiongpu
  • Loading branch information
Tixxx authored Aug 16, 2019
2 parents 4245b57 + 75363ef commit eabaa57
Show file tree
Hide file tree
Showing 11 changed files with 676 additions and 192 deletions.
7 changes: 5 additions & 2 deletions horovod/common/global_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,10 @@ struct HorovodGlobalState {

// Mutex to be used when accessing the queue of temp buffers
std::mutex buffer_lock;


// threads to be used for msallreduce operations
int num_msallreduce_threads;

HorovodGlobalState() {
auto horovod_number_of_threads = std::getenv(HOROVOD_NUMBER_OF_MPI_THREADS);
auto msallreduce = std::getenv(HOROVOD_MSALLREDUCE_ENABLE);
Expand All @@ -95,12 +98,12 @@ struct HorovodGlobalState {
//Making this static so that this pool is preverved throughout the lifetime of the program
LOG(INFO)<<"Starting "<<num_threads<<" MPI threads for threadpool.";
static boost::asio::thread_pool pool(num_threads);
num_msallreduce_threads = num_threads;
// Create a buffer manager for temp buffers for each thread
for (int i = 0; i < num_threads; ++i) {
temp_buffers.emplace();
}
background_thread_pool = &pool;
finished_parallel_reductions = 0;
}
}

Expand Down
34 changes: 20 additions & 14 deletions horovod/common/operations.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
#include "timeline.h"

#if HAVE_CUDA
#include "ops/msallreduce_cuda_operations.h"
#include "ops/cuda_operations.h"
#include "ops/mpi_cuda_operations.h"
#endif
Expand Down Expand Up @@ -152,13 +153,12 @@ OperationManager* CreateOperationManager(HorovodGlobalState& state) {
std::vector<std::shared_ptr<BroadcastOp>> broadcast_ops;
std::vector<std::shared_ptr<AllreduceOp>> msallreduce_ops;

// TODO: Do we still want to check for enabling this here? Probably want to add regardless
if (state.msallreduce_enabled == true){
LOG(INFO) << "msallreduce enabled.";
msallreduce_ops.push_back(std::shared_ptr<AllreduceOp>(new MsAllreduceOp(&mpi_context, &state)));
}
#if HAVE_CUDA
#if HOROVOD_GPU_ALLREDUCE == 'M'
if (state.msallreduce_enabled == true){
LOG(INFO) << "msallGpureduce enabled.";
msallreduce_ops.push_back(std::shared_ptr<AllreduceOp>(new MsCudaAllreduceOp(&mpi_context, &cuda_context, &state)));
}
allreduce_ops.push_back(std::shared_ptr<AllreduceOp>(
new MPI_CUDAAllreduce(&mpi_context, &cuda_context, &state)));

Expand All @@ -182,6 +182,12 @@ OperationManager* CreateOperationManager(HorovodGlobalState& state) {
#endif
#endif

//TODO remove this check once fully tested
if (state.msallreduce_enabled == true){
LOG(INFO) << "msallreduce enabled.";
msallreduce_ops.push_back(std::shared_ptr<AllreduceOp>(new MsAllreduceOp(&mpi_context, &state)));
}

#if HAVE_GLOO
if (strcasecmp(state.cpu_operation.c_str(), "gloo") == 0) {
LOG(INFO) << "Gloo enabled.";
Expand Down Expand Up @@ -1020,19 +1026,19 @@ void BackgroundThreadLoop(HorovodGlobalState& state, MPIContext& ctx) {

// parasail new algo begin
// TODO make this a condition and merge with horovod's hiearchical allreduce
if(state.msallreduce_enabled == true) {
//MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &state.local_comm);
//int ms_local_rank, ms_local_size;
//MPI_Comm_size(state.local_comm, &ms_local_size);
//MPI_Comm_rank(state.local_comm, &ms_local_rank);
if (true) //ms_local_rank == 0)
if(state.msallreduce_enabled == true) {
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &state.local_comm);
int ms_local_rank, ms_local_size;
MPI_Comm_size(state.local_comm, &ms_local_size);
MPI_Comm_rank(state.local_comm, &ms_local_rank);
if (ms_local_rank == 0)
{
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
// converting to node-based rank and size
///rank /= ms_local_size;
//size /= ms_local_size;
rank /= ms_local_size;
size /= ms_local_size;

MPI_Group world_group;
MPI_Comm_group(MPI_COMM_WORLD, &world_group);
Expand All @@ -1053,7 +1059,7 @@ void BackgroundThreadLoop(HorovodGlobalState& state, MPIContext& ctx) {
for (int i = 0; i < (level << 1); i++)
{
// converting back to world rank
node_rank[i] = (base_rank + i);// * ms_local_size;
node_rank[i] = (base_rank + i) * ms_local_size;
}
MPI_Group red_group;
MPI_Group_incl(world_group, (level << 1), node_rank, &red_group);
Expand Down
2 changes: 1 addition & 1 deletion horovod/common/ops/cuda_operations.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ struct CUDAContext {
std::unordered_map<int, std::queue<cudaEvent_t>> cuda_events;
std::mutex cuda_events_mutex;

void ErrorCheck(std::string op_name, cudaError_t cuda_result);
void static ErrorCheck(std::string op_name, cudaError_t cuda_result);

void RecordEvent(std::queue<std::pair<std::string, cudaEvent_t>>& event_queue, std::string name,
cudaStream_t& stream);
Expand Down
Loading

0 comments on commit eabaa57

Please sign in to comment.