Skip to content

Commit

Permalink
Remove sync at shutdown.
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis committed Oct 19, 2020
1 parent e6df1c7 commit 3708601
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 8 deletions.
3 changes: 0 additions & 3 deletions rabit/src/allreduce_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,6 @@ bool AllreduceBase::Init(int argc, char* argv[]) {

bool AllreduceBase::Shutdown() {
try {
int32_t pseudo_sync = 0;
std::cout << "timeout_sec: " << timeout_sec.count() << std::endl;
this->TryAllreduce(&pseudo_sync, sizeof pseudo_sync, 1, op::Reducer<op::Max, int32_t>);
for (auto & all_link : all_links) {
all_link.sock.Close();
}
Expand Down
8 changes: 3 additions & 5 deletions tests/distributed/distributed_gpu.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
"""Distributed GPU tests."""
import sys
import time
import xgboost as xgb
import os
import numpy as np


def run_test(name, params_fun):
Expand All @@ -28,7 +28,7 @@ def run_test(name, params_fun):
# Have each worker save its model
model_name = "test.model.%s.%d" % (name, rank)
bst.dump_model(model_name, with_stats=True)
time.sleep(2)
xgb.rabit.allreduce(np.ones((1, 1)), xgb.rabit.Op.MAX) # sync
xgb.rabit.tracker_print("Finished training\n")

if (rank == 0):
Expand All @@ -43,15 +43,13 @@ def run_test(name, params_fun):
with open(model_name_rank, 'r') as model_rank:
contents_rank = model_rank.read()
if contents_root != contents_rank:
print(contents_root, contents_rank)
raise Exception(
('Worker models diverged: test.model.%s.%d '
'differs from test.model.%s.%d') % (name, i, name, j))

xgb.rabit.finalize()

if os.path.exists(model_name):
os.remove(model_name)


base_params = {
'tree_method': 'gpu_hist',
Expand Down
2 changes: 2 additions & 0 deletions tests/distributed/runtests-gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ submit="timeout 30 python ../../dmlc-core/tracker/dmlc-submit"

echo -e "\n ====== 1. Basic distributed-gpu test with Python: 4 workers; 1 GPU per worker ====== \n"
$submit --num-workers=$(nvidia-smi -L | wc -l) python distributed_gpu.py basic_1x4 || exit 1
rm test.model.*

echo -e "\n ====== 2. RF distributed-gpu test with Python: 4 workers; 1 GPU per worker ====== \n"
$submit --num-workers=$(nvidia-smi -L | wc -l) python distributed_gpu.py rf_1x4 || exit 1
rm test.model.*

0 comments on commit 3708601

Please sign in to comment.