From ea455ad4cb57901057b4c9d0deb059c16b7a1935 Mon Sep 17 00:00:00 2001 From: fis Date: Mon, 19 Oct 2020 16:51:45 +0800 Subject: [PATCH] Remove sync at shutdown. --- rabit/src/allreduce_base.cc | 3 --- tests/distributed/distributed_gpu.py | 8 +++----- tests/distributed/runtests-gpu.sh | 2 ++ 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/rabit/src/allreduce_base.cc b/rabit/src/allreduce_base.cc index a3e0151b58ee..3936d5d565ac 100644 --- a/rabit/src/allreduce_base.cc +++ b/rabit/src/allreduce_base.cc @@ -122,9 +122,6 @@ bool AllreduceBase::Init(int argc, char* argv[]) { bool AllreduceBase::Shutdown() { try { - int32_t pseudo_sync = 0; - std::cout << "timeout_sec: " << timeout_sec.count() << std::endl; - this->TryAllreduce(&pseudo_sync, sizeof pseudo_sync, 1, op::Reducer); for (auto & all_link : all_links) { all_link.sock.Close(); } diff --git a/tests/distributed/distributed_gpu.py b/tests/distributed/distributed_gpu.py index f30e39b1b4be..99c89f552dee 100644 --- a/tests/distributed/distributed_gpu.py +++ b/tests/distributed/distributed_gpu.py @@ -1,8 +1,8 @@ """Distributed GPU tests.""" import sys -import time import xgboost as xgb import os +import numpy as np def run_test(name, params_fun): @@ -28,7 +28,7 @@ def run_test(name, params_fun): # Have each worker save its model model_name = "test.model.%s.%d" % (name, rank) bst.dump_model(model_name, with_stats=True) - time.sleep(2) + xgb.rabit.allreduce(np.ones((1, 1)), xgb.rabit.Op.MAX) # sync xgb.rabit.tracker_print("Finished training\n") if (rank == 0): @@ -43,15 +43,13 @@ def run_test(name, params_fun): with open(model_name_rank, 'r') as model_rank: contents_rank = model_rank.read() if contents_root != contents_rank: + print(contents_root, contents_rank) raise Exception( ('Worker models diverged: test.model.%s.%d ' 'differs from test.model.%s.%d') % (name, i, name, j)) xgb.rabit.finalize() - if os.path.exists(model_name): - os.remove(model_name) - base_params = { 'tree_method': 'gpu_hist', diff --git a/tests/distributed/runtests-gpu.sh b/tests/distributed/runtests-gpu.sh index cc2d23cec8d2..17e472482caa 100755 --- a/tests/distributed/runtests-gpu.sh +++ b/tests/distributed/runtests-gpu.sh @@ -7,6 +7,8 @@ submit="timeout 30 python ../../dmlc-core/tracker/dmlc-submit" echo -e "\n ====== 1. Basic distributed-gpu test with Python: 4 workers; 1 GPU per worker ====== \n" $submit --num-workers=$(nvidia-smi -L | wc -l) python distributed_gpu.py basic_1x4 || exit 1 +rm test.model.* echo -e "\n ====== 2. RF distributed-gpu test with Python: 4 workers; 1 GPU per worker ====== \n" $submit --num-workers=$(nvidia-smi -L | wc -l) python distributed_gpu.py rf_1x4 || exit 1 +rm test.model.*