Skip to content

Commit

Permalink
add ut
Browse files Browse the repository at this point in the history
  • Loading branch information
ronny1996 committed Nov 16, 2021
1 parent 4098423 commit c858052
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 7 deletions.
4 changes: 4 additions & 0 deletions python/paddle/distributed/collective.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,10 @@ def new_group(ranks=None, backend=None):
place = core.CUDAPlace(genv.device_id)
core.NCCLParallelContext(strategy,
place).init_with_ring_id(ring_id)
elif core.is_compiled_with_npu():
place = core.NPUPlace(genv.device_id)
core.HCCLParallelContext(strategy,
place).init_with_ring_id(ring_id)
else:
assert False, ("no cuda device found")
else:
Expand Down
4 changes: 2 additions & 2 deletions python/paddle/fluid/dygraph/nn.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,9 +207,9 @@ def __init__(self,
if core.is_compiled_with_npu():
if (self._num_channels == self._groups and
self._num_channels == self._num_filters):
l_type = 'depthwise_conv2d'
self._l_type = 'depthwise_conv2d'
else:
l_type = 'conv2d'
self._l_type = 'conv2d'

self._num_channels = num_channels
if self._groups is None:
Expand Down
5 changes: 4 additions & 1 deletion python/paddle/fluid/dygraph/parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,12 @@ def prepare_context(strategy=None):
elif isinstance(place, core.XPUPlace):
parallel_helper._set_parallel_ctx(
core.BKCLParallelContext(strategy, place))
elif isinstance(place, core.NPUPlace):
parallel_helper._set_parallel_ctx(
core.HCCLParallelContext(strategy, place))
else:
# TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation
assert ("Only support CUDAPlace or XPUPlace for now.")
assert ("Only support CUDAPlace or XPUPlace or NPUPlace for now.")
parallel_helper._init_parallel_ctx()
return strategy

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import os
import sys
import unittest
sys.path.append("..")

from test_dist_base import TestDistBase
import paddle.fluid as fluid

flag_name = os.path.splitext(__file__)[0]
rank_table_file = b"""{
"status": "completed",
"version": "1.0",
"server_count": "1",
"server_list": [
{
"server_id": "127.0.0.1",
"device": [
{
"device_id": "0",
"device_ip": "192.1.184.23",
"rank_id": "0"
},
{
"device_id": "1",
"device_ip": "192.2.21.93",
"rank_id": "1"
}
]
}
]
}"""

need_envs = {
"ASCEND_AICPU_PATH":
os.getenv("ASCEND_AICPU_PATH", "/usr/local/Ascend/nnae/latest"),
"ASCEND_OPP_PATH":
os.getenv("ASCEND_OPP_PATH", "/usr/local/Ascend/nnae/latest/opp"),
"HCCL_CONNECT_TIMEOUT": "7200",
"HCCL_WHITELIST_DISABLE": "1",
"HCCL_SECURITY_MODE": "1",
"RANK_TABLE_FILE": "rank_table_file.json",
}


class TestParallelDygraphMnistNPU(TestDistBase):
def _setup_config(self):
self._sync_mode = False
self._hccl_mode = True
self._dygraph = True
self._enforce_place = "NPU"

def test_mnist(self):
with open("rank_table_file.json", "wb") as f:
f.write(rank_table_file)
if fluid.core.is_compiled_with_npu():
self.check_with_place(
os.path.abspath('../parallel_dygraph_mnist.py'),
delta=1e-3,
check_error_log=True,
need_envs=need_envs,
log_name=flag_name)


class TestFleetDygraphMnistNPU(TestParallelDygraphMnistNPU):
def _setup_config(self):
self._sync_mode = False
self._hccl_mode = True
self._dygraph = True
self._enforce_place = "NPU"
self._use_fleet_api = True


if __name__ == "__main__":
unittest.main()
46 changes: 42 additions & 4 deletions python/paddle/fluid/tests/unittests/test_dist_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,9 @@ def run_trainer(self, args):
elif fluid.core.is_compiled_with_xpu():
device_id = int(os.getenv("FLAGS_selected_xpus", "0"))
place = fluid.XPUPlace(device_id)
elif fluid.core.is_compiled_with_npu():
device_id = int(os.getenv("FLAGS_selected_npus", "0"))
place = fluid.NPUPlace(device_id)
else:
assert ("Only support CUDAPlace or XPUPlace or CPU(Gloo) for now.")

Expand All @@ -564,7 +567,7 @@ def run_trainer(self, args):
nranks = len(args.endpoints.split(",")) if args.endpoints else 1

#if args.update_method == "nccl2":
if args.update_method == "nccl2" or args.update_method == "bkcl":
if args.update_method == "nccl2" or args.update_method == "bkcl" or args.update_method == "hccl":
strategy = dygraph.parallel.ParallelStrategy()
strategy.nranks = nranks
strategy.local_rank = args.trainer_id
Expand Down Expand Up @@ -671,12 +674,12 @@ def run_use_fleet_api_trainer(self, args):
strategy.find_unused_parameters = True

# 3. init parallel env
if args.update_method == "nccl2" or "bkcl":
if args.update_method == "nccl2" or "bkcl" or "hccl":
fleet.init(is_collective=True, strategy=strategy)

# 4. train model
model, train_reader, opt = self.get_model()
if args.update_method == "nccl2" or "bkcl":
if args.update_method == "nccl2" or "bkcl" or "hccl":
opt = fleet.distributed_optimizer(opt)
model = fleet.distributed_model(model)

Expand Down Expand Up @@ -706,7 +709,8 @@ def runtime_main(test_class):
type=str,
default="local",
choices=[
"pserver", "nccl2", "bkcl", "local", "nccl2_reduce_layer", "gloo"
"pserver", "nccl2", "bkcl", "local", "nccl2_reduce_layer", "gloo",
"hccl"
])
parser.add_argument('--trainer_id', type=int, required=False, default=0)
parser.add_argument('--trainers', type=int, required=False, default=1)
Expand All @@ -728,6 +732,7 @@ def runtime_main(test_class):
parser.add_argument('--use_cpu', action='store_true')
parser.add_argument('--use_xpu', action='store_true')
parser.add_argument('--use_dgc', action='store_true')
parser.add_argument('--use_npu', action='store_true')
parser.add_argument('--accumulate_gradient', action='store_true')
parser.add_argument('--find_unused_parameters', action='store_true')
parser.add_argument('--use_reduce', action='store_true')
Expand Down Expand Up @@ -784,13 +789,21 @@ def _after_setup_config(self):
self.__use_cuda = False
self.__use_xpu = False
self._use_dgc = False
self.__use_npu = False
elif self._enforce_place == "GPU":
self.__use_cuda = True
self.__use_xpu = False
self.__use_npu = False
elif self._enforce_place == "XPU":
self.__use_cuda = False
self.__use_xpu = True
self._use_dgc = False
self.__use_npu = False
elif self._enforce_place == "NPU":
self.__use_cuda = False
self.__use_xpu = False
self._use_dgc = False
self.__use_npu = True
else:
if fluid.core.is_compiled_with_cuda():
self.__use_cuda = True
Expand All @@ -815,6 +828,7 @@ def setUp(self):
self._nccl2_mode = False
self._bkcl_mode = False
self._gloo_mode = False # now, support gloo backend
self._hccl_mode = False
self._pipeline_mode = False
self._mp_mode = False
self._diff_batch = False
Expand Down Expand Up @@ -953,6 +967,13 @@ def _run_local(self,
"PADDLE_TRAINERS_NUM": "1",
"PADDLE_TRAINER_ID": "0"
}
elif self.__use_npu:
cmd += " --use_npu"
env_local = {
"FLAGS_selected_npus": devices,
"PADDLE_TRAINERS_NUM": "1",
"PADDLE_TRAINER_ID": "0"
}
else:
env_local = {'CPU_NUM': '1'}

Expand Down Expand Up @@ -1199,6 +1220,16 @@ def _get_nccl2_trainer_cmd(self, model, ep, update_method, trainer_id,
"PADDLE_CURRENT_ENDPOINT": ep,
"GLOG_v": "2",
})
elif self.__use_npu:
tr_cmd += " --use_npu"
env.update({
"FLAGS_selected_npus": "{}".format(trainer_id),
"PADDLE_TRAINERS_NUM": "{}".format(trainer_num),
"PADDLE_TRAINER_ID": "{}".format(trainer_id),
"PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
"PADDLE_CURRENT_ENDPOINT": ep,
"GLOG_v": "2",
})
else:
env.update({'CPU_NUM': '1'})

Expand Down Expand Up @@ -1471,6 +1502,13 @@ def check_with_place(self,
update_method='gloo',
check_error_log=check_error_log,
log_name=log_name)
elif self._hccl_mode:
tr0_losses, tr1_losses = self._run_cluster_nccl2(
model_file,
required_envs,
update_method='hccl',
check_error_log=check_error_log,
log_name=log_name)

elif self._pipeline_mode:
tr0_losses, tr1_losses = self._run_pipeline(
Expand Down

1 comment on commit c858052

@paddle-bot-old
Copy link

@paddle-bot-old paddle-bot-old bot commented on c858052 Nov 16, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🕵️ CI failures summary

🔍 PR: #36285 Commit ID: c858052 contains failed CI.

🔹 Failed: PR-CI-Coverage

Unknown Failed
2021-11-16 22:12:09   File "/paddle/tools/coverage/gcda_clean.py", line 108, in 
2021-11-16 22:12:09 clean(pull_id)
2021-11-16 22:12:09 File "/paddle/tools/coverage/gcda_clean.py", line 80, in clean
2021-11-16 22:12:09 for file in get_files(pull_id):
2021-11-16 22:12:09 File "/paddle/tools/coverage/gcda_clean.py", line 63, in get_files
2021-11-16 22:12:09 pull = get_pull(pull_id)
2021-11-16 22:12:09 File "/paddle/tools/coverage/gcda_clean.py", line 49, in get_pull
2021-11-16 22:12:09 pull = repo.get_pull(pull_id)
2021-11-16 22:12:09 UnboundLocalError: local variable 'repo' referenced before assignment
2021-11-16 22:12:09 + exit 101
2021-11-16 22:12:09 + EXCODE=101
2021-11-16 22:12:09 + echo 101
2021-11-16 22:12:09 101
2021-11-16 22:12:09 + echo 'ipipe_log_param_EXCODE: 101'
2021-11-16 22:12:09 ipipe_log_param_EXCODE: 101
2021-11-16 22:12:09 + '[' 101 -ne 0 ']'
2021-11-16 22:12:09 + '[' 101 -ne 9 ']'
2021-11-16 22:12:09 + exit 101
2021-11-16 22:12:09 {build code state=101}

Please sign in to comment.