Skip to content

Commit

Permalink
add ascend unittest (#31249)
Browse files Browse the repository at this point in the history
add ascend unittest
  • Loading branch information
xymyeah authored Feb 26, 2021
1 parent d45f5d7 commit 821c2f4
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 6 deletions.
3 changes: 1 addition & 2 deletions python/paddle/distributed/fleet/launch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,8 +476,7 @@ def start_local_trainers(cluster,
if len(t.accelerators) > 0 and pod.device_mode==DeviceMode.GPU:
proc_env["FLAGS_selected_gpus"] = "%s" % ",".join(
[str(g) for g in t.accelerators])

if len(t.accelerators) > 0 and pod.device_mode==DeviceMode.ASCEND_NPU:
elif len(t.accelerators) > 0 and pod.device_mode==DeviceMode.ASCEND_NPU:
proc_env["FLAGS_selected_npus"] = "%s" % ",".join(
[str(g) for g in t.accelerators])

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

def train(prefix):
selected_accelerators = os.getenv("FLAGS_selected_accelerators")
selected_npus = os.getenv("FLAGS_selected_npus")
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
Expand All @@ -26,8 +27,8 @@ def train(prefix):
device_ids=os.getenv("PADDLE_WORLD_DEVICE_IDS")
current_device_id=os.getenv("PADDLE_LOCAL_DEVICE_IDS")

details = "selected_accelerators:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\
.format(selected_accelerators, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id)
details = "selected_accelerators:{} selected_npus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\
.format(selected_accelerators, selected_npus, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id)

print(details)
with open("multi_process_{}.check_{}.log".format(prefix, trainer_id), "w") as f:
Expand Down
65 changes: 65 additions & 0 deletions python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function
import sys
import os
import time
import six
import copy
import json
import unittest
import paddle.fluid as fluid

import paddle.distributed.fleet.ascend_utils as ascend_utils

RANK_TABLE_JSON = {
"status": "completed",
"version": "1.0",
"server_count": "1",
"server_list": [
{
"server_id": "127.0.0.1",
"device": [
{
"device_id": "0",
"device_ip": "192.1.184.23",
"rank_id": "0"
},
{
"device_id": "1",
"device_ip": "192.2.21.93",
"rank_id": "1"
}
]
}
]
}

class TestAscendUtil(unittest.TestCase):
def test_get_cloud_cluster(self):
cluster, pod = ascend_utils.get_cloud_cluster()
self.assertTrue(cluster)
self.assertTrue(pod)

with open('rank_table_file.json', 'w') as f:
json.dump(RANK_TABLE_JSON, f)
rank_table_file = "./rank_table_file.json"
cluster, pod = ascend_utils.get_cloud_cluster(rank_table_file=rank_table_file)
self.assertTrue(cluster)
self.assertTrue(pod)


if __name__ == '__main__':
unittest.main()
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ echo "begin test use ascend npu"
distributed_args="--run_mode=collective --log_dir=testlog"
python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend

str1="selected_accelerators:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0 device_ids:0,1 device_id:0"
str2="selected_accelerators:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6171 trainer_id:1 device_ids:0,1 device_id:1"
str1="selected_accelerators:0 selected_npus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0 device_ids:0,1 device_id:0"
str2="selected_accelerators:1 selected_npus:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6171 trainer_id:1 device_ids:0,1 device_id:1"
file_0="multi_process_fleetlaunchascend.check_0.log"
file_1="multi_process_fleetlaunchascend.check_1.log"

Expand Down

0 comments on commit 821c2f4

Please sign in to comment.