-
Notifications
You must be signed in to change notification settings - Fork 84
/
Copy pathrun_and_time.py
116 lines (102 loc) · 3.27 KB
/
run_and_time.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import argparse
import os
import sys
import subprocess
"""
usage:
python3 run_and_time.py \
--num-replicas 16 --ipus-per-replica 1 --num-instances 8 \
--hosts pod_name1,pod_name2 --partition parition_name --vipu-host host_name \
--app-args='--config resnet50_mlperf_pod16_bs20 ...'
"""
def add_poprun_arguments(parser):
# poprun parallelism
parser.add_argument(
"--num-replicas",
type=str,
required=True,
help="Number of replicas for model parallelism, aka replication factor.",
)
parser.add_argument(
"--ipus-per-replica", type=str, required=True, help="Number for ipus for a replica, aka replica size."
)
parser.add_argument("--num-instances", type=str, required=True, help="Number of instances for data parallelism.")
# vipu args
parser.add_argument(
"--hosts", type=str, required=True, help="Host addresses separated by commas for mutli-host data parallelism."
)
parser.add_argument("--partition", type=str, required=True, help="The name of the vipu partition.")
parser.add_argument("--vipu-host", type=str, required=True, help="The address of the vipu host machine.")
# application arguments
parser.add_argument(
"--app-args", default=None, type=str, required=False, help="A string with arguments for the application."
)
return parser
if __name__ == "__main__":
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser = add_poprun_arguments(parser)
args = parser.parse_args()
app_args = args.app_args.split(" ") if args.app_args is not None else []
user = os.environ["USER"]
exec_cache = os.environ.get("TF_POPLAR_EXEC_CACHE") or os.path.join("/home", user, "exec_cache")
poprun_command = [
"poprun",
"-vv",
"--host",
args.hosts,
"--only-output-from-instance",
"0",
"--mpi-global-args",
"--mca oob_tcp_if_include eno1 --mca btl_tcp_if_include eno1 ",
"--update-partition",
"yes",
"--reset-partition",
"no",
"--vipu-server-timeout",
"600",
"--vipu-server-host",
args.vipu_host,
"--vipu-partition",
args.partition,
"--executable-cache-path",
exec_cache,
"--num-instances",
args.num_instances,
"--num-replicas",
args.num_replicas,
"--ipus-per-replica",
args.ipus_per_replica,
]
compilation_command = [
*poprun_command,
"python3",
"train.py",
*app_args,
"--mlperf-logging",
"True",
"--num-epochs",
"1",
"--ckpts-per-epoch",
"0",
"--wandb",
"False",
]
print(" ".join(compilation_command))
# compile the training and validation model
p = subprocess.Popen(compilation_command, stderr=sys.stderr, stdout=sys.stdout)
p.wait()
training_command = [
*poprun_command,
"python3",
"train.py",
*app_args,
"--mlperf-logging",
"True",
"--wandb",
"True",
]
print(" ".join(training_command))
# run training
p = subprocess.Popen(training_command, stderr=sys.stderr, stdout=sys.stdout)
p.wait()