-
Notifications
You must be signed in to change notification settings - Fork 3
/
lgm_run
executable file
·155 lines (143 loc) · 5.68 KB
/
lgm_run
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env python3
import sys
import os
import click
import subprocess as sp
import time
import signal
import atexit
import functools
script_dir = os.path.dirname(os.path.realpath(__file__))
manager_def = os.path.join(script_dir, "manager_tcp")
manager_proc = None
login = None
ava_ip = 'localhost'
ava_port = 4000
def clean_up():
if manager_proc:
manager_proc.send_signal(signal.SIGINT)
os.system('ssh {} killall -9 worker manager_tcp &>/dev/null'.format(login))
atexit.register(clean_up)
def run_test(test_cmd):
try:
print("Running test")
sys.stdout.flush()
return sp.run(test_cmd, env=ava_env_client()).returncode
except Exception as e:
print("ERROR:", e)
sys.exit(1)
guest_lib=os.path.join(script_dir, 'libguestlib.so')
shim=os.path.join(script_dir, 'guestshim.so')
extra_client_env = {
'MARS_NO_WAIT': '1',
'MXNET_ENGINE_TYPE': 'NaiveEngine',
'MXNET_NO_WAIT_MODE': '1',
'AVA_LOCAL' : '1',
'AVA_ENABLE_SSL': '0',
'HCC_LAZYINIT': '1',
'HIP_ENABLE_COMMAND_SCHEDULER' : '0',
'HIP_ENABLE_SEP_MEMCPY_COMMAND_SCHEDULER' : '0',
'LGM_MEMCPY_FIX_SIZE' : '0',
'LGM_MEMCPY_ENABLE_ENCRYPTION' : '0',
'HIP_COMMAND_SCHEDULER_FR_INTERVAL_US' : '-1',
'HIP_COMMAND_SCHEDULER_MEMCPY_FR_INTERVAL_US' : '-1',
}
def ava_env_client():
new_env = dict(os.environ)
if ava_reverse_socket:
new_env['AVA_REVERSE_SOCKET'] = '1'
else:
new_env['AVA_IP'] = ava_ip
new_env['AVA_PORT'] = str(ava_port)
new_env.update(extra_client_env)
return new_env
click_option = functools.partial(click.option, show_default=True)
@click.command()
@click_option('--batching', '-2', is_flag=True, help="Bar 2 in our graphs")
@click_option('--exec-xfer-stream', '-3', is_flag=True, help="Bar 3 in our graphs")
@click_option('--data-encryption', '-4', is_flag=True, help="Bar 4 in our graphs")
@click_option('--data-obv-scheduling', '-5', is_flag=True, help="Bar 5 in our graphs, Requires tuning --kernel-fr-interval-us, --memcpy-fr-interval-us, and --gpu-op-batch-size")
@click.argument('run-type', type=click.Choice(['baseline','ava']))
@click.argument('test-cmd', nargs=-1)
@click_option('--server', default="localhost")
@click_option('--server-port', type=int, default=4000)
@click_option('--reverse-socket', is_flag=True)
@click_option('--enable-ssl', is_flag=True)
@click_option('--gpu-op-batch-size', default=32)
@click_option('--gpu-op-batching/--no-gpu-op-batching', default=False)
@click_option('--gpu-op-sep-memcpy', is_flag=True)
@click_option('--hip-memcpy-fixed-size', is_flag=True)
@click_option('--hip-memcpy-encrypted', is_flag=True)
@click_option('--hip-check-complete', is_flag=True)
@click_option('--kernel-fr-interval-us', type=int, help="if not set, kernel launches are not fixed rated")
@click_option('--memcpy-fr-interval-us', type=int, help="if not set, memcpys are not fixed rated")
@click_option('--hip-memcpy-n-staging-buffers', type=int, help="specify number of staging buffers (default 256)")
def main(run_type, test_cmd, server, server_port, reverse_socket, enable_ssl, gpu_op_batching, gpu_op_batch_size,
hip_memcpy_fixed_size, hip_memcpy_encrypted, gpu_op_sep_memcpy,
hip_check_complete, kernel_fr_interval_us, memcpy_fr_interval_us,
hip_memcpy_n_staging_buffers, exec_xfer_stream, data_encryption,
data_obv_scheduling, batching):
global ava_ip
global ava_port
global ava_reverse_socket
if data_obv_scheduling:
data_encryption = True
if not kernel_fr_interval_us:
print("You must set --kernel-fr-interval-us")
return
if not memcpy_fr_interval_us:
print("You must set --memcpy-fr-interval-us")
return
if data_encryption:
exec_xfer_stream = True
hip_memcpy_encrypted = True
if exec_xfer_stream:
gpu_op_sep_memcpy = True
hip_memcpy_fixed_size = True
if batching:
gpu_op_batching = True
if len(test_cmd) == 0:
print("no command given exiting!")
sys.exit(1)
if reverse_socket:
ava_reverse_socket = True
else:
ava_reverse_socket = False
if server:
ava_ip = server
ava_port = server_port
if enable_ssl:
print("SSL on")
extra_client_env["AVA_ENABLE_SSL"] = "1"
if gpu_op_batching:
print("BATCHING on")
extra_client_env["HIP_ENABLE_COMMAND_SCHEDULER"] = "1"
extra_client_env["HIP_COMMAND_SCHEDULER_BATCH_SIZE"] = str(gpu_op_batch_size)
if gpu_op_sep_memcpy:
print("SPLIT stream on")
extra_client_env["HIP_ENABLE_SEP_MEMCPY_COMMAND_SCHEDULER"] = "1"
extra_client_env["HIP_COMMAND_SCHEDULER_BATCH_SIZE"] = str(gpu_op_batch_size)
if hip_memcpy_fixed_size:
print("FIX size on")
extra_client_env["LGM_MEMCPY_FIX_SIZE"] = "1"
if hip_memcpy_encrypted:
print("Encryption on")
extra_client_env["LGM_MEMCPY_ENABLE_ENCRYPTION"] = "1"
if hip_check_complete:
extra_client_env["HIP_SYNC_CHECK_BATCH_COMPLETE"] = "1"
if run_type == "ava":
extra_client_env['LD_PRELOAD'] = ':'.join([shim, guest_lib])
if kernel_fr_interval_us:
extra_client_env['HIP_COMMAND_SCHEDULER_FR_INTERVAL_US'] = \
str(kernel_fr_interval_us)
if memcpy_fr_interval_us:
extra_client_env['HIP_COMMAND_SCHEDULER_MEMCPY_FR_INTERVAL_US'] = \
str(memcpy_fr_interval_us)
if hip_memcpy_n_staging_buffers:
extra_client_env["HIP_MEMCPY_N_STAGING_BUFFERS"] = str(hip_memcpy_n_staging_buffers)
for k,v in sorted(extra_client_env.items()):
print(k, "=", v)
print("TEST CMD:", " ".join(test_cmd))
sys.exit(run_test(test_cmd))
if __name__ == '__main__':
main()