-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsolver.py
157 lines (130 loc) · 4.88 KB
/
solver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# -*- coding: utf-8 -*-
from typing import Tuple, Union
from scipy.optimize import minimize
import math
import numpy as np
ALPHA_D_100GBIB_64 = 7.83e-4
BETA_D_100GBIB_64 = 3.84e-10
ALPHA_D_100GBIB_32 = 2.09e-4
BETA_D_100GBIB_32 = 3.55e-10
ALPHA_D_100GBIB_16 = 5.70e-4
BETA_D_100GBIB_16 = 2.96e-10
ALPHA_GEMM_RTX2080TI = 1.5e-4
BETA_GEMM_RTX2080TI = 3.73e-14
NBYTES_PER_ELEMENT = 4 # FP32
GPU_PARAMS = {
'rtx2080ti': (ALPHA_GEMM_RTX2080TI, BETA_GEMM_RTX2080TI)
}
NETWORK_PARAMS = {
'64-100GbIB': (ALPHA_D_100GBIB_64, BETA_D_100GBIB_64),
'32-100GbIB': (ALPHA_D_100GBIB_32, BETA_D_100GBIB_32),
'16-100GbIB': (ALPHA_D_100GBIB_16, BETA_D_100GBIB_16),
}
def get_network_params(network: str,
P: Union[int, str]) -> Tuple[float, float]:
alpha, beta = NETWORK_PARAMS.get(str(P)+'-'+network)
return alpha, beta
def predict_gemm_time(m: int,
n: int,
k: int,
gpu: str = 'rtx2080ti') -> float:
r"""
This is a function to predict gemm time.
:param m: The shape[0] of first matrix
:param n: The shape[1] of first matrix
:param k: The shape[1] of second matrix
:param gpu: The type of GPU, defaults to 'rtx2080ti'
:return: predicted time
"""
x = m * n * k
assert gpu in GPU_PARAMS, 'We donnot support this GPU: %s current' % gpu
alpha, beta = GPU_PARAMS.get(gpu)
t = alpha + beta * x * NBYTES_PER_ELEMENT
return t
def predict_alltoall_time(x: int,
P: Union[int, str],
network: str = '100GbIB') -> float:
r"""
This is a function to predict alltoall time.
:param x: # of tokens per GPU
:param P: # of GPUs for expert parallesim
:param network: The config of Network, defaults to '100GbIB'
:return: predicted time
"""
alpha, beta = get_network_params(network, P)
return alpha + beta * x * NBYTES_PER_ELEMENT
def solve_k(B: int,
L: int,
M: int,
H: int,
P: int,
experts_per_token: int,
capacity_factor: float,
gpu: str,
network: str) -> int:
r"""
This is a function to get the best pipeline degree.
:param B: local mini-batch size
:param L: sequence length, i.e., # of tokens per training sample
:param M: dimension of feature map (i.e., embedding dimension or model_dim)
:param H: size of hidden layer in expert
:param P: # of GPUs for expert parallesim
:param experts_per_token: # of experts for each token
:param capacity_factor: the capacity factor of model
:param gpu: GPU model, e.g., rtx2080ti
:param network: network config, e.g., 100GbIB
:return: the best pipeline degree
"""
num_tokens_pre_gpu = experts_per_token * int(capacity_factor * B * L)
len_of_alltoall = num_tokens_pre_gpu * M
len_of_expert_gemm = num_tokens_pre_gpu * M * H
n_e = len_of_expert_gemm * NBYTES_PER_ELEMENT
n_d = len_of_alltoall * NBYTES_PER_ELEMENT
alpha_a, beta_a = get_network_params(network, P)
alpha_gemm, beta_gemm = GPU_PARAMS.get(gpu)
alpha_e = 2 * alpha_gemm
beta_e = 2 * beta_gemm
t_e = alpha_e + beta_e * n_e
t_d = alpha_a + beta_a * n_d
# No pipeline
t1 = 2 * t_d + t_e
r1 = 1
r2 = max(2, math.ceil((beta_e*n_e - beta_a*n_d)/(alpha_a-alpha_e)))
t2 = 2 * r2 * alpha_a + 2 * n_d * beta_a
eps = 1e-10
def _solve_f3():
def fun(x): return 2*alpha_a+beta_e*n_e+2*beta_a*n_d/x+alpha_e*x
cons = ({'type': 'ineq', 'fun': lambda x: beta_e*n_e/x+alpha_e-alpha_a-beta_a*n_d/x-eps},
{'type': 'ineq', 'fun': lambda x: x * alpha_e + beta_e *
n_e - (2*(x-1)*alpha_a+2*(x-1)*beta_a*n_d/x)-eps},
{'type': 'ineq', 'fun': lambda x: x-2},
)
x0 = np.array((2.0))
res = minimize(fun, x0, method='SLSQP', constraints=cons, tol=1e-11)
x = int(res.x+0.5)
minimizer = fun(x)
return x, minimizer
def _solve_f4():
def fun(x): return 2*alpha_a*x+2*n_d*beta_a
cons = ({'type': 'ineq', 'fun': lambda x: beta_e*n_e/x+alpha_e-alpha_a-beta_a*n_d/x-eps},
{'type': 'ineq', 'fun': lambda x: 2 *
(x-1)*alpha_a+2*(x-1)*beta_a*n_d/x-x*alpha_e-beta_e*n_e-eps},
{'type': 'ineq', 'fun': lambda x: x-2},
)
x0 = np.array((2.0))
res = minimize(fun, x0, method='SLSQP', constraints=cons,
tol=1e-11, options={'maxiter': 100})
x = int(res.x+0.5)
minimizer = fun(x)
return x, minimizer
r3, t3 = _solve_f3()
r4, t4 = _solve_f4()
if min(t2, t3, t4) >= t1:
return r1
else:
l = [t2, t3, t4]
rs = [r2, r3, r4]
i = np.argmin(l)
return rs[i]
if __name__ == '__main__':
print(solve_k(8, 1024, 4096, 4096, 16, 2, 1.0, 'rtx2080ti', '100GbIB'))