-
Notifications
You must be signed in to change notification settings - Fork 5.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
improve the performence of fftc2rgrad #63137
Conversation
你的PR提交成功,感谢你对开源项目的贡献! |
❌ The PR is not created using PR's template. You can refer to this Demo. |
Sorry to inform you that 7466413's CIs have passed for more than 7 days. To prevent PR conflicts, you need to re-run all CIs manually. |
Sorry to inform you that b545023's CIs have passed for more than 7 days. To prevent PR conflicts, you need to re-run all CIs manually. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
很不错的优化思路,这里的for循环优化后性能上的提升效果怎么样?
const int64_t strides_axis; | ||
const int64_t strides_axis_minus_1; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
如果没理解错的话,strides_axis
和strides_axis_minus_1
分别表示strides的倒数第一和第二个位置上的元素值,但strides_axis
和strides_axis_minus_1
仅从命名语义上不太能直观的看出这个含义,这两个变量的命名可以再优化下,语义上尽量看着直观一些
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
如果没理解错的话,
strides_axis
和strides_axis_minus_1
分别表示strides的倒数第一和第二个位置上的元素值,但strides_axis
和strides_axis_minus_1
仅从命名语义上不太能直观的看出这个含义,这两个变量的命名可以再优化下,语义上尽量看着直观一些
Done
测试代码: import argparse
import functools
import time
from contextlib import ContextDecorator
import numpy as np
import paddle
from paddle.nn import functional as F
from paddle import profiler
class PaddleProfiler(ContextDecorator):
"""
Profiler list how many kinds of C++ API is called.
"""
def __init__(self, scheduler=None):
super().__init__()
self.prof = profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
scheduler=scheduler,
on_trace_ready=profiler.export_chrome_tracing("./log"),
)
def __enter__(self):
self.prof.start()
return self
def step(self):
print("step")
self.prof.step()
def __exit__(self, type, value, traceback):
self.prof.stop()
self.prof.summary(
sorted_by=profiler.SortedKeys.GPUTotal,
op_detail=False,
thread_sep=False,
time_unit="ms",
views=profiler.SummaryView.OperatorView,
)
print(
"[Warning] This profiler mainly for count how many kinds of C++ API is called. "
"It is recommend to exit after 1 step for it is enough to gather called C++ API information."
)
def test_grad(y_, x_, order):
if order >= 1:
g = paddle.grad(y_, x_, create_graph=order>1)
if order >= 2:
gg = paddle.grad(g, x_, create_graph=order>2)
if order >= 3:
ggg = paddle.grad(gg, x_, create_graph=order>3)
if order >= 4:
gggg = paddle.grad(ggg, x_, create_graph=order>3)
# raise NotImplementedError(
# "暂不支持4阶"
# )
# def show_ops(order: int, prim: bool):
# paddle.framework.core.set_prim_eager_enabled(prim)
# paddle.framework.core._set_prim_backward_blacklist("divide_grad")
# with PaddleProfiler() as pf:
# for i in range(1):
# x = paddle.randn([4, 3])
# x.stop_gradient = False
# y = paddle.randn([4, 3])
# y.stop_gradient = False
# # add
# # z = x + y
# # z = -x # 使用tanh控制变量,因为白名单生效后无论是否开启组合模式,都走一阶拆解
# z = x / y
# z = paddle.tanh(z)
# test_grad(z, x, order)
# pf.step()
def speed_test(op_name, func, inputs, order: int, prim: bool, backward_blacklist: list[str] = None, show_ops: bool = False):
paddle.framework.core.set_prim_eager_enabled(prim)
if backward_blacklist is None:
backward_blacklist = []
if prim:
if op_name == "tanh":
if backward_blacklist:
# tanh 单独测试黑名单
paddle.framework.core._set_prim_backward_blacklist(*backward_blacklist)
else:
# 非tanh算子,默认加上tanh所有反向到黑名单,全部走大算子,控制变量
paddle.framework.core._set_prim_backward_blacklist(*backward_blacklist, "tanh_grad", "tanh_double_grad", "tanh_triple_grad")
if show_ops:
with PaddleProfiler() as pf:
z = func(*inputs)
pf.step()
test_grad(z, inputs[0] if isinstance(inputs, (tuple, list)) else inputs, order)
return
while True:
costs = []
warmup = 100
for i in range(500):
z = func(*inputs)
paddle.device.synchronize()
t = time.perf_counter()
test_grad(z, inputs[0] if isinstance(inputs, (tuple, list)) else inputs, order)
paddle.device.synchronize()
cost = time.perf_counter() - t
if i >= warmup:
costs.append(cost * 1000)
costs = np.array(costs)
if (np.std(costs) / np.mean(costs)) > 0.1:
continue
print(f"[{op_name}] # order = {order}, prim = {prim}, Avg = {np.mean(costs):.5f} ms, Std = {np.std(costs):.5f} ms, mem = {paddle.device.cuda.max_memory_allocated() / (1 << 30):.3f} GB, backward_blacklist = {backward_blacklist}")
break
"""
可禁用的prim::xxx API如下
abs_grad
concat_grad
cos_grad
cumsum_grad
erf_grad
exp_grad
expand_grad
floor_grad
gather_grad
gather_nd_grad
gelu_grad
group_norm_grad
instance_norm_grad
layer_norm_grad
leaky_relu_grad
log_grad
relu_grad
roll_grad
scatter_grad
scatter_nd_add_grad
sigmoid_grad
silu_grad
sin_grad
sqrt_grad
tanh_grad
tanh_double_grad
tanh_triple_grad
topk_grad
add_grad
add_double_grad
add_triple_grad
batch_norm_grad
divide_grad
dropout_grad
elementwise_pow_grad
matmul_double_grad
max_grad
maximum_grad
minimum_grad
pad_grad
prod_grad
slice_grad
softmax_grad
subtract_grad
subtract_double_grad
sum_grad
tile_grad
transpose_grad
"""
"""
其中PINN模型常用的算API如下
x abs_grad
x add_double_grad
x add_grad
x add_triple_grad
x assign_grad
x concat_grad
x cos_grad
x divide_grad
x elementwise_pow_grad
x exp_grad
x expand_grad
x gather_grad
x log_grad
x matmul_double_grad
x max_grad
x maximum_grad
x minimum_grad
x multiply_double_grad
x multiply_grad
x reshape_grad
x sigmoid_grad
x silu_double_grad
x silu_grad
x sin_grad
x split_grad
x sqrt_grad
x subtract_double_grad
x subtract_grad
x sum_grad
x tanh_double_grad
x tanh_grad
x tanh_triple_grad
x transpose_grad
"""
def func_wrapper(func):
@functools.wraps(func)
def wrapped_func(*args, **kwargs):
out = func(*args, **kwargs)
if isinstance(out, paddle.Tensor):
return paddle.tanh(out)
else:
return [paddle.tanh(out_) for out_ in out]
return wrapped_func
def scale(x, a):
return x + a
def my_add_n(xs):
x = xs[0]
for _x in xs[1:]:
x = paddle.add(x, _x)
return x
def neg(x):
return -x
func_map = {
"abs": func_wrapper(paddle.abs),
"add": func_wrapper(paddle.add),
"add_n": func_wrapper(my_add_n),
"assign": func_wrapper(paddle.assign),
"concat": func_wrapper(paddle.concat),
"cos": paddle.cos,
"divide": func_wrapper(paddle.divide),
"pow": paddle.pow,
"exp": paddle.exp,
"expand": func_wrapper(paddle.expand),
"gather": (paddle.gather),
"hardswish": (F.hardswish),
"log": paddle.log,
"matmul": func_wrapper(paddle.matmul),
"max": func_wrapper(paddle.max),
"min": func_wrapper(paddle.min),
"maximum": func_wrapper(paddle.maximum),
"slice": func_wrapper(paddle.slice),
"minimum": func_wrapper(paddle.minimum),
"multiply": func_wrapper(paddle.multiply),
"neg": func_wrapper(neg),
"relu": func_wrapper(F.relu),
"reshape": func_wrapper(paddle.reshape),
"scale": func_wrapper(scale),
"sigmoid": F.sigmoid,
"silu": F.silu,
"sin": paddle.sin,
"softmax": F.softmax,
"split": func_wrapper(paddle.split),
"sqrt": paddle.sqrt,
"subtract": func_wrapper(paddle.subtract),
"sum": func_wrapper(paddle.sum),
"tanh": paddle.tanh,
"transpose": func_wrapper(paddle.transpose),
"fft_c2r": paddle._C_ops.fft_c2r
}
def gen_data(shape):
return np.random.randn(*shape).astype("float32")
inputs_map = {
"abs": [paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False)],
"add": [
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
],
"add_n": [
[
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
]
],
"assign": [paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False)],
"concat": [
[
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
],
1
],
"cos": [paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False)],
"divide": [
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
],
"pow": [
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
2
],
"exp": [paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False)],
"expand": [
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
[4, 1024, 512],
],
"gather": [
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
paddle.to_tensor(np.random.randint(0, 1024, [256], dtype="int64")),
],
"hardswish": [
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
],
"log": [paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False)],
"matmul": [
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False).T,
],
"max": [paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False)],
"min": [paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False)],
"maximum": [
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
],
"minimum": [
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False)
],
"multiply": [
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False)
],
"neg": [
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
],
"relu": [
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
],
"reshape": [
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
[512, 1024],
],
"scale": [
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
3.14,
],
"slice": [
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
[-1],
[250],
[500],
],
"sigmoid": [paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False)],
"silu": [paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False)],
"sin": [paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False)],
"softmax": [paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False)],
"split": [
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
16,
1,
],
"sqrt": [paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False)],
"subtract": [
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False)
],
"sum": [paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False)],
"tanh": [paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False)],
"transpose": [
paddle.to_tensor(gen_data([1024, 512]), stop_gradient=False),
[1, 0],
],
"fft_c2r": [
paddle.to_tensor((
np.random.random((128,128,128)) + 1j * np.random.random((128,128,128))
).astype(np.complex128), stop_gradient=False),
(0, 1),
"backward",
False,
0,
]
}
def parse_args():
parser = argparse.ArgumentParser("Prim benchmark")
parser.add_argument("-o", "--order", type=int, help="grad order", required=True)
parser.add_argument("-op", "--operator", type=str, help="output directory", required=True)
parser.add_argument("-p", "--prim", action="store_true", help="use prim", default=False)
parser.add_argument("-bl", "--backward_blacklist", nargs='+', type=str, help="backward_blacklist")
parser.add_argument("-show", "--show_ops", action="store_true", default=False)
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
func = func_map[args.operator]
inputs = inputs_map[args.operator]
speed_test(args.operator, func, inputs, args.order, args.prim, args.backward_blacklist, show_ops=args.show_ops) 优化后 |
PR Category
Performance Optimization
PR Types
Performance
Description
优化 fftc2rgrad 的性能,去掉了不必要的for循环计算,具体证明如下: