diff --git a/op_tests/test_layernorm2d.py b/op_tests/test_layernorm2d.py index 489a0a82cd..6847747b29 100644 --- a/op_tests/test_layernorm2d.py +++ b/op_tests/test_layernorm2d.py @@ -6,6 +6,7 @@ import aiter from aiter.test_common import checkAllclose, perftest from aiter import dtypes +import argparse @perftest() @@ -105,11 +106,48 @@ def test_layernorm2d_fuseAdd(dtype, m, n): checkAllclose(res_a, res_c, atol=0.01, msg="asm res") +l_dtype = ["bf16"] +parser = argparse.ArgumentParser( + description="Test layernorm2d performance and correctness", +) +parser.add_argument( + "-d", + "--dtype", + type=str, + choices=l_dtype, + nargs="?", + const=None, + default=None, + help="""Data type. + e.g.: -d bf16""", +) +parser.add_argument( + "-m", + type=int, + nargs="?", + default=128, + help="""Number of rows in the input tensor. + e.g.: -m 128""", +) +parser.add_argument( + "-n", + type=int, + nargs="?", + default=8192, + help="""Number of columns in the input tensor. + e.g.: -n 8192""", +) +args = parser.parse_args() +if args.dtype is None: + l_dtype = [dtypes.d_dtypes[key] for key in l_dtype] +else: + l_dtype = [dtypes.d_dtypes[args.dtype]] # for dtype in [dtypes.fp16, dtypes.bf16]: # for m in [1, 2, 4, 8, 16, 32, 64, 128, 256]: # for n in [4096, 8192, 16384, 32768, 65536]: # test_layernorm2d(dtype, m, n) -test_layernorm2d_fuseAdd(dtypes.bf16, 128, 8192) +for dtype in l_dtype: + test_layernorm2d_fuseAdd(dtype, args.m, args.n) # print('\nstart fuse add test') diff --git a/op_tests/test_mha.py b/op_tests/test_mha.py index 1c48a4e65f..c2ae85634f 100644 --- a/op_tests/test_mha.py +++ b/op_tests/test_mha.py @@ -338,12 +338,12 @@ def test_flash_attn_output( e.g.: -k 1024""", ) parser.add_argument( - "-d", - "--d", + "-qk", + "--d_qk", type=int, default=128, help="""Dimension of query and key. Default is 128. - e.g.: -d 256""", + e.g.: -qk 256""", ) parser.add_argument( "-v", @@ -399,7 +399,7 @@ def test_flash_attn_output( e.g.: -m mha""", ) parser.add_argument( - "-dtype", + "-d", "--dtype", type=str, default="bf16", @@ -414,7 +414,7 @@ def test_flash_attn_output( args.nheads, args.seqlen_q, args.seqlen_k, - args.d, + args.d_qk, args.d_v, args.dropout_p, args.causal, diff --git a/op_tests/test_mla.py b/op_tests/test_mla.py index 908e487761..1bf96d0683 100644 --- a/op_tests/test_mla.py +++ b/op_tests/test_mla.py @@ -472,7 +472,7 @@ def test_absorb_prefill(): nargs="?", const=None, default=None, - help="""Number of heads. + help="""Number of nhead and mtp. e.g.: -n 16,1""", ) diff --git a/op_tests/test_moe.py b/op_tests/test_moe.py index ee4bb8a987..fec9ebd257 100755 --- a/op_tests/test_moe.py +++ b/op_tests/test_moe.py @@ -331,7 +331,15 @@ def calculateTensorsSize(*args): choices=l_test, default=None, help="""Select test to run. - e.g.: -t test_fmoe_16_bit""", + e.g.: -t test_fmoe_16_bit + or -t test_fmoe_16_bit + or -t g1u1_no_quant + or -t g1u1_int8quant + or -t g1u1_fp8quant + or -t g1u0_int8smoothquant + or -t g1u1_int8smoothquant + or -t g1u1_fp8smoothquant + or -t g1u1_int4""", ) parser.add_argument( "-d", diff --git a/op_tests/test_moe_ep.py b/op_tests/test_moe_ep.py index 8e383ec6db..26907175b2 100644 --- a/op_tests/test_moe_ep.py +++ b/op_tests/test_moe_ep.py @@ -361,8 +361,79 @@ def calculateTensorsSize(*args): choices=l_test, default=None, help="""Select test to run. - e.g.: -t g1u1_int8quant""", + e.g.: -t g1u1_int8quant + or -t test_fmoe_16_bit + or -t g1u1_no_quant + or -t g1u1_int8quant + or -t g1u1_fp8quant + or -t g1u0_int8smoothquant + or -t g1u1_int8smoothquant + or -t g1u1_fp8smoothquant""", ) +parser.add_argument( + "-d", + "--dtype", + type=str, + nargs="?", + default=None, + help="""Data type. + e.g.: -d bf16""", +) +parser.add_argument( + "-m", + "--token", + type=int, + nargs="*", + default=None, + help="""Token Num. + e.g.: -m 128""", +) +parser.add_argument( + "-hd", + "--hidden_dim", + type=int, + nargs="*", + default=None, + help="""Hidden states dim. + e.g.: -hd 4096""", +) +parser.add_argument( + "-id", + "--inter_dim", + type=int, + nargs="*", + default=None, + help="""Intermediate dim. + e.g.: -id 1024""", +) +parser.add_argument( + "-e", + "--expert", + type=int, + nargs="?", + default=None, + help="""Number of experts. + e.g.: -e 32""", +) +parser.add_argument( + "-k", + "--topk", + type=int, + nargs="?", + default=None, + help="""Top-k value. + e.g.: -k 5""", +) +parser.add_argument( + "-ep", + "--expert_parallelism", + type=int, + nargs="*", + default=None, + help="""Expert Parallelism. + e.g.: -ep 8""", +) + args = parser.parse_args() if args.test is not None: l_test = [args.test] @@ -382,108 +453,174 @@ def calculateTensorsSize(*args): # ) elif test == "g1u1_no_quant": - for dtype in [dtypes.fp16, dtypes.bf16]: - for m in [7, 128, 256]: - for dim in [4096, 8192]: - for hdim in [1024, 1280]: - for ep in [4, 8]: + for dtype in ( + [dtypes.fp16, dtypes.bf16] + if args.dtype is None + else [dtypes.d_dtypes[args.dtype]] + ): + for m in [7, 128, 256] if args.token is None else args.token: + for hdim in ( + [4096, 8192] if args.hidden_dim is None else args.hidden_dim + ): + for idim in ( + [1024, 1280] if args.inter_dim is None else args.inter_dim + ): + for ep in ( + [4, 8] + if args.expert_parallelism is None + else args.expert_parallelism + ): + expert = 128 if args.expert is None else args.expert + topk = 9 if args.topk is None else args.topk test_fmoe_ep( dtype, m, - dim, hdim, - 128, - 9, + idim, + expert, + topk, quant="No", use_g1u1=True, shared_E=2, ep=ep, ) elif test == "g1u1_int8quant": - for dtype in [dtypes.bf16]: - for m in [128, 256]: - for dim in [4096, 8192]: - for hdim in [1024]: - for ep in [4, 8]: + for dtype in ( + [dtypes.bf16] if args.dtype is None else [dtypes.d_dtypes[args.dtype]] + ): + for m in [128, 256] if args.token is None else args.token: + for hdim in ( + [4096, 8192] if args.hidden_dim is None else args.hidden_dim + ): + for idim in [1024] if args.inter_dim is None else args.inter_dim: + expert = 32 if args.expert is None else args.expert + topk = 5 if args.topk is None else args.topk + for ep in ( + [4, 8] + if args.expert_parallelism is None + else args.expert_parallelism + ): test_fmoe_ep( dtype, m, - dim, hdim, - 32, - 5, + idim, + expert, + topk, quant="int8quant", use_g1u1=True, shared_E=2, ep=ep, ) elif test == "g1u1_fp8quant": - for dtype in [dtypes.bf16]: - for m in [128, 256]: - for dim in [4096, 8192]: - for hdim in [1024]: - for ep in [4, 8]: + for dtype in ( + [dtypes.bf16] if args.dtype is None else [dtypes.d_dtypes[args.dtype]] + ): + for m in [128, 256] if args.token is None else args.token: + for hdim in ( + [4096, 8192] if args.hidden_dim is None else args.hidden_dim + ): + for idim in [1024] if args.inter_dim is None else args.inter_dim: + expert = 32 if args.expert is None else args.expert + topk = 5 if args.topk is None else args.topk + for ep in ( + [4, 8] + if args.expert_parallelism is None + else args.expert_parallelism + ): test_fmoe_ep( dtype, m, - dim, hdim, - 32, - 5, + idim, + expert, + topk, quant="fp8quant", use_g1u1=True, shared_E=2, ep=ep, ) elif test == "g1u0_int8smoothquant": - for dtype in [dtypes.bf16]: - for m in [128]: - for dim in [4096, 6144, 8192]: - for hdim in [512, 1024]: - for ep in [4, 8]: + for dtype in ( + [dtypes.bf16] if args.dtype is None else [dtypes.d_dtypes[args.dtype]] + ): + for m in [128] if args.token is None else args.token: + for hdim in ( + [4096, 6144, 8192] if args.hidden_dim is None else args.hidden_dim + ): + for idim in ( + [512, 1024] if args.inter_dim is None else args.inter_dim + ): + expert = 32 if args.expert is None else args.expert + topk = 5 if args.topk is None else args.topk + for ep in ( + [4, 8] + if args.expert_parallelism is None + else args.expert_parallelism + ): test_fmoe_ep( dtype, m, - dim, hdim, - 32, - 5, + idim, + expert, + topk, quant="int8smoothquant", use_g1u1=False, shared_E=2, ep=ep, ) elif test == "g1u1_int8smoothquant": - for dtype in [dtypes.bf16]: - for m in [128]: - for dim in [4096]: - for hdim in [1280]: - for ep in [8]: + for dtype in ( + [dtypes.bf16] if args.dtype is None else [dtypes.d_dtypes[args.dtype]] + ): + for m in [128] if args.token is None else args.token: + for hdim in [4096] if args.hidden_dim is None else args.hidden_dim: + for idim in [1280] if args.inter_dim is None else args.inter_dim: + expert = 128 if args.expert is None else args.expert + topk = 6 if args.topk is None else args.topk + for ep in ( + [8] + if args.expert_parallelism is None + else args.expert_parallelism + ): test_fmoe_ep( dtype, m, - dim, hdim, - 128, - 6, + idim, + expert, + topk, quant="int8smoothquant", use_g1u1=True, shared_E=2, ep=ep, ) elif test == "g1u1_fp8smoothquant": - for dtype in [dtypes.bf16]: - for m in [128]: - for dim in [4096, 6144, 8192]: - for hdim in [512, 1024, 1280]: - for ep in [4, 8]: + for dtype in ( + [dtypes.bf16] if args.dtype is None else [dtypes.d_dtypes[args.dtype]] + ): + for m in [128] if args.token is None else args.token: + for hdim in ( + [4096, 6144, 8192] if args.hidden_dim is None else args.hidden_dim + ): + for idim in ( + [512, 1024, 1280] if args.inter_dim is None else args.inter_dim + ): + expert = 32 if args.expert is None else args.expert + topk = 5 if args.topk is None else args.topk + for ep in ( + [4, 8] + if args.expert_parallelism is None + else args.expert_parallelism + ): test_fmoe_ep( dtype, m, - dim, hdim, - 32, - 5, + idim, + expert, + topk, quant="fp8smoothquant", use_g1u1=True, shared_E=2, diff --git a/op_tests/test_moe_sorting.py b/op_tests/test_moe_sorting.py index c0a171d80e..b81ba345dd 100644 --- a/op_tests/test_moe_sorting.py +++ b/op_tests/test_moe_sorting.py @@ -227,8 +227,8 @@ def test_moe_sorting( "--padding", type=int, default=None, - help="""Number of padding token. - e.g.: -t 0""", + help="""Padding token. + e.g.: -p 0""", ) parser.add_argument( "-dp", @@ -238,8 +238,8 @@ def test_moe_sorting( nargs="?", const=None, default=None, - help="""Number of padding token. - e.g.: -t 0""", + help="""Dispatch policy. + e.g.: -dp 0""", ) parser.add_argument( "-em", diff --git a/op_tests/test_rope.py b/op_tests/test_rope.py index 989926be80..d51878e4a7 100644 --- a/op_tests/test_rope.py +++ b/op_tests/test_rope.py @@ -1273,17 +1273,20 @@ def test_rope_2d(input, height, width, freqs_h, freqs_w, grad): parser.add_argument( "--no_check", action="store_true", - help="Do not check correctness of ops. Default: False.", + help="""Do not check correctness of ops. Default: False. + --no_check # True""", ) parser.add_argument( "--compare", action="store_true", - help="Compare with legacy implementation. Default: False", + help="""Compare with legacy implementation. Default: False + --compare # True""", ) parser.add_argument( "--compare_check", action="store_true", - help="Check correctness when compare with legacy implementation. Default: False", + help="""Check correctness when compare with legacy implementation. Default: False + --compare_check # True""", ) parser.add_argument( "-d", @@ -1304,7 +1307,7 @@ def test_rope_2d(input, height, width, freqs_h, freqs_w, grad): type=dtypes.str2bool, help="""Transpose output. Default: (False, True). e.g.: -t f # for False - e.g.: -t t # for True""", + or -t t # for True""", ) parser.add_argument( "-b", @@ -1378,7 +1381,8 @@ def test_rope_2d(input, height, width, freqs_h, freqs_w, grad): choices=list(d_rs.keys()), nargs="*", help="""Rotate style. Default is all combinations of neox and gptj. - e.g.: -rs neox""", + e.g.: -rs neox + or -rs gptj""", ) d_rr = { # [0]: rotary percentage, [1]: reuse front part, [2]: nope first @@ -1397,8 +1401,12 @@ def test_rope_2d(input, height, width, freqs_h, freqs_w, grad): nargs="*", choices=list(d_rr.keys()), help="""Rotary percentage and reuse front part. Default is all combinations of: -(1.0, True, False), (1.0, False, False), (0.5, False, False), (0.5, True, False), (0.5, True, True), (0.5, False, True). - e.g.: -rr 0 # for (1.0, True, False)""", + e.g.: -rr 0 # for (1.0, True, False) + or -rr 1 # for (1.0, False, False) + or -rr 2 # for (0.5, False, False) + or -rr 3 # for (0.5, True, False) + or -rr 4 # for (0.5, True, True) + or -rr 5 # for (0.5, False, True)""", ) args = parser.parse_args()