Skip to content

Commit 2e80f3a

Browse files
shivamprilmarkov
authored andcommitted
[Kernel][Model] Tune fused_moe Triton configs for Qwen3-30B A3/A3B on H100 (FP8/BF16) (vllm-project#26268)
Signed-off-by: Shivam <shivampr.dev@gmail.com>
1 parent 0df5be6 commit 2e80f3a

4 files changed

+328
-0
lines changed
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
{
2+
"1": {
3+
"BLOCK_SIZE_M": 64,
4+
"BLOCK_SIZE_N": 256,
5+
"BLOCK_SIZE_K": 64,
6+
"GROUP_SIZE_M": 1,
7+
"num_warps": 4,
8+
"num_stages": 3
9+
},
10+
"2": {
11+
"BLOCK_SIZE_M": 32,
12+
"BLOCK_SIZE_N": 256,
13+
"BLOCK_SIZE_K": 256,
14+
"GROUP_SIZE_M": 16,
15+
"num_warps": 4,
16+
"num_stages": 4
17+
},
18+
"4": {
19+
"BLOCK_SIZE_M": 32,
20+
"BLOCK_SIZE_N": 256,
21+
"BLOCK_SIZE_K": 64,
22+
"GROUP_SIZE_M": 16,
23+
"num_warps": 4,
24+
"num_stages": 4
25+
},
26+
"8": {
27+
"BLOCK_SIZE_M": 128,
28+
"BLOCK_SIZE_N": 128,
29+
"BLOCK_SIZE_K": 256,
30+
"GROUP_SIZE_M": 8,
31+
"num_warps": 4,
32+
"num_stages": 3
33+
},
34+
"16": {
35+
"BLOCK_SIZE_M": 128,
36+
"BLOCK_SIZE_N": 64,
37+
"BLOCK_SIZE_K": 64,
38+
"GROUP_SIZE_M": 8,
39+
"num_warps": 8,
40+
"num_stages": 3
41+
},
42+
"32": {
43+
"BLOCK_SIZE_M": 64,
44+
"BLOCK_SIZE_N": 256,
45+
"BLOCK_SIZE_K": 256,
46+
"GROUP_SIZE_M": 1,
47+
"num_warps": 8,
48+
"num_stages": 3
49+
},
50+
"64": {
51+
"BLOCK_SIZE_M": 128,
52+
"BLOCK_SIZE_N": 128,
53+
"BLOCK_SIZE_K": 256,
54+
"GROUP_SIZE_M": 16,
55+
"num_warps": 8,
56+
"num_stages": 4
57+
},
58+
"128": {
59+
"BLOCK_SIZE_M": 16,
60+
"BLOCK_SIZE_N": 128,
61+
"BLOCK_SIZE_K": 128,
62+
"GROUP_SIZE_M": 16,
63+
"num_warps": 8,
64+
"num_stages": 2
65+
},
66+
"256": {
67+
"BLOCK_SIZE_M": 32,
68+
"BLOCK_SIZE_N": 64,
69+
"BLOCK_SIZE_K": 128,
70+
"GROUP_SIZE_M": 8,
71+
"num_warps": 8,
72+
"num_stages": 2
73+
},
74+
"512": {
75+
"BLOCK_SIZE_M": 128,
76+
"BLOCK_SIZE_N": 64,
77+
"BLOCK_SIZE_K": 64,
78+
"GROUP_SIZE_M": 16,
79+
"num_warps": 4,
80+
"num_stages": 2
81+
}
82+
}
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
{
2+
"1": {
3+
"BLOCK_SIZE_M": 64,
4+
"BLOCK_SIZE_N": 256,
5+
"BLOCK_SIZE_K": 64,
6+
"GROUP_SIZE_M": 1,
7+
"num_warps": 8,
8+
"num_stages": 3
9+
},
10+
"2": {
11+
"BLOCK_SIZE_M": 16,
12+
"BLOCK_SIZE_N": 64,
13+
"BLOCK_SIZE_K": 64,
14+
"GROUP_SIZE_M": 16,
15+
"num_warps": 8,
16+
"num_stages": 3
17+
},
18+
"4": {
19+
"BLOCK_SIZE_M": 32,
20+
"BLOCK_SIZE_N": 64,
21+
"BLOCK_SIZE_K": 64,
22+
"GROUP_SIZE_M": 16,
23+
"num_warps": 8,
24+
"num_stages": 4
25+
},
26+
"8": {
27+
"BLOCK_SIZE_M": 16,
28+
"BLOCK_SIZE_N": 256,
29+
"BLOCK_SIZE_K": 128,
30+
"GROUP_SIZE_M": 1,
31+
"num_warps": 8,
32+
"num_stages": 4
33+
},
34+
"16": {
35+
"BLOCK_SIZE_M": 16,
36+
"BLOCK_SIZE_N": 128,
37+
"BLOCK_SIZE_K": 128,
38+
"GROUP_SIZE_M": 8,
39+
"num_warps": 8,
40+
"num_stages": 2
41+
},
42+
"32": {
43+
"BLOCK_SIZE_M": 16,
44+
"BLOCK_SIZE_N": 64,
45+
"BLOCK_SIZE_K": 128,
46+
"GROUP_SIZE_M": 16,
47+
"num_warps": 8,
48+
"num_stages": 2
49+
},
50+
"64": {
51+
"BLOCK_SIZE_M": 32,
52+
"BLOCK_SIZE_N": 256,
53+
"BLOCK_SIZE_K": 128,
54+
"GROUP_SIZE_M": 16,
55+
"num_warps": 8,
56+
"num_stages": 4
57+
},
58+
"128": {
59+
"BLOCK_SIZE_M": 16,
60+
"BLOCK_SIZE_N": 256,
61+
"BLOCK_SIZE_K": 64,
62+
"GROUP_SIZE_M": 8,
63+
"num_warps": 8,
64+
"num_stages": 3
65+
},
66+
"256": {
67+
"BLOCK_SIZE_M": 16,
68+
"BLOCK_SIZE_N": 64,
69+
"BLOCK_SIZE_K": 128,
70+
"GROUP_SIZE_M": 8,
71+
"num_warps": 4,
72+
"num_stages": 2
73+
},
74+
"512": {
75+
"BLOCK_SIZE_M": 32,
76+
"BLOCK_SIZE_N": 256,
77+
"BLOCK_SIZE_K": 64,
78+
"GROUP_SIZE_M": 16,
79+
"num_warps": 8,
80+
"num_stages": 2
81+
}
82+
}
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
{
2+
"1": {
3+
"BLOCK_SIZE_M": 64,
4+
"BLOCK_SIZE_N": 128,
5+
"BLOCK_SIZE_K": 128,
6+
"GROUP_SIZE_M": 8,
7+
"num_warps": 8,
8+
"num_stages": 4
9+
},
10+
"2": {
11+
"BLOCK_SIZE_M": 32,
12+
"BLOCK_SIZE_N": 128,
13+
"BLOCK_SIZE_K": 64,
14+
"GROUP_SIZE_M": 16,
15+
"num_warps": 4,
16+
"num_stages": 2
17+
},
18+
"4": {
19+
"BLOCK_SIZE_M": 32,
20+
"BLOCK_SIZE_N": 256,
21+
"BLOCK_SIZE_K": 128,
22+
"GROUP_SIZE_M": 16,
23+
"num_warps": 4,
24+
"num_stages": 3
25+
},
26+
"8": {
27+
"BLOCK_SIZE_M": 128,
28+
"BLOCK_SIZE_N": 256,
29+
"BLOCK_SIZE_K": 64,
30+
"GROUP_SIZE_M": 16,
31+
"num_warps": 8,
32+
"num_stages": 2
33+
},
34+
"16": {
35+
"BLOCK_SIZE_M": 32,
36+
"BLOCK_SIZE_N": 64,
37+
"BLOCK_SIZE_K": 128,
38+
"GROUP_SIZE_M": 1,
39+
"num_warps": 4,
40+
"num_stages": 2
41+
},
42+
"32": {
43+
"BLOCK_SIZE_M": 128,
44+
"BLOCK_SIZE_N": 256,
45+
"BLOCK_SIZE_K": 64,
46+
"GROUP_SIZE_M": 8,
47+
"num_warps": 8,
48+
"num_stages": 4
49+
},
50+
"64": {
51+
"BLOCK_SIZE_M": 64,
52+
"BLOCK_SIZE_N": 128,
53+
"BLOCK_SIZE_K": 128,
54+
"GROUP_SIZE_M": 16,
55+
"num_warps": 8,
56+
"num_stages": 2
57+
},
58+
"128": {
59+
"BLOCK_SIZE_M": 32,
60+
"BLOCK_SIZE_N": 64,
61+
"BLOCK_SIZE_K": 256,
62+
"GROUP_SIZE_M": 16,
63+
"num_warps": 8,
64+
"num_stages": 3
65+
},
66+
"256": {
67+
"BLOCK_SIZE_M": 16,
68+
"BLOCK_SIZE_N": 128,
69+
"BLOCK_SIZE_K": 256,
70+
"GROUP_SIZE_M": 16,
71+
"num_warps": 8,
72+
"num_stages": 3
73+
},
74+
"512": {
75+
"BLOCK_SIZE_M": 16,
76+
"BLOCK_SIZE_N": 64,
77+
"BLOCK_SIZE_K": 128,
78+
"GROUP_SIZE_M": 1,
79+
"num_warps": 8,
80+
"num_stages": 2
81+
}
82+
}
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
{
2+
"1": {
3+
"BLOCK_SIZE_M": 16,
4+
"BLOCK_SIZE_N": 256,
5+
"BLOCK_SIZE_K": 128,
6+
"GROUP_SIZE_M": 1,
7+
"num_warps": 8,
8+
"num_stages": 2
9+
},
10+
"2": {
11+
"BLOCK_SIZE_M": 16,
12+
"BLOCK_SIZE_N": 256,
13+
"BLOCK_SIZE_K": 64,
14+
"GROUP_SIZE_M": 8,
15+
"num_warps": 8,
16+
"num_stages": 2
17+
},
18+
"4": {
19+
"BLOCK_SIZE_M": 16,
20+
"BLOCK_SIZE_N": 128,
21+
"BLOCK_SIZE_K": 128,
22+
"GROUP_SIZE_M": 16,
23+
"num_warps": 4,
24+
"num_stages": 3
25+
},
26+
"8": {
27+
"BLOCK_SIZE_M": 64,
28+
"BLOCK_SIZE_N": 64,
29+
"BLOCK_SIZE_K": 64,
30+
"GROUP_SIZE_M": 16,
31+
"num_warps": 4,
32+
"num_stages": 3
33+
},
34+
"16": {
35+
"BLOCK_SIZE_M": 64,
36+
"BLOCK_SIZE_N": 256,
37+
"BLOCK_SIZE_K": 128,
38+
"GROUP_SIZE_M": 8,
39+
"num_warps": 8,
40+
"num_stages": 3
41+
},
42+
"32": {
43+
"BLOCK_SIZE_M": 32,
44+
"BLOCK_SIZE_N": 128,
45+
"BLOCK_SIZE_K": 64,
46+
"GROUP_SIZE_M": 1,
47+
"num_warps": 4,
48+
"num_stages": 3
49+
},
50+
"64": {
51+
"BLOCK_SIZE_M": 16,
52+
"BLOCK_SIZE_N": 256,
53+
"BLOCK_SIZE_K": 64,
54+
"GROUP_SIZE_M": 8,
55+
"num_warps": 8,
56+
"num_stages": 2
57+
},
58+
"128": {
59+
"BLOCK_SIZE_M": 16,
60+
"BLOCK_SIZE_N": 128,
61+
"BLOCK_SIZE_K": 64,
62+
"GROUP_SIZE_M": 8,
63+
"num_warps": 8,
64+
"num_stages": 4
65+
},
66+
"256": {
67+
"BLOCK_SIZE_M": 64,
68+
"BLOCK_SIZE_N": 128,
69+
"BLOCK_SIZE_K": 128,
70+
"GROUP_SIZE_M": 1,
71+
"num_warps": 4,
72+
"num_stages": 2
73+
},
74+
"512": {
75+
"BLOCK_SIZE_M": 64,
76+
"BLOCK_SIZE_N": 128,
77+
"BLOCK_SIZE_K": 64,
78+
"GROUP_SIZE_M": 8,
79+
"num_warps": 4,
80+
"num_stages": 2
81+
}
82+
}

0 commit comments

Comments
 (0)