Skip to content

Commit aba48f7

Browse files
authored
[Kernel][MoE] Add MoE tunings for GLM 4.6-FP8 and GLM 4.5 Air on NVidia B200 (#26818)
1 parent 04b5f98 commit aba48f7

File tree

3 files changed

+441
-0
lines changed

3 files changed

+441
-0
lines changed
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
{
2+
"triton_version": "3.4.0",
3+
"1": {
4+
"BLOCK_SIZE_M": 16,
5+
"BLOCK_SIZE_N": 32,
6+
"BLOCK_SIZE_K": 128,
7+
"GROUP_SIZE_M": 16,
8+
"num_warps": 4,
9+
"num_stages": 3
10+
},
11+
"2": {
12+
"BLOCK_SIZE_M": 16,
13+
"BLOCK_SIZE_N": 128,
14+
"BLOCK_SIZE_K": 128,
15+
"GROUP_SIZE_M": 1,
16+
"num_warps": 8,
17+
"num_stages": 3
18+
},
19+
"4": {
20+
"BLOCK_SIZE_M": 16,
21+
"BLOCK_SIZE_N": 128,
22+
"BLOCK_SIZE_K": 128,
23+
"GROUP_SIZE_M": 1,
24+
"num_warps": 8,
25+
"num_stages": 4
26+
},
27+
"8": {
28+
"BLOCK_SIZE_M": 16,
29+
"BLOCK_SIZE_N": 64,
30+
"BLOCK_SIZE_K": 128,
31+
"GROUP_SIZE_M": 16,
32+
"num_warps": 4,
33+
"num_stages": 3
34+
},
35+
"16": {
36+
"BLOCK_SIZE_M": 16,
37+
"BLOCK_SIZE_N": 64,
38+
"BLOCK_SIZE_K": 128,
39+
"GROUP_SIZE_M": 1,
40+
"num_warps": 4,
41+
"num_stages": 3
42+
},
43+
"24": {
44+
"BLOCK_SIZE_M": 16,
45+
"BLOCK_SIZE_N": 64,
46+
"BLOCK_SIZE_K": 128,
47+
"GROUP_SIZE_M": 1,
48+
"num_warps": 4,
49+
"num_stages": 3
50+
},
51+
"32": {
52+
"BLOCK_SIZE_M": 16,
53+
"BLOCK_SIZE_N": 64,
54+
"BLOCK_SIZE_K": 128,
55+
"GROUP_SIZE_M": 1,
56+
"num_warps": 4,
57+
"num_stages": 3
58+
},
59+
"48": {
60+
"BLOCK_SIZE_M": 16,
61+
"BLOCK_SIZE_N": 128,
62+
"BLOCK_SIZE_K": 128,
63+
"GROUP_SIZE_M": 1,
64+
"num_warps": 8,
65+
"num_stages": 3
66+
},
67+
"64": {
68+
"BLOCK_SIZE_M": 16,
69+
"BLOCK_SIZE_N": 64,
70+
"BLOCK_SIZE_K": 128,
71+
"GROUP_SIZE_M": 1,
72+
"num_warps": 4,
73+
"num_stages": 3
74+
},
75+
"96": {
76+
"BLOCK_SIZE_M": 32,
77+
"BLOCK_SIZE_N": 256,
78+
"BLOCK_SIZE_K": 128,
79+
"GROUP_SIZE_M": 1,
80+
"num_warps": 8,
81+
"num_stages": 4
82+
},
83+
"128": {
84+
"BLOCK_SIZE_M": 64,
85+
"BLOCK_SIZE_N": 128,
86+
"BLOCK_SIZE_K": 64,
87+
"GROUP_SIZE_M": 1,
88+
"num_warps": 8,
89+
"num_stages": 3
90+
},
91+
"256": {
92+
"BLOCK_SIZE_M": 128,
93+
"BLOCK_SIZE_N": 256,
94+
"BLOCK_SIZE_K": 64,
95+
"GROUP_SIZE_M": 1,
96+
"num_warps": 8,
97+
"num_stages": 4
98+
},
99+
"512": {
100+
"BLOCK_SIZE_M": 128,
101+
"BLOCK_SIZE_N": 256,
102+
"BLOCK_SIZE_K": 64,
103+
"GROUP_SIZE_M": 1,
104+
"num_warps": 8,
105+
"num_stages": 4
106+
},
107+
"1024": {
108+
"BLOCK_SIZE_M": 128,
109+
"BLOCK_SIZE_N": 256,
110+
"BLOCK_SIZE_K": 64,
111+
"GROUP_SIZE_M": 1,
112+
"num_warps": 8,
113+
"num_stages": 4
114+
},
115+
"1536": {
116+
"BLOCK_SIZE_M": 256,
117+
"BLOCK_SIZE_N": 256,
118+
"BLOCK_SIZE_K": 64,
119+
"GROUP_SIZE_M": 1,
120+
"num_warps": 8,
121+
"num_stages": 3
122+
},
123+
"2048": {
124+
"BLOCK_SIZE_M": 256,
125+
"BLOCK_SIZE_N": 256,
126+
"BLOCK_SIZE_K": 64,
127+
"GROUP_SIZE_M": 1,
128+
"num_warps": 8,
129+
"num_stages": 3
130+
},
131+
"3072": {
132+
"BLOCK_SIZE_M": 256,
133+
"BLOCK_SIZE_N": 256,
134+
"BLOCK_SIZE_K": 64,
135+
"GROUP_SIZE_M": 1,
136+
"num_warps": 8,
137+
"num_stages": 3
138+
},
139+
"4096": {
140+
"BLOCK_SIZE_M": 256,
141+
"BLOCK_SIZE_N": 256,
142+
"BLOCK_SIZE_K": 64,
143+
"GROUP_SIZE_M": 1,
144+
"num_warps": 8,
145+
"num_stages": 3
146+
}
147+
}
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
{
2+
"triton_version": "3.4.0",
3+
"1": {
4+
"BLOCK_SIZE_M": 16,
5+
"BLOCK_SIZE_N": 64,
6+
"BLOCK_SIZE_K": 128,
7+
"GROUP_SIZE_M": 1,
8+
"num_warps": 4,
9+
"num_stages": 5
10+
},
11+
"2": {
12+
"BLOCK_SIZE_M": 16,
13+
"BLOCK_SIZE_N": 128,
14+
"BLOCK_SIZE_K": 256,
15+
"GROUP_SIZE_M": 1,
16+
"num_warps": 8,
17+
"num_stages": 3
18+
},
19+
"4": {
20+
"BLOCK_SIZE_M": 16,
21+
"BLOCK_SIZE_N": 128,
22+
"BLOCK_SIZE_K": 256,
23+
"GROUP_SIZE_M": 1,
24+
"num_warps": 8,
25+
"num_stages": 4
26+
},
27+
"8": {
28+
"BLOCK_SIZE_M": 16,
29+
"BLOCK_SIZE_N": 128,
30+
"BLOCK_SIZE_K": 256,
31+
"GROUP_SIZE_M": 1,
32+
"num_warps": 8,
33+
"num_stages": 3
34+
},
35+
"16": {
36+
"BLOCK_SIZE_M": 16,
37+
"BLOCK_SIZE_N": 128,
38+
"BLOCK_SIZE_K": 256,
39+
"GROUP_SIZE_M": 1,
40+
"num_warps": 8,
41+
"num_stages": 3
42+
},
43+
"24": {
44+
"BLOCK_SIZE_M": 16,
45+
"BLOCK_SIZE_N": 64,
46+
"BLOCK_SIZE_K": 128,
47+
"GROUP_SIZE_M": 1,
48+
"num_warps": 4,
49+
"num_stages": 5
50+
},
51+
"32": {
52+
"BLOCK_SIZE_M": 16,
53+
"BLOCK_SIZE_N": 64,
54+
"BLOCK_SIZE_K": 128,
55+
"GROUP_SIZE_M": 1,
56+
"num_warps": 4,
57+
"num_stages": 5
58+
},
59+
"48": {
60+
"BLOCK_SIZE_M": 16,
61+
"BLOCK_SIZE_N": 64,
62+
"BLOCK_SIZE_K": 256,
63+
"GROUP_SIZE_M": 1,
64+
"num_warps": 4,
65+
"num_stages": 2
66+
},
67+
"64": {
68+
"BLOCK_SIZE_M": 16,
69+
"BLOCK_SIZE_N": 128,
70+
"BLOCK_SIZE_K": 256,
71+
"GROUP_SIZE_M": 1,
72+
"num_warps": 8,
73+
"num_stages": 3
74+
},
75+
"96": {
76+
"BLOCK_SIZE_M": 64,
77+
"BLOCK_SIZE_N": 128,
78+
"BLOCK_SIZE_K": 128,
79+
"GROUP_SIZE_M": 1,
80+
"num_warps": 8,
81+
"num_stages": 3
82+
},
83+
"128": {
84+
"BLOCK_SIZE_M": 64,
85+
"BLOCK_SIZE_N": 128,
86+
"BLOCK_SIZE_K": 128,
87+
"GROUP_SIZE_M": 1,
88+
"num_warps": 8,
89+
"num_stages": 3
90+
},
91+
"256": {
92+
"BLOCK_SIZE_M": 64,
93+
"BLOCK_SIZE_N": 128,
94+
"BLOCK_SIZE_K": 128,
95+
"GROUP_SIZE_M": 1,
96+
"num_warps": 8,
97+
"num_stages": 3
98+
},
99+
"512": {
100+
"BLOCK_SIZE_M": 128,
101+
"BLOCK_SIZE_N": 256,
102+
"BLOCK_SIZE_K": 128,
103+
"GROUP_SIZE_M": 1,
104+
"num_warps": 8,
105+
"num_stages": 4
106+
},
107+
"1024": {
108+
"BLOCK_SIZE_M": 256,
109+
"BLOCK_SIZE_N": 256,
110+
"BLOCK_SIZE_K": 64,
111+
"GROUP_SIZE_M": 1,
112+
"num_warps": 8,
113+
"num_stages": 5
114+
},
115+
"1536": {
116+
"BLOCK_SIZE_M": 128,
117+
"BLOCK_SIZE_N": 256,
118+
"BLOCK_SIZE_K": 128,
119+
"GROUP_SIZE_M": 1,
120+
"num_warps": 8,
121+
"num_stages": 4
122+
},
123+
"2048": {
124+
"BLOCK_SIZE_M": 256,
125+
"BLOCK_SIZE_N": 256,
126+
"BLOCK_SIZE_K": 128,
127+
"GROUP_SIZE_M": 16,
128+
"num_warps": 8,
129+
"num_stages": 3
130+
},
131+
"3072": {
132+
"BLOCK_SIZE_M": 128,
133+
"BLOCK_SIZE_N": 256,
134+
"BLOCK_SIZE_K": 128,
135+
"GROUP_SIZE_M": 1,
136+
"num_warps": 8,
137+
"num_stages": 4
138+
},
139+
"4096": {
140+
"BLOCK_SIZE_M": 256,
141+
"BLOCK_SIZE_N": 256,
142+
"BLOCK_SIZE_K": 128,
143+
"GROUP_SIZE_M": 1,
144+
"num_warps": 8,
145+
"num_stages": 3
146+
}
147+
}

0 commit comments

Comments
 (0)