-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.py
315 lines (266 loc) · 13.5 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
import sys
import tempfile
import math
import triton
import torch
from typing import Union, Optional
from dataclasses import dataclass
@dataclass
class Layout:
layout_line: str
name: str
def __post_init__(self):
self.name = self.layout_line.split('=')[0].split('#')[1].strip()
@dataclass
class BlockedLayout(Layout):
size_per_thread: Optional[list[int]] = None
threads_per_warp: Optional[list[int]] = None
warps_per_cta: Optional[list[int]] = None
order: Optional[list[int]] = None
def __post_init__(self):
super().__post_init__()
# e.g., #blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}
self.size_per_thread = list(map(int, self.layout_line.split(
'=')[2].split('[')[1].split(']')[0].split(',')))
self.threads_per_warp = list(map(int, self.layout_line.split(
'=')[3].split('[')[1].split(']')[0].split(',')))
self.warps_per_cta = list(map(int, self.layout_line.split(
'=')[4].split('[')[1].split(']')[0].split(',')))
self.order = list(map(int, self.layout_line.split(
'=')[5].split('[')[1].split(']')[0].split(',')))
@ dataclass
class NvidiaMmaLayout(Layout):
version_major: Optional[int] = None
version_minor: Optional[int] = None
warps_per_cta: Optional[int] = None
instr_shape: Optional[list[int]] = None
def __post_init__(self):
super().__post_init__()
# e.g., #mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 256, 16]}>
self.version_major = int(self.layout_line.split('=')[2].split(',')[0])
self.version_minor = int(self.layout_line.split('=')[3].split(',')[0])
self.warps_per_cta = list(map(int, self.layout_line.split(
'=')[4].split('[')[1].split(']')[0].split(',')))
self.instr_shape = list(map(int, self.layout_line.split(
'=')[5].split('[')[1].split(']')[0].split(',')))
@ dataclass
class SliceLayout(Layout):
dim: Optional[int] = None
parent: Optional[Layout] = None
def __post_init__(self):
pass
@ dataclass
class Tensor:
# 64x64xbf16
shape_and_dtype_str: str
shape: Optional[list] = None
dtype: Optional[str] = None
layout: Optional[Union[BlockedLayout, NvidiaMmaLayout, SliceLayout]] = None
def __post_init__(self):
# separate by 'x'
shape_str = self.shape_and_dtype_str.split('x')[:-1]
self.dtype = self.shape_and_dtype_str.split('x')[-1]
self.shape = list(map(int, shape_str))
@ dataclass
class ConvertLayout:
input_tensor: Tensor
output_tensor: Tensor
warps_per_cta: int
layout_lines: list[str]
def parse_layout(layout_line):
# e.g., #blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
layout_name = layout_line.split('=')[0].strip().split('#')[1]
layout_identifier = layout_line.split('=')[1].strip().split('<')[0]
if layout_identifier == "#triton_gpu.blocked":
return layout_name, BlockedLayout(name=layout_name, layout_line=layout_line)
elif layout_identifier == "#triton_gpu.nvidia_mma":
return layout_name, NvidiaMmaLayout(name=layout_name, layout_line=layout_line)
elif layout_identifier == "#triton_gpu.slice":
return layout_name, SliceLayout(name=layout_name, layout_line=layout_line)
else:
raise ValueError(f"Unknown layout identifier: {layout_identifier}")
def extract_tensor_info(tensor_str, layout_dict):
# e.g., tensor<256xf32, #blocked>
shape_and_dtype_str = tensor_str.split('<')[1].split(',')[0]
layout_name = tensor_str.split('<')[1].split(
',')[1].split('>')[0].split('#')[1]
# e.g., tensor<256xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>>
if "triton_gpu.slice" in tensor_str:
dim = int(tensor_str.split('<')[2].split(',')[0].split('=')[1])
parent_str = tensor_str.split('<')[2].split(',')[
1].split('=')[1].split('}')[0].split('#')[1]
parent_layout = layout_dict[parent_str]
layout_name = f"triton_gpu.slice<{{dim = {dim}, parent = #{parent_str}}}>"
layout = SliceLayout(
layout_line=tensor_str, name=layout_name, dim=dim, parent=parent_layout)
else:
layout = layout_dict[layout_name]
return shape_and_dtype_str, layout
def parse_convert_layout(convert_layout_line, layout_dict, layout_lines):
# e.g., %149 = triton_gpu.convert_layout %145 : tensor<256xf32, #blocked> -> tensor<256xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>>
# remove = if exists
convert_layout_line = convert_layout_line.split(':')[1].strip()
input_tensor_str = convert_layout_line.split('->')[0].strip()
output_tensor_str = convert_layout_line.split('->')[1].strip()
input_tensor_shape_and_dtype_str, input_tensor_layout = extract_tensor_info(
input_tensor_str, layout_dict)
output_tensor_shape_and_dtype_str, output_tensor_layout = extract_tensor_info(
output_tensor_str, layout_dict)
input_tensor = Tensor(
shape_and_dtype_str=input_tensor_shape_and_dtype_str, layout=input_tensor_layout)
output_tensor = Tensor(
shape_and_dtype_str=output_tensor_shape_and_dtype_str, layout=output_tensor_layout)
warps_per_cta = 4
for _, layout in layout_dict.items():
if isinstance(layout, (NvidiaMmaLayout, BlockedLayout)):
warps_per_cta = math.prod(layout.warps_per_cta)
break
return ConvertLayout(input_tensor, output_tensor, warps_per_cta, layout_lines)
def parse_file(input_file):
convert_layouts = []
with open(input_file, 'r') as f:
lines = f.readlines()
sections = []
cur_section = []
for i in range(len(lines)):
if lines[i].startswith("// --"):
if len(cur_section) > 0:
sections.append(cur_section)
cur_section = []
elif len(lines[i].strip()) > 0:
cur_section.append(lines[i])
if len(cur_section) > 0:
sections.append(cur_section)
for section in sections:
layout_dict = {}
layout_lines = section[:-1]
convert_layout_line = section[-1]
for layout_line in layout_lines:
name, layout = parse_layout(layout_line)
layout_dict[name] = layout
convert_layout = parse_convert_layout(
convert_layout_line, layout_dict, layout_lines)
convert_layouts.append(convert_layout)
return convert_layouts
def generate_ttgir1d(kernel_name: str, convert_layout: ConvertLayout):
M = convert_layout.input_tensor.shape[0]
dtype = convert_layout.input_tensor.dtype
src_layout = convert_layout.input_tensor.layout.name
dst_layout = convert_layout.output_tensor.layout.name
warps_per_cta = convert_layout.warps_per_cta
layout_lines = "".join(convert_layout.layout_lines)
ir = layout_lines + f"""module attributes {{"triton_gpu.num-warps" = {warps_per_cta} : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.threads-per-warp" = 32 : i32}} {{
tt.func public @{kernel_name}(%arg0: !tt.ptr<{dtype}> {{tt.divisibility = 16 : i32}}, %arg1: !tt.ptr<{dtype}> {{tt.divisibility = 16 : i32}}) {{
%0 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #{src_layout}>
%1 = tt.splat %arg0 : !tt.ptr<{dtype}> -> tensor<{M}x!tt.ptr<{dtype}>, #{src_layout}>
%2 = tt.splat %arg1 : !tt.ptr<{dtype}> -> tensor<{M}x!tt.ptr<{dtype}>, #{dst_layout}>
%3 = tt.addptr %1, %0 : tensor<{M}x!tt.ptr<{dtype}>, #{src_layout}>, tensor<{M}xi32, #{src_layout}>
%4 = tt.load %3 : tensor<{M}x!tt.ptr<{dtype}>, #{src_layout}>
%tmp = triton_gpu.convert_layout %4 : tensor<{M}x{dtype}, #{src_layout}> -> tensor<{M}x{dtype}, #{dst_layout}>
%idx = arith.constant 0 : i32
%ub = arith.constant 1024 : i32
%step = arith.constant 1 : i32
%5 = scf.for %i = %idx to %ub step %step iter_args(%arg = %tmp) -> (tensor<{M}x{dtype}, #{dst_layout}>) : i32 {{
%result = triton_gpu.convert_layout %4 : tensor<{M}x{dtype}, #{src_layout}> -> tensor<{M}x{dtype}, #{dst_layout}>
scf.yield %result : tensor<{M}x{dtype}, #{dst_layout}>
}}
%6 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #{dst_layout}>
%7 = tt.addptr %2, %6 : tensor<{M}x!tt.ptr<{dtype}>, #{dst_layout}>, tensor<{M}xi32, #{dst_layout}>
tt.store %7, %5 : tensor<{M}x!tt.ptr<{dtype}>, #{dst_layout}>
tt.return
}}
}}
"""
return ir
def generate_ttgir2d(kernel_name: str, convert_layout: ConvertLayout):
M, N = convert_layout.input_tensor.shape
dtype = convert_layout.input_tensor.dtype
src_layout = convert_layout.input_tensor.layout.name
dst_layout = convert_layout.output_tensor.layout.name
warps_per_cta = convert_layout.warps_per_cta
layout_lines = "".join(convert_layout.layout_lines)
ir = layout_lines + f"""module attributes {{"triton_gpu.num-warps" = {warps_per_cta} : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.threads-per-warp" = 32 : i32}} {{
tt.func public @{kernel_name}(%arg0: !tt.ptr<{dtype}> {{tt.divisibility = 16 : i32}}, %arg1: !tt.ptr<{dtype}> {{tt.divisibility = 16 : i32}}) {{
%cst = arith.constant dense<{N}> : tensor<{M}x1xi32, #{src_layout}>
%0 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim=1, parent=#{src_layout}}}>>
%1 = tt.make_range {{end = {N} : i32, start = 0 : i32}} : tensor<{N}xi32, #triton_gpu.slice<{{dim=0, parent=#{src_layout}}}>>
%2 = tt.splat %arg0 : !tt.ptr<{dtype}> -> tensor<{M}x{N}x!tt.ptr<{dtype}>, #{src_layout}>
%3 = tt.splat %arg1 : !tt.ptr<{dtype}> -> tensor<{M}x{N}x!tt.ptr<{dtype}>, #{dst_layout}>
%4 = tt.expand_dims %0 {{axis = 1 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #{src_layout}}}>> -> tensor<{M}x1xi32, #{src_layout}>
%6 = tt.expand_dims %1 {{axis = 0 : i32}} : tensor<{N}xi32, #triton_gpu.slice<{{dim = 0, parent = #{src_layout}}}>> -> tensor<1x{N}xi32, #{src_layout}>
%5 = arith.muli %4, %cst : tensor<{M}x1xi32, #{src_layout}>
%7 = tt.broadcast %6 : tensor<1x{N}xi32, #{src_layout}> -> tensor<{M}x{N}xi32, #{src_layout}>
%8 = tt.broadcast %5 : tensor<{M}x1xi32, #{src_layout}> -> tensor<{M}x{N}xi32, #{src_layout}>
%9 = arith.addi %8, %7 : tensor<{M}x{N}xi32, #{src_layout}>
%10 = tt.addptr %2, %9 : tensor<{M}x{N}x!tt.ptr<{dtype}>, #{src_layout}>, tensor<{M}x{N}xi32, #{src_layout}>
%11 = tt.load %10 : tensor<{M}x{N}x!tt.ptr<{dtype}>, #{src_layout}>
%tmp = triton_gpu.convert_layout %11 : tensor<{M}x{N}x{dtype}, #{src_layout}> -> tensor<{M}x{N}x{dtype}, #{dst_layout}>
%idx = arith.constant 0 : i32
%ub = arith.constant 1024 : i32
%step = arith.constant 1 : i32
%12 = scf.for %i = %idx to %ub step %step iter_args(%arg = %tmp) -> (tensor<{M}x{N}x{dtype}, #{dst_layout}>) : i32 {{
%result = triton_gpu.convert_layout %11 : tensor<{M}x{N}x{dtype}, #{src_layout}> -> tensor<{M}x{N}x{dtype}, #{dst_layout}>
scf.yield %result : tensor<{M}x{N}x{dtype}, #{dst_layout}>
}}
%13 = triton_gpu.convert_layout %9 : tensor<{M}x{N}xi32, #{src_layout}> -> tensor<{M}x{N}xi32, #{dst_layout}>
%14 = tt.addptr %3, %13 : tensor<{M}x{N}x!tt.ptr<{dtype}>, #{dst_layout}>, tensor<{M}x{N}xi32, #{dst_layout}>
tt.store %14, %12 : tensor<{M}x{N}x!tt.ptr<{dtype}>, #{dst_layout}>
tt.return
}}
}}
"""
return ir
def generate_ttgir(kernel_name: str, convert_layout: ConvertLayout):
if len(convert_layout.input_tensor.shape) == 1:
return generate_ttgir1d(kernel_name, convert_layout)
elif len(convert_layout.input_tensor.shape) == 2:
return generate_ttgir2d(kernel_name, convert_layout)
else:
raise ValueError("Only support 1D or 2D tensor for now")
def compile_ttgir(ttgir):
with tempfile.NamedTemporaryFile(mode='w', suffix='.ttgir') as f:
f.write(ttgir)
f.flush()
kernel = triton.compile(f.name)
return kernel
def triton_dtype_to_torch_dtype(dtype: str):
if dtype == "f16":
return torch.float16
elif dtype == "f32":
return torch.float32
elif dtype == "f64":
return torch.float64
elif dtype == "bf16":
return torch.bfloat16
elif dtype == "i64":
return torch.int64
elif dtype == "i32":
return torch.int32
elif dtype == "i16":
return torch.int16
elif dtype == "i8":
return torch.int8
elif dtype.startswith("f8"):
# Any fp8 type should work in our test cases
return torch.float8_e5m2
else:
raise ValueError(f"Unknown dtype: {dtype}")
def execute(index, kernel, convert_layout: ConvertLayout):
torch_dtype = triton_dtype_to_torch_dtype(
convert_layout.input_tensor.dtype)
src = torch.randn(convert_layout.input_tensor.shape,
device='cuda').to(torch_dtype)
dst = torch.zeros(convert_layout.output_tensor.shape,
device='cuda').to(torch_dtype)
kernel[(1, 1, 1)](src.data_ptr(), dst.data_ptr())
torch.testing.assert_close(
dst, src, msg="Mismatch between src and dst")
time = triton.testing.do_bench_cudagraph(
lambda: kernel[(1, 1, 1)](src.data_ptr(), dst.data_ptr()), rep=100)
print(f"Kernel {index} execution time: {time}")
input_file = sys.argv[1]
convert_layouts = parse_file(input_file)
for i, convert_layout in enumerate(convert_layouts):
ttgir = generate_ttgir("kernel" + str(i), convert_layout)
kernel = compile_ttgir(ttgir)
execute(i, kernel, convert_layout)