Skip to content

Commit 003b49d

Browse files
authored
Fix Task Group Mixing (#16)
* Fix taskgroup usage in runtime--mha is a to-do * Fix MHA BD allocation errors by using taskgroups * Formatting
1 parent 0999a9a commit 003b49d

File tree

16 files changed

+131
-47
lines changed

16 files changed

+131
-47
lines changed

example/axpy/axpy.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -87,29 +87,34 @@ def core_body(of_in1, of_in2, of_out, axpy):
8787
rt = Runtime()
8888
with rt.sequence(tensor_ty, tensor_ty, tensor_ty) as (A, B, C):
8989
rt.start(*my_workers)
90+
91+
# Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
92+
tg = rt.task_group()
93+
9094
# Fill the input objectFIFOs with data
9195
for i in range(num_columns):
9296
rt.fill(
9397
of_in1s[i].prod(),
9498
A,
9599
taps[i],
100+
task_group=tg,
96101
)
97102
rt.fill(
98103
of_in2s[i].prod(),
99104
B,
100105
taps[i],
106+
task_group=tg,
101107
)
102108
# Drain the output objectFIFOs with data
103-
tg_out = rt.task_group()
104109
for i in range(num_columns):
105110
rt.drain(
106111
of_outs[i].cons(),
107112
C,
108113
taps[i],
109114
wait=True, # wait for the transfer to complete and data to be available
110-
task_group=tg_out,
115+
task_group=tg,
111116
)
112-
rt.finish_task_group(tg_out)
117+
rt.finish_task_group(tg)
113118

114119
# Place program components (assign them resources on the device) and generate an MLIR module
115120
return Program(dev, rt).resolve_program(SequentialPlacer())

example/dequant/dequant.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,26 +122,30 @@ def core_body(of_in1, of_out, dequant_kernel):
122122
if enable_trace:
123123
rt.enable_trace(trace_size)
124124
rt.start(*my_workers)
125+
126+
# Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
127+
tg = rt.task_group()
128+
125129
# Fill the input objectFIFOs with data
126130
for i in range(num_columns):
127131
for j in range(num_channels):
128132
rt.fill(
129133
of_in1s[i * num_channels + j].prod(),
130134
A,
131135
taps_in[i * num_channels + j],
136+
task_group=tg,
132137
)
133138
# Drain the output objectFIFOs with data
134-
tg_out = rt.task_group()
135139
for i in range(num_columns):
136140
for j in range(num_channels):
137141
rt.drain(
138142
of_outs[i * num_channels + j].cons(),
139143
C,
140144
taps_out[i * num_channels + j],
141145
wait=True, # wait for the transfer to complete and data to be available
142-
task_group=tg_out,
146+
task_group=tg,
143147
)
144-
rt.finish_task_group(tg_out)
148+
rt.finish_task_group(tg)
145149

146150
# Place program components (assign them resources on the device) and generate an MLIR module
147151
return Program(dev, rt).resolve_program(SequentialPlacer())

example/elementwise_add/eltwise_add.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,29 +84,34 @@ def core_body(of_in1, of_in2, of_out, eltwise_add):
8484
rt = Runtime()
8585
with rt.sequence(tensor_ty, tensor_ty, tensor_ty) as (A, B, C):
8686
rt.start(*my_workers)
87+
88+
# Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
89+
tg = rt.task_group()
90+
8791
# Fill the input objectFIFOs with data
8892
for i in range(num_columns):
8993
rt.fill(
9094
of_in1s[i].prod(),
9195
A,
9296
taps[i],
97+
task_group=tg,
9398
)
9499
rt.fill(
95100
of_in2s[i].prod(),
96101
B,
97102
taps[i],
103+
task_group=tg,
98104
)
99105
# Drain the output objectFIFOs with data
100-
tg_out = rt.task_group()
101106
for i in range(num_columns):
102107
rt.drain(
103108
of_outs[i].cons(),
104109
C,
105110
taps[i],
106111
wait=True, # wait for the transfer to complete and data to be available
107-
task_group=tg_out,
112+
task_group=tg,
108113
)
109-
rt.finish_task_group(tg_out)
114+
rt.finish_task_group(tg)
110115

111116
# Place program components (assign them resources on the device) and generate an MLIR module
112117
return Program(dev, rt).resolve_program(SequentialPlacer())

example/elementwise_mul/eltwise_mul.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,29 +83,34 @@ def core_body(of_in1, of_in2, of_out, eltwise_mul):
8383
rt = Runtime()
8484
with rt.sequence(tensor_ty, tensor_ty, tensor_ty) as (A, B, C):
8585
rt.start(*my_workers)
86+
87+
# Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
88+
tg = rt.task_group()
89+
8690
# Fill the input objectFIFOs with data
8791
for i in range(num_columns):
8892
rt.fill(
8993
of_in1s[i].prod(),
9094
A,
9195
taps[i],
96+
task_group=tg,
9297
)
9398
rt.fill(
9499
of_in2s[i].prod(),
95100
B,
96101
taps[i],
102+
task_group=tg,
97103
)
98104
# Drain the output objectFIFOs with data
99-
tg_out = rt.task_group()
100105
for i in range(num_columns):
101106
rt.drain(
102107
of_outs[i].cons(),
103108
C,
104109
taps[i],
105110
wait=True, # wait for the transfer to complete and data to be available
106-
task_group=tg_out,
111+
task_group=tg,
107112
)
108-
rt.finish_task_group(tg_out)
113+
rt.finish_task_group(tg)
109114

110115
# Place program components (assign them resources on the device) and generate an MLIR module
111116
return Program(dev, rt).resolve_program(SequentialPlacer())

example/gelu/gelu.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,26 +92,30 @@ def core_fn(of_in, of_out, geluLine):
9292
rt = Runtime()
9393
with rt.sequence(transfer_type, transfer_type) as (a_in, b_out):
9494
rt.start(*my_workers)
95+
96+
# Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
97+
tg = rt.task_group()
98+
9599
# Fill the input objectFIFOs with data
96100
for i in range(num_columns):
97101
for j in range(num_channels):
98102
rt.fill(
99103
of_ins[i * num_channels + j].prod(),
100104
a_in,
101105
taps[i * num_channels + j],
106+
task_group=tg,
102107
)
103108
# Drain the output objectFIFOs with data
104-
tg_out = rt.task_group() # Initialize a group for parallel drain tasks
105109
for i in range(num_columns):
106110
for j in range(num_channels):
107111
rt.drain(
108112
of_outs[i * num_channels + j].cons(),
109113
b_out,
110114
taps[i * num_channels + j],
111115
wait=True, # wait for the transfer to complete and data to be available
112-
task_group=tg_out,
116+
task_group=tg,
113117
)
114-
rt.finish_task_group(tg_out)
118+
rt.finish_task_group(tg)
115119

116120
# Place components (assign them resources on the device) and generate an MLIR module
117121
return Program(dev, rt).resolve_program(SequentialPlacer())

example/layer_norm/layer_norm.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,26 +93,30 @@ def core_body(of_in1, of_out, layer_norm_kernel):
9393
rt = Runtime()
9494
with rt.sequence(tensor_ty, tensor_ty) as (A, C):
9595
rt.start(*my_workers)
96+
97+
# Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
98+
tg = rt.task_group()
99+
96100
# Fill the input objectFIFOs with data
97101
for i in range(num_columns):
98102
for j in range(num_channels):
99103
rt.fill(
100104
of_in1s[i * num_channels + j].prod(),
101105
A,
102106
taps[i * num_channels + j],
107+
task_group=tg,
103108
)
104109
# Drain the output objectFIFOs with data
105-
tg_out = rt.task_group()
106110
for i in range(num_columns):
107111
for j in range(num_channels):
108112
rt.drain(
109113
of_outs[i * num_channels + j].cons(),
110114
C,
111115
taps[i * num_channels + j],
112116
wait=True, # wait for the transfer to complete and data to be available
113-
task_group=tg_out,
117+
task_group=tg,
114118
)
115-
rt.finish_task_group(tg_out)
119+
rt.finish_task_group(tg)
116120

117121
# Place program components (assign them resources on the device) and generate an MLIR module
118122
return Program(dev, rt).resolve_program(SequentialPlacer())

example/leaky_relu/leaky_relu.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,26 +91,30 @@ def core_fn(of_in, of_out, leaky_relu_line):
9191
rt = Runtime()
9292
with rt.sequence(transfer_type, transfer_type) as (a_in, b_out):
9393
rt.start(*my_workers)
94+
95+
# Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
96+
tg = rt.task_group()
97+
9498
# Fill the input objectFIFOs with data
9599
for i in range(num_columns):
96100
for j in range(num_channels):
97101
rt.fill(
98102
of_ins[i * num_channels + j].prod(),
99103
a_in,
100104
taps[i * num_channels + j],
105+
task_group=tg,
101106
)
102107
# Drain the output objectFIFOs with data
103-
tg_out = rt.task_group() # Initialize a group for parallel drain tasks
104108
for i in range(num_columns):
105109
for j in range(num_channels):
106110
rt.drain(
107111
of_outs[i * num_channels + j].cons(),
108112
b_out,
109113
taps[i * num_channels + j],
110114
wait=True, # wait for the transfer to complete and data to be available
111-
task_group=tg_out,
115+
task_group=tg,
112116
)
113-
rt.finish_task_group(tg_out)
117+
rt.finish_task_group(tg)
114118

115119
# Place components (assign them resources on the device) and generate an MLIR module
116120
return Program(dev, rt).resolve_program(SequentialPlacer())

example/mem_copy/mem_copy.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,26 +92,30 @@ def core_fn(of_in, of_out, mem_copyLine):
9292
rt = Runtime()
9393
with rt.sequence(transfer_type, transfer_type) as (a_in, b_out):
9494
rt.start(*my_workers)
95+
96+
# Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
97+
tg = rt.task_group()
98+
9599
# Fill the input objectFIFOs with data
96100
for i in range(num_columns):
97101
for j in range(num_channels):
98102
rt.fill(
99103
of_ins[i * num_channels + j].prod(),
100104
a_in,
101105
taps[i * num_channels + j],
106+
task_group=tg,
102107
)
103108
# Drain the output objectFIFOs with data
104-
tg_out = rt.task_group() # Initialize a group for parallel drain tasks
105109
for i in range(num_columns):
106110
for j in range(num_channels):
107111
rt.drain(
108112
of_outs[i * num_channels + j].cons(),
109113
b_out,
110114
taps[i * num_channels + j],
111115
wait=True, # wait for the transfer to complete and data to be available
112-
task_group=tg_out,
116+
task_group=tg,
113117
)
114-
rt.finish_task_group(tg_out)
118+
rt.finish_task_group(tg)
115119

116120
# Place components (assign them resources on the device) and generate an MLIR module
117121
return Program(dev, rt).resolve_program(SequentialPlacer())

example/mha/mha.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -753,6 +753,9 @@ def set_loop_idx_rtp():
753753

754754
for q_block_idx in range(num_q_block_per_pipeline):
755755

756+
# Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
757+
tg = rt.task_group()
758+
756759
if number_of_pipelines > 6:
757760
rt.fill(
758761
inQ.prod(),
@@ -761,6 +764,7 @@ def set_loop_idx_rtp():
761764
2 * head_idx * num_q_block_per_pipeline + q_block_idx * 2
762765
],
763766
placement=Tile(col=4, row=0),
767+
task_group=tg,
764768
)
765769
rt.fill(
766770
inQ2.prod(),
@@ -771,21 +775,31 @@ def set_loop_idx_rtp():
771775
+ 1
772776
],
773777
placement=Tile(col=4, row=0),
778+
task_group=tg,
774779
)
775780
else:
776781
rt.fill(
777782
inQ.prod(),
778783
Q,
779784
tap=Q_tiles[head_idx * num_q_block_per_pipeline + q_block_idx],
780785
placement=Tile(col=4, row=0),
786+
task_group=tg,
781787
)
782788

783789
# Thow on bd containing the full K and V in the object fifo, then does it transfer cunks of inKV size at the time?
784790
rt.fill(
785-
inK.prod(), K, tap=K_tiles[head_idx], placement=Tile(col=5, row=0)
791+
inK.prod(),
792+
K,
793+
tap=K_tiles[head_idx],
794+
placement=Tile(col=5, row=0),
795+
task_group=tg,
786796
)
787797
rt.fill(
788-
inV.prod(), V, tap=V_tiles[head_idx], placement=Tile(col=6, row=0)
798+
inV.prod(),
799+
V,
800+
tap=V_tiles[head_idx],
801+
placement=Tile(col=6, row=0),
802+
task_group=tg,
789803
)
790804

791805
if number_of_pipelines > 6:
@@ -797,6 +811,7 @@ def set_loop_idx_rtp():
797811
],
798812
wait=True,
799813
placement=Tile(col=7, row=0),
814+
task_group=tg,
800815
)
801816
rt.drain(
802817
memO2.cons(),
@@ -808,6 +823,7 @@ def set_loop_idx_rtp():
808823
],
809824
wait=True,
810825
placement=Tile(col=7, row=0),
826+
task_group=tg,
811827
)
812828
else:
813829
rt.drain(
@@ -816,8 +832,11 @@ def set_loop_idx_rtp():
816832
tap=O_tiles[head_idx * num_q_block_per_pipeline + q_block_idx],
817833
wait=True,
818834
placement=Tile(col=7, row=0),
835+
task_group=tg,
819836
)
820837

838+
rt.finish_task_group(tg)
839+
821840
# Create the program from the device type and runtime
822841
if dev == "npu":
823842
dev_ty = NPU1Col1()

example/relu/relu.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -90,26 +90,30 @@ def core_fn(of_in, of_out, reluLine):
9090
rt = Runtime()
9191
with rt.sequence(transfer_type, transfer_type) as (a_in, b_out):
9292
rt.start(*my_workers)
93+
94+
# Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
95+
tg = rt.task_group()
96+
9397
# Fill the input objectFIFOs with data
9498
for i in range(num_columns):
9599
for j in range(num_channels):
96100
rt.fill(
97101
of_ins[i * num_channels + j].prod(),
98102
a_in,
99103
taps[i * num_channels + j],
104+
task_group=tg,
100105
)
101106
# Drain the output objectFIFOs with data
102-
tg_out = rt.task_group() # Initialize a group for parallel drain tasks
103107
for i in range(num_columns):
104108
for j in range(num_channels):
105109
rt.drain(
106110
of_outs[i * num_channels + j].cons(),
107111
b_out,
108112
taps[i * num_channels + j],
109113
wait=True, # wait for the transfer to complete and data to be available
110-
task_group=tg_out,
114+
task_group=tg,
111115
)
112-
rt.finish_task_group(tg_out)
116+
rt.finish_task_group(tg)
113117

114118
# Place components (assign them resources on the device) and generate an MLIR module
115119
return Program(dev, rt).resolve_program(SequentialPlacer())

0 commit comments

Comments
 (0)