amd
diff --git a/‎example/axpy/axpy.py‎
Lines changed: 8 additions & 3 deletions b/‎example/axpy/axpy.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎example/dequant/dequant.py‎
Lines changed: 7 additions & 3 deletions b/‎example/dequant/dequant.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎example/elementwise_add/eltwise_add.py‎
Lines changed: 8 additions & 3 deletions b/‎example/elementwise_add/eltwise_add.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎example/elementwise_mul/eltwise_mul.py‎
Lines changed: 8 additions & 3 deletions b/‎example/elementwise_mul/eltwise_mul.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎example/gelu/gelu.py‎
Lines changed: 7 additions & 3 deletions b/‎example/gelu/gelu.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎example/layer_norm/layer_norm.py‎
Lines changed: 7 additions & 3 deletions b/‎example/layer_norm/layer_norm.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎example/leaky_relu/leaky_relu.py‎
Lines changed: 7 additions & 3 deletions b/‎example/leaky_relu/leaky_relu.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎example/mem_copy/mem_copy.py‎
Lines changed: 7 additions & 3 deletions b/‎example/mem_copy/mem_copy.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎example/mha/mha.py‎
Lines changed: 21 additions & 2 deletions b/‎example/mha/mha.py‎
Lines changed: 21 additions & 2 deletions
diff --git a/‎example/relu/relu.py‎
Lines changed: 7 additions & 3 deletions b/‎example/relu/relu.py‎
Lines changed: 7 additions & 3 deletions
@@ -87,29 +87,34 @@ def core_body(of_in1, of_in2, of_out, axpy):
     rt = Runtime()
     with rt.sequence(tensor_ty, tensor_ty, tensor_ty) as (A, B, C):
         rt.start(*my_workers)
+
+        # Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
+        tg = rt.task_group()
+
         # Fill the input objectFIFOs with data
         for i in range(num_columns):
             rt.fill(
                 of_in1s[i].prod(),
                 A,
                 taps[i],
+                task_group=tg,
             )
             rt.fill(
                 of_in2s[i].prod(),
                 B,
                 taps[i],
+                task_group=tg,
             )
         # Drain the output objectFIFOs with data
-        tg_out = rt.task_group()
         for i in range(num_columns):
             rt.drain(
                 of_outs[i].cons(),
                 C,
                 taps[i],
                 wait=True,  # wait for the transfer to complete and data to be available
-                task_group=tg_out,
+                task_group=tg,
             )
-        rt.finish_task_group(tg_out)
+        rt.finish_task_group(tg)
 
     # Place program components (assign them resources on the device) and generate an MLIR module
     return Program(dev, rt).resolve_program(SequentialPlacer())
 
@@ -122,26 +122,30 @@ def core_body(of_in1, of_out, dequant_kernel):
         if enable_trace:
             rt.enable_trace(trace_size)
         rt.start(*my_workers)
+
+        # Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
+        tg = rt.task_group()
+
         # Fill the input objectFIFOs with data
         for i in range(num_columns):
             for j in range(num_channels):
                 rt.fill(
                     of_in1s[i * num_channels + j].prod(),
                     A,
                     taps_in[i * num_channels + j],
+                    task_group=tg,
                 )
         # Drain the output objectFIFOs with data
-        tg_out = rt.task_group()
         for i in range(num_columns):
             for j in range(num_channels):
                 rt.drain(
                     of_outs[i * num_channels + j].cons(),
                     C,
                     taps_out[i * num_channels + j],
                     wait=True,  # wait for the transfer to complete and data to be available
-                    task_group=tg_out,
+                    task_group=tg,
                 )
-        rt.finish_task_group(tg_out)
+        rt.finish_task_group(tg)
 
     # Place program components (assign them resources on the device) and generate an MLIR module
     return Program(dev, rt).resolve_program(SequentialPlacer())
 
@@ -84,29 +84,34 @@ def core_body(of_in1, of_in2, of_out, eltwise_add):
     rt = Runtime()
     with rt.sequence(tensor_ty, tensor_ty, tensor_ty) as (A, B, C):
         rt.start(*my_workers)
+
+        # Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
+        tg = rt.task_group()
+
         # Fill the input objectFIFOs with data
         for i in range(num_columns):
             rt.fill(
                 of_in1s[i].prod(),
                 A,
                 taps[i],
+                task_group=tg,
             )
             rt.fill(
                 of_in2s[i].prod(),
                 B,
                 taps[i],
+                task_group=tg,
             )
         # Drain the output objectFIFOs with data
-        tg_out = rt.task_group()
         for i in range(num_columns):
             rt.drain(
                 of_outs[i].cons(),
                 C,
                 taps[i],
                 wait=True,  # wait for the transfer to complete and data to be available
-                task_group=tg_out,
+                task_group=tg,
             )
-        rt.finish_task_group(tg_out)
+        rt.finish_task_group(tg)
 
     # Place program components (assign them resources on the device) and generate an MLIR module
     return Program(dev, rt).resolve_program(SequentialPlacer())
 
@@ -83,29 +83,34 @@ def core_body(of_in1, of_in2, of_out, eltwise_mul):
     rt = Runtime()
     with rt.sequence(tensor_ty, tensor_ty, tensor_ty) as (A, B, C):
         rt.start(*my_workers)
+
+        # Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
+        tg = rt.task_group()
+
         # Fill the input objectFIFOs with data
         for i in range(num_columns):
             rt.fill(
                 of_in1s[i].prod(),
                 A,
                 taps[i],
+                task_group=tg,
             )
             rt.fill(
                 of_in2s[i].prod(),
                 B,
                 taps[i],
+                task_group=tg,
             )
         # Drain the output objectFIFOs with data
-        tg_out = rt.task_group()
         for i in range(num_columns):
             rt.drain(
                 of_outs[i].cons(),
                 C,
                 taps[i],
                 wait=True,  # wait for the transfer to complete and data to be available
-                task_group=tg_out,
+                task_group=tg,
             )
-        rt.finish_task_group(tg_out)
+        rt.finish_task_group(tg)
 
     # Place program components (assign them resources on the device) and generate an MLIR module
     return Program(dev, rt).resolve_program(SequentialPlacer())
 
@@ -92,26 +92,30 @@ def core_fn(of_in, of_out, geluLine):
     rt = Runtime()
     with rt.sequence(transfer_type, transfer_type) as (a_in, b_out):
         rt.start(*my_workers)
+
+        # Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
+        tg = rt.task_group()
+
         # Fill the input objectFIFOs with data
         for i in range(num_columns):
             for j in range(num_channels):
                 rt.fill(
                     of_ins[i * num_channels + j].prod(),
                     a_in,
                     taps[i * num_channels + j],
+                    task_group=tg,
                 )
         # Drain the output objectFIFOs with data
-        tg_out = rt.task_group()  # Initialize a group for parallel drain tasks
         for i in range(num_columns):
             for j in range(num_channels):
                 rt.drain(
                     of_outs[i * num_channels + j].cons(),
                     b_out,
                     taps[i * num_channels + j],
                     wait=True,  # wait for the transfer to complete and data to be available
-                    task_group=tg_out,
+                    task_group=tg,
                 )
-        rt.finish_task_group(tg_out)
+        rt.finish_task_group(tg)
 
     # Place components (assign them resources on the device) and generate an MLIR module
     return Program(dev, rt).resolve_program(SequentialPlacer())
 
@@ -93,26 +93,30 @@ def core_body(of_in1, of_out, layer_norm_kernel):
     rt = Runtime()
     with rt.sequence(tensor_ty, tensor_ty) as (A, C):
         rt.start(*my_workers)
+
+        # Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
+        tg = rt.task_group()
+
         # Fill the input objectFIFOs with data
         for i in range(num_columns):
             for j in range(num_channels):
                 rt.fill(
                     of_in1s[i * num_channels + j].prod(),
                     A,
                     taps[i * num_channels + j],
+                    task_group=tg,
                 )
         # Drain the output objectFIFOs with data
-        tg_out = rt.task_group()
         for i in range(num_columns):
             for j in range(num_channels):
                 rt.drain(
                     of_outs[i * num_channels + j].cons(),
                     C,
                     taps[i * num_channels + j],
                     wait=True,  # wait for the transfer to complete and data to be available
-                    task_group=tg_out,
+                    task_group=tg,
                 )
-        rt.finish_task_group(tg_out)
+        rt.finish_task_group(tg)
 
     # Place program components (assign them resources on the device) and generate an MLIR module
     return Program(dev, rt).resolve_program(SequentialPlacer())
 
@@ -91,26 +91,30 @@ def core_fn(of_in, of_out, leaky_relu_line):
     rt = Runtime()
     with rt.sequence(transfer_type, transfer_type) as (a_in, b_out):
         rt.start(*my_workers)
+
+        # Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
+        tg = rt.task_group()
+
         # Fill the input objectFIFOs with data
         for i in range(num_columns):
             for j in range(num_channels):
                 rt.fill(
                     of_ins[i * num_channels + j].prod(),
                     a_in,
                     taps[i * num_channels + j],
+                    task_group=tg,
                 )
         # Drain the output objectFIFOs with data
-        tg_out = rt.task_group()  # Initialize a group for parallel drain tasks
         for i in range(num_columns):
             for j in range(num_channels):
                 rt.drain(
                     of_outs[i * num_channels + j].cons(),
                     b_out,
                     taps[i * num_channels + j],
                     wait=True,  # wait for the transfer to complete and data to be available
-                    task_group=tg_out,
+                    task_group=tg,
                 )
-        rt.finish_task_group(tg_out)
+        rt.finish_task_group(tg)
 
     # Place components (assign them resources on the device) and generate an MLIR module
     return Program(dev, rt).resolve_program(SequentialPlacer())
 
@@ -92,26 +92,30 @@ def core_fn(of_in, of_out, mem_copyLine):
     rt = Runtime()
     with rt.sequence(transfer_type, transfer_type) as (a_in, b_out):
         rt.start(*my_workers)
+
+        # Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
+        tg = rt.task_group()
+
         # Fill the input objectFIFOs with data
         for i in range(num_columns):
             for j in range(num_channels):
                 rt.fill(
                     of_ins[i * num_channels + j].prod(),
                     a_in,
                     taps[i * num_channels + j],
+                    task_group=tg,
                 )
         # Drain the output objectFIFOs with data
-        tg_out = rt.task_group()  # Initialize a group for parallel drain tasks
         for i in range(num_columns):
             for j in range(num_channels):
                 rt.drain(
                     of_outs[i * num_channels + j].cons(),
                     b_out,
                     taps[i * num_channels + j],
                     wait=True,  # wait for the transfer to complete and data to be available
-                    task_group=tg_out,
+                    task_group=tg,
                 )
-        rt.finish_task_group(tg_out)
+        rt.finish_task_group(tg)
 
     # Place components (assign them resources on the device) and generate an MLIR module
     return Program(dev, rt).resolve_program(SequentialPlacer())
 
@@ -753,6 +753,9 @@ def set_loop_idx_rtp():
 
             for q_block_idx in range(num_q_block_per_pipeline):
 
+                # Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
+                tg = rt.task_group()
+
                 if number_of_pipelines > 6:
                     rt.fill(
                         inQ.prod(),
@@ -761,6 +764,7 @@ def set_loop_idx_rtp():
                             2 * head_idx * num_q_block_per_pipeline + q_block_idx * 2
                         ],
                         placement=Tile(col=4, row=0),
+                        task_group=tg,
                     )
                     rt.fill(
                         inQ2.prod(),
@@ -771,21 +775,31 @@ def set_loop_idx_rtp():
                             + 1
                         ],
                         placement=Tile(col=4, row=0),
+                        task_group=tg,
                     )
                 else:
                     rt.fill(
                         inQ.prod(),
                         Q,
                         tap=Q_tiles[head_idx * num_q_block_per_pipeline + q_block_idx],
                         placement=Tile(col=4, row=0),
+                        task_group=tg,
                     )
 
                 # Thow on bd containing the full K and V in the object fifo, then does it transfer cunks of inKV size at the time?
                 rt.fill(
-                    inK.prod(), K, tap=K_tiles[head_idx], placement=Tile(col=5, row=0)
+                    inK.prod(),
+                    K,
+                    tap=K_tiles[head_idx],
+                    placement=Tile(col=5, row=0),
+                    task_group=tg,
                 )
                 rt.fill(
-                    inV.prod(), V, tap=V_tiles[head_idx], placement=Tile(col=6, row=0)
+                    inV.prod(),
+                    V,
+                    tap=V_tiles[head_idx],
+                    placement=Tile(col=6, row=0),
+                    task_group=tg,
                 )
 
                 if number_of_pipelines > 6:
@@ -797,6 +811,7 @@ def set_loop_idx_rtp():
                         ],
                         wait=True,
                         placement=Tile(col=7, row=0),
+                        task_group=tg,
                     )
                     rt.drain(
                         memO2.cons(),
@@ -808,6 +823,7 @@ def set_loop_idx_rtp():
                         ],
                         wait=True,
                         placement=Tile(col=7, row=0),
+                        task_group=tg,
                     )
                 else:
                     rt.drain(
@@ -816,8 +832,11 @@ def set_loop_idx_rtp():
                         tap=O_tiles[head_idx * num_q_block_per_pipeline + q_block_idx],
                         wait=True,
                         placement=Tile(col=7, row=0),
+                        task_group=tg,
                     )
 
+                rt.finish_task_group(tg)
+
     # Create the program from the device type and runtime
     if dev == "npu":
         dev_ty = NPU1Col1()
 
@@ -90,26 +90,30 @@ def core_fn(of_in, of_out, reluLine):
     rt = Runtime()
     with rt.sequence(transfer_type, transfer_type) as (a_in, b_out):
         rt.start(*my_workers)
+
+        # Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
+        tg = rt.task_group()
+
         # Fill the input objectFIFOs with data
         for i in range(num_columns):
             for j in range(num_channels):
                 rt.fill(
                     of_ins[i * num_channels + j].prod(),
                     a_in,
                     taps[i * num_channels + j],
+                    task_group=tg,
                 )
         # Drain the output objectFIFOs with data
-        tg_out = rt.task_group()  # Initialize a group for parallel drain tasks
         for i in range(num_columns):
             for j in range(num_channels):
                 rt.drain(
                     of_outs[i * num_channels + j].cons(),
                     b_out,
                     taps[i * num_channels + j],
                     wait=True,  # wait for the transfer to complete and data to be available
-                    task_group=tg_out,
+                    task_group=tg,
                 )
-        rt.finish_task_group(tg_out)
+        rt.finish_task_group(tg)
 
     # Place components (assign them resources on the device) and generate an MLIR module
     return Program(dev, rt).resolve_program(SequentialPlacer())