From 58dab93af14231852c89e3d4f4bb04b5d6651287 Mon Sep 17 00:00:00 2001
From: Amir Mohammad Tavakkoli
 <amirmohammadtavakkoli@amirs-mbp.uconnect.utah.edu>
Date: Mon, 4 Mar 2024 14:02:58 -0700
Subject: [PATCH 1/5] Add gpu wait op

---
 tests/dialects/test_gpu.py | 22 ++++++++++++--
 xdsl/dialects/gpu.py       | 60 ++++++++++++++++++++++++++++----------
 2 files changed, 64 insertions(+), 18 deletions(-)

diff --git a/tests/dialects/test_gpu.py b/tests/dialects/test_gpu.py
index 597540cba8..ed4bc8b7b6 100644
--- a/tests/dialects/test_gpu.py
+++ b/tests/dialects/test_gpu.py
@@ -29,6 +29,7 @@
     SubgroupIdOp,
     SubgroupSizeOp,
     TerminatorOp,
+    WaitOp,
     ThreadIdOp,
     YieldOp,
 )
@@ -99,7 +100,8 @@ def test_all_reduce():
 
     @Builder.implicit_region
     def body():
-        sum = Operation.clone(arith.Addi(body_block.args[0], body_block.args[1]))
+        sum = Operation.clone(arith.Addi(
+            body_block.args[0], body_block.args[1]))
         YieldOp([sum])
 
     all_reduce_body = AllReduceOp.from_body(body, init)
@@ -142,7 +144,8 @@ def test_dealloc():
     assert alloc.asyncToken is not None  # For pyright
 
     dealloc = DeallocOp(
-        buffer=alloc.result, async_dependencies=[alloc.asyncToken], is_async=True
+        buffer=alloc.result, async_dependencies=[
+            alloc.asyncToken], is_async=True
     )
 
     assert dealloc.asyncToken is not None
@@ -228,7 +231,8 @@ def test_func():
 
     body = Region(Block([ReturnOp([])]))
 
-    func = FuncOp(kernel, (inputs, []), body, True, known_block_size, known_grid_size)
+    func = FuncOp(kernel, (inputs, []), body, True,
+                  known_block_size, known_grid_size)
 
     assert isinstance(func, FuncOp)
     assert func.kernel == builtin.UnitAttr()
@@ -400,6 +404,18 @@ def test_terminator():
     assert isinstance(terminator, TerminatorOp)
 
 
+def test_wait():
+    waitOp = WaitOp()
+
+    assert isinstance(waitOp,  WaitOp)
+    assert waitOp.asyncToken is not None
+    assert isinstance(waitOp.asyncToken.type, AsyncTokenType)
+
+    waitOpWithDep = WaitOp(waitOp)
+    assert waitOpWithDep.asyncToken is not None
+    assert waitOpWithDep.asyncDependencies[0] is waitOp.asyncToken
+
+
 def test_yield():
     operands: list[SSAValue | Operation] = [
         o
diff --git a/xdsl/dialects/gpu.py b/xdsl/dialects/gpu.py
index 50834ee35c..cc20ae74c0 100644
--- a/xdsl/dialects/gpu.py
+++ b/xdsl/dialects/gpu.py
@@ -188,7 +188,8 @@ def __init__(
             [SSAValue.get(e) for e in dynamic_sizes] if dynamic_sizes else []
         )
         async_dependencies_vals: list[SSAValue] = (
-            [SSAValue.get(e) for e in async_dependencies] if async_dependencies else []
+            [SSAValue.get(e)
+             for e in async_dependencies] if async_dependencies else []
         )
         attributes: dict[str, Attribute] = (
             {"hostShared": UnitAttr()} if host_shared else {}
@@ -283,7 +284,8 @@ class BlockDimOp(IRDLOperation):
     result: OpResult = result_def(IndexType)
 
     def __init__(self, dim: DimensionAttr):
-        super().__init__(result_types=[IndexType()], properties={"dimension": dim})
+        super().__init__(result_types=[
+            IndexType()], properties={"dimension": dim})
 
 
 @irdl_op_definition
@@ -293,7 +295,8 @@ class BlockIdOp(IRDLOperation):
     result: OpResult = result_def(IndexType)
 
     def __init__(self, dim: DimensionAttr):
-        super().__init__(result_types=[IndexType()], properties={"dimension": dim})
+        super().__init__(result_types=[
+            IndexType()], properties={"dimension": dim})
 
 
 @irdl_op_definition
@@ -355,7 +358,8 @@ def verify_(self) -> None:
 class ModuleEndOp(IRDLOperation):
     name = "gpu.module_end"
 
-    traits = traits_def(lambda: frozenset([IsTerminator(), HasParent(ModuleOp)]))
+    traits = traits_def(lambda: frozenset(
+        [IsTerminator(), HasParent(ModuleOp)]))
 
     def __init__(self):
         super().__init__()
@@ -396,7 +400,8 @@ class FuncOp(IRDLOperation):
         DenseArrayBase, attr_name="gpu.known_grid_size"
     )
 
-    traits = frozenset([IsolatedFromAbove(), HasParent(ModuleOp), SymbolOpInterface()])
+    traits = frozenset(
+        [IsolatedFromAbove(), HasParent(ModuleOp), SymbolOpInterface()])
 
     def __init__(
         self,
@@ -412,7 +417,8 @@ def __init__(
             function_type = FunctionType.from_lists(inputs, outputs)
         if not isinstance(region, Region):
             region = Region(Block(arg_types=function_type.inputs))
-        attributes: dict[str, Attribute | None] = {"sym_name": StringAttr(name)}
+        attributes: dict[str, Attribute | None] = {
+            "sym_name": StringAttr(name)}
         properties: dict[str, Attribute | None] = {
             "function_type": function_type,
         }
@@ -426,7 +432,8 @@ def __init__(
             )
         if kernel:
             properties["kernel"] = UnitAttr()
-        super().__init__(properties=properties, attributes=attributes, regions=[region])
+        super().__init__(properties=properties,
+                         attributes=attributes, regions=[region])
 
     def verify_(self):
         entry_block: Block = self.body.blocks[0]
@@ -438,7 +445,8 @@ def verify_(self):
                 "function input types"
             )
         if (self.kernel is not None) and (len(self.function_type.outputs) != 0):
-            raise VerifyException("Expected void return type for kernel function")
+            raise VerifyException(
+                "Expected void return type for kernel function")
 
 
 @irdl_op_definition
@@ -448,7 +456,8 @@ class GlobalIdOp(IRDLOperation):
     result: OpResult = result_def(IndexType)
 
     def __init__(self, dim: DimensionAttr):
-        super().__init__(result_types=[IndexType()], properties={"dimension": dim})
+        super().__init__(result_types=[
+            IndexType()], properties={"dimension": dim})
 
 
 @irdl_op_definition
@@ -458,7 +467,8 @@ class GridDimOp(IRDLOperation):
     result: OpResult = result_def(IndexType)
 
     def __init__(self, dim: DimensionAttr):
-        super().__init__(result_types=[IndexType()], properties={"dimension": dim})
+        super().__init__(result_types=[
+            IndexType()], properties={"dimension": dim})
 
 
 @irdl_op_definition
@@ -529,9 +539,11 @@ def __init__(
         dynamicSharedMemorySize: SSAValue | Operation | None = None,
     ):
         if len(gridSize) != 3:
-            raise ValueError(f"LaunchOp must have 3 gridSizes, got {len(gridSize)}")
+            raise ValueError(
+                f"LaunchOp must have 3 gridSizes, got {len(gridSize)}")
         if len(blockSize) != 3:
-            raise ValueError(f"LaunchOp must have 3 blockSizes, got {len(blockSize)}")
+            raise ValueError(
+                f"LaunchOp must have 3 blockSizes, got {len(blockSize)}")
         operands = [
             (
                 []
@@ -632,9 +644,11 @@ def __init__(
         dynamicSharedMemorySize: SSAValue | Operation | None = None,
     ):
         if len(gridSize) != 3:
-            raise ValueError(f"LaunchOp must have 3 gridSizes, got {len(gridSize)}")
+            raise ValueError(
+                f"LaunchOp must have 3 gridSizes, got {len(gridSize)}")
         if len(blockSize) != 3:
-            raise ValueError(f"LaunchOp must have 3 blockSizes, got {len(blockSize)}")
+            raise ValueError(
+                f"LaunchOp must have 3 blockSizes, got {len(blockSize)}")
         clusterSizeOperands: Sequence[
             SSAValue | Operation | Sequence[SSAValue | Operation]
         ]
@@ -727,7 +741,22 @@ class ThreadIdOp(IRDLOperation):
     result: OpResult = result_def(IndexType)
 
     def __init__(self, dim: DimensionAttr):
-        super().__init__(result_types=[IndexType()], properties={"dimension": dim})
+        super().__init__(result_types=[
+            IndexType()], properties={"dimension": dim})
+
+
+@irdl_op_definition
+class WaitOp(IRDLOperation):
+    name = "gpu.wait"
+    asyncDependencies: VarOperand = var_operand_def(AsyncTokenType)
+    asyncToken: OptOpResult = opt_result_def(AsyncTokenType)
+
+    def __init__(
+        self,
+        async_dependencies: Sequence[SSAValue | Operation] | None = None,
+    ):
+        super().__init__(operands=[async_dependencies],
+                         result_types=[[AsyncTokenType()]],)
 
 
 @irdl_op_definition
@@ -779,6 +808,7 @@ def verify_(self) -> None:
         SubgroupSizeOp,
         TerminatorOp,
         ThreadIdOp,
+        WaitOp,
         YieldOp,
     ],
     [

From 8e9d6eb49dec3bbf38b972846b066158d07556b8 Mon Sep 17 00:00:00 2001
From: Amir Mohammad Tavakkoli
 <amirmohammadtavakkoli@amirs-mbp.uconnect.utah.edu>
Date: Mon, 4 Mar 2024 14:15:22 -0700
Subject: [PATCH 2/5] Fix the formatting error and the test case

---
 tests/dialects/test_gpu.py | 18 +++++++-------
 xdsl/dialects/gpu.py       | 51 ++++++++++++++------------------------
 2 files changed, 28 insertions(+), 41 deletions(-)

diff --git a/tests/dialects/test_gpu.py b/tests/dialects/test_gpu.py
index ed4bc8b7b6..8849f1f5f6 100644
--- a/tests/dialects/test_gpu.py
+++ b/tests/dialects/test_gpu.py
@@ -29,8 +29,8 @@
     SubgroupIdOp,
     SubgroupSizeOp,
     TerminatorOp,
-    WaitOp,
     ThreadIdOp,
+    WaitOp,
     YieldOp,
 )
 from xdsl.ir import Block, Operation, Region, SSAValue
@@ -100,8 +100,7 @@ def test_all_reduce():
 
     @Builder.implicit_region
     def body():
-        sum = Operation.clone(arith.Addi(
-            body_block.args[0], body_block.args[1]))
+        sum = Operation.clone(arith.Addi(body_block.args[0], body_block.args[1]))
         YieldOp([sum])
 
     all_reduce_body = AllReduceOp.from_body(body, init)
@@ -144,8 +143,7 @@ def test_dealloc():
     assert alloc.asyncToken is not None  # For pyright
 
     dealloc = DeallocOp(
-        buffer=alloc.result, async_dependencies=[
-            alloc.asyncToken], is_async=True
+        buffer=alloc.result, async_dependencies=[alloc.asyncToken], is_async=True
     )
 
     assert dealloc.asyncToken is not None
@@ -231,8 +229,7 @@ def test_func():
 
     body = Region(Block([ReturnOp([])]))
 
-    func = FuncOp(kernel, (inputs, []), body, True,
-                  known_block_size, known_grid_size)
+    func = FuncOp(kernel, (inputs, []), body, True, known_block_size, known_grid_size)
 
     assert isinstance(func, FuncOp)
     assert func.kernel == builtin.UnitAttr()
@@ -407,13 +404,16 @@ def test_terminator():
 def test_wait():
     waitOp = WaitOp()
 
-    assert isinstance(waitOp,  WaitOp)
+    assert isinstance(waitOp, WaitOp)
     assert waitOp.asyncToken is not None
     assert isinstance(waitOp.asyncToken.type, AsyncTokenType)
 
-    waitOpWithDep = WaitOp(waitOp)
+    waitOp1 = WaitOp()
+
+    waitOpWithDep = WaitOp([waitOp, waitOp1])
     assert waitOpWithDep.asyncToken is not None
     assert waitOpWithDep.asyncDependencies[0] is waitOp.asyncToken
+    assert waitOpWithDep.asyncDependencies[1] is waitOp1.asyncToken
 
 
 def test_yield():
diff --git a/xdsl/dialects/gpu.py b/xdsl/dialects/gpu.py
index cc20ae74c0..1ccb6238fa 100644
--- a/xdsl/dialects/gpu.py
+++ b/xdsl/dialects/gpu.py
@@ -188,8 +188,7 @@ def __init__(
             [SSAValue.get(e) for e in dynamic_sizes] if dynamic_sizes else []
         )
         async_dependencies_vals: list[SSAValue] = (
-            [SSAValue.get(e)
-             for e in async_dependencies] if async_dependencies else []
+            [SSAValue.get(e) for e in async_dependencies] if async_dependencies else []
         )
         attributes: dict[str, Attribute] = (
             {"hostShared": UnitAttr()} if host_shared else {}
@@ -284,8 +283,7 @@ class BlockDimOp(IRDLOperation):
     result: OpResult = result_def(IndexType)
 
     def __init__(self, dim: DimensionAttr):
-        super().__init__(result_types=[
-            IndexType()], properties={"dimension": dim})
+        super().__init__(result_types=[IndexType()], properties={"dimension": dim})
 
 
 @irdl_op_definition
@@ -295,8 +293,7 @@ class BlockIdOp(IRDLOperation):
     result: OpResult = result_def(IndexType)
 
     def __init__(self, dim: DimensionAttr):
-        super().__init__(result_types=[
-            IndexType()], properties={"dimension": dim})
+        super().__init__(result_types=[IndexType()], properties={"dimension": dim})
 
 
 @irdl_op_definition
@@ -358,8 +355,7 @@ def verify_(self) -> None:
 class ModuleEndOp(IRDLOperation):
     name = "gpu.module_end"
 
-    traits = traits_def(lambda: frozenset(
-        [IsTerminator(), HasParent(ModuleOp)]))
+    traits = traits_def(lambda: frozenset([IsTerminator(), HasParent(ModuleOp)]))
 
     def __init__(self):
         super().__init__()
@@ -400,8 +396,7 @@ class FuncOp(IRDLOperation):
         DenseArrayBase, attr_name="gpu.known_grid_size"
     )
 
-    traits = frozenset(
-        [IsolatedFromAbove(), HasParent(ModuleOp), SymbolOpInterface()])
+    traits = frozenset([IsolatedFromAbove(), HasParent(ModuleOp), SymbolOpInterface()])
 
     def __init__(
         self,
@@ -417,8 +412,7 @@ def __init__(
             function_type = FunctionType.from_lists(inputs, outputs)
         if not isinstance(region, Region):
             region = Region(Block(arg_types=function_type.inputs))
-        attributes: dict[str, Attribute | None] = {
-            "sym_name": StringAttr(name)}
+        attributes: dict[str, Attribute | None] = {"sym_name": StringAttr(name)}
         properties: dict[str, Attribute | None] = {
             "function_type": function_type,
         }
@@ -432,8 +426,7 @@ def __init__(
             )
         if kernel:
             properties["kernel"] = UnitAttr()
-        super().__init__(properties=properties,
-                         attributes=attributes, regions=[region])
+        super().__init__(properties=properties, attributes=attributes, regions=[region])
 
     def verify_(self):
         entry_block: Block = self.body.blocks[0]
@@ -445,8 +438,7 @@ def verify_(self):
                 "function input types"
             )
         if (self.kernel is not None) and (len(self.function_type.outputs) != 0):
-            raise VerifyException(
-                "Expected void return type for kernel function")
+            raise VerifyException("Expected void return type for kernel function")
 
 
 @irdl_op_definition
@@ -456,8 +448,7 @@ class GlobalIdOp(IRDLOperation):
     result: OpResult = result_def(IndexType)
 
     def __init__(self, dim: DimensionAttr):
-        super().__init__(result_types=[
-            IndexType()], properties={"dimension": dim})
+        super().__init__(result_types=[IndexType()], properties={"dimension": dim})
 
 
 @irdl_op_definition
@@ -467,8 +458,7 @@ class GridDimOp(IRDLOperation):
     result: OpResult = result_def(IndexType)
 
     def __init__(self, dim: DimensionAttr):
-        super().__init__(result_types=[
-            IndexType()], properties={"dimension": dim})
+        super().__init__(result_types=[IndexType()], properties={"dimension": dim})
 
 
 @irdl_op_definition
@@ -539,11 +529,9 @@ def __init__(
         dynamicSharedMemorySize: SSAValue | Operation | None = None,
     ):
         if len(gridSize) != 3:
-            raise ValueError(
-                f"LaunchOp must have 3 gridSizes, got {len(gridSize)}")
+            raise ValueError(f"LaunchOp must have 3 gridSizes, got {len(gridSize)}")
         if len(blockSize) != 3:
-            raise ValueError(
-                f"LaunchOp must have 3 blockSizes, got {len(blockSize)}")
+            raise ValueError(f"LaunchOp must have 3 blockSizes, got {len(blockSize)}")
         operands = [
             (
                 []
@@ -644,11 +632,9 @@ def __init__(
         dynamicSharedMemorySize: SSAValue | Operation | None = None,
     ):
         if len(gridSize) != 3:
-            raise ValueError(
-                f"LaunchOp must have 3 gridSizes, got {len(gridSize)}")
+            raise ValueError(f"LaunchOp must have 3 gridSizes, got {len(gridSize)}")
         if len(blockSize) != 3:
-            raise ValueError(
-                f"LaunchOp must have 3 blockSizes, got {len(blockSize)}")
+            raise ValueError(f"LaunchOp must have 3 blockSizes, got {len(blockSize)}")
         clusterSizeOperands: Sequence[
             SSAValue | Operation | Sequence[SSAValue | Operation]
         ]
@@ -741,8 +727,7 @@ class ThreadIdOp(IRDLOperation):
     result: OpResult = result_def(IndexType)
 
     def __init__(self, dim: DimensionAttr):
-        super().__init__(result_types=[
-            IndexType()], properties={"dimension": dim})
+        super().__init__(result_types=[IndexType()], properties={"dimension": dim})
 
 
 @irdl_op_definition
@@ -755,8 +740,10 @@ def __init__(
         self,
         async_dependencies: Sequence[SSAValue | Operation] | None = None,
     ):
-        super().__init__(operands=[async_dependencies],
-                         result_types=[[AsyncTokenType()]],)
+        super().__init__(
+            operands=[async_dependencies],
+            result_types=[[AsyncTokenType()]],
+        )
 
 
 @irdl_op_definition

From a00392323a62fa9b6d93d4f58fc0ff63101b958a Mon Sep 17 00:00:00 2001
From: Amir Mohammad Tavakkoli
 <amirmohammadtavakkoli@amirs-mbp.uconnect.utah.edu>
Date: Wed, 6 Mar 2024 10:56:05 -0700
Subject: [PATCH 3/5] Add roundtrip for gpu.wait

---
 tests/filecheck/dialects/gpu/ops.mlir                         | 4 ++++
 .../filecheck/mlir-conversion/with-mlir/dialects/gpu/ops.mlir | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/tests/filecheck/dialects/gpu/ops.mlir b/tests/filecheck/dialects/gpu/ops.mlir
index 3f940f5702..e563f6bf03 100644
--- a/tests/filecheck/dialects/gpu/ops.mlir
+++ b/tests/filecheck/dialects/gpu/ops.mlir
@@ -11,6 +11,8 @@ builtin.module attributes {"gpu.container_module"} {
             "gpu.host_register"(%unranked) : (memref<*xi32>) -> ()
             "gpu.host_unregister"(%unranked) : (memref<*xi32>) -> ()
 
+            %wait_token = "gpu.wait"() : () -> !gpu.async.token
+
             %threadidx = "gpu.thread_id"() {"dimension" = #gpu<dim x>} : () -> index
             %threadidy = "gpu.thread_id"() {"dimension" = #gpu<dim y>} : () -> index
             %threadidz = "gpu.thread_id"() {"dimension" = #gpu<dim z>} : () -> index
@@ -89,6 +91,8 @@ builtin.module attributes {"gpu.container_module"} {
 // CHECK-NEXT:             "gpu.host_register"(%{{.*}}) : (memref<*xi32>) -> ()
 // CHECK-NEXT:             "gpu.host_unregister"(%{{.*}}) : (memref<*xi32>) -> ()
 
+ // CHECK-NEXT:            %{{.*}} = "gpu.wait"() : () -> !gpu.async.token
+
 // CHECK-NEXT:             %{{.*}} = "gpu.thread_id"() <{"dimension" = #gpu<dim x>}> : () -> index
 // CHECK-NEXT:             %{{.*}} = "gpu.thread_id"() <{"dimension" = #gpu<dim y>}> : () -> index
 // CHECK-NEXT:             %{{.*}} = "gpu.thread_id"() <{"dimension" = #gpu<dim z>}> : () -> index
diff --git a/tests/filecheck/mlir-conversion/with-mlir/dialects/gpu/ops.mlir b/tests/filecheck/mlir-conversion/with-mlir/dialects/gpu/ops.mlir
index c47f12fbe5..02f788f3cc 100644
--- a/tests/filecheck/mlir-conversion/with-mlir/dialects/gpu/ops.mlir
+++ b/tests/filecheck/mlir-conversion/with-mlir/dialects/gpu/ops.mlir
@@ -11,6 +11,8 @@
             "gpu.host_register"(%unranked) : (memref<*xi32>) -> ()
             "gpu.host_unregister"(%unranked) : (memref<*xi32>) -> ()
 
+            %wait_token = "gpu.wait"() : () -> !gpu.async.token
+
             %threadidx = "gpu.thread_id"() {"dimension" = #gpu<dim x>} : () -> index
             %threadidy = "gpu.thread_id"() {"dimension" = #gpu<dim y>} : () -> index
             %threadidz = "gpu.thread_id"() {"dimension" = #gpu<dim z>} : () -> index
@@ -88,6 +90,8 @@
 // CHECK-NEXT:             "gpu.host_register"(%{{.*}}) : (memref<*xi32>) -> ()
 // CHECK-NEXT:             "gpu.host_unregister"(%{{.*}}) : (memref<*xi32>) -> ()
 
+ // CHECK-NEXT:            %{{.*}} = "gpu.wait"() : () -> !gpu.async.token
+
 // CHECK-NEXT:             %{{.*}} = "gpu.thread_id"() <{"dimension" = #gpu<dim x>}> : () -> index
 // CHECK-NEXT:             %{{.*}} = "gpu.thread_id"() <{"dimension" = #gpu<dim y>}> : () -> index
 // CHECK-NEXT:             %{{.*}} = "gpu.thread_id"() <{"dimension" = #gpu<dim z>}> : () -> index

From e86f7a4d8be89b202dd832f59f4d7dc0cf50de29 Mon Sep 17 00:00:00 2001
From: Amir Mohammad Tavakkoli
 <46460249+tavakkoliamirmohammad@users.noreply.github.com>
Date: Wed, 6 Mar 2024 11:40:26 -0700
Subject: [PATCH 4/5] Add async import

Co-authored-by: Emilien Bauer <papychacal@gmail.com>
---
 xdsl/dialects/gpu.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/xdsl/dialects/gpu.py b/xdsl/dialects/gpu.py
index 1ccb6238fa..95dbb90fa7 100644
--- a/xdsl/dialects/gpu.py
+++ b/xdsl/dialects/gpu.py
@@ -799,6 +799,7 @@ def verify_(self) -> None:
         YieldOp,
     ],
     [
+        AsyncTokenType
         AllReduceOpAttr,
         DimensionAttr,
         ProcessorAttr,

From 9d825bc3959185498175ff04ac5d93e28264ef09 Mon Sep 17 00:00:00 2001
From: Amir Mohammad Tavakkoli
 <amirmohammadtavakkoli@amirs-mbp.uconnect.utah.edu>
Date: Wed, 6 Mar 2024 11:42:24 -0700
Subject: [PATCH 5/5] Fix comma

---
 xdsl/dialects/gpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xdsl/dialects/gpu.py b/xdsl/dialects/gpu.py
index 95dbb90fa7..67d1519540 100644
--- a/xdsl/dialects/gpu.py
+++ b/xdsl/dialects/gpu.py
@@ -799,7 +799,7 @@ def verify_(self) -> None:
         YieldOp,
     ],
     [
-        AsyncTokenType
+        AsyncTokenType,
         AllReduceOpAttr,
         DimensionAttr,
         ProcessorAttr,