From 58dab93af14231852c89e3d4f4bb04b5d6651287 Mon Sep 17 00:00:00 2001 From: Amir Mohammad Tavakkoli Date: Mon, 4 Mar 2024 14:02:58 -0700 Subject: [PATCH 1/5] Add gpu wait op --- tests/dialects/test_gpu.py | 22 ++++++++++++-- xdsl/dialects/gpu.py | 60 ++++++++++++++++++++++++++++---------- 2 files changed, 64 insertions(+), 18 deletions(-) diff --git a/tests/dialects/test_gpu.py b/tests/dialects/test_gpu.py index 597540cba8..ed4bc8b7b6 100644 --- a/tests/dialects/test_gpu.py +++ b/tests/dialects/test_gpu.py @@ -29,6 +29,7 @@ SubgroupIdOp, SubgroupSizeOp, TerminatorOp, + WaitOp, ThreadIdOp, YieldOp, ) @@ -99,7 +100,8 @@ def test_all_reduce(): @Builder.implicit_region def body(): - sum = Operation.clone(arith.Addi(body_block.args[0], body_block.args[1])) + sum = Operation.clone(arith.Addi( + body_block.args[0], body_block.args[1])) YieldOp([sum]) all_reduce_body = AllReduceOp.from_body(body, init) @@ -142,7 +144,8 @@ def test_dealloc(): assert alloc.asyncToken is not None # For pyright dealloc = DeallocOp( - buffer=alloc.result, async_dependencies=[alloc.asyncToken], is_async=True + buffer=alloc.result, async_dependencies=[ + alloc.asyncToken], is_async=True ) assert dealloc.asyncToken is not None @@ -228,7 +231,8 @@ def test_func(): body = Region(Block([ReturnOp([])])) - func = FuncOp(kernel, (inputs, []), body, True, known_block_size, known_grid_size) + func = FuncOp(kernel, (inputs, []), body, True, + known_block_size, known_grid_size) assert isinstance(func, FuncOp) assert func.kernel == builtin.UnitAttr() @@ -400,6 +404,18 @@ def test_terminator(): assert isinstance(terminator, TerminatorOp) +def test_wait(): + waitOp = WaitOp() + + assert isinstance(waitOp, WaitOp) + assert waitOp.asyncToken is not None + assert isinstance(waitOp.asyncToken.type, AsyncTokenType) + + waitOpWithDep = WaitOp(waitOp) + assert waitOpWithDep.asyncToken is not None + assert waitOpWithDep.asyncDependencies[0] is waitOp.asyncToken + + def test_yield(): operands: list[SSAValue | Operation] = [ o diff --git a/xdsl/dialects/gpu.py b/xdsl/dialects/gpu.py index 50834ee35c..cc20ae74c0 100644 --- a/xdsl/dialects/gpu.py +++ b/xdsl/dialects/gpu.py @@ -188,7 +188,8 @@ def __init__( [SSAValue.get(e) for e in dynamic_sizes] if dynamic_sizes else [] ) async_dependencies_vals: list[SSAValue] = ( - [SSAValue.get(e) for e in async_dependencies] if async_dependencies else [] + [SSAValue.get(e) + for e in async_dependencies] if async_dependencies else [] ) attributes: dict[str, Attribute] = ( {"hostShared": UnitAttr()} if host_shared else {} @@ -283,7 +284,8 @@ class BlockDimOp(IRDLOperation): result: OpResult = result_def(IndexType) def __init__(self, dim: DimensionAttr): - super().__init__(result_types=[IndexType()], properties={"dimension": dim}) + super().__init__(result_types=[ + IndexType()], properties={"dimension": dim}) @irdl_op_definition @@ -293,7 +295,8 @@ class BlockIdOp(IRDLOperation): result: OpResult = result_def(IndexType) def __init__(self, dim: DimensionAttr): - super().__init__(result_types=[IndexType()], properties={"dimension": dim}) + super().__init__(result_types=[ + IndexType()], properties={"dimension": dim}) @irdl_op_definition @@ -355,7 +358,8 @@ def verify_(self) -> None: class ModuleEndOp(IRDLOperation): name = "gpu.module_end" - traits = traits_def(lambda: frozenset([IsTerminator(), HasParent(ModuleOp)])) + traits = traits_def(lambda: frozenset( + [IsTerminator(), HasParent(ModuleOp)])) def __init__(self): super().__init__() @@ -396,7 +400,8 @@ class FuncOp(IRDLOperation): DenseArrayBase, attr_name="gpu.known_grid_size" ) - traits = frozenset([IsolatedFromAbove(), HasParent(ModuleOp), SymbolOpInterface()]) + traits = frozenset( + [IsolatedFromAbove(), HasParent(ModuleOp), SymbolOpInterface()]) def __init__( self, @@ -412,7 +417,8 @@ def __init__( function_type = FunctionType.from_lists(inputs, outputs) if not isinstance(region, Region): region = Region(Block(arg_types=function_type.inputs)) - attributes: dict[str, Attribute | None] = {"sym_name": StringAttr(name)} + attributes: dict[str, Attribute | None] = { + "sym_name": StringAttr(name)} properties: dict[str, Attribute | None] = { "function_type": function_type, } @@ -426,7 +432,8 @@ def __init__( ) if kernel: properties["kernel"] = UnitAttr() - super().__init__(properties=properties, attributes=attributes, regions=[region]) + super().__init__(properties=properties, + attributes=attributes, regions=[region]) def verify_(self): entry_block: Block = self.body.blocks[0] @@ -438,7 +445,8 @@ def verify_(self): "function input types" ) if (self.kernel is not None) and (len(self.function_type.outputs) != 0): - raise VerifyException("Expected void return type for kernel function") + raise VerifyException( + "Expected void return type for kernel function") @irdl_op_definition @@ -448,7 +456,8 @@ class GlobalIdOp(IRDLOperation): result: OpResult = result_def(IndexType) def __init__(self, dim: DimensionAttr): - super().__init__(result_types=[IndexType()], properties={"dimension": dim}) + super().__init__(result_types=[ + IndexType()], properties={"dimension": dim}) @irdl_op_definition @@ -458,7 +467,8 @@ class GridDimOp(IRDLOperation): result: OpResult = result_def(IndexType) def __init__(self, dim: DimensionAttr): - super().__init__(result_types=[IndexType()], properties={"dimension": dim}) + super().__init__(result_types=[ + IndexType()], properties={"dimension": dim}) @irdl_op_definition @@ -529,9 +539,11 @@ def __init__( dynamicSharedMemorySize: SSAValue | Operation | None = None, ): if len(gridSize) != 3: - raise ValueError(f"LaunchOp must have 3 gridSizes, got {len(gridSize)}") + raise ValueError( + f"LaunchOp must have 3 gridSizes, got {len(gridSize)}") if len(blockSize) != 3: - raise ValueError(f"LaunchOp must have 3 blockSizes, got {len(blockSize)}") + raise ValueError( + f"LaunchOp must have 3 blockSizes, got {len(blockSize)}") operands = [ ( [] @@ -632,9 +644,11 @@ def __init__( dynamicSharedMemorySize: SSAValue | Operation | None = None, ): if len(gridSize) != 3: - raise ValueError(f"LaunchOp must have 3 gridSizes, got {len(gridSize)}") + raise ValueError( + f"LaunchOp must have 3 gridSizes, got {len(gridSize)}") if len(blockSize) != 3: - raise ValueError(f"LaunchOp must have 3 blockSizes, got {len(blockSize)}") + raise ValueError( + f"LaunchOp must have 3 blockSizes, got {len(blockSize)}") clusterSizeOperands: Sequence[ SSAValue | Operation | Sequence[SSAValue | Operation] ] @@ -727,7 +741,22 @@ class ThreadIdOp(IRDLOperation): result: OpResult = result_def(IndexType) def __init__(self, dim: DimensionAttr): - super().__init__(result_types=[IndexType()], properties={"dimension": dim}) + super().__init__(result_types=[ + IndexType()], properties={"dimension": dim}) + + +@irdl_op_definition +class WaitOp(IRDLOperation): + name = "gpu.wait" + asyncDependencies: VarOperand = var_operand_def(AsyncTokenType) + asyncToken: OptOpResult = opt_result_def(AsyncTokenType) + + def __init__( + self, + async_dependencies: Sequence[SSAValue | Operation] | None = None, + ): + super().__init__(operands=[async_dependencies], + result_types=[[AsyncTokenType()]],) @irdl_op_definition @@ -779,6 +808,7 @@ def verify_(self) -> None: SubgroupSizeOp, TerminatorOp, ThreadIdOp, + WaitOp, YieldOp, ], [ From 8e9d6eb49dec3bbf38b972846b066158d07556b8 Mon Sep 17 00:00:00 2001 From: Amir Mohammad Tavakkoli Date: Mon, 4 Mar 2024 14:15:22 -0700 Subject: [PATCH 2/5] Fix the formatting error and the test case --- tests/dialects/test_gpu.py | 18 +++++++------- xdsl/dialects/gpu.py | 51 ++++++++++++++------------------------ 2 files changed, 28 insertions(+), 41 deletions(-) diff --git a/tests/dialects/test_gpu.py b/tests/dialects/test_gpu.py index ed4bc8b7b6..8849f1f5f6 100644 --- a/tests/dialects/test_gpu.py +++ b/tests/dialects/test_gpu.py @@ -29,8 +29,8 @@ SubgroupIdOp, SubgroupSizeOp, TerminatorOp, - WaitOp, ThreadIdOp, + WaitOp, YieldOp, ) from xdsl.ir import Block, Operation, Region, SSAValue @@ -100,8 +100,7 @@ def test_all_reduce(): @Builder.implicit_region def body(): - sum = Operation.clone(arith.Addi( - body_block.args[0], body_block.args[1])) + sum = Operation.clone(arith.Addi(body_block.args[0], body_block.args[1])) YieldOp([sum]) all_reduce_body = AllReduceOp.from_body(body, init) @@ -144,8 +143,7 @@ def test_dealloc(): assert alloc.asyncToken is not None # For pyright dealloc = DeallocOp( - buffer=alloc.result, async_dependencies=[ - alloc.asyncToken], is_async=True + buffer=alloc.result, async_dependencies=[alloc.asyncToken], is_async=True ) assert dealloc.asyncToken is not None @@ -231,8 +229,7 @@ def test_func(): body = Region(Block([ReturnOp([])])) - func = FuncOp(kernel, (inputs, []), body, True, - known_block_size, known_grid_size) + func = FuncOp(kernel, (inputs, []), body, True, known_block_size, known_grid_size) assert isinstance(func, FuncOp) assert func.kernel == builtin.UnitAttr() @@ -407,13 +404,16 @@ def test_terminator(): def test_wait(): waitOp = WaitOp() - assert isinstance(waitOp, WaitOp) + assert isinstance(waitOp, WaitOp) assert waitOp.asyncToken is not None assert isinstance(waitOp.asyncToken.type, AsyncTokenType) - waitOpWithDep = WaitOp(waitOp) + waitOp1 = WaitOp() + + waitOpWithDep = WaitOp([waitOp, waitOp1]) assert waitOpWithDep.asyncToken is not None assert waitOpWithDep.asyncDependencies[0] is waitOp.asyncToken + assert waitOpWithDep.asyncDependencies[1] is waitOp1.asyncToken def test_yield(): diff --git a/xdsl/dialects/gpu.py b/xdsl/dialects/gpu.py index cc20ae74c0..1ccb6238fa 100644 --- a/xdsl/dialects/gpu.py +++ b/xdsl/dialects/gpu.py @@ -188,8 +188,7 @@ def __init__( [SSAValue.get(e) for e in dynamic_sizes] if dynamic_sizes else [] ) async_dependencies_vals: list[SSAValue] = ( - [SSAValue.get(e) - for e in async_dependencies] if async_dependencies else [] + [SSAValue.get(e) for e in async_dependencies] if async_dependencies else [] ) attributes: dict[str, Attribute] = ( {"hostShared": UnitAttr()} if host_shared else {} @@ -284,8 +283,7 @@ class BlockDimOp(IRDLOperation): result: OpResult = result_def(IndexType) def __init__(self, dim: DimensionAttr): - super().__init__(result_types=[ - IndexType()], properties={"dimension": dim}) + super().__init__(result_types=[IndexType()], properties={"dimension": dim}) @irdl_op_definition @@ -295,8 +293,7 @@ class BlockIdOp(IRDLOperation): result: OpResult = result_def(IndexType) def __init__(self, dim: DimensionAttr): - super().__init__(result_types=[ - IndexType()], properties={"dimension": dim}) + super().__init__(result_types=[IndexType()], properties={"dimension": dim}) @irdl_op_definition @@ -358,8 +355,7 @@ def verify_(self) -> None: class ModuleEndOp(IRDLOperation): name = "gpu.module_end" - traits = traits_def(lambda: frozenset( - [IsTerminator(), HasParent(ModuleOp)])) + traits = traits_def(lambda: frozenset([IsTerminator(), HasParent(ModuleOp)])) def __init__(self): super().__init__() @@ -400,8 +396,7 @@ class FuncOp(IRDLOperation): DenseArrayBase, attr_name="gpu.known_grid_size" ) - traits = frozenset( - [IsolatedFromAbove(), HasParent(ModuleOp), SymbolOpInterface()]) + traits = frozenset([IsolatedFromAbove(), HasParent(ModuleOp), SymbolOpInterface()]) def __init__( self, @@ -417,8 +412,7 @@ def __init__( function_type = FunctionType.from_lists(inputs, outputs) if not isinstance(region, Region): region = Region(Block(arg_types=function_type.inputs)) - attributes: dict[str, Attribute | None] = { - "sym_name": StringAttr(name)} + attributes: dict[str, Attribute | None] = {"sym_name": StringAttr(name)} properties: dict[str, Attribute | None] = { "function_type": function_type, } @@ -432,8 +426,7 @@ def __init__( ) if kernel: properties["kernel"] = UnitAttr() - super().__init__(properties=properties, - attributes=attributes, regions=[region]) + super().__init__(properties=properties, attributes=attributes, regions=[region]) def verify_(self): entry_block: Block = self.body.blocks[0] @@ -445,8 +438,7 @@ def verify_(self): "function input types" ) if (self.kernel is not None) and (len(self.function_type.outputs) != 0): - raise VerifyException( - "Expected void return type for kernel function") + raise VerifyException("Expected void return type for kernel function") @irdl_op_definition @@ -456,8 +448,7 @@ class GlobalIdOp(IRDLOperation): result: OpResult = result_def(IndexType) def __init__(self, dim: DimensionAttr): - super().__init__(result_types=[ - IndexType()], properties={"dimension": dim}) + super().__init__(result_types=[IndexType()], properties={"dimension": dim}) @irdl_op_definition @@ -467,8 +458,7 @@ class GridDimOp(IRDLOperation): result: OpResult = result_def(IndexType) def __init__(self, dim: DimensionAttr): - super().__init__(result_types=[ - IndexType()], properties={"dimension": dim}) + super().__init__(result_types=[IndexType()], properties={"dimension": dim}) @irdl_op_definition @@ -539,11 +529,9 @@ def __init__( dynamicSharedMemorySize: SSAValue | Operation | None = None, ): if len(gridSize) != 3: - raise ValueError( - f"LaunchOp must have 3 gridSizes, got {len(gridSize)}") + raise ValueError(f"LaunchOp must have 3 gridSizes, got {len(gridSize)}") if len(blockSize) != 3: - raise ValueError( - f"LaunchOp must have 3 blockSizes, got {len(blockSize)}") + raise ValueError(f"LaunchOp must have 3 blockSizes, got {len(blockSize)}") operands = [ ( [] @@ -644,11 +632,9 @@ def __init__( dynamicSharedMemorySize: SSAValue | Operation | None = None, ): if len(gridSize) != 3: - raise ValueError( - f"LaunchOp must have 3 gridSizes, got {len(gridSize)}") + raise ValueError(f"LaunchOp must have 3 gridSizes, got {len(gridSize)}") if len(blockSize) != 3: - raise ValueError( - f"LaunchOp must have 3 blockSizes, got {len(blockSize)}") + raise ValueError(f"LaunchOp must have 3 blockSizes, got {len(blockSize)}") clusterSizeOperands: Sequence[ SSAValue | Operation | Sequence[SSAValue | Operation] ] @@ -741,8 +727,7 @@ class ThreadIdOp(IRDLOperation): result: OpResult = result_def(IndexType) def __init__(self, dim: DimensionAttr): - super().__init__(result_types=[ - IndexType()], properties={"dimension": dim}) + super().__init__(result_types=[IndexType()], properties={"dimension": dim}) @irdl_op_definition @@ -755,8 +740,10 @@ def __init__( self, async_dependencies: Sequence[SSAValue | Operation] | None = None, ): - super().__init__(operands=[async_dependencies], - result_types=[[AsyncTokenType()]],) + super().__init__( + operands=[async_dependencies], + result_types=[[AsyncTokenType()]], + ) @irdl_op_definition From a00392323a62fa9b6d93d4f58fc0ff63101b958a Mon Sep 17 00:00:00 2001 From: Amir Mohammad Tavakkoli Date: Wed, 6 Mar 2024 10:56:05 -0700 Subject: [PATCH 3/5] Add roundtrip for gpu.wait --- tests/filecheck/dialects/gpu/ops.mlir | 4 ++++ .../filecheck/mlir-conversion/with-mlir/dialects/gpu/ops.mlir | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/tests/filecheck/dialects/gpu/ops.mlir b/tests/filecheck/dialects/gpu/ops.mlir index 3f940f5702..e563f6bf03 100644 --- a/tests/filecheck/dialects/gpu/ops.mlir +++ b/tests/filecheck/dialects/gpu/ops.mlir @@ -11,6 +11,8 @@ builtin.module attributes {"gpu.container_module"} { "gpu.host_register"(%unranked) : (memref<*xi32>) -> () "gpu.host_unregister"(%unranked) : (memref<*xi32>) -> () + %wait_token = "gpu.wait"() : () -> !gpu.async.token + %threadidx = "gpu.thread_id"() {"dimension" = #gpu} : () -> index %threadidy = "gpu.thread_id"() {"dimension" = #gpu} : () -> index %threadidz = "gpu.thread_id"() {"dimension" = #gpu} : () -> index @@ -89,6 +91,8 @@ builtin.module attributes {"gpu.container_module"} { // CHECK-NEXT: "gpu.host_register"(%{{.*}}) : (memref<*xi32>) -> () // CHECK-NEXT: "gpu.host_unregister"(%{{.*}}) : (memref<*xi32>) -> () + // CHECK-NEXT: %{{.*}} = "gpu.wait"() : () -> !gpu.async.token + // CHECK-NEXT: %{{.*}} = "gpu.thread_id"() <{"dimension" = #gpu}> : () -> index // CHECK-NEXT: %{{.*}} = "gpu.thread_id"() <{"dimension" = #gpu}> : () -> index // CHECK-NEXT: %{{.*}} = "gpu.thread_id"() <{"dimension" = #gpu}> : () -> index diff --git a/tests/filecheck/mlir-conversion/with-mlir/dialects/gpu/ops.mlir b/tests/filecheck/mlir-conversion/with-mlir/dialects/gpu/ops.mlir index c47f12fbe5..02f788f3cc 100644 --- a/tests/filecheck/mlir-conversion/with-mlir/dialects/gpu/ops.mlir +++ b/tests/filecheck/mlir-conversion/with-mlir/dialects/gpu/ops.mlir @@ -11,6 +11,8 @@ "gpu.host_register"(%unranked) : (memref<*xi32>) -> () "gpu.host_unregister"(%unranked) : (memref<*xi32>) -> () + %wait_token = "gpu.wait"() : () -> !gpu.async.token + %threadidx = "gpu.thread_id"() {"dimension" = #gpu} : () -> index %threadidy = "gpu.thread_id"() {"dimension" = #gpu} : () -> index %threadidz = "gpu.thread_id"() {"dimension" = #gpu} : () -> index @@ -88,6 +90,8 @@ // CHECK-NEXT: "gpu.host_register"(%{{.*}}) : (memref<*xi32>) -> () // CHECK-NEXT: "gpu.host_unregister"(%{{.*}}) : (memref<*xi32>) -> () + // CHECK-NEXT: %{{.*}} = "gpu.wait"() : () -> !gpu.async.token + // CHECK-NEXT: %{{.*}} = "gpu.thread_id"() <{"dimension" = #gpu}> : () -> index // CHECK-NEXT: %{{.*}} = "gpu.thread_id"() <{"dimension" = #gpu}> : () -> index // CHECK-NEXT: %{{.*}} = "gpu.thread_id"() <{"dimension" = #gpu}> : () -> index From e86f7a4d8be89b202dd832f59f4d7dc0cf50de29 Mon Sep 17 00:00:00 2001 From: Amir Mohammad Tavakkoli <46460249+tavakkoliamirmohammad@users.noreply.github.com> Date: Wed, 6 Mar 2024 11:40:26 -0700 Subject: [PATCH 4/5] Add async import Co-authored-by: Emilien Bauer --- xdsl/dialects/gpu.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xdsl/dialects/gpu.py b/xdsl/dialects/gpu.py index 1ccb6238fa..95dbb90fa7 100644 --- a/xdsl/dialects/gpu.py +++ b/xdsl/dialects/gpu.py @@ -799,6 +799,7 @@ def verify_(self) -> None: YieldOp, ], [ + AsyncTokenType AllReduceOpAttr, DimensionAttr, ProcessorAttr, From 9d825bc3959185498175ff04ac5d93e28264ef09 Mon Sep 17 00:00:00 2001 From: Amir Mohammad Tavakkoli Date: Wed, 6 Mar 2024 11:42:24 -0700 Subject: [PATCH 5/5] Fix comma --- xdsl/dialects/gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xdsl/dialects/gpu.py b/xdsl/dialects/gpu.py index 95dbb90fa7..67d1519540 100644 --- a/xdsl/dialects/gpu.py +++ b/xdsl/dialects/gpu.py @@ -799,7 +799,7 @@ def verify_(self) -> None: YieldOp, ], [ - AsyncTokenType + AsyncTokenType, AllReduceOpAttr, DimensionAttr, ProcessorAttr,