From afe887dacfcfb3253f1725aa14d2e3b0228b98d2 Mon Sep 17 00:00:00 2001 From: pradeepku Date: Mon, 26 Aug 2024 18:46:12 +0530 Subject: [PATCH] [MLIR][NVVM] Add support for fence.proxy.{acquire, release} Ops This commit adds fence.proxy.acquire and fence.proxy.release Ops which map to uni-directional proxy fences in PTX with lowering tests and negative tests under nvvmir.mlir and nvvmir-invalid.mlir respectively. The commits also adds a new MemScopeKind attribute and extends the current ProxyKindAttr to support tensormap and generic. --- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 79 ++++++++++++++++++- mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 28 +++++++ .../Dialect/NVVM/NVVMToLLVMIRTranslation.cpp | 34 ++++++++ mlir/test/Target/LLVMIR/nvvmir-invalid.mlir | 33 ++++++++ mlir/test/Target/LLVMIR/nvvmir.mlir | 37 +++++++++ 5 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 mlir/test/Target/LLVMIR/nvvmir-invalid.mlir diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 4d48b3de7a57e..709dd922b8fa2 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -19,6 +19,7 @@ include "mlir/Dialect/LLVMIR/LLVMOpBase.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.td" +def LLVM_PointerGeneric : LLVM_PointerInAddressSpace<0>; def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>; def LLVM_PointerShared : LLVM_PointerInAddressSpace<3>; @@ -531,8 +532,10 @@ def ProxyAlias : I32EnumAttrCase<"alias", 0, "alias">; def ProxyAsync : I32EnumAttrCase<"async", 1, "async">; def ProxyAsyncGlobal : I32EnumAttrCase<"async_global", 2, "async.global">; def ProxyAsyncShared : I32EnumAttrCase<"async_shared", 3, "async.shared">; +def ProxyTensorMap : I32EnumAttrCase<"TENSORMAP", 4, "tensormap">; +def ProxyGeneric : I32EnumAttrCase<"GENERIC", 5, "generic">; def ProxyKind : I32EnumAttr<"ProxyKind", "Proxy kind", - [ProxyAlias, ProxyAsync, ProxyAsyncGlobal, ProxyAsyncShared]> { + [ProxyAlias, ProxyAsync, ProxyAsyncGlobal, ProxyAsyncShared, ProxyTensorMap, ProxyGeneric]> { let genSpecializedAttr = 0; let cppNamespace = "::mlir::NVVM"; } @@ -565,6 +568,80 @@ def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">, let hasVerifier = 1; } +// Attrs describing the scope of the Memory Operation +def MemScopeKindCTA : I32EnumAttrCase<"CTA", 0, "cta">; +def MemScopeKindCluster : I32EnumAttrCase<"CLUSTER", 1, "cluster">; +def MemScopeKindGPU : I32EnumAttrCase<"GPU", 2, "gpu">; +def MemScopeKindSYS : I32EnumAttrCase<"SYS", 3, "sys">; + +def MemScopeKind : I32EnumAttr<"MemScopeKind", "NVVM Memory Scope kind", + [MemScopeKindCTA, MemScopeKindCluster, MemScopeKindGPU, MemScopeKindSYS]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::NVVM"; +} +def MemScopeKindAttr : EnumAttr { + let assemblyFormat = "`<` $value `>`"; +} + +def NVVM_FenceProxyAcquireOp : NVVM_Op<"fence.proxy.acquire">, + Arguments<(ins MemScopeKindAttr:$scope, LLVM_PointerGeneric:$addr, I32:$size, + DefaultValuedAttr:$fromProxy, + DefaultValuedAttr:$toProxy)> { + let summary = "Uni-directional proxy fence operation with acquire semantics"; + let description = [{ + `fence.proxy.acquire` is a uni-directional fence used to establish ordering + between a prior memory access performed via the generic proxy and a + subsequent memory access performed via the tensormap proxy + + The address operand `addr` and the operand `size` together specify the + memory range `[addr, addr+size)` on which the ordering guarantees on the + memory accesses across the proxies is to be provided. The only supported + value for the `size` operand is 128 and must be an immediate. Generic Addressing + is used unconditionally, and the address specified by the operand `addr` must + fall within the `.global` state space. Otherwise, the behavior is undefined + [For more information, see PTX ISA] + (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar) + }]; + + let assemblyFormat = "$scope $addr `,` $size (`from_proxy` `=` $fromProxy^)? (`to_proxy` `=` $toProxy^)? attr-dict"; + let llvmBuilder = [{ + createIntrinsicCall( + builder, + getUnidirectionalFenceProxyID($fromProxy, $toProxy, $scope, false), + {$addr, $size}); + }]; + + let hasVerifier = 1; +} + +def NVVM_FenceProxyReleaseOp : NVVM_Op<"fence.proxy.release">, + Arguments<(ins MemScopeKindAttr:$scope, + DefaultValuedAttr:$fromProxy, + DefaultValuedAttr:$toProxy)> { + let summary = "Uni-directional proxy fence operation with release semantics"; + let description = [{ + `fence.proxy.release` is a uni-directional fence used to establish ordering + between a prior memory access performed via the generic proxy and a + subsequent memory access performed via the tensormap proxy. `fence.proxy.release` + operation can form a release sequence that synchronizes with an acquire + sequence that contains the fence.proxy.acquire proxy fence operation + [For more information, see PTX ISA] + (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar) + }]; + + let assemblyFormat = "$scope (`from_proxy` `=` $fromProxy^)? (`to_proxy` `=` $toProxy^)? attr-dict"; + let llvmBuilder = [{ + createIntrinsicCall(builder, getUnidirectionalFenceProxyID( + $fromProxy, $toProxy, $scope, true)); + }]; + + let hasVerifier = 1; +} + def SetMaxRegisterActionIncrease : I32EnumAttrCase<"increase", 0>; def SetMaxRegisterActionDecrease : I32EnumAttrCase<"decrease", 1>; def SetMaxRegisterAction : I32EnumAttr<"SetMaxRegisterAction", "NVVM set max register action", diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index 4d1896551101e..2c7c3e9d535f7 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -1004,6 +1004,10 @@ void NVVM::WgmmaMmaAsyncOp::getAsmValues( } } LogicalResult NVVM::FenceProxyOp::verify() { + if (getKind() == NVVM::ProxyKind::TENSORMAP) + return emitOpError() << "tensormap proxy is not a supported proxy kind"; + if (getKind() == NVVM::ProxyKind::GENERIC) + return emitOpError() << "generic proxy not a supported proxy kind"; if (getKind() == NVVM::ProxyKind::async_shared && !getSpace().has_value()) { return emitOpError() << "async_shared fence requires space attribute"; } @@ -1013,6 +1017,30 @@ LogicalResult NVVM::FenceProxyOp::verify() { return success(); } +LogicalResult NVVM::FenceProxyAcquireOp::verify() { + if (getFromProxy() != NVVM::ProxyKind::GENERIC) + return emitOpError("uni-directional proxies only support generic for " + "from_proxy attribute"); + + if (getToProxy() != NVVM::ProxyKind::TENSORMAP) + return emitOpError("uni-directional proxies only support tensormap " + "for to_proxy attribute"); + + return success(); +} + +LogicalResult NVVM::FenceProxyReleaseOp::verify() { + if (getFromProxy() != NVVM::ProxyKind::GENERIC) + return emitOpError("uni-directional proxies only support generic for " + "from_proxy attribute"); + + if (getToProxy() != NVVM::ProxyKind::TENSORMAP) + return emitOpError("uni-directional proxies only support tensormap " + "for to_proxy attribute"); + + return success(); +} + LogicalResult NVVM::SetMaxRegisterOp::verify() { if (getRegCount() % 8) return emitOpError("new register size must be multiple of 8"); diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp index a09c24dda82af..f93e1cc8780c7 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp @@ -120,6 +120,40 @@ static llvm::Intrinsic::ID getLdMatrixIntrinsicId(NVVM::MMALayout layout, } } +static unsigned getUnidirectionalFenceProxyID(NVVM::ProxyKind fromProxy, + NVVM::ProxyKind toProxy, + NVVM::MemScopeKind scope, + bool isRelease) { + if (fromProxy == NVVM::ProxyKind::GENERIC && + toProxy == NVVM::ProxyKind::TENSORMAP) { + switch (scope) { + case NVVM::MemScopeKind::CTA: { + if (isRelease) + return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_release_cta; + return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_cta; + } + case NVVM::MemScopeKind::CLUSTER: { + if (isRelease) + return llvm::Intrinsic:: + nvvm_fence_proxy_tensormap_generic_release_cluster; + return llvm::Intrinsic:: + nvvm_fence_proxy_tensormap_generic_acquire_cluster; + } + case NVVM::MemScopeKind::GPU: { + if (isRelease) + return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_release_gpu; + return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_gpu; + } + case NVVM::MemScopeKind::SYS: { + if (isRelease) + return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_release_sys; + return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_sys; + } + } + llvm_unreachable("Unknown scope for uni-directional fence.proxy operation"); + } +} + namespace { /// Implementation of the dialect interface that converts operations belonging /// to the NVVM dialect to LLVM IR. diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir new file mode 100644 index 0000000000000..0e563808da970 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir @@ -0,0 +1,33 @@ +// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s + +// ----- + +llvm.func @nvvm_fence_proxy_acquire(%addr : !llvm.ptr, %size : i32) { + // expected-error @below {{'nvvm.fence.proxy.acquire' op uni-directional proxies only support generic for from_proxy attribute}} + nvvm.fence.proxy.acquire #nvvm.mem_scope %addr, %size from_proxy=#nvvm.proxy_kind to_proxy=#nvvm.proxy_kind + llvm.return +} + +// ----- + +llvm.func @nvvm_fence_proxy_release() { + // expected-error @below {{'nvvm.fence.proxy.release' op uni-directional proxies only support generic for from_proxy attribute}} + nvvm.fence.proxy.release #nvvm.mem_scope from_proxy=#nvvm.proxy_kind to_proxy=#nvvm.proxy_kind + llvm.return +} + +// ----- + +llvm.func @nvvm_fence_proxy_acquire(%addr : !llvm.ptr, %size : i32) { + // expected-error @below {{'nvvm.fence.proxy.acquire' op uni-directional proxies only support tensormap for to_proxy attribute}} + nvvm.fence.proxy.acquire #nvvm.mem_scope %addr, %size from_proxy=#nvvm.proxy_kind to_proxy=#nvvm.proxy_kind + llvm.return +} + +// ----- + +llvm.func @nvvm_fence_proxy_release() { + // expected-error @below {{'nvvm.fence.proxy.release' op uni-directional proxies only support tensormap for to_proxy attribute}} + nvvm.fence.proxy.release #nvvm.mem_scope from_proxy=#nvvm.proxy_kind to_proxy=#nvvm.proxy_kind + llvm.return +} \ No newline at end of file diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir index a8ae4d97888c9..6e2787d121ae6 100644 --- a/mlir/test/Target/LLVMIR/nvvmir.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir.mlir @@ -574,3 +574,40 @@ llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}) llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}, %arg1: f32, %arg2: !llvm.ptr {llvm.byval = f32, nvvm.grid_constant}) attributes {nvvm.kernel} { llvm.return } + + +// ----- +// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_release +llvm.func @nvvm_fence_proxy_tensormap_generic_release() { + %c128 = llvm.mlir.constant(128) : i32 + // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cta() + nvvm.fence.proxy.release #nvvm.mem_scope + + // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cluster() + nvvm.fence.proxy.release #nvvm.mem_scope + + // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.gpu() + nvvm.fence.proxy.release #nvvm.mem_scope + + // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.sys() + nvvm.fence.proxy.release #nvvm.mem_scope + llvm.return +} + +// ----- +// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_acquire +llvm.func @nvvm_fence_proxy_tensormap_generic_acquire(%addr : !llvm.ptr) { + %c128 = llvm.mlir.constant(128) : i32 + // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr {{%[0-9]+}}, i32 128) + nvvm.fence.proxy.acquire #nvvm.mem_scope %addr, %c128 + + // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cluster(ptr {{%[0-9]+}}, i32 128) + nvvm.fence.proxy.acquire #nvvm.mem_scope %addr, %c128 + + // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.gpu(ptr {{%[0-9]+}}, i32 128) + nvvm.fence.proxy.acquire #nvvm.mem_scope %addr, %c128 + + // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.sys(ptr {{%[0-9]+}}, i32 128) + nvvm.fence.proxy.acquire #nvvm.mem_scope %addr, %c128 + llvm.return +} \ No newline at end of file