-
Notifications
You must be signed in to change notification settings - Fork 15k
[MLIR][NVVM] Add support for fence.proxy.{acquire, release} Ops #106689
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MLIR][NVVM] Add support for fence.proxy.{acquire, release} Ops #106689
Conversation
This commit adds fence.proxy.acquire and fence.proxy.release Ops which map to uni-directional proxy fences in PTX with lowering tests and negative tests under nvvmir.mlir and nvvmir-invalid.mlir respectively. The commits also adds a new MemScopeKind attribute and extends the current ProxyKindAttr to support tensormap and generic.
|
@llvm/pr-subscribers-mlir Author: Pradeep Kumar (schwarzschild-radius) ChangesThis commit adds fence.proxy.acquire and fence.proxy.release Ops which map to uni-directional proxy fences in PTX with lowering tests and negative tests under nvvmir.mlir and nvvmir-invalid.mlir respectively. The commits also adds a new MemScopeKind attribute and extends the current ProxyKindAttr to support tensormap and generic. Full diff: https://github.com/llvm/llvm-project/pull/106689.diff 5 Files Affected:
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 4d48b3de7a57ed..709dd922b8fa2f 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -19,6 +19,7 @@ include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.td"
+def LLVM_PointerGeneric : LLVM_PointerInAddressSpace<0>;
def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>;
def LLVM_PointerShared : LLVM_PointerInAddressSpace<3>;
@@ -531,8 +532,10 @@ def ProxyAlias : I32EnumAttrCase<"alias", 0, "alias">;
def ProxyAsync : I32EnumAttrCase<"async", 1, "async">;
def ProxyAsyncGlobal : I32EnumAttrCase<"async_global", 2, "async.global">;
def ProxyAsyncShared : I32EnumAttrCase<"async_shared", 3, "async.shared">;
+def ProxyTensorMap : I32EnumAttrCase<"TENSORMAP", 4, "tensormap">;
+def ProxyGeneric : I32EnumAttrCase<"GENERIC", 5, "generic">;
def ProxyKind : I32EnumAttr<"ProxyKind", "Proxy kind",
- [ProxyAlias, ProxyAsync, ProxyAsyncGlobal, ProxyAsyncShared]> {
+ [ProxyAlias, ProxyAsync, ProxyAsyncGlobal, ProxyAsyncShared, ProxyTensorMap, ProxyGeneric]> {
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::NVVM";
}
@@ -565,6 +568,80 @@ def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">,
let hasVerifier = 1;
}
+// Attrs describing the scope of the Memory Operation
+def MemScopeKindCTA : I32EnumAttrCase<"CTA", 0, "cta">;
+def MemScopeKindCluster : I32EnumAttrCase<"CLUSTER", 1, "cluster">;
+def MemScopeKindGPU : I32EnumAttrCase<"GPU", 2, "gpu">;
+def MemScopeKindSYS : I32EnumAttrCase<"SYS", 3, "sys">;
+
+def MemScopeKind : I32EnumAttr<"MemScopeKind", "NVVM Memory Scope kind",
+ [MemScopeKindCTA, MemScopeKindCluster, MemScopeKindGPU, MemScopeKindSYS]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::NVVM";
+}
+def MemScopeKindAttr : EnumAttr<NVVM_Dialect, MemScopeKind, "mem_scope"> {
+ let assemblyFormat = "`<` $value `>`";
+}
+
+def NVVM_FenceProxyAcquireOp : NVVM_Op<"fence.proxy.acquire">,
+ Arguments<(ins MemScopeKindAttr:$scope, LLVM_PointerGeneric:$addr, I32:$size,
+ DefaultValuedAttr<ProxyKindAttr,
+ "ProxyKind::GENERIC">:$fromProxy,
+ DefaultValuedAttr<ProxyKindAttr,
+ "ProxyKind::TENSORMAP">:$toProxy)> {
+ let summary = "Uni-directional proxy fence operation with acquire semantics";
+ let description = [{
+ `fence.proxy.acquire` is a uni-directional fence used to establish ordering
+ between a prior memory access performed via the generic proxy and a
+ subsequent memory access performed via the tensormap proxy
+
+ The address operand `addr` and the operand `size` together specify the
+ memory range `[addr, addr+size)` on which the ordering guarantees on the
+ memory accesses across the proxies is to be provided. The only supported
+ value for the `size` operand is 128 and must be an immediate. Generic Addressing
+ is used unconditionally, and the address specified by the operand `addr` must
+ fall within the `.global` state space. Otherwise, the behavior is undefined
+ [For more information, see PTX ISA]
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
+ }];
+
+ let assemblyFormat = "$scope $addr `,` $size (`from_proxy` `=` $fromProxy^)? (`to_proxy` `=` $toProxy^)? attr-dict";
+ let llvmBuilder = [{
+ createIntrinsicCall(
+ builder,
+ getUnidirectionalFenceProxyID($fromProxy, $toProxy, $scope, false),
+ {$addr, $size});
+ }];
+
+ let hasVerifier = 1;
+}
+
+def NVVM_FenceProxyReleaseOp : NVVM_Op<"fence.proxy.release">,
+ Arguments<(ins MemScopeKindAttr:$scope,
+ DefaultValuedAttr<ProxyKindAttr,
+ "ProxyKind::GENERIC">:$fromProxy,
+ DefaultValuedAttr<ProxyKindAttr,
+ "ProxyKind::TENSORMAP">:$toProxy)> {
+ let summary = "Uni-directional proxy fence operation with release semantics";
+ let description = [{
+ `fence.proxy.release` is a uni-directional fence used to establish ordering
+ between a prior memory access performed via the generic proxy and a
+ subsequent memory access performed via the tensormap proxy. `fence.proxy.release`
+ operation can form a release sequence that synchronizes with an acquire
+ sequence that contains the fence.proxy.acquire proxy fence operation
+ [For more information, see PTX ISA]
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
+ }];
+
+ let assemblyFormat = "$scope (`from_proxy` `=` $fromProxy^)? (`to_proxy` `=` $toProxy^)? attr-dict";
+ let llvmBuilder = [{
+ createIntrinsicCall(builder, getUnidirectionalFenceProxyID(
+ $fromProxy, $toProxy, $scope, true));
+ }];
+
+ let hasVerifier = 1;
+}
+
def SetMaxRegisterActionIncrease : I32EnumAttrCase<"increase", 0>;
def SetMaxRegisterActionDecrease : I32EnumAttrCase<"decrease", 1>;
def SetMaxRegisterAction : I32EnumAttr<"SetMaxRegisterAction", "NVVM set max register action",
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 4d1896551101ed..2c7c3e9d535f7d 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1004,6 +1004,10 @@ void NVVM::WgmmaMmaAsyncOp::getAsmValues(
}
}
LogicalResult NVVM::FenceProxyOp::verify() {
+ if (getKind() == NVVM::ProxyKind::TENSORMAP)
+ return emitOpError() << "tensormap proxy is not a supported proxy kind";
+ if (getKind() == NVVM::ProxyKind::GENERIC)
+ return emitOpError() << "generic proxy not a supported proxy kind";
if (getKind() == NVVM::ProxyKind::async_shared && !getSpace().has_value()) {
return emitOpError() << "async_shared fence requires space attribute";
}
@@ -1013,6 +1017,30 @@ LogicalResult NVVM::FenceProxyOp::verify() {
return success();
}
+LogicalResult NVVM::FenceProxyAcquireOp::verify() {
+ if (getFromProxy() != NVVM::ProxyKind::GENERIC)
+ return emitOpError("uni-directional proxies only support generic for "
+ "from_proxy attribute");
+
+ if (getToProxy() != NVVM::ProxyKind::TENSORMAP)
+ return emitOpError("uni-directional proxies only support tensormap "
+ "for to_proxy attribute");
+
+ return success();
+}
+
+LogicalResult NVVM::FenceProxyReleaseOp::verify() {
+ if (getFromProxy() != NVVM::ProxyKind::GENERIC)
+ return emitOpError("uni-directional proxies only support generic for "
+ "from_proxy attribute");
+
+ if (getToProxy() != NVVM::ProxyKind::TENSORMAP)
+ return emitOpError("uni-directional proxies only support tensormap "
+ "for to_proxy attribute");
+
+ return success();
+}
+
LogicalResult NVVM::SetMaxRegisterOp::verify() {
if (getRegCount() % 8)
return emitOpError("new register size must be multiple of 8");
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
index a09c24dda82afc..f93e1cc8780c79 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@@ -120,6 +120,40 @@ static llvm::Intrinsic::ID getLdMatrixIntrinsicId(NVVM::MMALayout layout,
}
}
+static unsigned getUnidirectionalFenceProxyID(NVVM::ProxyKind fromProxy,
+ NVVM::ProxyKind toProxy,
+ NVVM::MemScopeKind scope,
+ bool isRelease) {
+ if (fromProxy == NVVM::ProxyKind::GENERIC &&
+ toProxy == NVVM::ProxyKind::TENSORMAP) {
+ switch (scope) {
+ case NVVM::MemScopeKind::CTA: {
+ if (isRelease)
+ return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_release_cta;
+ return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_cta;
+ }
+ case NVVM::MemScopeKind::CLUSTER: {
+ if (isRelease)
+ return llvm::Intrinsic::
+ nvvm_fence_proxy_tensormap_generic_release_cluster;
+ return llvm::Intrinsic::
+ nvvm_fence_proxy_tensormap_generic_acquire_cluster;
+ }
+ case NVVM::MemScopeKind::GPU: {
+ if (isRelease)
+ return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_release_gpu;
+ return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_gpu;
+ }
+ case NVVM::MemScopeKind::SYS: {
+ if (isRelease)
+ return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_release_sys;
+ return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_sys;
+ }
+ }
+ llvm_unreachable("Unknown scope for uni-directional fence.proxy operation");
+ }
+}
+
namespace {
/// Implementation of the dialect interface that converts operations belonging
/// to the NVVM dialect to LLVM IR.
diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
new file mode 100644
index 00000000000000..0e563808da970b
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
@@ -0,0 +1,33 @@
+// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s
+
+// -----
+
+llvm.func @nvvm_fence_proxy_acquire(%addr : !llvm.ptr, %size : i32) {
+ // expected-error @below {{'nvvm.fence.proxy.acquire' op uni-directional proxies only support generic for from_proxy attribute}}
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<cta> %addr, %size from_proxy=#nvvm.proxy_kind<tensormap> to_proxy=#nvvm.proxy_kind<generic>
+ llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_fence_proxy_release() {
+ // expected-error @below {{'nvvm.fence.proxy.release' op uni-directional proxies only support generic for from_proxy attribute}}
+ nvvm.fence.proxy.release #nvvm.mem_scope<cta> from_proxy=#nvvm.proxy_kind<tensormap> to_proxy=#nvvm.proxy_kind<generic>
+ llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_fence_proxy_acquire(%addr : !llvm.ptr, %size : i32) {
+ // expected-error @below {{'nvvm.fence.proxy.acquire' op uni-directional proxies only support tensormap for to_proxy attribute}}
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<cta> %addr, %size from_proxy=#nvvm.proxy_kind<generic> to_proxy=#nvvm.proxy_kind<generic>
+ llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_fence_proxy_release() {
+ // expected-error @below {{'nvvm.fence.proxy.release' op uni-directional proxies only support tensormap for to_proxy attribute}}
+ nvvm.fence.proxy.release #nvvm.mem_scope<cta> from_proxy=#nvvm.proxy_kind<generic> to_proxy=#nvvm.proxy_kind<generic>
+ llvm.return
+}
\ No newline at end of file
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index a8ae4d97888c90..6e2787d121ae64 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -574,3 +574,40 @@ llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant})
llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}, %arg1: f32, %arg2: !llvm.ptr {llvm.byval = f32, nvvm.grid_constant}) attributes {nvvm.kernel} {
llvm.return
}
+
+
+// -----
+// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_release
+llvm.func @nvvm_fence_proxy_tensormap_generic_release() {
+ %c128 = llvm.mlir.constant(128) : i32
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cta()
+ nvvm.fence.proxy.release #nvvm.mem_scope<cta>
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cluster()
+ nvvm.fence.proxy.release #nvvm.mem_scope<cluster>
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.gpu()
+ nvvm.fence.proxy.release #nvvm.mem_scope<gpu>
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.sys()
+ nvvm.fence.proxy.release #nvvm.mem_scope<sys>
+ llvm.return
+}
+
+// -----
+// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_acquire
+llvm.func @nvvm_fence_proxy_tensormap_generic_acquire(%addr : !llvm.ptr) {
+ %c128 = llvm.mlir.constant(128) : i32
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr {{%[0-9]+}}, i32 128)
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<cta> %addr, %c128
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cluster(ptr {{%[0-9]+}}, i32 128)
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<cluster> %addr, %c128
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.gpu(ptr {{%[0-9]+}}, i32 128)
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<gpu> %addr, %c128
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.sys(ptr {{%[0-9]+}}, i32 128)
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<sys> %addr, %c128
+ llvm.return
+}
\ No newline at end of file
|
|
@llvm/pr-subscribers-mlir-llvm Author: Pradeep Kumar (schwarzschild-radius) ChangesThis commit adds fence.proxy.acquire and fence.proxy.release Ops which map to uni-directional proxy fences in PTX with lowering tests and negative tests under nvvmir.mlir and nvvmir-invalid.mlir respectively. The commits also adds a new MemScopeKind attribute and extends the current ProxyKindAttr to support tensormap and generic. Full diff: https://github.com/llvm/llvm-project/pull/106689.diff 5 Files Affected:
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 4d48b3de7a57ed..709dd922b8fa2f 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -19,6 +19,7 @@ include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.td"
+def LLVM_PointerGeneric : LLVM_PointerInAddressSpace<0>;
def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>;
def LLVM_PointerShared : LLVM_PointerInAddressSpace<3>;
@@ -531,8 +532,10 @@ def ProxyAlias : I32EnumAttrCase<"alias", 0, "alias">;
def ProxyAsync : I32EnumAttrCase<"async", 1, "async">;
def ProxyAsyncGlobal : I32EnumAttrCase<"async_global", 2, "async.global">;
def ProxyAsyncShared : I32EnumAttrCase<"async_shared", 3, "async.shared">;
+def ProxyTensorMap : I32EnumAttrCase<"TENSORMAP", 4, "tensormap">;
+def ProxyGeneric : I32EnumAttrCase<"GENERIC", 5, "generic">;
def ProxyKind : I32EnumAttr<"ProxyKind", "Proxy kind",
- [ProxyAlias, ProxyAsync, ProxyAsyncGlobal, ProxyAsyncShared]> {
+ [ProxyAlias, ProxyAsync, ProxyAsyncGlobal, ProxyAsyncShared, ProxyTensorMap, ProxyGeneric]> {
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::NVVM";
}
@@ -565,6 +568,80 @@ def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">,
let hasVerifier = 1;
}
+// Attrs describing the scope of the Memory Operation
+def MemScopeKindCTA : I32EnumAttrCase<"CTA", 0, "cta">;
+def MemScopeKindCluster : I32EnumAttrCase<"CLUSTER", 1, "cluster">;
+def MemScopeKindGPU : I32EnumAttrCase<"GPU", 2, "gpu">;
+def MemScopeKindSYS : I32EnumAttrCase<"SYS", 3, "sys">;
+
+def MemScopeKind : I32EnumAttr<"MemScopeKind", "NVVM Memory Scope kind",
+ [MemScopeKindCTA, MemScopeKindCluster, MemScopeKindGPU, MemScopeKindSYS]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::NVVM";
+}
+def MemScopeKindAttr : EnumAttr<NVVM_Dialect, MemScopeKind, "mem_scope"> {
+ let assemblyFormat = "`<` $value `>`";
+}
+
+def NVVM_FenceProxyAcquireOp : NVVM_Op<"fence.proxy.acquire">,
+ Arguments<(ins MemScopeKindAttr:$scope, LLVM_PointerGeneric:$addr, I32:$size,
+ DefaultValuedAttr<ProxyKindAttr,
+ "ProxyKind::GENERIC">:$fromProxy,
+ DefaultValuedAttr<ProxyKindAttr,
+ "ProxyKind::TENSORMAP">:$toProxy)> {
+ let summary = "Uni-directional proxy fence operation with acquire semantics";
+ let description = [{
+ `fence.proxy.acquire` is a uni-directional fence used to establish ordering
+ between a prior memory access performed via the generic proxy and a
+ subsequent memory access performed via the tensormap proxy
+
+ The address operand `addr` and the operand `size` together specify the
+ memory range `[addr, addr+size)` on which the ordering guarantees on the
+ memory accesses across the proxies is to be provided. The only supported
+ value for the `size` operand is 128 and must be an immediate. Generic Addressing
+ is used unconditionally, and the address specified by the operand `addr` must
+ fall within the `.global` state space. Otherwise, the behavior is undefined
+ [For more information, see PTX ISA]
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
+ }];
+
+ let assemblyFormat = "$scope $addr `,` $size (`from_proxy` `=` $fromProxy^)? (`to_proxy` `=` $toProxy^)? attr-dict";
+ let llvmBuilder = [{
+ createIntrinsicCall(
+ builder,
+ getUnidirectionalFenceProxyID($fromProxy, $toProxy, $scope, false),
+ {$addr, $size});
+ }];
+
+ let hasVerifier = 1;
+}
+
+def NVVM_FenceProxyReleaseOp : NVVM_Op<"fence.proxy.release">,
+ Arguments<(ins MemScopeKindAttr:$scope,
+ DefaultValuedAttr<ProxyKindAttr,
+ "ProxyKind::GENERIC">:$fromProxy,
+ DefaultValuedAttr<ProxyKindAttr,
+ "ProxyKind::TENSORMAP">:$toProxy)> {
+ let summary = "Uni-directional proxy fence operation with release semantics";
+ let description = [{
+ `fence.proxy.release` is a uni-directional fence used to establish ordering
+ between a prior memory access performed via the generic proxy and a
+ subsequent memory access performed via the tensormap proxy. `fence.proxy.release`
+ operation can form a release sequence that synchronizes with an acquire
+ sequence that contains the fence.proxy.acquire proxy fence operation
+ [For more information, see PTX ISA]
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
+ }];
+
+ let assemblyFormat = "$scope (`from_proxy` `=` $fromProxy^)? (`to_proxy` `=` $toProxy^)? attr-dict";
+ let llvmBuilder = [{
+ createIntrinsicCall(builder, getUnidirectionalFenceProxyID(
+ $fromProxy, $toProxy, $scope, true));
+ }];
+
+ let hasVerifier = 1;
+}
+
def SetMaxRegisterActionIncrease : I32EnumAttrCase<"increase", 0>;
def SetMaxRegisterActionDecrease : I32EnumAttrCase<"decrease", 1>;
def SetMaxRegisterAction : I32EnumAttr<"SetMaxRegisterAction", "NVVM set max register action",
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 4d1896551101ed..2c7c3e9d535f7d 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1004,6 +1004,10 @@ void NVVM::WgmmaMmaAsyncOp::getAsmValues(
}
}
LogicalResult NVVM::FenceProxyOp::verify() {
+ if (getKind() == NVVM::ProxyKind::TENSORMAP)
+ return emitOpError() << "tensormap proxy is not a supported proxy kind";
+ if (getKind() == NVVM::ProxyKind::GENERIC)
+ return emitOpError() << "generic proxy not a supported proxy kind";
if (getKind() == NVVM::ProxyKind::async_shared && !getSpace().has_value()) {
return emitOpError() << "async_shared fence requires space attribute";
}
@@ -1013,6 +1017,30 @@ LogicalResult NVVM::FenceProxyOp::verify() {
return success();
}
+LogicalResult NVVM::FenceProxyAcquireOp::verify() {
+ if (getFromProxy() != NVVM::ProxyKind::GENERIC)
+ return emitOpError("uni-directional proxies only support generic for "
+ "from_proxy attribute");
+
+ if (getToProxy() != NVVM::ProxyKind::TENSORMAP)
+ return emitOpError("uni-directional proxies only support tensormap "
+ "for to_proxy attribute");
+
+ return success();
+}
+
+LogicalResult NVVM::FenceProxyReleaseOp::verify() {
+ if (getFromProxy() != NVVM::ProxyKind::GENERIC)
+ return emitOpError("uni-directional proxies only support generic for "
+ "from_proxy attribute");
+
+ if (getToProxy() != NVVM::ProxyKind::TENSORMAP)
+ return emitOpError("uni-directional proxies only support tensormap "
+ "for to_proxy attribute");
+
+ return success();
+}
+
LogicalResult NVVM::SetMaxRegisterOp::verify() {
if (getRegCount() % 8)
return emitOpError("new register size must be multiple of 8");
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
index a09c24dda82afc..f93e1cc8780c79 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@@ -120,6 +120,40 @@ static llvm::Intrinsic::ID getLdMatrixIntrinsicId(NVVM::MMALayout layout,
}
}
+static unsigned getUnidirectionalFenceProxyID(NVVM::ProxyKind fromProxy,
+ NVVM::ProxyKind toProxy,
+ NVVM::MemScopeKind scope,
+ bool isRelease) {
+ if (fromProxy == NVVM::ProxyKind::GENERIC &&
+ toProxy == NVVM::ProxyKind::TENSORMAP) {
+ switch (scope) {
+ case NVVM::MemScopeKind::CTA: {
+ if (isRelease)
+ return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_release_cta;
+ return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_cta;
+ }
+ case NVVM::MemScopeKind::CLUSTER: {
+ if (isRelease)
+ return llvm::Intrinsic::
+ nvvm_fence_proxy_tensormap_generic_release_cluster;
+ return llvm::Intrinsic::
+ nvvm_fence_proxy_tensormap_generic_acquire_cluster;
+ }
+ case NVVM::MemScopeKind::GPU: {
+ if (isRelease)
+ return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_release_gpu;
+ return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_gpu;
+ }
+ case NVVM::MemScopeKind::SYS: {
+ if (isRelease)
+ return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_release_sys;
+ return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_sys;
+ }
+ }
+ llvm_unreachable("Unknown scope for uni-directional fence.proxy operation");
+ }
+}
+
namespace {
/// Implementation of the dialect interface that converts operations belonging
/// to the NVVM dialect to LLVM IR.
diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
new file mode 100644
index 00000000000000..0e563808da970b
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
@@ -0,0 +1,33 @@
+// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s
+
+// -----
+
+llvm.func @nvvm_fence_proxy_acquire(%addr : !llvm.ptr, %size : i32) {
+ // expected-error @below {{'nvvm.fence.proxy.acquire' op uni-directional proxies only support generic for from_proxy attribute}}
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<cta> %addr, %size from_proxy=#nvvm.proxy_kind<tensormap> to_proxy=#nvvm.proxy_kind<generic>
+ llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_fence_proxy_release() {
+ // expected-error @below {{'nvvm.fence.proxy.release' op uni-directional proxies only support generic for from_proxy attribute}}
+ nvvm.fence.proxy.release #nvvm.mem_scope<cta> from_proxy=#nvvm.proxy_kind<tensormap> to_proxy=#nvvm.proxy_kind<generic>
+ llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_fence_proxy_acquire(%addr : !llvm.ptr, %size : i32) {
+ // expected-error @below {{'nvvm.fence.proxy.acquire' op uni-directional proxies only support tensormap for to_proxy attribute}}
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<cta> %addr, %size from_proxy=#nvvm.proxy_kind<generic> to_proxy=#nvvm.proxy_kind<generic>
+ llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_fence_proxy_release() {
+ // expected-error @below {{'nvvm.fence.proxy.release' op uni-directional proxies only support tensormap for to_proxy attribute}}
+ nvvm.fence.proxy.release #nvvm.mem_scope<cta> from_proxy=#nvvm.proxy_kind<generic> to_proxy=#nvvm.proxy_kind<generic>
+ llvm.return
+}
\ No newline at end of file
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index a8ae4d97888c90..6e2787d121ae64 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -574,3 +574,40 @@ llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant})
llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}, %arg1: f32, %arg2: !llvm.ptr {llvm.byval = f32, nvvm.grid_constant}) attributes {nvvm.kernel} {
llvm.return
}
+
+
+// -----
+// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_release
+llvm.func @nvvm_fence_proxy_tensormap_generic_release() {
+ %c128 = llvm.mlir.constant(128) : i32
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cta()
+ nvvm.fence.proxy.release #nvvm.mem_scope<cta>
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cluster()
+ nvvm.fence.proxy.release #nvvm.mem_scope<cluster>
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.gpu()
+ nvvm.fence.proxy.release #nvvm.mem_scope<gpu>
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.sys()
+ nvvm.fence.proxy.release #nvvm.mem_scope<sys>
+ llvm.return
+}
+
+// -----
+// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_acquire
+llvm.func @nvvm_fence_proxy_tensormap_generic_acquire(%addr : !llvm.ptr) {
+ %c128 = llvm.mlir.constant(128) : i32
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr {{%[0-9]+}}, i32 128)
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<cta> %addr, %c128
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cluster(ptr {{%[0-9]+}}, i32 128)
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<cluster> %addr, %c128
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.gpu(ptr {{%[0-9]+}}, i32 128)
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<gpu> %addr, %c128
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.sys(ptr {{%[0-9]+}}, i32 128)
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<sys> %addr, %c128
+ llvm.return
+}
\ No newline at end of file
|
This commit adds fence.proxy.acquire and fence.proxy.release Ops which map to uni-directional proxy fences in PTX with lowering tests and negative tests under nvvmir.mlir and nvvmir-invalid.mlir respectively. The commits also adds a new MemScopeKind attribute and extends the current ProxyKindAttr to support tensormap and generic.