Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 78 additions & 1 deletion mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.td"

def LLVM_PointerGeneric : LLVM_PointerInAddressSpace<0>;
def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>;
def LLVM_PointerShared : LLVM_PointerInAddressSpace<3>;

Expand Down Expand Up @@ -531,8 +532,10 @@ def ProxyAlias : I32EnumAttrCase<"alias", 0, "alias">;
def ProxyAsync : I32EnumAttrCase<"async", 1, "async">;
def ProxyAsyncGlobal : I32EnumAttrCase<"async_global", 2, "async.global">;
def ProxyAsyncShared : I32EnumAttrCase<"async_shared", 3, "async.shared">;
def ProxyTensorMap : I32EnumAttrCase<"TENSORMAP", 4, "tensormap">;
def ProxyGeneric : I32EnumAttrCase<"GENERIC", 5, "generic">;
def ProxyKind : I32EnumAttr<"ProxyKind", "Proxy kind",
[ProxyAlias, ProxyAsync, ProxyAsyncGlobal, ProxyAsyncShared]> {
[ProxyAlias, ProxyAsync, ProxyAsyncGlobal, ProxyAsyncShared, ProxyTensorMap, ProxyGeneric]> {
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::NVVM";
}
Expand Down Expand Up @@ -565,6 +568,80 @@ def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">,
let hasVerifier = 1;
}

// Attrs describing the scope of the Memory Operation
def MemScopeKindCTA : I32EnumAttrCase<"CTA", 0, "cta">;
def MemScopeKindCluster : I32EnumAttrCase<"CLUSTER", 1, "cluster">;
def MemScopeKindGPU : I32EnumAttrCase<"GPU", 2, "gpu">;
def MemScopeKindSYS : I32EnumAttrCase<"SYS", 3, "sys">;

def MemScopeKind : I32EnumAttr<"MemScopeKind", "NVVM Memory Scope kind",
[MemScopeKindCTA, MemScopeKindCluster, MemScopeKindGPU, MemScopeKindSYS]> {
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::NVVM";
}
def MemScopeKindAttr : EnumAttr<NVVM_Dialect, MemScopeKind, "mem_scope"> {
let assemblyFormat = "`<` $value `>`";
}

def NVVM_FenceProxyAcquireOp : NVVM_Op<"fence.proxy.acquire">,
Arguments<(ins MemScopeKindAttr:$scope, LLVM_PointerGeneric:$addr, I32:$size,
DefaultValuedAttr<ProxyKindAttr,
"ProxyKind::GENERIC">:$fromProxy,
DefaultValuedAttr<ProxyKindAttr,
"ProxyKind::TENSORMAP">:$toProxy)> {
let summary = "Uni-directional proxy fence operation with acquire semantics";
let description = [{
`fence.proxy.acquire` is a uni-directional fence used to establish ordering
between a prior memory access performed via the generic proxy and a
subsequent memory access performed via the tensormap proxy

The address operand `addr` and the operand `size` together specify the
memory range `[addr, addr+size)` on which the ordering guarantees on the
memory accesses across the proxies is to be provided. The only supported
value for the `size` operand is 128 and must be an immediate. Generic Addressing
is used unconditionally, and the address specified by the operand `addr` must
fall within the `.global` state space. Otherwise, the behavior is undefined
[For more information, see PTX ISA]
(https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
}];

let assemblyFormat = "$scope $addr `,` $size (`from_proxy` `=` $fromProxy^)? (`to_proxy` `=` $toProxy^)? attr-dict";
let llvmBuilder = [{
createIntrinsicCall(
builder,
getUnidirectionalFenceProxyID($fromProxy, $toProxy, $scope, false),
{$addr, $size});
}];

let hasVerifier = 1;
}

def NVVM_FenceProxyReleaseOp : NVVM_Op<"fence.proxy.release">,
Arguments<(ins MemScopeKindAttr:$scope,
DefaultValuedAttr<ProxyKindAttr,
"ProxyKind::GENERIC">:$fromProxy,
DefaultValuedAttr<ProxyKindAttr,
"ProxyKind::TENSORMAP">:$toProxy)> {
let summary = "Uni-directional proxy fence operation with release semantics";
let description = [{
`fence.proxy.release` is a uni-directional fence used to establish ordering
between a prior memory access performed via the generic proxy and a
subsequent memory access performed via the tensormap proxy. `fence.proxy.release`
operation can form a release sequence that synchronizes with an acquire
sequence that contains the fence.proxy.acquire proxy fence operation
[For more information, see PTX ISA]
(https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
}];

let assemblyFormat = "$scope (`from_proxy` `=` $fromProxy^)? (`to_proxy` `=` $toProxy^)? attr-dict";
let llvmBuilder = [{
createIntrinsicCall(builder, getUnidirectionalFenceProxyID(
$fromProxy, $toProxy, $scope, true));
}];

let hasVerifier = 1;
}

def SetMaxRegisterActionIncrease : I32EnumAttrCase<"increase", 0>;
def SetMaxRegisterActionDecrease : I32EnumAttrCase<"decrease", 1>;
def SetMaxRegisterAction : I32EnumAttr<"SetMaxRegisterAction", "NVVM set max register action",
Expand Down
28 changes: 28 additions & 0 deletions mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1004,6 +1004,10 @@ void NVVM::WgmmaMmaAsyncOp::getAsmValues(
}
}
LogicalResult NVVM::FenceProxyOp::verify() {
if (getKind() == NVVM::ProxyKind::TENSORMAP)
return emitOpError() << "tensormap proxy is not a supported proxy kind";
if (getKind() == NVVM::ProxyKind::GENERIC)
return emitOpError() << "generic proxy not a supported proxy kind";
if (getKind() == NVVM::ProxyKind::async_shared && !getSpace().has_value()) {
return emitOpError() << "async_shared fence requires space attribute";
}
Expand All @@ -1013,6 +1017,30 @@ LogicalResult NVVM::FenceProxyOp::verify() {
return success();
}

LogicalResult NVVM::FenceProxyAcquireOp::verify() {
if (getFromProxy() != NVVM::ProxyKind::GENERIC)
return emitOpError("uni-directional proxies only support generic for "
"from_proxy attribute");

if (getToProxy() != NVVM::ProxyKind::TENSORMAP)
return emitOpError("uni-directional proxies only support tensormap "
"for to_proxy attribute");

return success();
}

LogicalResult NVVM::FenceProxyReleaseOp::verify() {
if (getFromProxy() != NVVM::ProxyKind::GENERIC)
return emitOpError("uni-directional proxies only support generic for "
"from_proxy attribute");

if (getToProxy() != NVVM::ProxyKind::TENSORMAP)
return emitOpError("uni-directional proxies only support tensormap "
"for to_proxy attribute");

return success();
}

LogicalResult NVVM::SetMaxRegisterOp::verify() {
if (getRegCount() % 8)
return emitOpError("new register size must be multiple of 8");
Expand Down
34 changes: 34 additions & 0 deletions mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,40 @@ static llvm::Intrinsic::ID getLdMatrixIntrinsicId(NVVM::MMALayout layout,
}
}

static unsigned getUnidirectionalFenceProxyID(NVVM::ProxyKind fromProxy,
NVVM::ProxyKind toProxy,
NVVM::MemScopeKind scope,
bool isRelease) {
if (fromProxy == NVVM::ProxyKind::GENERIC &&
toProxy == NVVM::ProxyKind::TENSORMAP) {
switch (scope) {
case NVVM::MemScopeKind::CTA: {
if (isRelease)
return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_release_cta;
return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_cta;
}
case NVVM::MemScopeKind::CLUSTER: {
if (isRelease)
return llvm::Intrinsic::
nvvm_fence_proxy_tensormap_generic_release_cluster;
return llvm::Intrinsic::
nvvm_fence_proxy_tensormap_generic_acquire_cluster;
}
case NVVM::MemScopeKind::GPU: {
if (isRelease)
return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_release_gpu;
return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_gpu;
}
case NVVM::MemScopeKind::SYS: {
if (isRelease)
return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_release_sys;
return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_sys;
}
}
llvm_unreachable("Unknown scope for uni-directional fence.proxy operation");
}
}

namespace {
/// Implementation of the dialect interface that converts operations belonging
/// to the NVVM dialect to LLVM IR.
Expand Down
33 changes: 33 additions & 0 deletions mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s

// -----

llvm.func @nvvm_fence_proxy_acquire(%addr : !llvm.ptr, %size : i32) {
// expected-error @below {{'nvvm.fence.proxy.acquire' op uni-directional proxies only support generic for from_proxy attribute}}
nvvm.fence.proxy.acquire #nvvm.mem_scope<cta> %addr, %size from_proxy=#nvvm.proxy_kind<tensormap> to_proxy=#nvvm.proxy_kind<generic>
llvm.return
}

// -----

llvm.func @nvvm_fence_proxy_release() {
// expected-error @below {{'nvvm.fence.proxy.release' op uni-directional proxies only support generic for from_proxy attribute}}
nvvm.fence.proxy.release #nvvm.mem_scope<cta> from_proxy=#nvvm.proxy_kind<tensormap> to_proxy=#nvvm.proxy_kind<generic>
llvm.return
}

// -----

llvm.func @nvvm_fence_proxy_acquire(%addr : !llvm.ptr, %size : i32) {
// expected-error @below {{'nvvm.fence.proxy.acquire' op uni-directional proxies only support tensormap for to_proxy attribute}}
nvvm.fence.proxy.acquire #nvvm.mem_scope<cta> %addr, %size from_proxy=#nvvm.proxy_kind<generic> to_proxy=#nvvm.proxy_kind<generic>
llvm.return
}

// -----

llvm.func @nvvm_fence_proxy_release() {
// expected-error @below {{'nvvm.fence.proxy.release' op uni-directional proxies only support tensormap for to_proxy attribute}}
nvvm.fence.proxy.release #nvvm.mem_scope<cta> from_proxy=#nvvm.proxy_kind<generic> to_proxy=#nvvm.proxy_kind<generic>
llvm.return
}
37 changes: 37 additions & 0 deletions mlir/test/Target/LLVMIR/nvvmir.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -574,3 +574,40 @@ llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant})
llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}, %arg1: f32, %arg2: !llvm.ptr {llvm.byval = f32, nvvm.grid_constant}) attributes {nvvm.kernel} {
llvm.return
}


// -----
// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_release
llvm.func @nvvm_fence_proxy_tensormap_generic_release() {
%c128 = llvm.mlir.constant(128) : i32
// CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cta()
nvvm.fence.proxy.release #nvvm.mem_scope<cta>

// CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cluster()
nvvm.fence.proxy.release #nvvm.mem_scope<cluster>

// CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.gpu()
nvvm.fence.proxy.release #nvvm.mem_scope<gpu>

// CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.sys()
nvvm.fence.proxy.release #nvvm.mem_scope<sys>
llvm.return
}

// -----
// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_acquire
llvm.func @nvvm_fence_proxy_tensormap_generic_acquire(%addr : !llvm.ptr) {
%c128 = llvm.mlir.constant(128) : i32
// CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr {{%[0-9]+}}, i32 128)
nvvm.fence.proxy.acquire #nvvm.mem_scope<cta> %addr, %c128

// CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cluster(ptr {{%[0-9]+}}, i32 128)
nvvm.fence.proxy.acquire #nvvm.mem_scope<cluster> %addr, %c128

// CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.gpu(ptr {{%[0-9]+}}, i32 128)
nvvm.fence.proxy.acquire #nvvm.mem_scope<gpu> %addr, %c128

// CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.sys(ptr {{%[0-9]+}}, i32 128)
nvvm.fence.proxy.acquire #nvvm.mem_scope<sys> %addr, %c128
llvm.return
}