Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 0 additions & 129 deletions llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1871,17 +1871,6 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
(is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
: (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))

#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \
[&]() -> auto { \
if (is_mc && is_ch) \
return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH); \
if (is_ch) \
return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH); \
if (is_mc) \
return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC); \
return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \
}()

static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim,
bool IsShared32,
bool IsCacheHint,
Expand Down Expand Up @@ -1925,112 +1914,6 @@ static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim,
}
}

static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
bool IsMultiCast,
bool IsCacheHint, bool IsIm2Col) {
if (IsIm2Col) {
switch (Dim) {
case 3:
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL, IsMultiCast,
IsCacheHint, IsShared32);
case 4:
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL, IsMultiCast,
IsCacheHint, IsShared32);
case 5:
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL, IsMultiCast,
IsCacheHint, IsShared32);
default:
llvm_unreachable("Invalid Dimension in im2col mode for "
"GetCpAsyncBulkTensorG2SOpcode.");
}
} else {
switch (Dim) {
case 1:
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE, IsMultiCast,
IsCacheHint, IsShared32);
case 2:
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE, IsMultiCast,
IsCacheHint, IsShared32);
case 3:
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE, IsMultiCast,
IsCacheHint, IsShared32);
case 4:
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE, IsMultiCast,
IsCacheHint, IsShared32);
case 5:
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE, IsMultiCast,
IsCacheHint, IsShared32);
default:
llvm_unreachable(
"Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode.");
}
}
}

static size_t GetDimsFromIntrinsic(unsigned IID) {
switch (IID) {
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
return 3;
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
return 4;
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
return 5;
default:
llvm_unreachable("Invalid im2col intrinsic in GetDimsFromIntrinsic.");
}
}

void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
bool IsIm2Col) {
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
// {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2}
// multicast, cache_hint,
// multicast_flag, cache_hint_flag, cta_group_flag}
// NumOperands = {Chain, IID} + {Actual intrinsic args}
// = {2} + {8 + dims + im2col_offsets}
size_t NumOps = N->getNumOperands();
size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
: (NumOps - 10);
// Offsets is always 'NumDims - 2' and only for im2col mode
size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
bool IsCacheHint = N->getConstantOperandVal(NumOps - 2) == 1;
bool IsMultiCast = N->getConstantOperandVal(NumOps - 3) == 1;
size_t NumBaseArgs = NumDims + NumOffsets + 3; // for {dst, mbar, src}
size_t MultiCastIdx = NumBaseArgs + 2; // for Chain and IID

unsigned CTAGroupVal = N->getConstantOperandVal(NumOps - 1);
if ((CTAGroupVal > 0) && !Subtarget->hasCpAsyncBulkTensorCTAGroupSupport())
report_fatal_error(
formatv("CpAsyncBulkTensorG2S cta_group::1/2 is not supported on sm_{}",
Subtarget->getSmVersion()));

SDLoc DL(N);
SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumBaseArgs));

// Push MultiCast operand, if available
if (IsMultiCast)
Ops.push_back(N->getOperand(MultiCastIdx));

// Push CacheHint operand, if available
if (IsCacheHint)
Ops.push_back(N->getOperand(MultiCastIdx + 1));

// Flag for CTA Group
Ops.push_back(getI32Imm(CTAGroupVal, DL));

// Finally, the chain operand
Ops.push_back(N->getOperand(0));

bool IsShared32 =
CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
unsigned Opcode = GetCpAsyncBulkTensorG2SOpcode(
NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col);
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
}

void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
unsigned RedOp,
bool IsIm2Col) {
Expand Down Expand Up @@ -2175,18 +2058,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
switch (IID) {
default:
return false;
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d:
SelectCpAsyncBulkTensorG2SCommon(N);
return true;
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true);
return true;
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
void SelectV2I64toI128(SDNode *N);
void SelectI128toV2I64(SDNode *N);
void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
void SelectCpAsyncBulkTensorReduceCommon(SDNode *N, unsigned RedOp,
bool IsIm2Col = false);
void SelectTcgen05Ld(SDNode *N, bool hasOffset = false);
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,6 @@ def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">;
def hasTcgen05MMAScaleInputDImm : Predicate<"Subtarget->hasTcgen05MMAScaleInputDImm()">;
def hasTMACTAGroupSupport : Predicate<"Subtarget->hasCpAsyncBulkTensorCTAGroupSupport()">;
def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">;

class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;
Expand Down
105 changes: 31 additions & 74 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -599,75 +599,15 @@ class TMA_IM2COL_UTIL<int dim, string mode> {
string base_str = !interleave(!foreach(i, !range(offsets), "$im2col" # i), ", ");
}

// From Global to Shared memory (G2S)
class G2S_STRINGS<int dim, string mode, bit mc, bit ch, bit is_shared32 = 0> {
string prefix = "cp.async.bulk.tensor";
string dir = "shared::cluster.global";
string completion = "mbarrier::complete_tx::bytes";
string inst_name = prefix
# "." # dim # "d"
# "." # dir
# "." # mode
# "." # completion
# !if(mc, ".multicast::cluster", "")
# !if(ch, ".L2::cache_hint", "");
string intr_name = "CP_ASYNC_BULK_TENSOR_G2S_"
# dim # "D"
# !if(is_shared32, "_SHARED32", "")
# !if(!eq(mode, "tile"), "_TILE", "_IM2COL");
}

def CTAGroupFlags : Operand<i32> {
let PrintMethod = "printCTAGroup";
}

multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, bit is_shared32, string mode> {
defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
defvar asm_str_default = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
defvar rc = !if(is_shared32, B32, B64);

defvar num_im2col = !if(!ge(dim, 3), !add(dim, -2), 0);
defvar im2col_dag = !if(!eq(mode, "im2col"),
!dag(ins, !listsplat(B16, num_im2col), !foreach(i, !range(num_im2col), "im2col" # i)),
(ins));
defvar im2col_str = !interleave(!foreach(i, !range(num_im2col), "$im2col" # i), ", ");
defvar im2col_asm_str = ", {{" # im2col_str # "}}";

defvar asm_str = !if(!eq(mode, "im2col"),
!strconcat(asm_str_default, im2col_asm_str), asm_str_default);
def tma_cta_group_imm0 : TImmLeaf<i32, [{return Imm == 0;}]>;
def tma_cta_group_imm_any : TImmLeaf<i32, [{return Imm >= 0;}]>;

def "" : NVPTXInst<(outs),
!con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, (ins CTAGroupFlags:$cg)),
!strconcat(G2S_STRINGS<dim, mode, 0, 0>.inst_name, asm_str, ";")>,
Requires<[hasPTX<80>, hasSM<90>]>;
def _MC : NVPTXInst<(outs),
!con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag,
(ins B16:$mc, CTAGroupFlags:$cg)),
!strconcat(G2S_STRINGS<dim, mode, 1, 0>.inst_name, asm_str, ", $mc;")>,
Requires<[hasPTX<80>, hasSM<90>]>;
def _CH : NVPTXInst<(outs),
!con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag,
(ins B64:$ch, CTAGroupFlags:$cg)),
!strconcat(G2S_STRINGS<dim, mode, 0, 1>.inst_name, asm_str, ", $ch;")>,
Requires<[hasPTX<80>, hasSM<90>]>;
def _MC_CH : NVPTXInst<(outs),
!con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag,
(ins B16:$mc, B64:$ch, CTAGroupFlags:$cg)),
!strconcat(G2S_STRINGS<dim, mode, 1, 1>.inst_name, asm_str, ", $mc, $ch;")>,
Requires<[hasPTX<80>, hasSM<90>]>;
}

foreach dim = [1, 2, 3, 4, 5] in {
foreach shared32 = [true, false] in {
foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
defm G2S_STRINGS<dim, mode, 0, 0, shared32>.intr_name :
CP_ASYNC_BULK_TENSOR_G2S_INTR<dim, shared32, mode>;
}
}
}

multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []> {
multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred,
TImmLeaf cta_group_type = tma_cta_group_imm_any> {
defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
defvar asm_str_base = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
Expand Down Expand Up @@ -697,10 +637,10 @@ multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []>
!setdagop(dims_dag, intr),
!setdagop(im2col_dag, intr),
(intr B16:$mc, B64:$ch));
defvar intr_dag_no_hints = !con(intr_dag_base, (intr 0, 0, timm:$cg));
defvar intr_dag_with_mc = !con(intr_dag_base, (intr -1, 0, timm:$cg));
defvar intr_dag_with_ch = !con(intr_dag_base, (intr 0, -1, timm:$cg));
defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, timm:$cg));
defvar intr_dag_no_hints = !con(intr_dag_base, (intr 0, 0, cta_group_type:$cg));
defvar intr_dag_with_mc = !con(intr_dag_base, (intr -1, 0, cta_group_type:$cg));
defvar intr_dag_with_ch = !con(intr_dag_base, (intr 0, -1, cta_group_type:$cg));
defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, cta_group_type:$cg));

def "" : NVPTXInst<(outs), ins_dag,
inst_name # asm_str # ";",
Expand All @@ -719,14 +659,30 @@ multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []>
[intr_dag_with_mc_ch]>,
Requires<pred>;
}

foreach dim = 1...5 in {
defm TMA_G2S_TILE_CG0_ # dim # "D"
: TMA_TENSOR_G2S_INTR<dim, "tile", [hasPTX<80>, hasSM<90>],
tma_cta_group_imm0>;
defm TMA_G2S_TILE_ # dim # "D"
: TMA_TENSOR_G2S_INTR<dim, "tile",
[callSubtarget<"hasTMABlackwellSupport">]>;
}
foreach dim = 3...5 in {
defm TMA_G2S_IM2COL_CG0_ # dim # "D"
: TMA_TENSOR_G2S_INTR<dim, "im2col", [hasPTX<80>, hasSM<90>],
tma_cta_group_imm0>;
defm TMA_G2S_IM2COL_ # dim # "D"
: TMA_TENSOR_G2S_INTR<dim, "im2col",
[callSubtarget<"hasTMABlackwellSupport">]>;
foreach mode = ["im2col_w", "im2col_w_128"] in {
defm TMA_G2S_ # !toupper(mode) # "_" # dim # "D"
: TMA_TENSOR_G2S_INTR<dim, mode, [hasTMACTAGroupSupport]>;
: TMA_TENSOR_G2S_INTR<dim, mode,
[callSubtarget<"hasTMABlackwellSupport">]>;
}
}
defm TMA_G2S_TILE_GATHER4_2D : TMA_TENSOR_G2S_INTR<5, "tile_gather4",
[hasTMACTAGroupSupport]>;
[callSubtarget<"hasTMABlackwellSupport">]>;

multiclass TMA_TENSOR_G2S_CTA_INTR<int dim, string mode, list<Predicate> pred = []> {
defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
Expand Down Expand Up @@ -784,7 +740,8 @@ foreach dim = 3...5 in {
: TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w", [hasPTX<86>, hasSM<100>]>;

defm TMA_G2S_CTA_IM2COL_W_128_ # dim # "D"
: TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w_128", [hasTMACTAGroupSupport]>;
: TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w_128",
[callSubtarget<"hasTMABlackwellSupport">]>;
}
defm TMA_G2S_CTA_TILE_GATHER4_2D : TMA_TENSOR_G2S_CTA_INTR<5, "tile_gather4",
[hasPTX<86>, hasSM<100>]>;
Expand Down Expand Up @@ -835,7 +792,7 @@ foreach dim = 1...5 in {
}
}
defm TMA_S2G_TILE_SCATTER4_2D : TMA_TENSOR_S2G_INTR<5, "tile_scatter4",
[hasTMACTAGroupSupport]>;
[callSubtarget<"hasTMABlackwellSupport">]>;

def TMAReductionFlags : Operand<i32> {
let PrintMethod = "printTmaReductionMode";
Expand Down Expand Up @@ -930,11 +887,11 @@ foreach dim = 3...5 in {
foreach mode = ["im2col_w", "im2col_w_128"] in {
defvar suffix = !toupper(mode) # "_" # dim # "D";
defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR<dim, mode,
[hasTMACTAGroupSupport]>;
[callSubtarget<"hasTMABlackwellSupport">]>;
}
}
defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4",
[hasTMACTAGroupSupport]>;
[callSubtarget<"hasTMABlackwellSupport">]>;

//Prefetchu and Prefetch

Expand Down
21 changes: 9 additions & 12 deletions llvm/lib/Target/NVPTX/NVPTXSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,18 +166,15 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
// f32x2 instructions in Blackwell family
bool hasF32x2Instructions() const;

// TMA G2S copy with cta_group::1/2 support
bool hasCpAsyncBulkTensorCTAGroupSupport() const {
// TODO: Update/tidy-up after the family-conditional support arrives
switch (FullSmVersion) {
case 1003:
case 1013:
return PTXVersion >= 86;
case 1033:
return PTXVersion >= 88;
default:
return false;
}
// Checks support for following in TMA:
// - cta_group::1/2 support
// - im2col_w/w_128 mode support
// - tile_gather4 mode support
// - tile_scatter4 mode support
bool hasTMABlackwellSupport() const {
return hasPTXWithFamilySMs(90, {100, 110}) ||
hasPTXWithFamilySMs(88, {100, 101}) ||
hasPTXWithAccelSMs(86, {100, 101});
}

// Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
Expand Down
4 changes: 4 additions & 0 deletions llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}

target triple = "nvptx64-nvidia-cuda"

Expand Down
4 changes: 4 additions & 0 deletions llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}

target triple = "nvptx64-nvidia-cuda"

Expand Down
4 changes: 4 additions & 0 deletions llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}

target triple = "nvptx64-nvidia-cuda"

Expand Down
Loading