Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NVPTX] Convert vector function nvvm.annotations to attributes #127736

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions clang/lib/CodeGen/Targets/NVPTX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -357,17 +357,13 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F,
int32_t *MaxThreadsVal,
int32_t *MinBlocksVal,
int32_t *MaxClusterRankVal) {
// Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
llvm::APSInt MaxThreads(32);
MaxThreads = Attr->getMaxThreads()->EvaluateKnownConstInt(getContext());
if (MaxThreads > 0) {
if (MaxThreadsVal)
*MaxThreadsVal = MaxThreads.getExtValue();
if (F) {
// Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx",
MaxThreads.getExtValue());
}
if (F)
F->addFnAttr("nvvm.maxntid", llvm::utostr(MaxThreads.getExtValue()));
}

// min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it
Expand Down
59 changes: 23 additions & 36 deletions clang/test/CodeGenCUDA/launch-bounds.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,30 @@
#endif

// CHECK: @Kernel1() #[[ATTR0:[0-9]+]]
// CHECK: @Kernel2() #[[ATTR1:[0-9]+]]
// CHECK: @{{.*}}Kernel3{{.*}}() #[[ATTR1]]
// CHECK: @{{.*}}Kernel4{{.*}}() #[[ATTR0]]
// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR1:[0-9]+]]
// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR0]]
// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR3:[0-9]+]]

// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}}
// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.minctasm"="258" {{.*}}}
// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="12" {{.*}}}

// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR4:[0-9]+]]
// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR4]]
// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR5:[0-9]+]]
// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR6:[0-9]+]]

// CHECK_MAX_BLOCKS: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.minctasm"="2" {{.*}}}
// CHECK_MAX_BLOCKS: attributes #[[ATTR5]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.minctasm"="258" {{.*}}}
// CHECK_MAX_BLOCKS: attributes #[[ATTR6]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.minctasm"="12" {{.*}}}
// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR2:[0-9]+]]
// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR3:[0-9]+]]
// CHECK: @{{.*}}Kernel7{{.*}}() #[[ATTR1]]
// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR4:[0-9]+]]

// CHECK-DAG: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="256" "nvvm.minctasm"="2" {{.*}}}
// CHECK-DAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="256" {{.*}}}
// CHECK-DAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="356" "nvvm.minctasm"="258" {{.*}}}
// CHECK-DAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}}
// CHECK-DAG: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxntid"="100" "nvvm.minctasm"="12" {{.*}}}

// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR0:[0-9]+]]
// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR0]]
// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR1:[0-9]+]]
// CHECK_MAX_BLOCKS: @{{.*}}Kernel7_sm_90{{.*}} #[[ATTR2:[0-9]+]]
// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR3:[0-9]+]]

// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.maxntid"="256" "nvvm.minctasm"="2" {{.*}}}
// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.maxntid"="356" "nvvm.minctasm"="258" {{.*}}}
// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="256" {{.*}}}
// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.maxntid"="100" "nvvm.minctasm"="12" {{.*}}}

// Test both max threads per block and Min cta per sm.
extern "C" {
Expand All @@ -37,8 +44,6 @@ Kernel1()
}
}

// CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"maxntidx", i32 256}

#ifdef USE_MAX_BLOCKS
// Test max threads per block and min/max cta per sm.
extern "C" {
Expand All @@ -48,8 +53,6 @@ Kernel1_sm_90()
{
}
}

// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxntidx", i32 256}
#endif // USE_MAX_BLOCKS

// Test only max threads per block. Min cta per sm defaults to 0, and
Expand All @@ -62,8 +65,6 @@ Kernel2()
}
}

// CHECK: !{{[0-9]+}} = !{ptr @Kernel2, !"maxntidx", i32 256}

template <int max_threads_per_block>
__global__ void
__launch_bounds__(max_threads_per_block)
Expand All @@ -72,7 +73,6 @@ Kernel3()
}

template __global__ void Kernel3<MAX_THREADS_PER_BLOCK>();
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel3{{.*}}, !"maxntidx", i32 256}

template <int max_threads_per_block, int min_blocks_per_mp>
__global__ void
Expand All @@ -82,7 +82,6 @@ Kernel4()
}
template __global__ void Kernel4<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();

// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"maxntidx", i32 256}

#ifdef USE_MAX_BLOCKS
template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
Expand All @@ -93,7 +92,6 @@ Kernel4_sm_90()
}
template __global__ void Kernel4_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();

// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxntidx", i32 256}
#endif //USE_MAX_BLOCKS

const int constint = 100;
Expand All @@ -106,8 +104,6 @@ Kernel5()
}
template __global__ void Kernel5<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();

// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"maxntidx", i32 356}

#ifdef USE_MAX_BLOCKS

template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
Expand All @@ -120,7 +116,6 @@ Kernel5_sm_90()
}
template __global__ void Kernel5_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();

// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxntidx", i32 356}
#endif //USE_MAX_BLOCKS

// Make sure we don't emit negative launch bounds values.
Expand All @@ -129,33 +124,25 @@ __launch_bounds__( -MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP )
Kernel6()
{
}
// CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel6{{.*}}, !"maxntidx",

__global__ void
__launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP )
Kernel7()
{
}
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"maxntidx",
// CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"minctasm",

#ifdef USE_MAX_BLOCKS
__global__ void
__launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP, -MAX_BLOCKS_PER_MP )
Kernel7_sm_90()
{
}
// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxntidx",
// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"minctasm",
// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxclusterrank",
#endif // USE_MAX_BLOCKS

const char constchar = 12;
__global__ void __launch_bounds__(constint, constchar) Kernel8() {}
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"maxntidx", i32 100

#ifdef USE_MAX_BLOCKS
const char constchar_2 = 14;
__global__ void __launch_bounds__(constint, constchar, constchar_2) Kernel8_sm_90() {}
// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxntidx", i32 100
#endif // USE_MAX_BLOCKS
25 changes: 16 additions & 9 deletions clang/test/OpenMP/ompx_attributes_codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,13 @@

// Check that the target attributes are set on the generated kernel
void func() {
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18(ptr {{[^,]+}}) #0
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}})
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #4
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #0
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l24(ptr {{[^,]+}})
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l26(ptr {{[^,]+}}) #4

// NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #[[ATTR0:[0-9]+]]
// NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l24(ptr {{[^,]+}}) #[[ATTR1:[0-9]+]]
// NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l26(ptr {{[^,]+}}) #[[ATTR2:[0-9]+]]

#pragma omp target ompx_attribute([[clang::amdgpu_flat_work_group_size(10, 20)]])
{}
Expand All @@ -34,9 +38,12 @@ void func() {
// AMD-SAME: "omp_target_thread_limit"="17"

// It is unclear if we should use the AMD annotations for other targets, we do for now.
// NVIDIA: "omp_target_thread_limit"="20"
// NVIDIA: "omp_target_thread_limit"="45"
// NVIDIA: "omp_target_thread_limit"="17"
// NVIDIA: !{ptr @__omp_offloading[[HASH1:.*]]_l18, !"maxntidx", i32 20}
// NVIDIA: !{ptr @__omp_offloading[[HASH2:.*]]_l20, !"maxntidx", i32 45}
// NVIDIA: !{ptr @__omp_offloading[[HASH3:.*]]_l22, !"maxntidx", i32 17}
// NVIDIA: attributes #[[ATTR0]]
// NVIDIA-SAME: "nvvm.maxntid"="20"
// NVIDIA-SAME: "omp_target_thread_limit"="20"
// NVIDIA: attributes #[[ATTR1]]
// NVIDIA-SAME: "nvvm.maxntid"="45"
// NVIDIA-SAME: "omp_target_thread_limit"="45"
// NVIDIA: attributes #[[ATTR2]]
// NVIDIA-SAME: "nvvm.maxntid"="17"
// NVIDIA-SAME: "omp_target_thread_limit"="17"
18 changes: 10 additions & 8 deletions clang/test/OpenMP/thread_limit_nvptx.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,27 +7,29 @@
#define HEADER

void foo(int N) {
// CHECK: l11, !"maxntidx", i32 128}
// CHECK: define {{.*}}l11{{.*}} #[[ATTR0:[0-9]+]]
#pragma omp target teams distribute parallel for simd
for (int i = 0; i < N; ++i)
;
// CHECK: l15, !"maxntidx", i32 4}
// CHECK: define {{.*}}l15{{.*}} #[[ATTR1:[0-9]+]]
#pragma omp target teams distribute parallel for simd thread_limit(4)
for (int i = 0; i < N; ++i)
;
// CHECK-NOT: l21, !"maxntidx", i32 128}
// CHECK: l21, !"maxntidx", i32 42}
// CHECK-NOT: l21, !"maxntidx", i32 128}

// CHECK: define {{.*}}l20{{.*}} #[[ATTR2:[0-9]+]]
#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42))))
for (int i = 0; i < N; ++i)
;
// CHECK-NOT: l27, !"maxntidx", i32 42}
// CHECK: l27, !"maxntidx", i32 22}
// CHECK-NOT: l27, !"maxntidx", i32 42}

// CHECK: define {{.*}}l25{{.*}} #[[ATTR3:[0-9]+]]
#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22)
for (int i = 0; i < N; ++i)
;
}

#endif

// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="128" {{.*}}}
// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="4" {{.*}}}
// CHECK: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="42" {{.*}}}
// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxntid"="22" {{.*}}}
17 changes: 17 additions & 0 deletions llvm/docs/NVPTXUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,23 @@ Function Attributes
This attribute indicates the maximum number of registers to be used for the
kernel function.

``"nvvm.maxntid"="<x>[,<y>[,<z>]]"``
This attribute declares the maximum number of threads in the thread block
(CTA). The maximum number of threads is the product of the maximum extent in
each dimension. Exceeding the maximum number of threads results in a runtime
error or kernel launch failure.

``"nvvm.reqntid"="<x>[,<y>[,<z>]]"``
This attribute declares the exact number of threads in the thread block
(CTA). The number of threads is the product of the value in each dimension.
Specifying a different CTA dimension at launch will result in a runtime
error or kernel launch failure.

``"nvvm.cluster_dim"="<x>[,<y>[,<z>]]"``
This attribute declares the number of thread blocks (CTAs) in the cluster.
The total number of CTAs is the product of the number of CTAs in each
dimension. Specifying a different cluster dimension at launch will result in
a runtime error or kernel launch failure. Only supported for Hopper+.

.. _address_spaces:

Expand Down
51 changes: 9 additions & 42 deletions llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6406,45 +6406,13 @@ void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
KernelEnvironmentGV->setInitializer(NewInitializer);
}

static MDNode *getNVPTXMDNode(Function &Kernel, StringRef Name) {
Module &M = *Kernel.getParent();
NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
for (auto *Op : MD->operands()) {
if (Op->getNumOperands() != 3)
continue;
auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
if (!KernelOp || KernelOp->getValue() != &Kernel)
continue;
auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
if (!Prop || Prop->getString() != Name)
continue;
return Op;
}
return nullptr;
}

static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value,
bool Min) {
// Update the "maxntidx" metadata for NVIDIA, or add it.
MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
if (ExistingOp) {
auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
ExistingOp->replaceOperandWith(
2, ConstantAsMetadata::get(ConstantInt::get(
OldVal->getValue()->getType(),
Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
} else {
LLVMContext &Ctx = Kernel.getContext();
Metadata *MDVals[] = {ConstantAsMetadata::get(&Kernel),
MDString::get(Ctx, Name),
ConstantAsMetadata::get(
ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
// Append metadata to nvvm.annotations
Module &M = *Kernel.getParent();
NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
MD->addOperand(MDNode::get(Ctx, MDVals));
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
bool Min) {
if (Kernel.hasFnAttribute(Name)) {
int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
}
Kernel.addFnAttr(Name, llvm::utostr(Value));
}

std::pair<int32_t, int32_t>
Expand All @@ -6466,9 +6434,8 @@ OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) {
return {LB, UB};
}

if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
if (Kernel.hasFnAttribute("nvvm.maxntid")) {
int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
}
return {0, ThreadLimit};
Expand All @@ -6485,7 +6452,7 @@ void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T,
return;
}

updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
}

std::pair<int32_t, int32_t>
Expand Down
Loading