-
Notifications
You must be signed in to change notification settings - Fork 12.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
clang/AMDGPU: Use atomicrmw for ds fmin/fmax builtins #96738
Merged
arsenm
merged 2 commits into
main
from
users/arsenm/amdgpu-clang-use-atomicrmw-ds-fmin-fmax-builtins
Jun 27, 2024
Merged
clang/AMDGPU: Use atomicrmw for ds fmin/fmax builtins #96738
arsenm
merged 2 commits into
main
from
users/arsenm/amdgpu-clang-use-atomicrmw-ds-fmin-fmax-builtins
Jun 27, 2024
Conversation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-clang-codegen @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesFull diff: https://github.com/llvm/llvm-project/pull/96738.diff 4 Files Affected:
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 4edd8283aa03c..5ade65ac112a6 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18622,28 +18622,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
return Builder.CreateCall(F, { Src0, Builder.getFalse() });
}
- case AMDGPU::BI__builtin_amdgcn_ds_fminf:
- case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
- Intrinsic::ID Intrin;
- switch (BuiltinID) {
- case AMDGPU::BI__builtin_amdgcn_ds_fminf:
- Intrin = Intrinsic::amdgcn_ds_fmin;
- break;
- case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
- Intrin = Intrinsic::amdgcn_ds_fmax;
- break;
- }
- llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
- llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
- llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
- llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
- llvm::Value *Src4 = EmitScalarExpr(E->getArg(4));
- llvm::Function *F = CGM.getIntrinsic(Intrin, { Src1->getType() });
- llvm::FunctionType *FTy = F->getFunctionType();
- llvm::Type *PTy = FTy->getParamType(0);
- Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
- return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
- }
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
@@ -19077,11 +19055,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
- case AMDGPU::BI__builtin_amdgcn_ds_faddf:
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
- case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: {
+ case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
+ case AMDGPU::BI__builtin_amdgcn_ds_faddf:
+ case AMDGPU::BI__builtin_amdgcn_ds_fminf:
+ case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
llvm::AtomicRMWInst::BinOp BinOp;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19099,6 +19079,12 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
BinOp = llvm::AtomicRMWInst::FAdd;
break;
+ case AMDGPU::BI__builtin_amdgcn_ds_fminf:
+ BinOp = llvm::AtomicRMWInst::FMin;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
+ BinOp = llvm::AtomicRMWInst::FMax;
+ break;
}
Address Ptr = CheckAtomicAlignment(*this, E);
@@ -19108,8 +19094,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
bool Volatile;
- if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf) {
- // __builtin_amdgcn_ds_faddf has an explicit volatile argument
+ if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf ||
+ BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fminf ||
+ BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fmaxf) {
+ // __builtin_amdgcn_ds_faddf/fminf/fmaxf has an explicit volatile argument
Volatile =
cast<ConstantInt>(EmitScalarExpr(E->getArg(4)))->getZExtValue();
} else {
diff --git a/clang/test/CodeGenCUDA/builtins-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-amdgcn.cu
index 132cbd27b08fc..2e88afac813f4 100644
--- a/clang/test/CodeGenCUDA/builtins-amdgcn.cu
+++ b/clang/test/CodeGenCUDA/builtins-amdgcn.cu
@@ -98,7 +98,7 @@ __global__
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
// CHECK-NEXT: store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP1:%.*]] = call contract float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]], i32 0, i32 0, i1 false)
+// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fmax ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]] monotonic, align 4
// CHECK-NEXT: store volatile float [[TMP1]], ptr [[X_ASCAST]], align 4
// CHECK-NEXT: ret void
//
@@ -142,7 +142,7 @@ __global__ void test_ds_fadd(float src) {
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP4:%.*]] = call contract float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false)
+// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
// CHECK-NEXT: store volatile float [[TMP4]], ptr [[X_ASCAST]], align 4
// CHECK-NEXT: ret void
//
@@ -245,10 +245,10 @@ __device__ void func(float *x);
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP4:%.*]] = call contract float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false)
+// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
// CHECK-NEXT: store volatile float [[TMP4]], ptr [[X_ASCAST]], align 4
// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT: call void @_Z4funcPf(ptr noundef [[TMP5]]) #[[ATTR8:[0-9]+]]
+// CHECK-NEXT: call void @_Z4funcPf(ptr noundef [[TMP5]]) #[[ATTR7:[0-9]+]]
// CHECK-NEXT: ret void
//
__global__ void test_ds_fmin_func(float src, float *__restrict shared) {
diff --git a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
index 7bb756a4a2731..32851805298f1 100644
--- a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
+++ b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
@@ -95,7 +95,7 @@ __global__
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP1:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]], i32 0, i32 0, i1 false)
+// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fmax ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]] monotonic, align 4
// CHECK-NEXT: store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4
// CHECK-NEXT: ret void
//
@@ -139,7 +139,7 @@ __global__ void test_ds_fadd(float src) {
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr addrspace(3)
// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP4:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false)
+// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
// CHECK-NEXT: store volatile float [[TMP4]], ptr addrspace(4) [[X_ASCAST]], align 4
// CHECK-NEXT: ret void
//
@@ -236,10 +236,10 @@ __device__ void func(float *x);
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr addrspace(3)
// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP4:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false)
+// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
// CHECK-NEXT: store volatile float [[TMP4]], ptr addrspace(4) [[X_ASCAST]], align 4
// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT: call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP5]]) #[[ATTR7:[0-9]+]]
+// CHECK-NEXT: call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP5]]) #[[ATTR6:[0-9]+]]
// CHECK-NEXT: ret void
//
__global__ void test_ds_fmin_func(float src, float *__restrict shared) {
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
index 46af87f5e1d68..4e3b1e14ead62 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -158,23 +158,85 @@ void test_ds_faddf(local float *out, float src) {
}
// CHECK-LABEL: @test_ds_fmin
-// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %out, float %src, i32 0, i32 0, i1 false)
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
+// CHECK: atomicrmw volatile fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
+
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src acquire, align 4{{$}}
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src acquire, align 4{{$}}
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src release, align 4{{$}}
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src acq_rel, align 4{{$}}
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
+
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
+
#if !defined(__SPIRV__)
void test_ds_fminf(local float *out, float src) {
#else
void test_ds_fminf(__attribute__((address_space(3))) float *out, float src) {
#endif
*out = __builtin_amdgcn_ds_fminf(out, src, 0, 0, false);
+ *out = __builtin_amdgcn_ds_fminf(out, src, 0, 0, true);
+
+ // Test all orders.
+ *out = __builtin_amdgcn_ds_fminf(out, src, 1, 0, false);
+ *out = __builtin_amdgcn_ds_fminf(out, src, 2, 0, false);
+ *out = __builtin_amdgcn_ds_fminf(out, src, 3, 0, false);
+ *out = __builtin_amdgcn_ds_fminf(out, src, 4, 0, false);
+ *out = __builtin_amdgcn_ds_fminf(out, src, 5, 0, false);
+ *out = __builtin_amdgcn_ds_fminf(out, src, 5, 0, false); // invalid
+
+ // Test all syncscopes.
+ *out = __builtin_amdgcn_ds_fminf(out, src, 0, 1, false);
+ *out = __builtin_amdgcn_ds_fminf(out, src, 0, 2, false);
+ *out = __builtin_amdgcn_ds_fminf(out, src, 0, 3, false);
+ *out = __builtin_amdgcn_ds_fminf(out, src, 0, 4, false);
+ *out = __builtin_amdgcn_ds_fminf(out, src, 0, 5, false); // invalid
}
// CHECK-LABEL: @test_ds_fmax
-// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) %out, float %src, i32 0, i32 0, i1 false)
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
+// CHECK: atomicrmw volatile fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
+
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src acquire, align 4{{$}}
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src acquire, align 4{{$}}
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src release, align 4{{$}}
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src acq_rel, align 4{{$}}
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
+
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
+
#if !defined(__SPIRV__)
void test_ds_fmaxf(local float *out, float src) {
#else
void test_ds_fmaxf(__attribute__((address_space(3))) float *out, float src) {
#endif
*out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 0, false);
+ *out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 0, true);
+
+ // Test all orders.
+ *out = __builtin_amdgcn_ds_fmaxf(out, src, 1, 0, false);
+ *out = __builtin_amdgcn_ds_fmaxf(out, src, 2, 0, false);
+ *out = __builtin_amdgcn_ds_fmaxf(out, src, 3, 0, false);
+ *out = __builtin_amdgcn_ds_fmaxf(out, src, 4, 0, false);
+ *out = __builtin_amdgcn_ds_fmaxf(out, src, 5, 0, false);
+ *out = __builtin_amdgcn_ds_fmaxf(out, src, 5, 0, false); // invalid
+
+ // Test all syncscopes.
+ *out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 1, false);
+ *out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 2, false);
+ *out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 3, false);
+ *out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 4, false);
+ *out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 5, false); // invalid
}
// CHECK-LABEL: @test_s_memtime
|
llvmbot
added
clang
Clang issues not falling into any other category
clang:codegen
labels
Jun 26, 2024
yxsamliu
reviewed
Jun 26, 2024
arsenm
force-pushed
the
users/arsenm/amdgpu-clang-use-atomicrmw-ds-fmin-fmax-builtins
branch
from
June 27, 2024 08:55
5c2be85
to
7c9636a
Compare
shiltian
approved these changes
Jun 27, 2024
arsenm
force-pushed
the
users/arsenm/amdgpu-clang-use-atomicrmw-ds-fmin-fmax-builtins
branch
from
June 27, 2024 13:29
7c9636a
to
ff880d5
Compare
arsenm
deleted the
users/arsenm/amdgpu-clang-use-atomicrmw-ds-fmin-fmax-builtins
branch
June 27, 2024 13:32
lravenclaw
pushed a commit
to lravenclaw/llvm-project
that referenced
this pull request
Jul 3, 2024
AlexisPerry
pushed a commit
to llvm-project-tlp/llvm-project
that referenced
this pull request
Jul 9, 2024
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Labels
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
No description provided.