Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

clang/AMDGPU: Use atomicrmw for ds fmin/fmax builtins #96738

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 14 additions & 26 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18632,28 +18632,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
return Builder.CreateCall(F, { Src0, Builder.getFalse() });
}
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
Intrinsic::ID Intrin;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
Intrin = Intrinsic::amdgcn_ds_fmin;
break;
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
Intrin = Intrinsic::amdgcn_ds_fmax;
break;
}
llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
llvm::Value *Src4 = EmitScalarExpr(E->getArg(4));
llvm::Function *F = CGM.getIntrinsic(Intrin, { Src1->getType() });
llvm::FunctionType *FTy = F->getFunctionType();
llvm::Type *PTy = FTy->getParamType(0);
Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
}
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
Expand Down Expand Up @@ -19087,11 +19065,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: {
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
llvm::AtomicRMWInst::BinOp BinOp;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
Expand All @@ -19109,6 +19089,12 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
BinOp = llvm::AtomicRMWInst::FAdd;
break;
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
BinOp = llvm::AtomicRMWInst::FMin;
break;
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
BinOp = llvm::AtomicRMWInst::FMax;
break;
}

Address Ptr = CheckAtomicAlignment(*this, E);
Expand All @@ -19118,8 +19104,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,

bool Volatile;

if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf) {
// __builtin_amdgcn_ds_faddf has an explicit volatile argument
if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf ||
BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fminf ||
BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fmaxf) {
// __builtin_amdgcn_ds_faddf/fminf/fmaxf has an explicit volatile argument
Volatile =
cast<ConstantInt>(EmitScalarExpr(E->getArg(4)))->getZExtValue();
} else {
Expand Down
8 changes: 4 additions & 4 deletions clang/test/CodeGenCUDA/builtins-amdgcn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ __global__
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
// CHECK-NEXT: store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = call contract float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]], i32 0, i32 0, i1 false)
// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fmax ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]] monotonic, align 4
// CHECK-NEXT: store volatile float [[TMP1]], ptr [[X_ASCAST]], align 4
// CHECK-NEXT: ret void
//
Expand Down Expand Up @@ -142,7 +142,7 @@ __global__ void test_ds_fadd(float src) {
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP4:%.*]] = call contract float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false)
// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
// CHECK-NEXT: store volatile float [[TMP4]], ptr [[X_ASCAST]], align 4
// CHECK-NEXT: ret void
//
Expand Down Expand Up @@ -245,10 +245,10 @@ __device__ void func(float *x);
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP4:%.*]] = call contract float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false)
// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
// CHECK-NEXT: store volatile float [[TMP4]], ptr [[X_ASCAST]], align 4
// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
// CHECK-NEXT: call void @_Z4funcPf(ptr noundef [[TMP5]]) #[[ATTR8:[0-9]+]]
// CHECK-NEXT: call void @_Z4funcPf(ptr noundef [[TMP5]]) #[[ATTR7:[0-9]+]]
// CHECK-NEXT: ret void
//
__global__ void test_ds_fmin_func(float src, float *__restrict shared) {
Expand Down
8 changes: 4 additions & 4 deletions clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ __global__
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]], i32 0, i32 0, i1 false)
// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fmax ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]] monotonic, align 4
// CHECK-NEXT: store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4
// CHECK-NEXT: ret void
//
Expand Down Expand Up @@ -139,7 +139,7 @@ __global__ void test_ds_fadd(float src) {
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr addrspace(3)
// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP4:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false)
// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
// CHECK-NEXT: store volatile float [[TMP4]], ptr addrspace(4) [[X_ASCAST]], align 4
// CHECK-NEXT: ret void
//
Expand Down Expand Up @@ -236,10 +236,10 @@ __device__ void func(float *x);
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr addrspace(3)
// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP4:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false)
// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
// CHECK-NEXT: store volatile float [[TMP4]], ptr addrspace(4) [[X_ASCAST]], align 4
// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
// CHECK-NEXT: call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP5]]) #[[ATTR7:[0-9]+]]
// CHECK-NEXT: call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP5]]) #[[ATTR6:[0-9]+]]
// CHECK-NEXT: ret void
//
__global__ void test_ds_fmin_func(float src, float *__restrict shared) {
Expand Down
66 changes: 64 additions & 2 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
Original file line number Diff line number Diff line change
Expand Up @@ -158,23 +158,85 @@ void test_ds_faddf(local float *out, float src) {
}

// CHECK-LABEL: @test_ds_fmin
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %out, float %src, i32 0, i32 0, i1 false)
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
// CHECK: atomicrmw volatile fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}}

// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src acquire, align 4{{$}}
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src acquire, align 4{{$}}
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src release, align 4{{$}}
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src acq_rel, align 4{{$}}
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}

// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}}

#if !defined(__SPIRV__)
void test_ds_fminf(local float *out, float src) {
#else
void test_ds_fminf(__attribute__((address_space(3))) float *out, float src) {
#endif
*out = __builtin_amdgcn_ds_fminf(out, src, 0, 0, false);
*out = __builtin_amdgcn_ds_fminf(out, src, 0, 0, true);

// Test all orders.
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_CONSUME, __MEMORY_SCOPE_SYSTEM, false);
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_SYSTEM, false);
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM, false);
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_ACQ_REL, __MEMORY_SCOPE_SYSTEM, false);
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false);
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false); // invalid

// Test all syncscopes.
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false);
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false);
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false);
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false);
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, 5, false); // invalid
}

// CHECK-LABEL: @test_ds_fmax
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) %out, float %src, i32 0, i32 0, i1 false)
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
// CHECK: atomicrmw volatile fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}}

// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src acquire, align 4{{$}}
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src acquire, align 4{{$}}
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src release, align 4{{$}}
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src acq_rel, align 4{{$}}
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}

// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}}

#if !defined(__SPIRV__)
void test_ds_fmaxf(local float *out, float src) {
#else
void test_ds_fmaxf(__attribute__((address_space(3))) float *out, float src) {
#endif
*out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 0, false);
*out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 0, true);

// Test all orders.
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_CONSUME, __MEMORY_SCOPE_SYSTEM, false);
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_SYSTEM, false);
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM, false);
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_ACQ_REL, __MEMORY_SCOPE_SYSTEM, false);
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false);
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false); // invalid

// Test all syncscopes.
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false);
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false);
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false);
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false);
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, 5, false); // invalid
}

// CHECK-LABEL: @test_s_memtime
Expand Down
Loading