diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 96dcf6283f9f86..98c2f70664ec7a 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18632,28 +18632,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() }); return Builder.CreateCall(F, { Src0, Builder.getFalse() }); } - case AMDGPU::BI__builtin_amdgcn_ds_fminf: - case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: { - Intrinsic::ID Intrin; - switch (BuiltinID) { - case AMDGPU::BI__builtin_amdgcn_ds_fminf: - Intrin = Intrinsic::amdgcn_ds_fmin; - break; - case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: - Intrin = Intrinsic::amdgcn_ds_fmax; - break; - } - llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); - llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); - llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); - llvm::Value *Src3 = EmitScalarExpr(E->getArg(3)); - llvm::Value *Src4 = EmitScalarExpr(E->getArg(4)); - llvm::Function *F = CGM.getIntrinsic(Intrin, { Src1->getType() }); - llvm::FunctionType *FTy = F->getFunctionType(); - llvm::Type *PTy = FTy->getParamType(0); - Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy); - return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 }); - } case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32: case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16: @@ -19087,11 +19065,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_atomic_inc64: case AMDGPU::BI__builtin_amdgcn_atomic_dec32: case AMDGPU::BI__builtin_amdgcn_atomic_dec64: - case AMDGPU::BI__builtin_amdgcn_ds_faddf: case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64: case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: - case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: { + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: + case AMDGPU::BI__builtin_amdgcn_ds_faddf: + case AMDGPU::BI__builtin_amdgcn_ds_fminf: + case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: { llvm::AtomicRMWInst::BinOp BinOp; switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_atomic_inc32: @@ -19109,6 +19089,12 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: BinOp = llvm::AtomicRMWInst::FAdd; break; + case AMDGPU::BI__builtin_amdgcn_ds_fminf: + BinOp = llvm::AtomicRMWInst::FMin; + break; + case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: + BinOp = llvm::AtomicRMWInst::FMax; + break; } Address Ptr = CheckAtomicAlignment(*this, E); @@ -19118,8 +19104,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, bool Volatile; - if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf) { - // __builtin_amdgcn_ds_faddf has an explicit volatile argument + if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf || + BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fminf || + BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fmaxf) { + // __builtin_amdgcn_ds_faddf/fminf/fmaxf has an explicit volatile argument Volatile = cast(EmitScalarExpr(E->getArg(4)))->getZExtValue(); } else { diff --git a/clang/test/CodeGenCUDA/builtins-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-amdgcn.cu index 132cbd27b08fc3..2e88afac813f43 100644 --- a/clang/test/CodeGenCUDA/builtins-amdgcn.cu +++ b/clang/test/CodeGenCUDA/builtins-amdgcn.cu @@ -98,7 +98,7 @@ __global__ // CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr // CHECK-NEXT: store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = call contract float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]], i32 0, i32 0, i1 false) +// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fmax ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]] monotonic, align 4 // CHECK-NEXT: store volatile float [[TMP1]], ptr [[X_ASCAST]], align 4 // CHECK-NEXT: ret void // @@ -142,7 +142,7 @@ __global__ void test_ds_fadd(float src) { // CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3) // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = call contract float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4 // CHECK-NEXT: store volatile float [[TMP4]], ptr [[X_ASCAST]], align 4 // CHECK-NEXT: ret void // @@ -245,10 +245,10 @@ __device__ void func(float *x); // CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3) // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = call contract float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4 // CHECK-NEXT: store volatile float [[TMP4]], ptr [[X_ASCAST]], align 4 // CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8 -// CHECK-NEXT: call void @_Z4funcPf(ptr noundef [[TMP5]]) #[[ATTR8:[0-9]+]] +// CHECK-NEXT: call void @_Z4funcPf(ptr noundef [[TMP5]]) #[[ATTR7:[0-9]+]] // CHECK-NEXT: ret void // __global__ void test_ds_fmin_func(float src, float *__restrict shared) { diff --git a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu index 7bb756a4a27314..32851805298f18 100644 --- a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu +++ b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu @@ -95,7 +95,7 @@ __global__ // CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) // CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]], i32 0, i32 0, i1 false) +// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fmax ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]] monotonic, align 4 // CHECK-NEXT: store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4 // CHECK-NEXT: ret void // @@ -139,7 +139,7 @@ __global__ void test_ds_fadd(float src) { // CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr addrspace(3) // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4 // CHECK-NEXT: store volatile float [[TMP4]], ptr addrspace(4) [[X_ASCAST]], align 4 // CHECK-NEXT: ret void // @@ -236,10 +236,10 @@ __device__ void func(float *x); // CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr addrspace(3) // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4 // CHECK-NEXT: store volatile float [[TMP4]], ptr addrspace(4) [[X_ASCAST]], align 4 // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 -// CHECK-NEXT: call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP5]]) #[[ATTR7:[0-9]+]] +// CHECK-NEXT: call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP5]]) #[[ATTR6:[0-9]+]] // CHECK-NEXT: ret void // __global__ void test_ds_fmin_func(float src, float *__restrict shared) { diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl index 46af87f5e1d683..5bd8f77a5930c4 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl @@ -158,23 +158,85 @@ void test_ds_faddf(local float *out, float src) { } // CHECK-LABEL: @test_ds_fmin -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %out, float %src, i32 0, i32 0, i1 false) +// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}} +// CHECK: atomicrmw volatile fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}} + +// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src acquire, align 4{{$}} +// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src acquire, align 4{{$}} +// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src release, align 4{{$}} +// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src acq_rel, align 4{{$}} +// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}} +// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}} + +// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}} +// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}} +// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}} +// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}} +// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}} + #if !defined(__SPIRV__) void test_ds_fminf(local float *out, float src) { #else void test_ds_fminf(__attribute__((address_space(3))) float *out, float src) { #endif *out = __builtin_amdgcn_ds_fminf(out, src, 0, 0, false); + *out = __builtin_amdgcn_ds_fminf(out, src, 0, 0, true); + + // Test all orders. + *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_CONSUME, __MEMORY_SCOPE_SYSTEM, false); + *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_SYSTEM, false); + *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM, false); + *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_ACQ_REL, __MEMORY_SCOPE_SYSTEM, false); + *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false); + *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false); // invalid + + // Test all syncscopes. + *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false); + *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false); + *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false); + *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false); + *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, 5, false); // invalid } // CHECK-LABEL: @test_ds_fmax -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) %out, float %src, i32 0, i32 0, i1 false) +// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}} +// CHECK: atomicrmw volatile fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}} + +// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src acquire, align 4{{$}} +// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src acquire, align 4{{$}} +// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src release, align 4{{$}} +// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src acq_rel, align 4{{$}} +// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}} +// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}} + +// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}} +// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}} +// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}} +// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}} +// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}} + #if !defined(__SPIRV__) void test_ds_fmaxf(local float *out, float src) { #else void test_ds_fmaxf(__attribute__((address_space(3))) float *out, float src) { #endif *out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 0, false); + *out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 0, true); + + // Test all orders. + *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_CONSUME, __MEMORY_SCOPE_SYSTEM, false); + *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_SYSTEM, false); + *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM, false); + *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_ACQ_REL, __MEMORY_SCOPE_SYSTEM, false); + *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false); + *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false); // invalid + + // Test all syncscopes. + *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false); + *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false); + *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false); + *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false); + *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, 5, false); // invalid } // CHECK-LABEL: @test_s_memtime