Skip to content

Commit a063f63

Browse files
committed
remove Intrinsic::vector_reduce_fadd and Intrinsic::vector_reduce_fmul replace with sequential reduction
1 parent f935ecd commit a063f63

File tree

2 files changed

+53
-25
lines changed

2 files changed

+53
-25
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 25 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4315,33 +4315,37 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
43154315
}
43164316

43174317
case Builtin::BI__builtin_reduce_add: {
4318-
// Note: vector_reduce_fadd takes two arguments a
4319-
// scalar start value and a vector. That would mean to
4320-
// correctly call it we would need emitBuiltinWithOneOverloadedType<2>
4321-
// To keep the builtin sema behavior the same despite type we will
4322-
// popululate vector_reduce_fadd scalar value with a 0.
4323-
if (E->getArg(0)->getType()->hasFloatingRepresentation()) {
4324-
Value *X = EmitScalarExpr(E->getArg(0));
4325-
auto EltTy = X->getType()->getScalarType();
4326-
Value *Seed = ConstantFP::get(EltTy, -0.0);
4327-
return RValue::get(Builder.CreateIntrinsic(
4328-
/*ReturnType=*/EltTy, llvm::Intrinsic::vector_reduce_fadd,
4329-
ArrayRef<Value *>{Seed, X}, nullptr, "rdx.fadd"));
4318+
QualType QT = E->getArg(0)->getType();
4319+
if (QT->hasFloatingRepresentation()) {
4320+
Value *Op0 = EmitScalarExpr(E->getArg(0));
4321+
assert(Op0->getType()->isVectorTy());
4322+
unsigned VecSize = QT->getAs<VectorType>()->getNumElements();
4323+
Value *Sum = Builder.CreateExtractElement(Op0, static_cast<uint64_t>(0));
4324+
for (unsigned I = 1; I < VecSize; I++) {
4325+
Value *Elt = Builder.CreateExtractElement(Op0, I);
4326+
Sum = Builder.CreateFAdd(Sum, Elt);
4327+
}
4328+
return RValue::get(Sum);
43304329
}
4331-
assert(E->getArg(0)->getType()->hasIntegerRepresentation());
4330+
assert(QT->hasIntegerRepresentation());
43324331
return RValue::get(emitBuiltinWithOneOverloadedType<1>(
43334332
*this, E, llvm::Intrinsic::vector_reduce_add, "rdx.add"));
43344333
}
43354334
case Builtin::BI__builtin_reduce_mul: {
4336-
if (E->getArg(0)->getType()->hasFloatingRepresentation()) {
4337-
Value *X = EmitScalarExpr(E->getArg(0));
4338-
auto EltTy = X->getType()->getScalarType();
4339-
Value *Seed = ConstantFP::get(EltTy, 1.0);
4340-
return RValue::get(Builder.CreateIntrinsic(
4341-
/*ReturnType=*/EltTy, llvm::Intrinsic::vector_reduce_fmul,
4342-
ArrayRef<Value *>{Seed, X}, nullptr, "rdx.fmul"));
4335+
QualType QT = E->getArg(0)->getType();
4336+
if (QT->hasFloatingRepresentation()) {
4337+
Value *Op0 = EmitScalarExpr(E->getArg(0));
4338+
assert(Op0->getType()->isVectorTy());
4339+
unsigned VecSize = QT->getAs<VectorType>()->getNumElements();
4340+
Value *Product =
4341+
Builder.CreateExtractElement(Op0, static_cast<uint64_t>(0));
4342+
for (unsigned I = 1; I < VecSize; I++) {
4343+
Value *Elt = Builder.CreateExtractElement(Op0, I);
4344+
Product = Builder.CreateFMul(Product, Elt);
4345+
}
4346+
return RValue::get(Product);
43434347
}
4344-
assert(E->getArg(0)->getType()->hasIntegerRepresentation());
4348+
assert(QT->hasIntegerRepresentation());
43454349
return RValue::get(emitBuiltinWithOneOverloadedType<1>(
43464350
*this, E, llvm::Intrinsic::vector_reduce_mul, "rdx.mul"));
43474351
}

clang/test/CodeGen/builtins-reduction-math.c

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,23 @@ void test_builtin_reduce_min(float4 vf1, si8 vi1, u4 vu1) {
6464

6565
void test_builtin_reduce_addf(float4 vf4, double4 vd4) {
6666
// CHECK: [[VF4:%.+]] = load <4 x float>, ptr %vf4.addr, align 16
67-
// CHECK-NEXT: call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[VF4]])
67+
// CHECK-NEXT: [[ARRF1:%.+]] = extractelement <4 x float> [[VF4]], i64 0
68+
// CHECK-NEXT: [[ARRF2:%.+]] = extractelement <4 x float> [[VF4]], i64 1
69+
// CHECK-NEXT: [[ADDF1:%.+]] = fadd float [[ARRF1]], [[ARRF2]]
70+
// CHECK-NEXT: [[ARRF3:%.+]] = extractelement <4 x float> [[VF4]], i64 2
71+
// CHECK-NEXT: [[ADDF2:%.+]] = fadd float [[ADDF1]], [[ARRF3]]
72+
// CHECK-NEXT: [[ARRF4:%.+]] = extractelement <4 x float> [[VF4]], i64 3
73+
// CHECK-NEXT: [[ADDF3:%.+]] = fadd float [[ADDF2]], [[ARRF4]]
6874
float r2 = __builtin_reduce_add(vf4);
6975

7076
// CHECK: [[VD4:%.+]] = load <4 x double>, ptr %vd4.addr, align 16
71-
// CHECK-NEXT: call double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[VD4]])
77+
// CHECK-NEXT: [[ARR1:%.+]] = extractelement <4 x double> [[VD4]], i64 0
78+
// CHECK-NEXT: [[ARR2:%.+]] = extractelement <4 x double> [[VD4]], i64 1
79+
// CHECK-NEXT: [[ADD1:%.+]] = fadd double [[ARR1]], [[ARR2]]
80+
// CHECK-NEXT: [[ARR3:%.+]] = extractelement <4 x double> [[VD4]], i64 2
81+
// CHECK-NEXT: [[ADD2:%.+]] = fadd double [[ADD1]], [[ARR3]]
82+
// CHECK-NEXT: [[ARR4:%.+]] = extractelement <4 x double> [[VD4]], i64 3
83+
// CHECK-NEXT: [[ADD3:%.+]] = fadd double [[ADD2]], [[ARR4]]
7284
double r3 = __builtin_reduce_add(vd4);
7385
}
7486

@@ -96,11 +108,23 @@ void test_builtin_reduce_add(si8 vi1, u4 vu1) {
96108

97109
void test_builtin_reduce_mulf(float4 vf4, double4 vd4) {
98110
// CHECK: [[VF4:%.+]] = load <4 x float>, ptr %vf4.addr, align 16
99-
// CHECK-NEXT: call float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[VF4]])
111+
// CHECK-NEXT: [[ARRF1:%.+]] = extractelement <4 x float> [[VF4]], i64 0
112+
// CHECK-NEXT: [[ARRF2:%.+]] = extractelement <4 x float> [[VF4]], i64 1
113+
// CHECK-NEXT: [[MULF1:%.+]] = fmul float [[ARRF1]], [[ARRF2]]
114+
// CHECK-NEXT: [[ARRF3:%.+]] = extractelement <4 x float> [[VF4]], i64 2
115+
// CHECK-NEXT: [[MULF2:%.+]] = fmul float [[MULF1]], [[ARRF3]]
116+
// CHECK-NEXT: [[ARRF4:%.+]] = extractelement <4 x float> [[VF4]], i64 3
117+
// CHECK-NEXT: [[MULF3:%.+]] = fmul float [[MULF2]], [[ARRF4]]
100118
float r2 = __builtin_reduce_mul(vf4);
101119

102120
// CHECK: [[VD4:%.+]] = load <4 x double>, ptr %vd4.addr, align 16
103-
// CHECK-NEXT: call double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> [[VD4]])
121+
// CHECK-NEXT: [[ARR1:%.+]] = extractelement <4 x double> [[VD4]], i64 0
122+
// CHECK-NEXT: [[ARR2:%.+]] = extractelement <4 x double> [[VD4]], i64 1
123+
// CHECK-NEXT: [[MUL1:%.+]] = fmul double [[ARR1]], [[ARR2]]
124+
// CHECK-NEXT: [[ARR3:%.+]] = extractelement <4 x double> [[VD4]], i64 2
125+
// CHECK-NEXT: [[MUL2:%.+]] = fmul double [[MUL1]], [[ARR3]]
126+
// CHECK-NEXT: [[ARR4:%.+]] = extractelement <4 x double> [[VD4]], i64 3
127+
// CHECK-NEXT: [[MUL3:%.+]] = fmul double [[MUL2]], [[ARR4]]
104128
double r3 = __builtin_reduce_mul(vd4);
105129
}
106130

0 commit comments

Comments
 (0)