Skip to content

Commit 1d9de10

Browse files
committed
[ARM NEON] Define vfms_f32 on ARM, and all vfms using vfma.
r259537 added vfma/vfms to armv7, but the builtin was only lowered on the AArch64 side. Instead of supporting it on ARM, get rid of it. The vfms builtin lowered to: %nb = fsub float -0.0, %b %r = @llvm.fma.f32(%a, %nb, %c) Instead, define the operation in terms of vfma, and swap the multiplicands. It now lowers to: %na = fsub float -0.0, %a %r = @llvm.fma.f32(%na, %b, %c) This matches the instruction more closely, and lets current LLVM generate the "natural" operand ordering: fmls.2s v0, v1, v2 instead of the crooked (but equivalent): fmls.2s v0, v2, v1 Except for theses changes, assembly is identical. LLVM accepts both commutations, and the LLVM tests in: test/CodeGen/AArch64/arm64-fmadd.ll test/CodeGen/AArch64/fp-dp3.ll test/CodeGen/AArch64/neon-fma.ll test/CodeGen/ARM/fusedMAC.ll already check either the new one only, or both. Also verified against the test-suite unittests. llvm-svn: 266807
1 parent e885d5e commit 1d9de10

File tree

7 files changed

+134
-121
lines changed

7 files changed

+134
-121
lines changed

Diff for: clang/include/clang/Basic/arm_neon.td

+6-5
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,7 @@ def OP_MLALHi : Op<(call "vmlal", $p0, (call "vget_high", $p1),
339339
(call "vget_high", $p2))>;
340340
def OP_MLALHi_N : Op<(call "vmlal_n", $p0, (call "vget_high", $p1), $p2)>;
341341
def OP_MLS : Op<(op "-", $p0, (op "*", $p1, $p2))>;
342+
def OP_FMLS : Op<(call "vfma", $p0, (op "-", $p1), $p2)>;
342343
def OP_MLSL : Op<(op "-", $p0, (call "vmull", $p1, $p2))>;
343344
def OP_MLSLHi : Op<(call "vmlsl", $p0, (call "vget_high", $p1),
344345
(call "vget_high", $p2))>;
@@ -347,7 +348,7 @@ def OP_MUL_N : Op<(op "*", $p0, (dup $p1))>;
347348
def OP_MLA_N : Op<(op "+", $p0, (op "*", $p1, (dup $p2)))>;
348349
def OP_MLS_N : Op<(op "-", $p0, (op "*", $p1, (dup $p2)))>;
349350
def OP_FMLA_N : Op<(call "vfma", $p0, $p1, (dup $p2))>;
350-
def OP_FMLS_N : Op<(call "vfms", $p0, $p1, (dup $p2))>;
351+
def OP_FMLS_N : Op<(call "vfma", $p0, (op "-", $p1), (dup $p2))>;
351352
def OP_MLAL_N : Op<(op "+", $p0, (call "vmull", $p1, (dup $p2)))>;
352353
def OP_MLSL_N : Op<(op "-", $p0, (call "vmull", $p1, (dup $p2)))>;
353354
def OP_MUL_LN : Op<(op "*", $p0, (splat $p1, $p2))>;
@@ -377,8 +378,8 @@ def OP_QRDMLAH : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, $p2))>;
377378
def OP_QRDMLSH : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, $p2))>;
378379
def OP_QRDMLAH_LN : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, (splat $p2, $p3)))>;
379380
def OP_QRDMLSH_LN : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, (splat $p2, $p3)))>;
380-
def OP_FMS_LN : Op<(call "vfma_lane", $p0, $p1, (op "-", $p2), $p3)>;
381-
def OP_FMS_LNQ : Op<(call "vfma_laneq", $p0, $p1, (op "-", $p2), $p3)>;
381+
def OP_FMS_LN : Op<(call "vfma_lane", $p0, (op "-", $p1), $p2, $p3)>;
382+
def OP_FMS_LNQ : Op<(call "vfma_laneq", $p0, (op "-", $p1), $p2, $p3)>;
382383
def OP_TRN1 : Op<(shuffle $p0, $p1, (interleave (decimate mask0, 2),
383384
(decimate mask1, 2)))>;
384385
def OP_ZIP1 : Op<(shuffle $p0, $p1, (lowhalf (interleave mask0, mask1)))>;
@@ -826,7 +827,7 @@ def VREINTERPRET
826827

827828
let ArchGuard = "defined(__ARM_FEATURE_FMA)" in {
828829
def VFMA : SInst<"vfma", "dddd", "fQf">;
829-
def VFMS : SInst<"vfms", "dddd", "fQf">;
830+
def VFMS : SOpInst<"vfms", "dddd", "fQf", OP_FMLS>;
830831
}
831832

832833
////////////////////////////////////////////////////////////////////////////////
@@ -911,7 +912,7 @@ def FDIV : IOpInst<"vdiv", "ddd", "fdQfQd", OP_DIV>;
911912
////////////////////////////////////////////////////////////////////////////////
912913
// Vector fused multiply-add operations
913914
def FMLA : SInst<"vfma", "dddd", "dQd">;
914-
def FMLS : SInst<"vfms", "dddd", "dQd">;
915+
def FMLS : SOpInst<"vfms", "dddd", "dQd", OP_FMLS>;
915916

916917
////////////////////////////////////////////////////////////////////////////////
917918
// MUL, MLA, MLS, FMA, FMS definitions with scalar argument

Diff for: clang/lib/CodeGen/CGBuiltin.cpp

-16
Original file line numberDiff line numberDiff line change
@@ -5319,22 +5319,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
53195319
Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
53205320
return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0]});
53215321
}
5322-
case NEON::BI__builtin_neon_vfms_v:
5323-
case NEON::BI__builtin_neon_vfmsq_v: { // Only used for FP types
5324-
// FIXME: probably remove when we no longer support aarch64_simd.h
5325-
// (arm_neon.h delegates to vfma).
5326-
5327-
// The ARM builtins (and instructions) have the addend as the first
5328-
// operand, but the 'fma' intrinsics have it last. Swap it around here.
5329-
Value *Subtrahend = Ops[0];
5330-
Value *Multiplicand = Ops[2];
5331-
Ops[0] = Multiplicand;
5332-
Ops[2] = Subtrahend;
5333-
Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
5334-
Ops[1] = Builder.CreateFNeg(Ops[1]);
5335-
Int = Intrinsic::fma;
5336-
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmls");
5337-
}
53385322
case NEON::BI__builtin_neon_vmull_v:
53395323
// FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
53405324
Int = usgn ? Intrinsic::aarch64_neon_umull : Intrinsic::aarch64_neon_smull;

0 commit comments

Comments
 (0)