-
Notifications
You must be signed in to change notification settings - Fork 15k
[DAG] Constant fold FMAD #69324
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[DAG] Constant fold FMAD #69324
Conversation
This has very little effect on codegen in practice, but is a nice to have I think. See llvm#68315
|
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-selectiondag Author: Pierre van Houtryve (Pierre-vh) ChangesThis has very little effect on codegen in practice, but is a nice to have I think. See #68315 Full diff: https://github.com/llvm/llvm-project/pull/69324.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 20ad4c766a1a3fc..eac0a14d8303fa4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -495,6 +495,7 @@ namespace {
SDValue visitFSUB(SDNode *N);
SDValue visitFMUL(SDNode *N);
template <class MatchContextClass> SDValue visitFMA(SDNode *N);
+ SDValue visitFMAD(SDNode *N);
SDValue visitFDIV(SDNode *N);
SDValue visitFREM(SDNode *N);
SDValue visitFSQRT(SDNode *N);
@@ -2000,6 +2001,8 @@ SDValue DAGCombiner::visit(SDNode *N) {
case ISD::FSUB: return visitFSUB(N);
case ISD::FMUL: return visitFMUL(N);
case ISD::FMA: return visitFMA<EmptyMatchContext>(N);
+ case ISD::FMAD:
+ return visitFMAD(N);
case ISD::FDIV: return visitFDIV(N);
case ISD::FREM: return visitFREM(N);
case ISD::FSQRT: return visitFSQRT(N);
@@ -16752,6 +16755,22 @@ template <class MatchContextClass> SDValue DAGCombiner::visitFMA(SDNode *N) {
return SDValue();
}
+SDValue DAGCombiner::visitFMAD(SDNode *N) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue N2 = N->getOperand(2);
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ // Constant fold FMAD.
+ if (isa<ConstantFPSDNode>(N0) && isa<ConstantFPSDNode>(N1) &&
+ isa<ConstantFPSDNode>(N2)) {
+ return DAG.getNode(ISD::FMAD, DL, VT, N0, N1, N2);
+ }
+
+ return SDValue();
+}
+
// Combine multiple FDIVs with the same divisor into multiple FMULs by the
// reciprocal.
// E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 3f06d0bd4eaa1d5..b028c483718107e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7069,7 +7069,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
"Operand is DELETED_NODE!");
// Perform various simplifications.
switch (Opcode) {
- case ISD::FMA: {
+ case ISD::FMA:
+ case ISD::FMAD: {
assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
assert(N1.getValueType() == VT && N2.getValueType() == VT &&
N3.getValueType() == VT && "FMA types must match!");
@@ -7080,7 +7081,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
APFloat V1 = N1CFP->getValueAPF();
const APFloat &V2 = N2CFP->getValueAPF();
const APFloat &V3 = N3CFP->getValueAPF();
- V1.fusedMultiplyAdd(V2, V3, APFloat::rmNearestTiesToEven);
+ if (Opcode == ISD::FMAD) {
+ V1.multiply(V2, APFloat::rmNearestTiesToEven);
+ V1.add(V3, APFloat::rmNearestTiesToEven);
+ } else
+ V1.fusedMultiplyAdd(V2, V3, APFloat::rmNearestTiesToEven);
return getConstantFP(V1, DL, VT);
}
break;
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 012b3f976734dec..e554f912ff64886 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -2619,39 +2619,20 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
; VI-LABEL: v_test_udiv64_mulhi_fold:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; VI-NEXT: v_madak_f32 v2, 0, v2, 0x47c35000
-; VI-NEXT: v_rcp_f32_e32 v2, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0xa7c5
+; VI-NEXT: v_mul_u32_u24_e32 v3, 0x500, v4
+; VI-NEXT: v_mul_hi_u32_u24_e32 v2, 0x500, v4
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x4237, v3
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v2, vcc
+; VI-NEXT: v_add_u32_e32 v6, vcc, 0xa9000000, v3
; VI-NEXT: s_mov_b32 s6, 0xfffe7960
-; VI-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; VI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
-; VI-NEXT: v_trunc_f32_e32 v3, v3
-; VI-NEXT: v_madmk_f32 v2, v3, 0xcf800000, v2
-; VI-NEXT: v_cvt_u32_f32_e32 v6, v2
-; VI-NEXT: v_cvt_u32_f32_e32 v7, v3
-; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
-; VI-NEXT: v_mul_lo_u32 v4, v7, s6
-; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v6
-; VI-NEXT: v_add_u32_e32 v8, vcc, v3, v4
-; VI-NEXT: v_mul_hi_u32 v5, v6, v2
-; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0
-; VI-NEXT: v_add_u32_e32 v9, vcc, v5, v3
-; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
-; VI-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc
-; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v8, 0
-; VI-NEXT: v_add_u32_e32 v2, vcc, v9, v2
-; VI-NEXT: v_addc_u32_e32 v2, vcc, v10, v3, vcc
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2
-; VI-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc
; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
+; VI-NEXT: v_addc_u32_e32 v7, vcc, v5, v4, vcc
; VI-NEXT: v_mul_lo_u32 v4, v7, s6
; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v6
+; VI-NEXT: v_mul_hi_u32 v8, v6, v2
; VI-NEXT: v_add_u32_e32 v5, vcc, v4, v3
; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
-; VI-NEXT: v_mul_hi_u32 v8, v6, v2
; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3
; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v4, vcc
|
| SDLoc DL(N); | ||
|
|
||
| // Constant fold FMAD. | ||
| if (isa<ConstantFPSDNode>(N0) && isa<ConstantFPSDNode>(N1) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should maybe use DAG.isConstantFPBuildVectorOrConstantFP? But I see the FMA case does not.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've been hoping to convert SelectionDAG::foldConstantFPMath to take a ArrayRef<SDValue> Ops arg instead of assuming a binop, but have never gotten around to it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
You can test this locally with the following command:git-clang-format --diff 90576084c1d797f845055e8d95c2d9f455268841 c015d9e2e37a56a189f4f70d951eb8c61c71ad55 -- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cppView the diff from clang-format here.diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index cec75b9331d6..dc0ea25f7b93 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2001,7 +2001,8 @@ SDValue DAGCombiner::visit(SDNode *N) {
case ISD::FSUB: return visitFSUB(N);
case ISD::FMUL: return visitFMUL(N);
case ISD::FMA: return visitFMA<EmptyMatchContext>(N);
- case ISD::FMAD: return visitFMAD(N);
+ case ISD::FMAD:
+ return visitFMAD(N);
case ISD::FDIV: return visitFDIV(N);
case ISD::FREM: return visitFREM(N);
case ISD::FSQRT: return visitFSQRT(N);
|
| ; VI-NEXT: v_mul_u32_u24_e32 v3, 0x500, v4 | ||
| ; VI-NEXT: v_mul_hi_u32_u24_e32 v2, 0x500, v4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we now fold these two guys as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is already computeKnownBits handling for MUL_U24/MUL_I24 - but it might not be handling the entire constant fold, it looks to be mainly setting the leading/trailing known bits.
Local branch amd-gfx d5cba0d Merged main:ed1d29028492 into amd-gfx:ece3beeeb394 Remote branch main c464fea [DAG] Constant fold FMAD (llvm#69324)
This has very little effect on codegen in practice, but is a nice to have I think.
See #68315