-
Notifications
You must be signed in to change notification settings - Fork 13.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Replace AND with LSL#2 for LDR target (#34101) #89531
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-aarch64 Author: hanbeom (ParkHanbum) ChangesCurrently, process of replacing bitwise operations consisting of However, in certain cases, the Consider following case:
In this case, we can remove the after changed:
This patch checks to see if the Full diff: https://github.com/llvm/llvm-project/pull/89531.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 22da7ddef98a2a..1c331c88042317 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -128,6 +128,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
bool visitINSvi64lane(MachineInstr &MI);
bool visitFMOVDr(MachineInstr &MI);
+ bool visitLOAD(MachineInstr &MI);
bool runOnMachineFunction(MachineFunction &MF) override;
StringRef getPassName() const override {
@@ -690,6 +691,64 @@ bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) {
return true;
}
+bool AArch64MIPeepholeOpt::visitLOAD(MachineInstr &MI) {
+ Register LdOp2Reg = MI.getOperand(2).getReg();
+ unsigned RegSize = TRI->getRegSizeInBits(LdOp2Reg, *MRI);
+
+ // Consider:
+ // (ldr w, [x, (and x, (ubfm x, x, imms, immr), C1)])
+ // If bitmask C1 of And is all the bits remaining after
+ // bitshifting to UBFM minus last 2 bits, try to optimize.
+ // Optimize to:
+ // (ldr w, [x (ubfm x, x, imms, immr), lsl #2])
+ {
+ if (!MI.getOperand(4).isImm() || MI.getOperand(4).getImm() != 0)
+ return false;
+
+ MachineInstr *AndMI = MRI->getUniqueVRegDef(LdOp2Reg);
+ if (!AndMI || AndMI->getOpcode() != AArch64::ANDXri ||
+ !AndMI->getOperand(2).isImm())
+ return false;
+
+ uint64_t AndMask = AArch64_AM::decodeLogicalImmediate(
+ AndMI->getOperand(2).getImm(), RegSize);
+ MachineInstr *ShtMI = MRI->getUniqueVRegDef(AndMI->getOperand(1).getReg());
+ uint64_t Mask = 0;
+ if (!ShtMI || ShtMI->getOpcode() != AArch64::UBFMXri)
+ return false;
+ uint64_t imms = ShtMI->getOperand(2).getImm();
+ uint64_t immr = ShtMI->getOperand(3).getImm();
+ uint64_t new_imms = 0;
+ uint64_t new_immr = 0;
+ if (imms <= immr) {
+ if (immr != RegSize - 1)
+ return false;
+ Mask = ((uint64_t)1 << (RegSize - imms)) - 4;
+ new_imms = imms+2;
+ new_immr = immr;
+ } else {
+ // we only need to handle case lsl #1
+ if ((imms - immr != 1) || imms != RegSize - 1)
+ return false;
+ Mask = UINT64_MAX - 3;
+ new_imms = 1;
+ new_immr = imms;
+ }
+
+ // check this shifting can be treat as PreIndex Shifting.
+ if (AndMask == Mask) {
+ AndMI->eraseFromParent();
+ ShtMI->getOperand(2).setImm(new_imms);
+ ShtMI->getOperand(3).setImm(new_immr);
+ MI.getOperand(2).setReg(ShtMI->getOperand(0).getReg());
+ MI.getOperand(4).setImm(1);
+ return true;
+ }
+ }
+
+ return false;
+}
+
bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
@@ -771,6 +830,9 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
case AArch64::FMOVDr:
Changed |= visitFMOVDr(MI);
break;
+ case AArch64::LDRWroX:
+ Changed |= visitLOAD(MI);
+ break;
}
}
}
diff --git a/llvm/test/CodeGen/AArch64/peephole-load.mir b/llvm/test/CodeGen/AArch64/peephole-load.mir
new file mode 100644
index 00000000000000..8c9555a48997b9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/peephole-load.mir
@@ -0,0 +1,160 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -run-pass=aarch64-mi-peephole-opt -o - -mtriple=aarch64-unknown-linux -verify-machineinstrs %s | FileCheck %s
+
+---
+name: transform_lsr_and_ldr_to_lsr_ldr2
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1, $x2
+ ; CHECK-LABEL: name: transform_lsr_and_ldr_to_lsr_ldr2
+ ; CHECK: liveins: $x0, $x1, $x2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x0
+ ; CHECK-NEXT: [[MADDXrrr:%[0-9]+]]:gpr64 = MADDXrrr [[COPY1]], [[COPY2]], $xzr
+ ; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64 = UBFMXri killed [[MADDXrrr]], 58, 63
+ ; CHECK-NEXT: [[LDRWroX:%[0-9]+]]:gpr32 = LDRWroX [[COPY]], killed [[UBFMXri]], 0, 1
+ ; CHECK-NEXT: $w0 = COPY [[LDRWroX]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %2:gpr64common = COPY $x2
+ %1:gpr64 = COPY $x1
+ %0:gpr64 = COPY $x0
+ %3:gpr64 = MADDXrrr %1, %0, $xzr
+ %4:gpr64 = UBFMXri killed %3, 56, 63
+ %5:gpr64common = ANDXri killed %4, 8069
+ %6:gpr32 = LDRWroX %2, killed %5, 0, 0
+ $w0 = COPY %6
+ RET_ReallyLR implicit $w0
+...
+---
+name: transform_lsl1_and_ldr_to_lsr1_ldr2
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1, $x2
+ ; CHECK-LABEL: name: transform_lsl1_and_ldr_to_lsr1_ldr2
+ ; CHECK: liveins: $x0, $x1, $x2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x0
+ ; CHECK-NEXT: [[MADDXrrr:%[0-9]+]]:gpr64 = MADDXrrr [[COPY1]], [[COPY2]], $xzr
+ ; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64 = UBFMXri killed [[MADDXrrr]], 1, 63
+ ; CHECK-NEXT: [[LDRWroX:%[0-9]+]]:gpr32 = LDRWroX [[COPY]], killed [[UBFMXri]], 0, 1
+ ; CHECK-NEXT: $w0 = COPY [[LDRWroX]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %2:gpr64common = COPY $x2
+ %1:gpr64 = COPY $x1
+ %0:gpr64 = COPY $x0
+ %3:gpr64 = MADDXrrr %1, %0, $xzr
+ %4:gpr64 = UBFMXri killed %3, 63, 62
+ %5:gpr64common = ANDXri killed %4, 8125
+ %6:gpr32 = LDRWroX %2, killed %5, 0, 0
+ $w0 = COPY %6
+ RET_ReallyLR implicit $w0
+...
+---
+name: donot_transform_and_ldr
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1, $x2
+ ; CHECK-LABEL: name: donot_transform_and_ldr
+ ; CHECK: liveins: $x0, $x1, $x2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x0
+ ; CHECK-NEXT: [[MADDXrrr:%[0-9]+]]:gpr64 = MADDXrrr [[COPY1]], [[COPY2]], $xzr
+ ; CHECK-NEXT: [[ANDXri:%[0-9]+]]:gpr64common = ANDXri killed [[MADDXrrr]], 8125
+ ; CHECK-NEXT: [[LDRWroX:%[0-9]+]]:gpr32 = LDRWroX [[COPY]], killed [[ANDXri]], 0, 0
+ ; CHECK-NEXT: $w0 = COPY [[LDRWroX]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %2:gpr64common = COPY $x2
+ %1:gpr64 = COPY $x1
+ %0:gpr64 = COPY $x0
+ %3:gpr64 = MADDXrrr %1, %0, $xzr
+ %4:gpr64common = ANDXri killed %3, 8125
+ %5:gpr32 = LDRWroX %2, killed %4, 0, 0
+ $w0 = COPY %5
+ RET_ReallyLR implicit $w0
+...
+---
+name: donot_transform_if_not_lsl
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1, $x2
+ ; CHECK-LABEL: name: donot_transform_if_not_lsl
+ ; CHECK: liveins: $x0, $x1, $x2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x0
+ ; CHECK-NEXT: [[MADDXrrr:%[0-9]+]]:gpr64 = MADDXrrr [[COPY1]], [[COPY2]], $xzr
+ ; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64 = UBFMXri killed [[MADDXrrr]], 64, 62
+ ; CHECK-NEXT: [[ANDXri:%[0-9]+]]:gpr64common = ANDXri killed [[UBFMXri]], 8125
+ ; CHECK-NEXT: [[LDRWroX:%[0-9]+]]:gpr32 = LDRWroX [[COPY]], killed [[ANDXri]], 0, 0
+ ; CHECK-NEXT: $w0 = COPY [[LDRWroX]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %2:gpr64common = COPY $x2
+ %1:gpr64 = COPY $x1
+ %0:gpr64 = COPY $x0
+ %3:gpr64 = MADDXrrr %1, %0, $xzr
+ %4:gpr64 = UBFMXri killed %3, 64, 62
+ %5:gpr64common = ANDXri killed %4, 8125
+ %6:gpr32 = LDRWroX %2, killed %5, 0, 0
+ $w0 = COPY %6
+ RET_ReallyLR implicit $w0
+...
+---
+name: donot_transform_if_not_lsr
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1, $x2
+ ; CHECK-LABEL: name: donot_transform_if_not_lsr
+ ; CHECK: liveins: $x0, $x1, $x2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x0
+ ; CHECK-NEXT: [[MADDXrrr:%[0-9]+]]:gpr64 = MADDXrrr [[COPY1]], [[COPY2]], $xzr
+ ; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64 = UBFMXri killed [[MADDXrrr]], 62, 62
+ ; CHECK-NEXT: [[ANDXri:%[0-9]+]]:gpr64common = ANDXri killed [[UBFMXri]], 8069
+ ; CHECK-NEXT: [[LDRWroX:%[0-9]+]]:gpr32 = LDRWroX [[COPY]], killed [[ANDXri]], 0, 0
+ ; CHECK-NEXT: $w0 = COPY [[LDRWroX]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %2:gpr64common = COPY $x2
+ %1:gpr64 = COPY $x1
+ %0:gpr64 = COPY $x0
+ %3:gpr64 = MADDXrrr %1, %0, $xzr
+ %4:gpr64 = UBFMXri killed %3, 62, 62
+ %5:gpr64common = ANDXri killed %4, 8069
+ %6:gpr32 = LDRWroX %2, killed %5, 0, 0
+ $w0 = COPY %6
+ RET_ReallyLR implicit $w0
+...
+---
+name: donot_transform_if_not_exist_and_and_lsl
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x0, $x2
+ ; CHECK-LABEL: name: donot_transform_if_not_exist_and_and_lsl
+ ; CHECK: liveins: $x0, $x2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x0
+ ; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64 = UBFMXri [[COPY1]], 61, 60
+ ; CHECK-NEXT: [[LDRWroX:%[0-9]+]]:gpr32 = LDRWroX [[COPY]], killed [[UBFMXri]], 0, 0
+ ; CHECK-NEXT: $w0 = COPY [[LDRWroX]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %2:gpr64common = COPY $x2
+ %0:gpr64 = COPY $x0
+ %3:gpr64 = UBFMXri %0, 61, 60
+ %4:gpr32 = LDRWroX %2, killed %3, 0, 0
+ $w0 = COPY %4
+ RET_ReallyLR implicit $w0
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Could this be handled better in DAG combine? Maybe with something similar to reassociationCanBreakAddressingModePattern? |
ok. I'll find way |
@davemgreen
But in this case, DAG change
Do you have any advice on where to implement this? |
Hi. There is code in the DAG combiner to attempt to stop reassociations that would break the addressing mode. |
@davemgreen Thanks, I think I'm almost there thanks to your advice, I was thinking I should do it in Can I ask one more thing? I can check the legality of the address via My current implementation also enforces the following cases in RISCV
before:
after:
|
@davemgreen I think it works done. would you review again please? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, but please wait for @davemgreen's review.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, this does look like a good place for it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. LGTM
b518518
to
b380592
Compare
I tested this with test-suite in m1 macbook and here is that results:
what different result at test aarch64-acle-fmv-features.test between main and this.
is this properly result? if not please let me know how to track which point was incorrect. |
Does the test reliably fail with the new compiler? It sounds like it might be unrelated, as there have been a number of changes around FMV lately. The test looks like it fork()'s, so maybe something was going wrong with it? If it is a real failure cause by this patch then you might need to debug a bit to see what might be going wrong. |
sorry, it was my mistake. I was tested it with my built llvm version. |
Any updates on this? |
nope |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry - I think I believed you had commit access. I've given this an extra test and it seems OK. If you can update the suggestion to fix a warning, we can get this in.
ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) { | ||
if (auto ShouldLOAD = dyn_cast<LoadSDNode>(*ShouldADD->use_begin())) { | ||
unsigned ByteVT = ShouldLOAD->getMemoryVT().getSizeInBits() / 8; | ||
if ((1 << ShlAmt) == ByteVT && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
-> (1ULL << ShlAmt) == ByteVT
…x, c3)) when load
Currently, process of replacing bitwise operations consisting of `(shl (srl x, c1), c2)` with `And` is performed by `DAGCombiner`. However, in certain case like `(shl (srl, x, c1) 2)` is do not need to transform to `AND` if it was used to `Load` Target. Consider following case: ``` lsr x8, x8, llvm#56 and x8, x8, #0xfc ldr w0, [x2, x8] ret ``` In this case, we can remove the `AND` by changing the target of `LDR` to `[X2, X8, LSL llvm#2]` and right-shifting amount change to 56 to 58. after changed: ``` lsr x8, x8, llvm#58 ldr w0, [x2, x8, lsl llvm#2] ret ``` This patch checks to see if the `(shl (srl x, c1) 2)` operation on `load` target can be prevent transform to `And`.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks (I didn't get notification for the update before)
@davemgreen never mind. I'm always thanks for your reviewing. |
…1863d2644 Local branch amd-gfx 08f1863 Merged main:6f618a7b8249e7baa3b2d18f8bbec3c5b6f6d24e into amd-gfx:e5edfda5900b Remote branch main 77fccb3 [AArch64] Replace AND with LSL#2 for LDR target (llvm#34101) (llvm#89531)
Currently, process of replacing bitwise operations consisting of
LSR
/LSL
withAnd
is performed byDAGCombiner
.However, in certain cases, the
AND
generated by this processcan be removed.
Consider following case:
In this case, we can remove the
AND
by changing the target ofLDR
to
[X2, X8, LSL #2]
and right-shifting amount change to 56 to 58.after changed:
This patch checks to see if the
SHIFTING
+AND
operation on loadtarget can be optimized and optimizes it if it can.