-
Notifications
You must be signed in to change notification settings - Fork 13.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64][SVE] Pair SVE fill/spill into LDP/STP with -msve-vector-bits=128. #134068
[AArch64][SVE] Pair SVE fill/spill into LDP/STP with -msve-vector-bits=128. #134068
Conversation
…s=128. When compiling with -msve-vector-bits=128 or vscale_range(1, 1) and when the offsets allow it, we can pair SVE LDR/STR instructions into Neon LDP/STP. For example, given: ```cpp void foo(double const *ldp, double *stp) { svbool_t pg = svptrue_b64(); svfloat64_t ld1 = svld1_f64(pg, ldp); svfloat64_t ld2 = svld1_f64(pg, ldp+svcntd()); svst1_f64(pg, stp, ld1); svst1_f64(pg, stp+svcntd(), ld2); } ``` When compiled with `-msve-vector-bits=128`, we currently generate: ```gas foo: ldr z0, [x0] ldr z1, [x0, llvm#1, mul vl] str z0, [x1] str z1, [x1, llvm#1, mul vl] ret ``` With this patch, we instead generate: ```gas foo: ldp q0, q1, [x0] stp q0, q1, [x1] ret ``` Loading (and to a lesser extent, storing) multiple registers from a common base address is a commonly occurring pattern, but multi-register SVE loads/stores are only supported starting with SVE2.1. This patch offers an alternative for SVE 128-bit implementations.
@llvm/pr-subscribers-backend-aarch64 Author: Ricardo Jesus (rj-jesus) ChangesWhen compiling with -msve-vector-bits=128 or vscale_range(1, 1) and when For example, given (https://godbolt.org/z/srjxYbK9s): #include <arm_sve.h>
void foo(double const *ldp, double *stp) {
svbool_t pg = svptrue_b64();
svfloat64_t ld1 = svld1_f64(pg, ldp);
svfloat64_t ld2 = svld1_f64(pg, ldp+svcntd());
svst1_f64(pg, stp, ld1);
svst1_f64(pg, stp+svcntd(), ld2);
} When compiled with foo:
ldr z0, [x0]
ldr z1, [x0, #<!-- -->1, mul vl]
str z0, [x1]
str z1, [x1, #<!-- -->1, mul vl]
ret With this patch, we instead generate: foo:
ldp q0, q1, [x0]
stp q0, q1, [x1]
ret Loading (and to a lesser extent, storing) multiple registers from a This is an alternative, more targetted approach to #127500. Full diff: https://github.com/llvm/llvm-project/pull/134068.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index cd976790ebb6f..f1f1f66e12216 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -87,6 +87,10 @@ static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit",
static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
cl::init(true), cl::Hidden);
+// Enable SVE fill/spill pairing for VLS 128.
+static cl::opt<bool> EnableSVEFillSpillPairing("aarch64-sve-fill-spill-pairing",
+ cl::init(true), cl::Hidden);
+
#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
namespace {
@@ -97,6 +101,9 @@ using LdStPairFlags = struct LdStPairFlags {
// a pair-wise insn, and false if the reverse is true.
bool MergeForward = false;
+ // Set to true when pairing SVE fill/spill instructions.
+ bool SVEFillSpillPair = false;
+
// SExtIdx gives the index of the result of the load pair that must be
// extended. The value of SExtIdx assumes that the paired load produces the
// value in this order: (I, returned iterator), i.e., -1 means no value has
@@ -113,6 +120,9 @@ using LdStPairFlags = struct LdStPairFlags {
void setMergeForward(bool V = true) { MergeForward = V; }
bool getMergeForward() const { return MergeForward; }
+ void setSVEFillSpillPair(bool V = true) { SVEFillSpillPair = V; }
+ bool getSVEFillSpillPair() const { return SVEFillSpillPair; }
+
void setSExtIdx(int V) { SExtIdx = V; }
int getSExtIdx() const { return SExtIdx; }
@@ -300,6 +310,7 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
case AArch64::STRXui:
case AArch64::STRXpre:
case AArch64::STURXi:
+ case AArch64::STR_ZXI:
case AArch64::LDRDui:
case AArch64::LDURDi:
case AArch64::LDRDpre:
@@ -318,6 +329,7 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
case AArch64::LDRSui:
case AArch64::LDURSi:
case AArch64::LDRSpre:
+ case AArch64::LDR_ZXI:
return Opc;
case AArch64::LDRSWui:
return AArch64::LDRWui;
@@ -363,6 +375,7 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
return AArch64::STPDpre;
case AArch64::STRQui:
case AArch64::STURQi:
+ case AArch64::STR_ZXI:
return AArch64::STPQi;
case AArch64::STRQpre:
return AArch64::STPQpre;
@@ -388,6 +401,7 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
return AArch64::LDPDpre;
case AArch64::LDRQui:
case AArch64::LDURQi:
+ case AArch64::LDR_ZXI:
return AArch64::LDPQi;
case AArch64::LDRQpre:
return AArch64::LDPQpre;
@@ -833,6 +847,12 @@ static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
}
}
+// Return true if MI is an SVE fill/spill instruction.
+static bool isPairableFillSpillInst(const MachineInstr &MI) {
+ auto const Opc = MI.getOpcode();
+ return Opc == AArch64::LDR_ZXI || Opc == AArch64::STR_ZXI;
+}
+
static bool isRewritableImplicitDef(unsigned Opc) {
switch (Opc) {
default:
@@ -1227,6 +1247,15 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
(void)MIBSXTW;
LLVM_DEBUG(dbgs() << " Extend operand:\n ");
LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
+ } else if (Flags.getSVEFillSpillPair()) {
+ // We are combining SVE fill/spill to LDP/STP, so we need to get the Q
+ // variant of the registers.
+ MachineOperand &MOp0 = MIB->getOperand(0);
+ MachineOperand &MOp1 = MIB->getOperand(1);
+ assert(AArch64::ZPRRegClass.contains(MOp0.getReg()) &&
+ AArch64::ZPRRegClass.contains(MOp1.getReg()) && "Invalid register.");
+ MOp0.setReg(AArch64::Q0 + (MOp0.getReg() - AArch64::Z0));
+ MOp1.setReg(AArch64::Q0 + (MOp1.getReg() - AArch64::Z0));
} else {
LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
}
@@ -1829,6 +1858,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
Flags.clearRenameReg();
+ if (isPairableFillSpillInst(FirstMI))
+ Flags.setSVEFillSpillPair();
+
// Track which register units have been modified and used between the first
// insn (inclusive) and the second insn.
ModifiedRegUnits.clear();
@@ -2661,7 +2693,8 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
// Get the needed alignments to check them if
// ldp-aligned-only/stp-aligned-only features are opted.
uint64_t MemAlignment = MemOp->getAlign().value();
- uint64_t TypeAlignment = Align(MemOp->getSize().getValue()).value();
+ uint64_t TypeAlignment =
+ Align(MemOp->getSize().getValue().getKnownMinValue()).value();
if (MemAlignment < 2 * TypeAlignment) {
NumFailedAlignmentCheck++;
@@ -2782,6 +2815,9 @@ bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI,
bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
bool EnableNarrowZeroStOpt) {
AArch64FunctionInfo &AFI = *MBB.getParent()->getInfo<AArch64FunctionInfo>();
+ bool const CanPairFillSpill = EnableSVEFillSpillPairing &&
+ Subtarget->isSVEorStreamingSVEAvailable() &&
+ Subtarget->getSVEVectorSizeInBits() == 128;
bool Modified = false;
// Four tranformations to do here:
@@ -2822,11 +2858,18 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
}
// 3) Find loads and stores that can be merged into a single load or store
// pair instruction.
+ // When compiling for SVE 128, also try to combine SVE fill/spill
+ // instructions into LDP/STP.
// e.g.,
// ldr x0, [x2]
// ldr x1, [x2, #8]
// ; becomes
// ldp x0, x1, [x2]
+ // e.g.,
+ // ldr z0, [x2]
+ // ldr z1, [x2, #1, mul vl]
+ // ; becomes
+ // ldp q0, q1, [x2]
if (MBB.getParent()->getRegInfo().tracksLiveness()) {
DefinedInBB.clear();
@@ -2840,6 +2883,9 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
updateDefinedRegisters(*MBBI, DefinedInBB, TRI);
if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
Modified = true;
+ else if (CanPairFillSpill && isPairableFillSpillInst(*MBBI) &&
+ tryToPairLdStInst(MBBI))
+ Modified = true;
else
++MBBI;
}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-fill-spill-pair.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-fill-spill-pair.ll
new file mode 100644
index 0000000000000..79120bc5352aa
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-fill-spill-pair.ll
@@ -0,0 +1,218 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+sve,ldp-aligned-only -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefixes=CHECK-LDPALIGNEDONLY
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+sve,stp-aligned-only -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefixes=CHECK-STPALIGNEDONLY
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK-OFF
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=256 -aarch64-sve-vector-bits-max=256 < %s | FileCheck %s --check-prefixes=CHECK-OFF
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 -aarch64-sve-fill-spill-pairing=0 < %s | FileCheck %s --check-prefixes=CHECK-OFF
+
+define void @nxv16i8(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp q0, q1, [x0]
+; CHECK-NEXT: stp q0, q1, [x1]
+; CHECK-NEXT: ret
+;
+; CHECK-LDPALIGNEDONLY-LABEL: nxv16i8:
+; CHECK-LDPALIGNEDONLY: // %bb.0:
+; CHECK-LDPALIGNEDONLY-NEXT: ldr z0, [x0]
+; CHECK-LDPALIGNEDONLY-NEXT: ldr z1, [x0, #1, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT: stp q0, q1, [x1]
+; CHECK-LDPALIGNEDONLY-NEXT: ret
+;
+; CHECK-STPALIGNEDONLY-LABEL: nxv16i8:
+; CHECK-STPALIGNEDONLY: // %bb.0:
+; CHECK-STPALIGNEDONLY-NEXT: ldp q0, q1, [x0]
+; CHECK-STPALIGNEDONLY-NEXT: str z0, [x1]
+; CHECK-STPALIGNEDONLY-NEXT: str z1, [x1, #1, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT: ret
+;
+; CHECK-OFF-LABEL: nxv16i8:
+; CHECK-OFF: // %bb.0:
+; CHECK-OFF-NEXT: ldr z0, [x0]
+; CHECK-OFF-NEXT: ldr z1, [x0, #1, mul vl]
+; CHECK-OFF-NEXT: str z0, [x1]
+; CHECK-OFF-NEXT: str z1, [x1, #1, mul vl]
+; CHECK-OFF-NEXT: ret
+ %vscale = tail call i64 @llvm.vscale()
+ %vl = shl nuw nsw i64 %vscale, 4
+ %ldptr2 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %vl
+ %stptr2 = getelementptr inbounds nuw i8, ptr %stptr, i64 %vl
+ %ld1 = load <vscale x 16 x i8>, ptr %ldptr, align 1
+ %ld2 = load <vscale x 16 x i8>, ptr %ldptr2, align 1
+ store <vscale x 16 x i8> %ld1, ptr %stptr, align 1
+ store <vscale x 16 x i8> %ld2, ptr %stptr2, align 1
+ ret void
+}
+
+define void @nxv16i8_max_range(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: nxv16i8_max_range:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp q0, q1, [x0, #-1024]
+; CHECK-NEXT: stp q0, q1, [x1, #1008]
+; CHECK-NEXT: ret
+;
+; CHECK-LDPALIGNEDONLY-LABEL: nxv16i8_max_range:
+; CHECK-LDPALIGNEDONLY: // %bb.0:
+; CHECK-LDPALIGNEDONLY-NEXT: ldr z0, [x0, #-64, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT: ldr z1, [x0, #-63, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT: stp q0, q1, [x1, #1008]
+; CHECK-LDPALIGNEDONLY-NEXT: ret
+;
+; CHECK-STPALIGNEDONLY-LABEL: nxv16i8_max_range:
+; CHECK-STPALIGNEDONLY: // %bb.0:
+; CHECK-STPALIGNEDONLY-NEXT: ldp q0, q1, [x0, #-1024]
+; CHECK-STPALIGNEDONLY-NEXT: str z0, [x1, #63, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT: str z1, [x1, #64, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT: ret
+;
+; CHECK-OFF-LABEL: nxv16i8_max_range:
+; CHECK-OFF: // %bb.0:
+; CHECK-OFF-NEXT: ldr z0, [x0, #-64, mul vl]
+; CHECK-OFF-NEXT: ldr z1, [x0, #-63, mul vl]
+; CHECK-OFF-NEXT: str z0, [x1, #63, mul vl]
+; CHECK-OFF-NEXT: str z1, [x1, #64, mul vl]
+; CHECK-OFF-NEXT: ret
+ %vscale = tail call i64 @llvm.vscale()
+ %ldoff1 = mul i64 %vscale, -1024
+ %ldoff2 = mul i64 %vscale, -1008
+ %stoff1 = mul i64 %vscale, 1008
+ %stoff2 = mul i64 %vscale, 1024
+ %ldptr1 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %ldoff1
+ %ldptr2 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %ldoff2
+ %stptr1 = getelementptr inbounds nuw i8, ptr %stptr, i64 %stoff1
+ %stptr2 = getelementptr inbounds nuw i8, ptr %stptr, i64 %stoff2
+ %ld1 = load <vscale x 16 x i8>, ptr %ldptr1, align 1
+ %ld2 = load <vscale x 16 x i8>, ptr %ldptr2, align 1
+ store <vscale x 16 x i8> %ld1, ptr %stptr1, align 1
+ store <vscale x 16 x i8> %ld2, ptr %stptr2, align 1
+ ret void
+}
+
+define void @nxv16i8_outside_range(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: nxv16i8_outside_range:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #-65, mul vl]
+; CHECK-NEXT: ldr z1, [x0, #-64, mul vl]
+; CHECK-NEXT: str z0, [x1, #64, mul vl]
+; CHECK-NEXT: str z1, [x1, #65, mul vl]
+; CHECK-NEXT: ret
+;
+; CHECK-LDPALIGNEDONLY-LABEL: nxv16i8_outside_range:
+; CHECK-LDPALIGNEDONLY: // %bb.0:
+; CHECK-LDPALIGNEDONLY-NEXT: ldr z0, [x0, #-65, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT: ldr z1, [x0, #-64, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT: str z0, [x1, #64, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT: str z1, [x1, #65, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT: ret
+;
+; CHECK-STPALIGNEDONLY-LABEL: nxv16i8_outside_range:
+; CHECK-STPALIGNEDONLY: // %bb.0:
+; CHECK-STPALIGNEDONLY-NEXT: ldr z0, [x0, #-65, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT: ldr z1, [x0, #-64, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT: str z0, [x1, #64, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT: str z1, [x1, #65, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT: ret
+;
+; CHECK-OFF-LABEL: nxv16i8_outside_range:
+; CHECK-OFF: // %bb.0:
+; CHECK-OFF-NEXT: ldr z0, [x0, #-65, mul vl]
+; CHECK-OFF-NEXT: ldr z1, [x0, #-64, mul vl]
+; CHECK-OFF-NEXT: str z0, [x1, #64, mul vl]
+; CHECK-OFF-NEXT: str z1, [x1, #65, mul vl]
+; CHECK-OFF-NEXT: ret
+ %vscale = tail call i64 @llvm.vscale()
+ %ldoff1 = mul i64 %vscale, -1040
+ %ldoff2 = mul i64 %vscale, -1024
+ %stoff1 = mul i64 %vscale, 1024
+ %stoff2 = mul i64 %vscale, 1040
+ %ldptr1 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %ldoff1
+ %ldptr2 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %ldoff2
+ %stptr1 = getelementptr inbounds nuw i8, ptr %stptr, i64 %stoff1
+ %stptr2 = getelementptr inbounds nuw i8, ptr %stptr, i64 %stoff2
+ %ld1 = load <vscale x 16 x i8>, ptr %ldptr1, align 1
+ %ld2 = load <vscale x 16 x i8>, ptr %ldptr2, align 1
+ store <vscale x 16 x i8> %ld1, ptr %stptr1, align 1
+ store <vscale x 16 x i8> %ld2, ptr %stptr2, align 1
+ ret void
+}
+
+define void @nxv16i8_2vl_stride(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: nxv16i8_2vl_stride:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ldr z1, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: str z1, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+;
+; CHECK-LDPALIGNEDONLY-LABEL: nxv16i8_2vl_stride:
+; CHECK-LDPALIGNEDONLY: // %bb.0:
+; CHECK-LDPALIGNEDONLY-NEXT: ldr z0, [x0]
+; CHECK-LDPALIGNEDONLY-NEXT: ldr z1, [x0, #2, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT: str z0, [x1]
+; CHECK-LDPALIGNEDONLY-NEXT: str z1, [x1, #2, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT: ret
+;
+; CHECK-STPALIGNEDONLY-LABEL: nxv16i8_2vl_stride:
+; CHECK-STPALIGNEDONLY: // %bb.0:
+; CHECK-STPALIGNEDONLY-NEXT: ldr z0, [x0]
+; CHECK-STPALIGNEDONLY-NEXT: ldr z1, [x0, #2, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT: str z0, [x1]
+; CHECK-STPALIGNEDONLY-NEXT: str z1, [x1, #2, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT: ret
+;
+; CHECK-OFF-LABEL: nxv16i8_2vl_stride:
+; CHECK-OFF: // %bb.0:
+; CHECK-OFF-NEXT: ldr z0, [x0]
+; CHECK-OFF-NEXT: ldr z1, [x0, #2, mul vl]
+; CHECK-OFF-NEXT: str z0, [x1]
+; CHECK-OFF-NEXT: str z1, [x1, #2, mul vl]
+; CHECK-OFF-NEXT: ret
+ %vscale = tail call i64 @llvm.vscale()
+ %vl = shl nuw nsw i64 %vscale, 5
+ %ldptr2 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %vl
+ %stptr2 = getelementptr inbounds nuw i8, ptr %stptr, i64 %vl
+ %ld1 = load <vscale x 16 x i8>, ptr %ldptr, align 1
+ %ld2 = load <vscale x 16 x i8>, ptr %ldptr2, align 1
+ store <vscale x 16 x i8> %ld1, ptr %stptr, align 1
+ store <vscale x 16 x i8> %ld2, ptr %stptr2, align 1
+ ret void
+}
+
+define void @nxv2f64_32b_aligned(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: nxv2f64_32b_aligned:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp q0, q1, [x0]
+; CHECK-NEXT: stp q0, q1, [x1]
+; CHECK-NEXT: ret
+;
+; CHECK-LDPALIGNEDONLY-LABEL: nxv2f64_32b_aligned:
+; CHECK-LDPALIGNEDONLY: // %bb.0:
+; CHECK-LDPALIGNEDONLY-NEXT: ldp q0, q1, [x0]
+; CHECK-LDPALIGNEDONLY-NEXT: stp q0, q1, [x1]
+; CHECK-LDPALIGNEDONLY-NEXT: ret
+;
+; CHECK-STPALIGNEDONLY-LABEL: nxv2f64_32b_aligned:
+; CHECK-STPALIGNEDONLY: // %bb.0:
+; CHECK-STPALIGNEDONLY-NEXT: ldp q0, q1, [x0]
+; CHECK-STPALIGNEDONLY-NEXT: stp q0, q1, [x1]
+; CHECK-STPALIGNEDONLY-NEXT: ret
+;
+; CHECK-OFF-LABEL: nxv2f64_32b_aligned:
+; CHECK-OFF: // %bb.0:
+; CHECK-OFF-NEXT: ldr z0, [x0]
+; CHECK-OFF-NEXT: ldr z1, [x0, #1, mul vl]
+; CHECK-OFF-NEXT: str z0, [x1]
+; CHECK-OFF-NEXT: str z1, [x1, #1, mul vl]
+; CHECK-OFF-NEXT: ret
+ %vscale = tail call i64 @llvm.vscale()
+ %vl = shl nuw nsw i64 %vscale, 4
+ %ldptr2 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %vl
+ %stptr2 = getelementptr inbounds nuw i8, ptr %stptr, i64 %vl
+ %ld1 = load <vscale x 2 x double>, ptr %ldptr, align 32
+ %ld2 = load <vscale x 2 x double>, ptr %ldptr2, align 32
+ store <vscale x 2 x double> %ld1, ptr %stptr, align 32
+ store <vscale x 2 x double> %ld2, ptr %stptr2, align 32
+ ret void
+}
|
static cl::opt<bool> EnableSVEFillSpillPairing("aarch64-sve-fill-spill-pairing", | ||
cl::init(true), cl::Hidden); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given the transformation is little endian specific and the option's primary use case being for debug, perhaps it's worth reversing the polarity and implementing DisableSVEFillSpillPairing -> aarch64-disable-sve-fill-pairing
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, that makes sense. I've renamed this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just going to throw this out there but when inspecting tryToPairLdStInst
I see there is already provision to disable the use of pair instructions via target features disable-ldp
and disable-stp
so perhaps we don't need a dedicated flag after all? That might make the implementation easier because then the "should we do this" check only requires the subtarget?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That sounds good to me. I thought having the option to disable just SVE pairing could be useful for debugging, but I'm happy to remove it if you don't think so.
else if (CanPairFillSpill && isPairableFillSpillInst(*MBBI) && | ||
tryToPairLdStInst(MBBI)) | ||
Modified = true; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is necessary to have a separate path for SVE rather than extending isPairableLdStInst? I can see isPairableLdStInst is also used to group related instructions together, which is something we might want in the future, assuming it doesn't already come for free.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe isPairableLdStInst
is static, but I needed to query the subtarget to perform the checks that CanPairFillSpill
is doing. I thought it would be better to add the new path to reduce the number of changes, but I'm happy to revisit this if you have a better idea.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's worth trying. I wasn't thinking of extending isPairableLdStInst
specifically but wondered if the extra checks can be pushing into the tryTo..
function or perhaps isCandidateToMergeOrPair
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think they can, I think I tried this at first but found myself having to check for the opcodes for the fill/spill instructions in more than one place, and so went for the separate paths in the end. But I'll give this a try so that we can see how it looks proper.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've moved these checks into isPairableLdStInst
and isCandidateToMergeOrPair
- please let me know what you think of the new approach, otherwise I'm happy to revert it. :)
} else if (Opc == AArch64::LDR_ZXI || Opc == AArch64::STR_ZXI) { | ||
// We are combining SVE fill/spill to LDP/STP, so we need to use the Q | ||
// variant of the registers. | ||
MachineOperand &MOp0 = MIB->getOperand(0); | ||
MachineOperand &MOp1 = MIB->getOperand(1); | ||
assert(AArch64::ZPRRegClass.contains(MOp0.getReg()) && | ||
AArch64::ZPRRegClass.contains(MOp1.getReg()) && "Invalid register."); | ||
MOp0.setReg(AArch64::Q0 + (MOp0.getReg() - AArch64::Z0)); | ||
MOp1.setReg(AArch64::Q0 + (MOp1.getReg() - AArch64::Z0)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this be done earlier, e.g. applied to RegOp0
and RegOp1
just before they are added to MIB
?
If not then this block needs
LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
so the result gets dumped.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the reason I didn't place this earlier is that the machine verifier segfaults in verifyUseList
if I do so, although I believe it should work if I create a copy of the operands (say, with MachineOperand::CreateReg) instead of using setReg
.
For now I've just added the missing debug message, but please let me know if you'd like it done differently.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks @rj-jesus for this and the previous foundation work. To my eye I think we've landed on a clean solution so let's see what the buildbots think of it :)
Thank you very much for your help @paulwalker-arm - fingers crossed for the buildbots :D |
…ctor-bits=128." (#134997) Reverts #134068 Caused a stage 2 build failure: https://lab.llvm.org/buildbot/#/builders/41/builds/6016 ``` FAILED: lib/Support/CMakeFiles/LLVMSupport.dir/Caching.cpp.o /home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage1.install/bin/clang++ -DGTEST_HAS_RTTI=0 -D_DEBUG -D_GLIBCXX_ASSERTIONS -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage2/lib/Support -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/llvm/llvm/lib/Support -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage2/include -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/llvm/llvm/include -mcpu=neoverse-512tvb -mllvm -scalable-vectorization=preferred -mllvm -treat-scalable-fixed-error-as-warning=false -fPIC -fno-semantic-interposition -fvisibility-inlines-hidden -Werror=date-time -Werror=unguarded-availability-new -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -pedantic -Wno-long-long -Wc++98-compat-extra-semi -Wimplicit-fallthrough -Wcovered-switch-default -Wno-noexcept-type -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wsuggest-override -Wno-comment -Wstring-conversion -Wmisleading-indentation -Wctad-maybe-unsupported -fdiagnostics-color -ffunction-sections -fdata-sections -Werror=global-constructors -O3 -DNDEBUG -std=c++17 -UNDEBUG -fno-exceptions -funwind-tables -fno-rtti -MD -MT lib/Support/CMakeFiles/LLVMSupport.dir/Caching.cpp.o -MF lib/Support/CMakeFiles/LLVMSupport.dir/Caching.cpp.o.d -o lib/Support/CMakeFiles/LLVMSupport.dir/Caching.cpp.o -c /home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/llvm/llvm/lib/Support/Caching.cpp Opcode has unknown scale! UNREACHABLE executed at ../llvm/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp:4530! PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace, preprocessed source, and associated run script. Stack dump: 0. Program arguments: /home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage1.install/bin/clang++ -DGTEST_HAS_RTTI=0 -D_DEBUG -D_GLIBCXX_ASSERTIONS -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage2/lib/Support -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/llvm/llvm/lib/Support -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage2/include -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/llvm/llvm/include -mcpu=neoverse-512tvb -mllvm -scalable-vectorization=preferred -mllvm -treat-scalable-fixed-error-as-warning=false -fPIC -fno-semantic-interposition -fvisibility-inlines-hidden -Werror=date-time -Werror=unguarded-availability-new -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -pedantic -Wno-long-long -Wc++98-compat-extra-semi -Wimplicit-fallthrough -Wcovered-switch-default -Wno-noexcept-type -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wsuggest-override -Wno-comment -Wstring-conversion -Wmisleading-indentation -Wctad-maybe-unsupported -fdiagnostics-color -ffunction-sections -fdata-sections -Werror=global-constructors -O3 -DNDEBUG -std=c++17 -UNDEBUG -fno-exceptions -funwind-tables -fno-rtti -MD -MT lib/Support/CMakeFiles/LLVMSupport.dir/Caching.cpp.o -MF lib/Support/CMakeFiles/LLVMSupport.dir/Caching.cpp.o.d -o lib/Support/CMakeFiles/LLVMSupport.dir/Caching.cpp.o -c /home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/llvm/llvm/lib/Support/Caching.cpp 1. <eof> parser at end of file 2. Code generation 3. Running pass 'Function Pass Manager' on module '/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/llvm/llvm/lib/Support/Caching.cpp'. 4. Running pass 'AArch64 load / store optimization pass' on function '@"_ZNSt17_Function_handlerIFN4llvm8ExpectedISt8functionIFNS1_ISt10unique_ptrINS0_16CachedFileStreamESt14default_deleteIS4_EEEEjRKNS0_5TwineEEEEEjNS0_9StringRefESB_EZNS0_10localCacheESB_SB_SB_S2_IFvjSB_S3_INS0_12MemoryBufferES5_ISH_EEEEE3$_0E9_M_invokeERKSt9_Any_dataOjOSF_SB_"' #0 0x0000b6eae9b67bf0 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage1.install/bin/clang+++0x81c7bf0) #1 0x0000b6eae9b65aec llvm::sys::RunSignalHandlers() (/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage1.install/bin/clang+++0x81c5aec) #2 0x0000b6eae9acd5f4 CrashRecoverySignalHandler(int) CrashRecoveryContext.cpp:0:0 #3 0x0000f16c1aff28f8 (linux-vdso.so.1+0x8f8) #4 0x0000f16c1aacf1f0 __pthread_kill_implementation ./nptl/pthread_kill.c:44:76 #5 0x0000f16c1aa8a67c gsignal ./signal/../sysdeps/posix/raise.c:27:6 #6 0x0000f16c1aa77130 abort ./stdlib/abort.c:81:7 #7 0x0000b6eae9ad6628 (/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage1.install/bin/clang+++0x8136628) #8 0x0000b6eae72e95a8 (/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage1.install/bin/clang+++0x59495a8) #9 0x0000b6eae74ca9a8 (anonymous namespace)::AArch64LoadStoreOpt::findMatchingInsn(llvm::MachineInstrBundleIterator<llvm::MachineInstr, false>, (anonymous namespace)::LdStPairFlags&, unsigned int, bool) AArch64LoadStoreOptimizer.cpp:0:0 #10 0x0000b6eae74c85a8 (anonymous namespace)::AArch64LoadStoreOpt::tryToPairLdStInst(llvm::MachineInstrBundleIterator<llvm::MachineInstr, false>&) AArch64LoadStoreOptimizer.cpp:0:0 #11 0x0000b6eae74c624c (anonymous namespace)::AArch64LoadStoreOpt::optimizeBlock(llvm::MachineBasicBlock&, bool) AArch64LoadStoreOptimizer.cpp:0:0 #12 0x0000b6eae74c429c (anonymous namespace)::AArch64LoadStoreOpt::runOnMachineFunction(llvm::MachineFunction&) AArch64LoadStoreOptimizer.cpp:0:0 ```
Reverted due to https://lab.llvm.org/buildbot/#/builders/41/builds/6016. That build is SVE(1) running on Graviton 3. Vector length specific fails in the same way - https://lab.llvm.org/buildbot/#/builders/4/builds/6128, as does SVE2 VLA https://lab.llvm.org/buildbot/#/builders/199/builds/2674. |
Let me know what you need to debug this. |
…th -msve-vector-bits=128." (#134997) Reverts llvm/llvm-project#134068 Caused a stage 2 build failure: https://lab.llvm.org/buildbot/#/builders/41/builds/6016 ``` FAILED: lib/Support/CMakeFiles/LLVMSupport.dir/Caching.cpp.o /home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage1.install/bin/clang++ -DGTEST_HAS_RTTI=0 -D_DEBUG -D_GLIBCXX_ASSERTIONS -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage2/lib/Support -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/llvm/llvm/lib/Support -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage2/include -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/llvm/llvm/include -mcpu=neoverse-512tvb -mllvm -scalable-vectorization=preferred -mllvm -treat-scalable-fixed-error-as-warning=false -fPIC -fno-semantic-interposition -fvisibility-inlines-hidden -Werror=date-time -Werror=unguarded-availability-new -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -pedantic -Wno-long-long -Wc++98-compat-extra-semi -Wimplicit-fallthrough -Wcovered-switch-default -Wno-noexcept-type -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wsuggest-override -Wno-comment -Wstring-conversion -Wmisleading-indentation -Wctad-maybe-unsupported -fdiagnostics-color -ffunction-sections -fdata-sections -Werror=global-constructors -O3 -DNDEBUG -std=c++17 -UNDEBUG -fno-exceptions -funwind-tables -fno-rtti -MD -MT lib/Support/CMakeFiles/LLVMSupport.dir/Caching.cpp.o -MF lib/Support/CMakeFiles/LLVMSupport.dir/Caching.cpp.o.d -o lib/Support/CMakeFiles/LLVMSupport.dir/Caching.cpp.o -c /home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/llvm/llvm/lib/Support/Caching.cpp Opcode has unknown scale! UNREACHABLE executed at ../llvm/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp:4530! PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace, preprocessed source, and associated run script. Stack dump: 0. Program arguments: /home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage1.install/bin/clang++ -DGTEST_HAS_RTTI=0 -D_DEBUG -D_GLIBCXX_ASSERTIONS -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage2/lib/Support -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/llvm/llvm/lib/Support -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage2/include -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/llvm/llvm/include -mcpu=neoverse-512tvb -mllvm -scalable-vectorization=preferred -mllvm -treat-scalable-fixed-error-as-warning=false -fPIC -fno-semantic-interposition -fvisibility-inlines-hidden -Werror=date-time -Werror=unguarded-availability-new -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -pedantic -Wno-long-long -Wc++98-compat-extra-semi -Wimplicit-fallthrough -Wcovered-switch-default -Wno-noexcept-type -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wsuggest-override -Wno-comment -Wstring-conversion -Wmisleading-indentation -Wctad-maybe-unsupported -fdiagnostics-color -ffunction-sections -fdata-sections -Werror=global-constructors -O3 -DNDEBUG -std=c++17 -UNDEBUG -fno-exceptions -funwind-tables -fno-rtti -MD -MT lib/Support/CMakeFiles/LLVMSupport.dir/Caching.cpp.o -MF lib/Support/CMakeFiles/LLVMSupport.dir/Caching.cpp.o.d -o lib/Support/CMakeFiles/LLVMSupport.dir/Caching.cpp.o -c /home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/llvm/llvm/lib/Support/Caching.cpp 1. <eof> parser at end of file 2. Code generation 3. Running pass 'Function Pass Manager' on module '/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/llvm/llvm/lib/Support/Caching.cpp'. 4. Running pass 'AArch64 load / store optimization pass' on function '@"_ZNSt17_Function_handlerIFN4llvm8ExpectedISt8functionIFNS1_ISt10unique_ptrINS0_16CachedFileStreamESt14default_deleteIS4_EEEEjRKNS0_5TwineEEEEEjNS0_9StringRefESB_EZNS0_10localCacheESB_SB_SB_S2_IFvjSB_S3_INS0_12MemoryBufferES5_ISH_EEEEE3$_0E9_M_invokeERKSt9_Any_dataOjOSF_SB_"' #0 0x0000b6eae9b67bf0 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage1.install/bin/clang+++0x81c7bf0) #1 0x0000b6eae9b65aec llvm::sys::RunSignalHandlers() (/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage1.install/bin/clang+++0x81c5aec) #2 0x0000b6eae9acd5f4 CrashRecoverySignalHandler(int) CrashRecoveryContext.cpp:0:0 #3 0x0000f16c1aff28f8 (linux-vdso.so.1+0x8f8) #4 0x0000f16c1aacf1f0 __pthread_kill_implementation ./nptl/pthread_kill.c:44:76 #5 0x0000f16c1aa8a67c gsignal ./signal/../sysdeps/posix/raise.c:27:6 #6 0x0000f16c1aa77130 abort ./stdlib/abort.c:81:7 #7 0x0000b6eae9ad6628 (/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage1.install/bin/clang+++0x8136628) #8 0x0000b6eae72e95a8 (/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage1.install/bin/clang+++0x59495a8) #9 0x0000b6eae74ca9a8 (anonymous namespace)::AArch64LoadStoreOpt::findMatchingInsn(llvm::MachineInstrBundleIterator<llvm::MachineInstr, false>, (anonymous namespace)::LdStPairFlags&, unsigned int, bool) AArch64LoadStoreOptimizer.cpp:0:0 #10 0x0000b6eae74c85a8 (anonymous namespace)::AArch64LoadStoreOpt::tryToPairLdStInst(llvm::MachineInstrBundleIterator<llvm::MachineInstr, false>&) AArch64LoadStoreOptimizer.cpp:0:0 #11 0x0000b6eae74c624c (anonymous namespace)::AArch64LoadStoreOpt::optimizeBlock(llvm::MachineBasicBlock&, bool) AArch64LoadStoreOptimizer.cpp:0:0 #12 0x0000b6eae74c429c (anonymous namespace)::AArch64LoadStoreOpt::runOnMachineFunction(llvm::MachineFunction&) AArch64LoadStoreOptimizer.cpp:0:0 ```
Also fails to compile some flang tests when testing the stage 1 flang: https://lab.llvm.org/buildbot/#/builders/17/builds/7157 Due to mixed up parallel builds' logs, these may not be from the same crash but judging by the amount of crashes, it's likely to be the same reason for all of them anyway:
That's the same assert failure as we saw in the 2nd stage errors. So this would give you an easier way to reproduce it. |
Thanks very much for this, I think this is happening due to the compiler inadvertently attempting to pair SVE LDR/STR with Neon loads/stores, which I'll put up a new PR to bail out in these cases. |
…s=128. (llvm#134068) When compiling with -msve-vector-bits=128 or vscale_range(1, 1) and when the offsets allow it, we can pair SVE LDR/STR instructions into Neon LDP/STP. For example, given: ```cpp #include <arm_sve.h> void foo(double const *ldp, double *stp) { svbool_t pg = svptrue_b64(); svfloat64_t ld1 = svld1_f64(pg, ldp); svfloat64_t ld2 = svld1_f64(pg, ldp+svcntd()); svst1_f64(pg, stp, ld1); svst1_f64(pg, stp+svcntd(), ld2); } ``` When compiled with `-msve-vector-bits=128`, we currently generate: ```gas foo: ldr z0, [x0] ldr z1, [x0, llvm#1, mul vl] str z0, [x1] str z1, [x1, llvm#1, mul vl] ret ``` With this patch, we instead generate: ```gas foo: ldp q0, q1, [x0] stp q0, q1, [x1] ret ``` This is an alternative, more targetted approach to llvm#127500.
…ctor-bits=128." (llvm#134997) Reverts llvm#134068 Caused a stage 2 build failure: https://lab.llvm.org/buildbot/#/builders/41/builds/6016 ``` FAILED: lib/Support/CMakeFiles/LLVMSupport.dir/Caching.cpp.o /home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage1.install/bin/clang++ -DGTEST_HAS_RTTI=0 -D_DEBUG -D_GLIBCXX_ASSERTIONS -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage2/lib/Support -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/llvm/llvm/lib/Support -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage2/include -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/llvm/llvm/include -mcpu=neoverse-512tvb -mllvm -scalable-vectorization=preferred -mllvm -treat-scalable-fixed-error-as-warning=false -fPIC -fno-semantic-interposition -fvisibility-inlines-hidden -Werror=date-time -Werror=unguarded-availability-new -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -pedantic -Wno-long-long -Wc++98-compat-extra-semi -Wimplicit-fallthrough -Wcovered-switch-default -Wno-noexcept-type -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wsuggest-override -Wno-comment -Wstring-conversion -Wmisleading-indentation -Wctad-maybe-unsupported -fdiagnostics-color -ffunction-sections -fdata-sections -Werror=global-constructors -O3 -DNDEBUG -std=c++17 -UNDEBUG -fno-exceptions -funwind-tables -fno-rtti -MD -MT lib/Support/CMakeFiles/LLVMSupport.dir/Caching.cpp.o -MF lib/Support/CMakeFiles/LLVMSupport.dir/Caching.cpp.o.d -o lib/Support/CMakeFiles/LLVMSupport.dir/Caching.cpp.o -c /home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/llvm/llvm/lib/Support/Caching.cpp Opcode has unknown scale! UNREACHABLE executed at ../llvm/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp:4530! PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace, preprocessed source, and associated run script. Stack dump: 0. Program arguments: /home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage1.install/bin/clang++ -DGTEST_HAS_RTTI=0 -D_DEBUG -D_GLIBCXX_ASSERTIONS -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage2/lib/Support -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/llvm/llvm/lib/Support -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage2/include -I/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/llvm/llvm/include -mcpu=neoverse-512tvb -mllvm -scalable-vectorization=preferred -mllvm -treat-scalable-fixed-error-as-warning=false -fPIC -fno-semantic-interposition -fvisibility-inlines-hidden -Werror=date-time -Werror=unguarded-availability-new -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -pedantic -Wno-long-long -Wc++98-compat-extra-semi -Wimplicit-fallthrough -Wcovered-switch-default -Wno-noexcept-type -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wsuggest-override -Wno-comment -Wstring-conversion -Wmisleading-indentation -Wctad-maybe-unsupported -fdiagnostics-color -ffunction-sections -fdata-sections -Werror=global-constructors -O3 -DNDEBUG -std=c++17 -UNDEBUG -fno-exceptions -funwind-tables -fno-rtti -MD -MT lib/Support/CMakeFiles/LLVMSupport.dir/Caching.cpp.o -MF lib/Support/CMakeFiles/LLVMSupport.dir/Caching.cpp.o.d -o lib/Support/CMakeFiles/LLVMSupport.dir/Caching.cpp.o -c /home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/llvm/llvm/lib/Support/Caching.cpp 1. <eof> parser at end of file 2. Code generation 3. Running pass 'Function Pass Manager' on module '/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/llvm/llvm/lib/Support/Caching.cpp'. 4. Running pass 'AArch64 load / store optimization pass' on function '@"_ZNSt17_Function_handlerIFN4llvm8ExpectedISt8functionIFNS1_ISt10unique_ptrINS0_16CachedFileStreamESt14default_deleteIS4_EEEEjRKNS0_5TwineEEEEEjNS0_9StringRefESB_EZNS0_10localCacheESB_SB_SB_S2_IFvjSB_S3_INS0_12MemoryBufferES5_ISH_EEEEE3$_0E9_M_invokeERKSt9_Any_dataOjOSF_SB_"' #0 0x0000b6eae9b67bf0 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage1.install/bin/clang+++0x81c7bf0) llvm#1 0x0000b6eae9b65aec llvm::sys::RunSignalHandlers() (/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage1.install/bin/clang+++0x81c5aec) llvm#2 0x0000b6eae9acd5f4 CrashRecoverySignalHandler(int) CrashRecoveryContext.cpp:0:0 llvm#3 0x0000f16c1aff28f8 (linux-vdso.so.1+0x8f8) llvm#4 0x0000f16c1aacf1f0 __pthread_kill_implementation ./nptl/pthread_kill.c:44:76 llvm#5 0x0000f16c1aa8a67c gsignal ./signal/../sysdeps/posix/raise.c:27:6 llvm#6 0x0000f16c1aa77130 abort ./stdlib/abort.c:81:7 llvm#7 0x0000b6eae9ad6628 (/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage1.install/bin/clang+++0x8136628) llvm#8 0x0000b6eae72e95a8 (/home/tcwg-buildbot/worker/clang-aarch64-sve-vla-2stage/stage1.install/bin/clang+++0x59495a8) llvm#9 0x0000b6eae74ca9a8 (anonymous namespace)::AArch64LoadStoreOpt::findMatchingInsn(llvm::MachineInstrBundleIterator<llvm::MachineInstr, false>, (anonymous namespace)::LdStPairFlags&, unsigned int, bool) AArch64LoadStoreOptimizer.cpp:0:0 llvm#10 0x0000b6eae74c85a8 (anonymous namespace)::AArch64LoadStoreOpt::tryToPairLdStInst(llvm::MachineInstrBundleIterator<llvm::MachineInstr, false>&) AArch64LoadStoreOptimizer.cpp:0:0 llvm#11 0x0000b6eae74c624c (anonymous namespace)::AArch64LoadStoreOpt::optimizeBlock(llvm::MachineBasicBlock&, bool) AArch64LoadStoreOptimizer.cpp:0:0 llvm#12 0x0000b6eae74c429c (anonymous namespace)::AArch64LoadStoreOpt::runOnMachineFunction(llvm::MachineFunction&) AArch64LoadStoreOptimizer.cpp:0:0 ```
…ector-bits=128." Reapplies llvm#134068 The first patch was missing a check to prevent attempts to pair SVE fill/spill with other Neon load/store instructions, which could happen if the Neon instruction was unscaled.
When compiling with -msve-vector-bits=128 or vscale_range(1, 1) and when
the offsets allow it, we can pair SVE LDR/STR instructions into Neon
LDP/STP.
For example, given (https://godbolt.org/z/srjxYbK9s):
When compiled with
-msve-vector-bits=128
, we currently generate:With this patch, we instead generate:
Loading (and to a lesser extent, storing) multiple registers from a
common base address is a commonly occurring pattern, but multi-register
SVE load/store instructions are only supported starting with SVE2.1. This
patch offers an alternative for SVE 128-bit implementations.
This is an alternative, more targetted approach to #127500.