[AArch64][SVE] Lower unpredicated loads/stores as fixed LDR/STR with -msve-vector-bits=128. #127500

rj-jesus · 2025-02-17T14:46:56Z

Given the code below:

svuint8_t foo(uint8_t *x) {
  return svld1(svptrue_b8(), x);
}

When compiled with -msve-vector-bits=128 (or vscale_range(1, 1)), we
currently generate:

foo:
  ptrue   p0.b
  ld1b    { z0.b }, p0/z, [x0]
  ret

Whereas (on little-endian) we could instead be using LDR as follows:

foo:
  ldr     q0, [x0]
  ret

Besides avoiding the predicate dependency, the above form enables
further optimisations such as LDP folds. Likewise for other types and
stores.

I have a patch that enables similar folds for SVE LDR/STR, but since that
causes a fair number of test changes I rather open a separate PR for it.

llvmbot · 2025-02-17T14:47:33Z

@llvm/pr-subscribers-backend-aarch64

Author: Ricardo Jesus (rj-jesus)

Changes

Given the code below:

svuint8_t foo(uint8_t *x) {
  return svld1(svptrue_b8(), x);
}

When compiled with -msve-vector-bits=128 (or vscale_range(1, 1)), we
currently generate:

foo:
  ptrue   p0.b
  ld1b    { z0.b }, p0/z, [x0]
  ret

Whereas (on little-endian) we could instead be using LDR as follows:

foo:
  ldr     q0, [x0]
  ret

Besides avoiding the predicate dependency, the above form enables
further optimisations such as LDP folds. Likewise for other types and
stores.

I have a patch that enables similar folds for SVE LDR/STR, but since that
causes a fair number of test changes I rather open a separate PR for it.

Full diff: https://github.com/llvm/llvm-project/pull/127500.diff

2 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+55)
(added) llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll (+483)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4263be1098899..173a875a256e0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23550,6 +23550,31 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
   return DAG.getMergeValues({Extract, TokenFactor}, DL);
 }
 
+// Replace scalable loads with fixed loads when vscale_range(1, 1).
+// This enables further optimisations such as LDP folds.
+static SDValue combineVScale1Load(LoadSDNode *LD, SelectionDAG &DAG,
+                                  const AArch64Subtarget *Subtarget) {
+  EVT MemVT = LD->getMemoryVT();
+  if (!MemVT.isScalableVector() ||
+      Subtarget->getMaxSVEVectorSizeInBits() != AArch64::SVEBitsPerBlock)
+    return SDValue();
+
+  // Skip unpacked types given their different layouts between Neon and SVE.
+  if (MemVT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
+    return SDValue();
+
+  SDLoc DL(LD);
+  MVT NewVT = MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(),
+                               MemVT.getVectorMinNumElements());
+  SDValue NewLoad = DAG.getLoad(
+      NewVT, DL, LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
+      LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), LD->getAAInfo());
+  SDValue Insert = convertToScalableVector(DAG, MemVT, NewLoad);
+  SDValue TokenFactor = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                                    {SDValue(cast<SDNode>(NewLoad), 1)});
+  return DAG.getMergeValues({Insert, TokenFactor}, DL);
+}
+
 // Perform TBI simplification if supported by the target and try to break up
 // nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
 // load instructions can be selected.
@@ -23587,6 +23612,9 @@ static SDValue performLOADCombine(SDNode *N,
   if (SDValue Res = combineV3I8LoadExt(LD, DAG))
     return Res;
 
+  if (SDValue Res = combineVScale1Load(LD, DAG, Subtarget))
+    return Res;
+
   if (!LD->isNonTemporal())
     return SDValue(N, 0);
 
@@ -23845,6 +23873,30 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
   return Chain;
 }
 
+// Replace scalable stores with fixed stores when vscale_range(1, 1).
+static SDValue combineVScale1Store(StoreSDNode *ST, SelectionDAG &DAG,
+                                   const AArch64Subtarget *Subtarget) {
+  SDValue Value = ST->getValue();
+  EVT ValueVT = Value.getValueType();
+  if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
+      !ValueVT.isScalableVector() ||
+      Subtarget->getMaxSVEVectorSizeInBits() != AArch64::SVEBitsPerBlock)
+    return SDValue();
+
+  // Skip unpacked types given their different layouts between Neon and SVE.
+  if (ValueVT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
+    return SDValue();
+
+  SDLoc DL(ST);
+  MVT NewVT = MVT::getVectorVT(ValueVT.getVectorElementType().getSimpleVT(),
+                               ValueVT.getVectorMinNumElements());
+  SDValue NewValue = convertFromScalableVector(DAG, NewVT, Value);
+  SDValue NewStore = DAG.getStore(
+      ST->getChain(), DL, NewValue, ST->getBasePtr(), ST->getPointerInfo(),
+      ST->getOriginalAlign(), ST->getMemOperand()->getFlags(), ST->getAAInfo());
+  return NewStore;
+}
+
 static SDValue performSTORECombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    SelectionDAG &DAG,
@@ -23879,6 +23931,9 @@ static SDValue performSTORECombine(SDNode *N,
   if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
     return Res;
 
+  if (SDValue Res = combineVScale1Store(ST, DAG, Subtarget))
+    return Res;
+
   // If this is an FP_ROUND followed by a store, fold this into a truncating
   // store. We can do this even if this is already a truncstore.
   // We purposefully don't care about legality of the nodes here as we know
diff --git a/llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll b/llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll
new file mode 100644
index 0000000000000..f2d4933d43259
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll
@@ -0,0 +1,483 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -aarch64-sve-vector-bits-max=0   < %s | FileCheck %s --check-prefix=CHECK-VLA
+; RUN: llc -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefix=CHECK-128
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 16 x i8> @ld_nxv16i8(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv16i8:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.b
+; CHECK-VLA-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: ld_nxv16i8:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    ldr q0, [x0]
+; CHECK-128-NEXT:    ret
+  %2 = load <vscale x 16 x i8>, ptr %0, align 16
+  ret <vscale x 16 x i8> %2
+}
+
+define void @st_nxv16i8(ptr %0, <vscale x 16 x i8> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv16i8:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.b
+; CHECK-VLA-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: st_nxv16i8:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    str q0, [x0]
+; CHECK-128-NEXT:    ret
+  store <vscale x 16 x i8> %1, ptr %0, align 16
+  ret void
+}
+
+define <vscale x 8 x i16> @ld_nxv8i16(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv8i16:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.h
+; CHECK-VLA-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: ld_nxv8i16:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    ldr q0, [x0]
+; CHECK-128-NEXT:    ret
+  %2 = load <vscale x 8 x i16>, ptr %0, align 16
+  ret <vscale x 8 x i16> %2
+}
+
+define void @st_nxv8i16(ptr %0, <vscale x 8 x i16> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv8i16:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.h
+; CHECK-VLA-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: st_nxv8i16:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    str q0, [x0]
+; CHECK-128-NEXT:    ret
+  store <vscale x 8 x i16> %1, ptr %0, align 16
+  ret void
+}
+
+define <vscale x 4 x i32> @ld_nxv4i32(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv4i32:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.s
+; CHECK-VLA-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: ld_nxv4i32:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    ldr q0, [x0]
+; CHECK-128-NEXT:    ret
+  %2 = load <vscale x 4 x i32>, ptr %0, align 16
+  ret <vscale x 4 x i32> %2
+}
+
+define void @st_nxv4i32(ptr %0, <vscale x 4 x i32> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv4i32:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.s
+; CHECK-VLA-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: st_nxv4i32:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    str q0, [x0]
+; CHECK-128-NEXT:    ret
+  store <vscale x 4 x i32> %1, ptr %0, align 16
+  ret void
+}
+
+define <vscale x 2 x i64> @ld_nxv2i64(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv2i64:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.d
+; CHECK-VLA-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: ld_nxv2i64:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    ldr q0, [x0]
+; CHECK-128-NEXT:    ret
+  %2 = load <vscale x 2 x i64>, ptr %0, align 16
+  ret <vscale x 2 x i64> %2
+}
+
+define void @st_nxv2i64(ptr %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv2i64:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.d
+; CHECK-VLA-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: st_nxv2i64:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    str q0, [x0]
+; CHECK-128-NEXT:    ret
+  store <vscale x 2 x i64> %1, ptr %0, align 16
+  ret void
+}
+
+define <vscale x 8 x half> @ld_nxv8f16(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv8f16:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.h
+; CHECK-VLA-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: ld_nxv8f16:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    ldr q0, [x0]
+; CHECK-128-NEXT:    ret
+  %2 = load <vscale x 8 x half>, ptr %0, align 16
+  ret <vscale x 8 x half> %2
+}
+
+define void @st_nxv8f16(ptr %0, <vscale x 8 x half> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv8f16:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.h
+; CHECK-VLA-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: st_nxv8f16:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    str q0, [x0]
+; CHECK-128-NEXT:    ret
+  store <vscale x 8 x half> %1, ptr %0, align 16
+  ret void
+}
+
+define <vscale x 4 x float> @ld_nxv4f32(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv4f32:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.s
+; CHECK-VLA-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: ld_nxv4f32:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    ldr q0, [x0]
+; CHECK-128-NEXT:    ret
+  %2 = load <vscale x 4 x float>, ptr %0, align 16
+  ret <vscale x 4 x float> %2
+}
+
+define void @st_nxv4f32(ptr %0, <vscale x 4 x float> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv4f32:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.s
+; CHECK-VLA-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: st_nxv4f32:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    str q0, [x0]
+; CHECK-128-NEXT:    ret
+  store <vscale x 4 x float> %1, ptr %0, align 16
+  ret void
+}
+
+define <vscale x 2 x double> @ld_nxv2f64(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv2f64:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.d
+; CHECK-VLA-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: ld_nxv2f64:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    ldr q0, [x0]
+; CHECK-128-NEXT:    ret
+  %2 = load <vscale x 2 x double>, ptr %0, align 16
+  ret <vscale x 2 x double> %2
+}
+
+define void @st_nxv2f64(ptr %0, <vscale x 2 x double> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv2f64:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.d
+; CHECK-VLA-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: st_nxv2f64:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    str q0, [x0]
+; CHECK-128-NEXT:    ret
+  store <vscale x 2 x double> %1, ptr %0, align 16
+  ret void
+}
+
+define <vscale x 16 x i8> @ld_nxv16i8_offset(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv16i8_offset:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.b
+; CHECK-VLA-NEXT:    ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: ld_nxv16i8_offset:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    rdvl x8, #1
+; CHECK-128-NEXT:    ldr q0, [x0, x8]
+; CHECK-128-NEXT:    ret
+  %2 = tail call i64 @llvm.vscale.i64()
+  %3 = shl nuw nsw i64 %2, 4
+  %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
+  %5 = load <vscale x 16 x i8>, ptr %4, align 16
+  ret <vscale x 16 x i8> %5
+}
+
+define void @st_nxv16i8_offset(ptr %0, <vscale x 16 x i8> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv16i8_offset:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.b
+; CHECK-VLA-NEXT:    st1b { z0.b }, p0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: st_nxv16i8_offset:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    rdvl x8, #1
+; CHECK-128-NEXT:    str q0, [x0, x8]
+; CHECK-128-NEXT:    ret
+  %3 = tail call i64 @llvm.vscale.i64()
+  %4 = shl nuw nsw i64 %3, 4
+  %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
+  store <vscale x 16 x i8> %1, ptr %5, align 16
+  ret void
+}
+
+define <vscale x 8 x i16> @ld_nxv8i16_offset(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv8i16_offset:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.h
+; CHECK-VLA-NEXT:    ld1h { z0.h }, p0/z, [x0, #1, mul vl]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: ld_nxv8i16_offset:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    rdvl x8, #1
+; CHECK-128-NEXT:    ldr q0, [x0, x8]
+; CHECK-128-NEXT:    ret
+  %2 = tail call i64 @llvm.vscale.i64()
+  %3 = shl nuw nsw i64 %2, 4
+  %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
+  %5 = load <vscale x 8 x i16>, ptr %4, align 16
+  ret <vscale x 8 x i16> %5
+}
+
+define void @st_nxv8i16_offset(ptr %0, <vscale x 8 x i16> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv8i16_offset:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.h
+; CHECK-VLA-NEXT:    st1h { z0.h }, p0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: st_nxv8i16_offset:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    rdvl x8, #1
+; CHECK-128-NEXT:    str q0, [x0, x8]
+; CHECK-128-NEXT:    ret
+  %3 = tail call i64 @llvm.vscale.i64()
+  %4 = shl nuw nsw i64 %3, 4
+  %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
+  store <vscale x 8 x i16> %1, ptr %5, align 16
+  ret void
+}
+
+define <vscale x 4 x i32> @ld_nxv4i32_offset(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv4i32_offset:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.s
+; CHECK-VLA-NEXT:    ld1w { z0.s }, p0/z, [x0, #1, mul vl]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: ld_nxv4i32_offset:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    rdvl x8, #1
+; CHECK-128-NEXT:    ldr q0, [x0, x8]
+; CHECK-128-NEXT:    ret
+  %2 = tail call i64 @llvm.vscale.i64()
+  %3 = shl nuw nsw i64 %2, 4
+  %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
+  %5 = load <vscale x 4 x i32>, ptr %4, align 16
+  ret <vscale x 4 x i32> %5
+}
+
+define void @st_nxv4i32_offset(ptr %0, <vscale x 4 x i32> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv4i32_offset:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.s
+; CHECK-VLA-NEXT:    st1w { z0.s }, p0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: st_nxv4i32_offset:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    rdvl x8, #1
+; CHECK-128-NEXT:    str q0, [x0, x8]
+; CHECK-128-NEXT:    ret
+  %3 = tail call i64 @llvm.vscale.i64()
+  %4 = shl nuw nsw i64 %3, 4
+  %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
+  store <vscale x 4 x i32> %1, ptr %5, align 16
+  ret void
+}
+
+define <vscale x 2 x i64> @ld_nxv2i64_offset(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv2i64_offset:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.d
+; CHECK-VLA-NEXT:    ld1d { z0.d }, p0/z, [x0, #1, mul vl]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: ld_nxv2i64_offset:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    rdvl x8, #1
+; CHECK-128-NEXT:    ldr q0, [x0, x8]
+; CHECK-128-NEXT:    ret
+  %2 = tail call i64 @llvm.vscale.i64()
+  %3 = shl nuw nsw i64 %2, 4
+  %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
+  %5 = load <vscale x 2 x i64>, ptr %4, align 16
+  ret <vscale x 2 x i64> %5
+}
+
+define void @st_nxv2i64_offset(ptr %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv2i64_offset:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.d
+; CHECK-VLA-NEXT:    st1d { z0.d }, p0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: st_nxv2i64_offset:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    rdvl x8, #1
+; CHECK-128-NEXT:    str q0, [x0, x8]
+; CHECK-128-NEXT:    ret
+  %3 = tail call i64 @llvm.vscale.i64()
+  %4 = shl nuw nsw i64 %3, 4
+  %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
+  store <vscale x 2 x i64> %1, ptr %5, align 16
+  ret void
+}
+
+define <vscale x 8 x half> @ld_nxv8f16_offset(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv8f16_offset:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.h
+; CHECK-VLA-NEXT:    ld1h { z0.h }, p0/z, [x0, #1, mul vl]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: ld_nxv8f16_offset:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    rdvl x8, #1
+; CHECK-128-NEXT:    ldr q0, [x0, x8]
+; CHECK-128-NEXT:    ret
+  %2 = tail call i64 @llvm.vscale.i64()
+  %3 = shl nuw nsw i64 %2, 4
+  %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
+  %5 = load <vscale x 8 x half>, ptr %4, align 16
+  ret <vscale x 8 x half> %5
+}
+
+define void @st_nxv8f16_offset(ptr %0, <vscale x 8 x half> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv8f16_offset:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.h
+; CHECK-VLA-NEXT:    st1h { z0.h }, p0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: st_nxv8f16_offset:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    rdvl x8, #1
+; CHECK-128-NEXT:    str q0, [x0, x8]
+; CHECK-128-NEXT:    ret
+  %3 = tail call i64 @llvm.vscale.i64()
+  %4 = shl nuw nsw i64 %3, 4
+  %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
+  store <vscale x 8 x half> %1, ptr %5, align 16
+  ret void
+}
+
+define <vscale x 4 x float> @ld_nxv4f32_offset(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv4f32_offset:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.s
+; CHECK-VLA-NEXT:    ld1w { z0.s }, p0/z, [x0, #1, mul vl]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: ld_nxv4f32_offset:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    rdvl x8, #1
+; CHECK-128-NEXT:    ldr q0, [x0, x8]
+; CHECK-128-NEXT:    ret
+  %2 = tail call i64 @llvm.vscale.i64()
+  %3 = shl nuw nsw i64 %2, 4
+  %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
+  %5 = load <vscale x 4 x float>, ptr %4, align 16
+  ret <vscale x 4 x float> %5
+}
+
+define void @st_nxv4f32_offset(ptr %0, <vscale x 4 x float> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv4f32_offset:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.s
+; CHECK-VLA-NEXT:    st1w { z0.s }, p0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: st_nxv4f32_offset:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    rdvl x8, #1
+; CHECK-128-NEXT:    str q0, [x0, x8]
+; CHECK-128-NEXT:    ret
+  %3 = tail call i64 @llvm.vscale.i64()
+  %4 = shl nuw nsw i64 %3, 4
+  %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
+  store <vscale x 4 x float> %1, ptr %5, align 16
+  ret void
+}
+
+define <vscale x 2 x double> @ld_nxv2f64_offset(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv2f64_offset:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.d
+; CHECK-VLA-NEXT:    ld1d { z0.d }, p0/z, [x0, #1, mul vl]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: ld_nxv2f64_offset:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    rdvl x8, #1
+; CHECK-128-NEXT:    ldr q0, [x0, x8]
+; CHECK-128-NEXT:    ret
+  %2 = tail call i64 @llvm.vscale.i64()
+  %3 = shl nuw nsw i64 %2, 4
+  %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
+  %5 = load <vscale x 2 x double>, ptr %4, align 16
+  ret <vscale x 2 x double> %5
+}
+
+define void @st_nxv2f64_offset(ptr %0, <vscale x 2 x double> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv2f64_offset:
+; CHECK-VLA:       // %bb.0:
+; CHECK-VLA-NEXT:    ptrue p0.d
+; CHECK-VLA-NEXT:    st1d { z0.d }, p0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT:    ret
+;
+; CHECK-128-LABEL: st_nxv2f64_offset:
+; CHECK-128:       // %bb.0:
+; CHECK-128-NEXT:    rdvl x8, #1
+; CHECK-128-NEXT:    str q0, [x0, x8]
+; CHECK-128-NEXT:    ret
+  %3 = tail call i64 @llvm.vscale.i64()
+  %4 = shl nuw nsw i64 %3, 4
+  %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
+  store <vscale x 2 x double> %1, ptr %5, align 16
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve" }

rj-jesus · 2025-02-17T14:53:32Z

llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll

 ; CHECK-128:       // %bb.0:
-; CHECK-128-NEXT:    ptrue p0.b
-; CHECK-128-NEXT:    ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-128-NEXT:    rdvl x8, #1


The lowering for the *_offset versions could be improved, but if the equivalent ACLE code was compiled directly we wouldn't have vscale in the IR, and instead would generate:

ldr q0, [x0, #16]

and

str q0, [x0, #16]

paulwalker-arm

Whilst not unreasonable this PR makes me slightly uneasy because we have considerable code within the code generator that goes the other way (i.e. from fixed-length to scalable vector) for:

wider than NEON fixed length vectors.
operations unsupported by NEON
most all fixed length operations when in StreamingSVE mode.

I worry how far this could travel because if it's worth removing the predicate for the loads and stores then presumably the same is true for many other instructions and then we could end up in a situation where legalisation and combines are conflicting.

I have the following thoughts/questions:

Where do the scalable vector operations come from?
- Is this because the cost model has chosen scalable auto-vectorisation when fixed length would have been better?
  - I'd rather not complicate code generation just because LoopVectorize has made the wrong call.
- Is this scalable vector ACLE code that wants to benefit from knowing the exact vector length?
If the predicate is the main concern, does emitting the SVE fill/spill instructions improve performance?
- Which could be achieved during isel.
- If this works then perhaps AArch64LoadStoreOpt could be taught to pair SVE spill/fill instructions when the vector length is known to be 128-bit?
If this is only the start of taking advantage of knowing the exact vector length then would it be better to have these transformations as a dedicated IR pass? Then the other optimisers can improve things further and the code generator should just work.

As I say, I'm not against the PR but it would be good to understand the direction of travel early to prevent tying selection in knots.

paulwalker-arm · 2025-02-19T11:04:17Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

I think hasNEON should be enough here because the NEON load/store instruction remain available when in StreamingMode.

Should the combine be restricted to little endian only?

Thanks, I think you're right, I've changed isNeonAvailable to hasNEON.

It should indeed be restricted to little-endian only, but I believe this is already enforced in performLOADCombine (see https://github.com/llvm/llvm-project/blob/90802ec069a88ebce7de370658093ed919480fb8/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp#L23609). Would you rather have a check here in any case?

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

paulwalker-arm · 2025-02-19T11:06:26Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

As above, I don't see AArch64::SVEBitsPerBlock being relevant. You really means 128.

It looks like you're only considering the memory VT and ignoring the load's result type. This could be an extending load, which presumably you'll want to ignore as well?

Thanks, done as well.

You're right, thank you, I've added a check to ignore extending loads.

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

paulwalker-arm · 2025-02-19T12:04:31Z

llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll

Given the general SVE code generation should already be well tested. Does the -aarch64-sve-vector-bits-max=0 RUN line offer any new value?

Thanks, I was initially planning to use it to check the general SVE folds to LDR/STR, but you're right, there's already enough coverage for those. I've removed the -aarch64-sve-vector-bits-max=0 RUN line and cleaned up the tests a bit.

rj-jesus · 2025-02-19T13:12:06Z

Hi @paulwalker-arm, thanks very much for your feedback, I'll address the code changes but first I thought I could reply to your main comments.

I have the following thoughts/questions:

* Where do the scalable vector operations come from?
  * Is this scalable vector ACLE code that wants to benefit from knowing the exact vector length?

Yes, it's scalable ACLE code compiled with -msve-vector-bits=128 (see example below).

* If the predicate is the main concern, does emitting the SVE fill/spill instructions improve performance?
  * Which could be achieved during isel.
  * If this works then perhaps AArch64LoadStoreOpt could be taught to pair SVE spill/fill instructions when the vector length is known to be 128-bit?

I think this should work too. I have a patch that adds patterns for emitting SVE LDR/STR instead of PTRUE LD1/ST1. I can put it up for review if you'd like to try going this route instead. What do you think?

* If this is only the start of taking advantage of knowing the exact vector length then would it be better to have these transformations as a dedicated IR pass?  Then the other optimisers can improve things further and the code generator should just work.

Whilst having a dedicated pass could be useful, we are not aiming to pursue this "generally" for all instructions. We care about going from scalable to fixed-length vectors for loads and stores mainly to benefit from LDP/STP folds.

For example, given (https://godbolt.org/z/orbeMTon3):

#include <arm_sve.h>

svfloat64_t foo(const double *x) {
  svbool_t pg = svptrue_b64();
  return svld1_f64(pg, x) + svld1_f64(pg, x+svcntd());
}

We currently generate:

foo:
        ptrue   p0.d
        mov     x8, #2
        ld1d    { z0.d }, p0/z, [x0]
        ld1d    { z1.d }, p0/z, [x0, x8, lsl #3]
        fadd    z0.d, z0.d, z1.d
        ret

With this patch, we would instead have:

foo:
	ldp	q0, q1, [x0]
	fadd	v0.2d, v0.2d, v1.2d
	ret

Please let me know what you think. I'm happy to try the route via AArch64LoadStoreOpt you suggested if you think that's a better strategy!

paulwalker-arm · 2025-02-19T16:45:29Z

@rj-jesus - From the output I see a NEON fadd, presumably because an existing combine looks through the insert subvector. If this is want you want to generate then my AArch64LoadStoreOpt suggestion will not work because it'll bypass the necessary combine.

If you're happy to limit the output changes to just the loads and stores then experimenting with emitting SVE fill/spill instruction and extending AArch64LoadStoreOpt would be nice because it could also optimise real spill/fill code. This would be especially beneficial across function calls.

Otherwise, given the expected reach as you've described it, I'm happy enough for you to continue down the existing path assuming others agree. However, given this specifically relates to the ACLE, can the combine be restricted to pre type-legalisation? I think that'll cover your use case whilst removing the potential conflicts I'm worried about.

rj-jesus · 2025-02-19T17:03:59Z

From the output I see a NEON fadd, presumably because an existing combine looks through the insert subvector. If this is want you want to generate then my AArch64LoadStoreOpt suggestion will not work because it'll bypass the necessary combine.

I think the NEON FADD isn't critical to have - I'd be happy with an SVE FADD too, ideally unpredicated, which I do see emitted with the separate patch for SVE LDR/STR.

If you're happy to limit the output changes to just the loads and stores then experimenting with emitting SVE fill/spill instruction and extending AArch64LoadStoreOpt would be nice because it could also optimise real spill/fill code. This would be especially beneficial across function calls.

That sounds good, in that case I'll put up a separate patch for the SVE LDR/STR case and separately experiment with the AArch64LoadStoreOpt alternative. In the meantime I'll also address your comments here, I just haven't had the chance to do so today (I'm sorry).

However, given this specifically relates to the ACLE, can the combine be restricted to pre type-legalisation? I think that'll cover your use case whilst removing the potential conflicts I'm worried about.

I think that should work, in fact I believe I tried that in an earlier version of the patch and I seem to recall it working.

rj-jesus · 2025-02-21T16:03:40Z

Hi @paulwalker-arm, thanks again for your feedback. I should have addressed most of it now.

I'm still keen to try the route you suggested via AArch64LoadStoreOpt once #127837 gets sorted (assuming you still think that's the preferable approach).

Please let me know if you have any other comments or suggestions.

rj-jesus · 2025-02-27T17:27:43Z

Hi @paulwalker-arm, it seems that doing this in AArch64LoadStoreOpt won't be a walk in the park as by the time we reach the pass, when compiling with -msve-vector-bits=128, it's likely the loads/stores will consist of a mix of LDRs+LD1s/STRs+ST1s as seen here. We could potentially lower the LD1s/ST1s into LDR/STR before combining them into LDP/STP. Do you think this would be preferable to the current implementation?

paulwalker-arm · 2025-02-28T11:24:54Z

Hi @paulwalker-arm, it seems that doing this in AArch64LoadStoreOpt won't be a walk in the park as by the time we reach the pass, when compiling with -msve-vector-bits=128, it's likely the loads/stores will consist of a mix of LDRs+LD1s/STRs+ST1s as seen here. We could potentially lower the LD1s/ST1s into LDR/STR before combining them into LDP/STP. Do you think this would be preferable to the current implementation?

I see what you mean, the flaw being the instructions don't support a consistent set of addressing modes? Fair enough, I think for today that means we're better keeping things simple rather than complicating AArch64LoadStoreOpt. Thanks for investigating.

That just leaves the choice over whether to have this in IR or CodeGen. Have you looked into doing this as an InstCombine? I'm hoping doing it in IR will be simpler because there's less corner cases to worry about.

rj-jesus · 2025-02-28T12:10:47Z

I see what you mean, the flaw being the instructions don't support a consistent set of addressing modes?

Exactly, this seems to be a (likely unintended) side-effect of compiling for VLS SVE. When compiling for VLS SVE, VL-based offsets are folded to constants early on, which then leads to a mismatch with the addressing modes of SVE loads/stores given they generally expect offsets as multiples of VL (example). I think it would be worth "uplifting" these offsets back into multiples of VL, such that they could be folded into immediate-VL-based loads/stores, though I'm not sure where would be the best place to do this. Do you think this is worth trying, probably independently from this PR?

That just leaves the choice over whether to have this in IR or CodeGen. Have you looked into doing this as an InstCombine?

I haven't tried this yet as I wanted to limit this to the cases we knew we could fold to Neon LDR/STR to promote LDP/STP merging, though I think we could accomplish the same thing with an extra hook that would inspect the vector type and decide accordingly. Is that what you had in mind?

paulwalker-arm · 2025-02-28T12:18:46Z

I haven't tried this yet as I wanted to limit this to the cases we knew we could fold to Neon LDR/STR to promote LDP/STP merging, though I think we could accomplish the same thing with an extra hook that would inspect the vector type and decide accordingly. Is that what you had in mind?

Oh sorry, for some reason I had it in head that the source of the loads/store would all be SVE intrinsics and thus you could extend instCombineSVELD1, but that's not always going to be the case. Please ignore my suggestion.

paulwalker-arm · 2025-02-28T12:35:19Z

I think it would be worth "uplifting" these offsets back into multiples of VL, such that they could be folded into immediate-VL-based loads/stores, though I'm not sure where would be the best place to do this. Do you think this is worth trying, probably independently from this PR?

I've thought the same but it always felt tricky to pull off properly. The last time I looked at it I wondered whether it could be handling entirely during isel via complex patterns. I hesitated because I worried it could cause significant noise and make it harder for machine passes that attempt to analyse/merge offsets. This could be ill founded but with no performance based motivation I figured I'll just ignore it for now. Feel free to investigate this yourself but for my money I think you'll be better off just pushing this PR through and moving on until analysis shows a genuine need to look deeper.

I'll caveat this by saying despite implementing the feature the whole -msve-vector-bits has never sat well with me, although I do appreciate that when maximum performance is required the ideals of scalable vectors becomes secondary :)

rj-jesus · 2025-02-28T15:18:02Z

I've thought the same but it always felt tricky to pull off properly. The last time I looked at it I wondered whether it could be handling entirely during isel via complex patterns. I hesitated because I worried it could cause significant noise and make it harder for machine passes that attempt to analyse/merge offsets. This could be ill founded but with no performance based motivation I figured I'll just ignore it for now. Feel free to investigate this yourself but for my money I think you'll be better off just pushing this PR through and moving on until analysis shows a genuine need to look deeper.

I'll caveat this by saying despite implementing the feature the whole -msve-vector-bits has never sat well with me, although I do appreciate that when maximum performance is required the ideals of scalable vectors becomes secondary :)

Thanks for the suggestion, I'll ask around if there are known cases where missing out on the addressing modes is causing problems. In any case, although that wasn't the main goal, the current patch should also help fix the addressing mode issues for VLS 128.

For what it's worth, I'm not the biggest fan of -msve-vector-bits/VLS SVE either. I wish the best performance could always be plain SVE. I suppose -msve-vector-bits is just a stop-gap solution until we get there. :)

rj-jesus · 2025-03-03T14:53:45Z

I've rebased this to see if it gets through the Windows test that was seemingly failing.

In the meantime, I think we could also implement this as a few new patterns in AArch64SVEInstrInfo.td, similar to unpred_load, but that lower to LDRQui/LDRQroX instead (and likewise for stores). We would probably need a new predicate to check if sve-vector-bits=128 for this. Do you think that would be any better @paulwalker-arm?

paulwalker-arm · 2025-03-03T15:01:19Z

I've rebased this to see if it gets through the Windows test that was seemingly failing.

In the meantime, I think we could also implement this as a few new patterns in AArch64SVEInstrInfo.td, similar to unpred_load, but that lower to LDRQui/LDRQroX instead (and likewise for stores). We would probably need a new predicate to check if sve-vector-bits=128 for this. Do you think that would be any better @paulwalker-arm?

Definitely not. Unless absolutely necessary, let's keep SVE VLS separate from isel. Sorry for the sporadic reviews, I will take another look at the PR tomorrow.

rj-jesus · 2025-03-03T15:08:05Z

Definitely not. Unless absolutely necessary, let's keep SVE VLS separate from isel. Sorry for the sporadic reviews, I will take another look at the PR tomorrow.

Sounds good, thank you for the quick sanity check!

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

paulwalker-arm · 2025-03-04T15:20:57Z

llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll

@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefix=CHECK-128
+; RUN: llc -mtriple=aarch64_be-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-max=128 < %s | not grep -e ldr -e str


I know there's an existing bug with big endian for SVE ldr/str that I'm happy to ignore for this PR but I do think it's important to check the output for this test if only to ensure we don't erroneously emit NEON ldr/str instructions.

Given you call out ldp/stp as being a core benefit to this transformation, perhaps it's worth adding dedicated tests to verify it happens?

Thanks, I've changed the BE run line to match the full output.
I've also added a simple test for LDP/STP. Please let me know if you had more complex tests in mind.

paulwalker-arm · 2025-03-04T15:21:17Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const AArch64Subtarget *Subtarget) {
+  EVT MemVT = LD->getMemoryVT();
+  if (!DCI.isBeforeLegalize() || !Subtarget->hasNEON() ||


This is missing !Subtarget->isLittleEndian()?

I believe this was already checked in performLOADCombine(), but in any case I've added the explicit check here.

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

…ctor-bits=128. Given the code below: ```cpp svuint8_t foo(uint8_t *x) { return svld1(svptrue_b8(), x); } ``` When compiled with -msve-vector-bits=128 (or vscale_range(1, 1)), we currently generate: ```gas foo: ptrue p0.b ld1b { z0.b }, p0/z, [x0] ret ``` Whereas (on little-endian) we could instead be using LDR as follows: ```gas foo: ldr q0, [x0] ret ``` Besides avoiding the predicate dependency, the above form enables further optimisations such as LDP folds. Likewise for stores.

rj-jesus · 2025-03-25T17:33:17Z

Hi @paulwalker-arm, I'm very sorry for taking so long to update this. Initially, I was waiting on #129732 to get merged to propose the alternative version that combines SVE LDR/STR into LDP/STP directly, but then that PR took a bit longer than expected and I had to put this on hold.

I believe I've addressed most of your feedback. I also have a tentative patch implementing the SVE LDR/STR pairing you initially suggested. Would you prefer I open a separate PR for this, or should I include it in this patch so we can decide on the approach to go for?

Thanks very much!

paulwalker-arm · 2025-03-31T14:19:45Z

I believe I've addressed most of your feedback. I also have a tentative patch implementing the SVE LDR/STR pairing you initially suggested. Would you prefer I open a separate PR for this, or should I include it in this patch so we can decide on the approach to go for?

Please use a separate PR for the SVE LDR/STR pairing approach. Doing this means this PR can remain in reserve if for some reason we decide to reverse the current direction of travel.

rj-jesus · 2025-04-02T11:22:27Z

Please use a separate PR for the SVE LDR/STR pairing approach. Doing this means this PR can remain in reserve if for some reason we decide to reverse the current direction of travel.

Thanks very much, I've just opened #134068 to attempt the SVE fill/spill pairing route.

…s=128. (#134068) When compiling with -msve-vector-bits=128 or vscale_range(1, 1) and when the offsets allow it, we can pair SVE LDR/STR instructions into Neon LDP/STP. For example, given: ```cpp #include <arm_sve.h> void foo(double const *ldp, double *stp) { svbool_t pg = svptrue_b64(); svfloat64_t ld1 = svld1_f64(pg, ldp); svfloat64_t ld2 = svld1_f64(pg, ldp+svcntd()); svst1_f64(pg, stp, ld1); svst1_f64(pg, stp+svcntd(), ld2); } ``` When compiled with `-msve-vector-bits=128`, we currently generate: ```gas foo: ldr z0, [x0] ldr z1, [x0, #1, mul vl] str z0, [x1] str z1, [x1, #1, mul vl] ret ``` With this patch, we instead generate: ```gas foo: ldp q0, q1, [x0] stp q0, q1, [x1] ret ``` This is an alternative, more targetted approach to #127500.

…s=128. (llvm#134068) When compiling with -msve-vector-bits=128 or vscale_range(1, 1) and when the offsets allow it, we can pair SVE LDR/STR instructions into Neon LDP/STP. For example, given: ```cpp #include <arm_sve.h> void foo(double const *ldp, double *stp) { svbool_t pg = svptrue_b64(); svfloat64_t ld1 = svld1_f64(pg, ldp); svfloat64_t ld2 = svld1_f64(pg, ldp+svcntd()); svst1_f64(pg, stp, ld1); svst1_f64(pg, stp+svcntd(), ld2); } ``` When compiled with `-msve-vector-bits=128`, we currently generate: ```gas foo: ldr z0, [x0] ldr z1, [x0, llvm#1, mul vl] str z0, [x1] str z1, [x1, llvm#1, mul vl] ret ``` With this patch, we instead generate: ```gas foo: ldp q0, q1, [x0] stp q0, q1, [x1] ret ``` This is an alternative, more targetted approach to llvm#127500.

rj-jesus · 2025-04-16T14:47:58Z

I'm closing this since we merged an alternative approach in: 1df4af6c.

rj-jesus requested review from davemgreen, paulwalker-arm, sjoerdmeijer and david-arm February 17, 2025 14:46

llvmbot added the backend:AArch64 label Feb 17, 2025

rj-jesus commented Feb 17, 2025

View reviewed changes

paulwalker-arm reviewed Feb 19, 2025

View reviewed changes

rj-jesus force-pushed the rjj/aarch64-sve-unpred-loads-stores-128 branch from dd74540 to 8a8fbaa Compare March 3, 2025 14:52

paulwalker-arm reviewed Mar 4, 2025

View reviewed changes

rj-jesus mentioned this pull request Mar 4, 2025

[AArch64][SVE] Improve fixed-length addressing modes. #129732

Merged

rj-jesus added 3 commits March 25, 2025 03:18

Precommit unpredicated loads/stores tests

c69b267

Address comments and rebase patch

70568c9

rj-jesus force-pushed the rjj/aarch64-sve-unpred-loads-stores-128 branch from 8a8fbaa to 70568c9 Compare March 25, 2025 17:22

rj-jesus mentioned this pull request Apr 2, 2025

[AArch64][SVE] Pair SVE fill/spill into LDP/STP with -msve-vector-bits=128. #134068

Merged

rj-jesus closed this Apr 16, 2025

[AArch64][SVE] Lower unpredicated loads/stores as fixed LDR/STR with -msve-vector-bits=128. #127500

[AArch64][SVE] Lower unpredicated loads/stores as fixed LDR/STR with -msve-vector-bits=128. #127500

Uh oh!

Conversation

rj-jesus commented Feb 17, 2025

Uh oh!

llvmbot commented Feb 17, 2025

Uh oh!

rj-jesus Feb 17, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

paulwalker-arm left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

rj-jesus commented Feb 19, 2025

Uh oh!

paulwalker-arm commented Feb 19, 2025

Uh oh!

rj-jesus commented Feb 19, 2025

Uh oh!

rj-jesus commented Feb 21, 2025

Uh oh!

rj-jesus commented Feb 27, 2025

Uh oh!

paulwalker-arm commented Feb 28, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

rj-jesus commented Feb 28, 2025

Uh oh!

paulwalker-arm commented Feb 28, 2025

Uh oh!

paulwalker-arm commented Feb 28, 2025

Uh oh!

rj-jesus commented Feb 28, 2025

Uh oh!

rj-jesus commented Mar 3, 2025

Uh oh!

paulwalker-arm commented Mar 3, 2025

Uh oh!

rj-jesus commented Mar 3, 2025

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

rj-jesus Mar 25, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

rj-jesus commented Mar 25, 2025

Uh oh!

paulwalker-arm commented Mar 31, 2025

Uh oh!

rj-jesus commented Apr 2, 2025

Uh oh!

rj-jesus commented Apr 16, 2025

Uh oh!

rj-jesus Feb 17, 2025 •

edited

Loading

paulwalker-arm commented Feb 28, 2025 •

edited

Loading

rj-jesus Mar 25, 2025 •

edited

Loading