[LSR] Fix matching vscale immediates (#100080)

Summary: Somewhat confusingly a `SCEVMulExpr` is a `SCEVNAryExpr`, so can have > 2 operands. Previously, the vscale immediate matching did not check the number of operands of the `SCEVMulExpr`, so would ignore any operands after the first two. This led to incorrect codegen (and results) for ArmSME in IREE (https://github.com/iree-org/iree), which sometimes addresses things that are a `vscale * vscale` multiple away. The test added with this change shows an example reduced from IREE. The second write should be offset from the first `16 * vscale * vscale` (* 4 bytes), however, previously LSR dropped the second vscale and instead offset the write by `#4, mul vl`, which is an offset of `16 * vscale` (* 4 bytes). Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60250721
llvm · Jul 25, 2024 · c3e3509 · c3e3509
1 parent 9c03782
commit c3e3509
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 10 deletions.
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -946,13 +946,15 @@ static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
                            SCEV::FlagAnyWrap);
     return Result;
-  } else if (EnableVScaleImmediates)
-    if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S))
+  } else if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) {
+    if (EnableVScaleImmediates && M->getNumOperands() == 2) {
       if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
         if (isa<SCEVVScale>(M->getOperand(1))) {
           S = SE.getConstant(M->getType(), 0);
           return Immediate::getScalable(C->getValue()->getSExtValue());
         }
+    }
+  }
   return Immediate::getZero();
 }
 

diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
@@ -384,27 +384,31 @@ for.exit:
   ret void
 }
 
-;; This test demonstrates an incorrect MUL VL address calculation. Here there
-;; are two writes that should be `16 * vscale * vscale` apart, however,
-;; loop-strength-reduce has ignored the second `vscale` and offset the second
-;; write by `#4, mul vl` which is an offset of `16 * vscale` dropping a vscale.
+;; Here are two writes that should be `16 * vscale * vscale` apart, so MUL VL
+;; addressing cannot be used to offset the second write, as for example,
+;; `#4, mul vl` would only be an offset of `16 * vscale` (dropping a vscale).
 define void @vscale_squared_offset(ptr %alloc) #0 {
 ; COMMON-LABEL: vscale_squared_offset:
 ; COMMON:       // %bb.0: // %entry
+; COMMON-NEXT:    rdvl x9, #1
 ; COMMON-NEXT:    fmov z0.s, #4.00000000
 ; COMMON-NEXT:    mov x8, xzr
-; COMMON-NEXT:    cntw x9
+; COMMON-NEXT:    lsr x9, x9, #4
 ; COMMON-NEXT:    fmov z1.s, #8.00000000
+; COMMON-NEXT:    cntw x10
 ; COMMON-NEXT:    ptrue p0.s, vl1
-; COMMON-NEXT:    cmp x8, x9
+; COMMON-NEXT:    umull x9, w9, w9
+; COMMON-NEXT:    lsl x9, x9, #6
+; COMMON-NEXT:    cmp x8, x10
 ; COMMON-NEXT:    b.ge .LBB6_2
 ; COMMON-NEXT:  .LBB6_1: // %for.body
 ; COMMON-NEXT:    // =>This Inner Loop Header: Depth=1
+; COMMON-NEXT:    add x11, x0, x9
 ; COMMON-NEXT:    st1w { z0.s }, p0, [x0]
 ; COMMON-NEXT:    add x8, x8, #1
-; COMMON-NEXT:    st1w { z1.s }, p0, [x0, #4, mul vl]
+; COMMON-NEXT:    st1w { z1.s }, p0, [x11]
 ; COMMON-NEXT:    addvl x0, x0, #1
-; COMMON-NEXT:    cmp x8, x9
+; COMMON-NEXT:    cmp x8, x10
 ; COMMON-NEXT:    b.lt .LBB6_1
 ; COMMON-NEXT:  .LBB6_2: // %for.exit
 ; COMMON-NEXT:    ret