[AArch64] Improve the codegen for sdiv 2 #98324

vfdff · 2024-07-10T14:06:15Z

Follow PR97879, if X's size is BitWidth, then X sdiv 2 can be expressived as

  X += X >> (BitWidth - 1)
  X = X >> 1

Fix #97884

llvmbot · 2024-07-10T14:06:54Z

@llvm/pr-subscribers-backend-aarch64

@llvm/pr-subscribers-llvm-selectiondag

Author: Allen (vfdff)

Changes

Follow PR97879, if X's size is BitWidth, then X sdiv 2 can be expressived as

  X += X &gt;&gt; (BitWidth - 1)
  X = X &gt;&gt; 1

Fix #97884

Full diff: https://github.com/llvm/llvm-project/pull/98324.diff

4 Files Affected:

(modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+23-10)
(modified) llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll (+1-2)
(modified) llvm/test/CodeGen/AArch64/sdivpow2.ll (+3-6)
(modified) llvm/test/CodeGen/RISCV/sdiv-pow2-cmov.ll (+54-136)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 458f962802b4c..2b2d70976eb02 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6182,18 +6182,31 @@ SDValue TargetLowering::buildSDIVPow2WithCMov(
   SDLoc DL(N);
   SDValue N0 = N->getOperand(0);
   SDValue Zero = DAG.getConstant(0, DL, VT);
-  APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
-  SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
+  SDValue CMov;
 
-  // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
-  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
-  SDValue Cmp = DAG.getSetCC(DL, CCVT, N0, Zero, ISD::SETLT);
-  SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
-  SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
+  if (Lg2 == 1) {
+    // If Divisor is 2, add 1 << (BitWidth -1) to it before shifting right.
+    unsigned BitWidth = VT.getSizeInBits();
+    SDValue SignVal = DAG.getNode(ISD::SRL, DL, VT, N0,
+                                  DAG.getConstant(BitWidth - 1, DL, VT));
+    CMov = DAG.getNode(ISD::ADD, DL, VT, N0, SignVal);
+
+    Created.push_back(SignVal.getNode());
+    Created.push_back(CMov.getNode());
+  } else {
+    APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
+    SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
 
-  Created.push_back(Cmp.getNode());
-  Created.push_back(Add.getNode());
-  Created.push_back(CMov.getNode());
+    // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
+    EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+    SDValue Cmp = DAG.getSetCC(DL, CCVT, N0, Zero, ISD::SETLT);
+    SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
+    CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
+
+    Created.push_back(Cmp.getNode());
+    Created.push_back(Add.getNode());
+    Created.push_back(CMov.getNode());
+  }
 
   // Divide by pow2.
   SDValue SRA =
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
index 3a17a95ed71da..6431cfc58a54d 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
@@ -202,9 +202,8 @@ define <4 x i32> @test_bit_sink_operand(<4 x i32> %src, <4 x i32> %dst, <4 x i32
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    sub sp, sp, #32
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    add w8, w0, w0, lsr #31
 ; CHECK-SD-NEXT:    mov w9, wzr
-; CHECK-SD-NEXT:    cinc w8, w0, lt
 ; CHECK-SD-NEXT:    asr w8, w8, #1
 ; CHECK-SD-NEXT:  .LBB11_1: // %do.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AArch64/sdivpow2.ll b/llvm/test/CodeGen/AArch64/sdivpow2.ll
index 4619534151814..2551be8555ce6 100644
--- a/llvm/test/CodeGen/AArch64/sdivpow2.ll
+++ b/llvm/test/CodeGen/AArch64/sdivpow2.ll
@@ -90,8 +90,7 @@ define i64 @test7(i64 %x) {
 define i64 @test8(i64 %x) {
 ; ISEL-LABEL: test8:
 ; ISEL:       // %bb.0:
-; ISEL-NEXT:    cmp x0, #0
-; ISEL-NEXT:    cinc x8, x0, lt
+; ISEL-NEXT:    add x8, x0, x0, lsr #63
 ; ISEL-NEXT:    asr x0, x8, #1
 ; ISEL-NEXT:    ret
 ;
@@ -110,10 +109,8 @@ define i32 @sdiv_int(i32 %begin, i32 %first) #0 {
 ; ISEL-LABEL: sdiv_int:
 ; ISEL:       // %bb.0:
 ; ISEL-NEXT:    sub w8, w0, w1
-; ISEL-NEXT:    add w9, w8, #1
-; ISEL-NEXT:    add w10, w8, #2
-; ISEL-NEXT:    cmp w9, #0
-; ISEL-NEXT:    csinc w8, w10, w8, lt
+; ISEL-NEXT:    add w8, w8, #1
+; ISEL-NEXT:    add w8, w8, w8, lsr #31
 ; ISEL-NEXT:    sub w0, w0, w8, asr #1
 ; ISEL-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/sdiv-pow2-cmov.ll b/llvm/test/CodeGen/RISCV/sdiv-pow2-cmov.ll
index f7dda82885678..55742e12db6c0 100644
--- a/llvm/test/CodeGen/RISCV/sdiv-pow2-cmov.ll
+++ b/llvm/test/CodeGen/RISCV/sdiv-pow2-cmov.ll
@@ -5,177 +5,95 @@
 ; RUN:   | FileCheck -check-prefixes=CHECK,SFB %s
 
 define signext i32 @sdiv2_32(i32 signext %0) {
-; NOSFB-LABEL: sdiv2_32:
-; NOSFB:       # %bb.0:
-; NOSFB-NEXT:    srliw a1, a0, 31
-; NOSFB-NEXT:    add a0, a0, a1
-; NOSFB-NEXT:    sraiw a0, a0, 1
-; NOSFB-NEXT:    ret
-;
-; SFB-LABEL: sdiv2_32:
-; SFB:       # %bb.0:
-; SFB-NEXT:    bgez a0, .LBB0_2
-; SFB-NEXT:  # %bb.1:
-; SFB-NEXT:    addi a0, a0, 1
-; SFB-NEXT:  .LBB0_2:
-; SFB-NEXT:    sraiw a0, a0, 1
-; SFB-NEXT:    ret
+; CHECK-LABEL: sdiv2_32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srliw a1, a0, 31
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    sraiw a0, a0, 1
+; CHECK-NEXT:    ret
   %res = sdiv i32 %0, 2
   ret i32 %res
 }
 
 define signext i32 @sdivneg2_32(i32 signext %0) {
-; NOSFB-LABEL: sdivneg2_32:
-; NOSFB:       # %bb.0:
-; NOSFB-NEXT:    srliw a1, a0, 31
-; NOSFB-NEXT:    add a0, a0, a1
-; NOSFB-NEXT:    sraiw a0, a0, 1
-; NOSFB-NEXT:    neg a0, a0
-; NOSFB-NEXT:    ret
-;
-; SFB-LABEL: sdivneg2_32:
-; SFB:       # %bb.0:
-; SFB-NEXT:    bgez a0, .LBB1_2
-; SFB-NEXT:  # %bb.1:
-; SFB-NEXT:    addi a0, a0, 1
-; SFB-NEXT:  .LBB1_2:
-; SFB-NEXT:    sraiw a0, a0, 1
-; SFB-NEXT:    neg a0, a0
-; SFB-NEXT:    ret
+; CHECK-LABEL: sdivneg2_32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srliw a1, a0, 31
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    sraiw a0, a0, 1
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
   %res = sdiv i32 %0, -2
   ret i32 %res
 }
 
 define i64 @sdiv2_64(i64 %0) {
-; NOSFB-LABEL: sdiv2_64:
-; NOSFB:       # %bb.0:
-; NOSFB-NEXT:    srli a1, a0, 63
-; NOSFB-NEXT:    add a0, a0, a1
-; NOSFB-NEXT:    srai a0, a0, 1
-; NOSFB-NEXT:    ret
-;
-; SFB-LABEL: sdiv2_64:
-; SFB:       # %bb.0:
-; SFB-NEXT:    bgez a0, .LBB2_2
-; SFB-NEXT:  # %bb.1:
-; SFB-NEXT:    addi a0, a0, 1
-; SFB-NEXT:  .LBB2_2:
-; SFB-NEXT:    srai a0, a0, 1
-; SFB-NEXT:    ret
+; CHECK-LABEL: sdiv2_64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srli a1, a0, 63
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    srai a0, a0, 1
+; CHECK-NEXT:    ret
   %res = sdiv i64 %0, 2
   ret i64 %res
 }
 
 define i64 @sdivneg2_64(i64 %0) {
-; NOSFB-LABEL: sdivneg2_64:
-; NOSFB:       # %bb.0:
-; NOSFB-NEXT:    srli a1, a0, 63
-; NOSFB-NEXT:    add a0, a0, a1
-; NOSFB-NEXT:    srai a0, a0, 1
-; NOSFB-NEXT:    neg a0, a0
-; NOSFB-NEXT:    ret
-;
-; SFB-LABEL: sdivneg2_64:
-; SFB:       # %bb.0:
-; SFB-NEXT:    bgez a0, .LBB3_2
-; SFB-NEXT:  # %bb.1:
-; SFB-NEXT:    addi a0, a0, 1
-; SFB-NEXT:  .LBB3_2:
-; SFB-NEXT:    srai a0, a0, 1
-; SFB-NEXT:    neg a0, a0
-; SFB-NEXT:    ret
+; CHECK-LABEL: sdivneg2_64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srli a1, a0, 63
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    srai a0, a0, 1
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
   %res = sdiv i64 %0, -2
   ret i64 %res
 }
 
 define signext i32 @srem2_32(i32 signext %0) {
-; NOSFB-LABEL: srem2_32:
-; NOSFB:       # %bb.0:
-; NOSFB-NEXT:    srliw a1, a0, 31
-; NOSFB-NEXT:    add a1, a1, a0
-; NOSFB-NEXT:    andi a1, a1, -2
-; NOSFB-NEXT:    subw a0, a0, a1
-; NOSFB-NEXT:    ret
-;
-; SFB-LABEL: srem2_32:
-; SFB:       # %bb.0:
-; SFB-NEXT:    mv a1, a0
-; SFB-NEXT:    bgez a0, .LBB4_2
-; SFB-NEXT:  # %bb.1:
-; SFB-NEXT:    addi a1, a0, 1
-; SFB-NEXT:  .LBB4_2:
-; SFB-NEXT:    andi a1, a1, -2
-; SFB-NEXT:    subw a0, a0, a1
-; SFB-NEXT:    ret
+; CHECK-LABEL: srem2_32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srliw a1, a0, 31
+; CHECK-NEXT:    add a1, a1, a0
+; CHECK-NEXT:    andi a1, a1, -2
+; CHECK-NEXT:    subw a0, a0, a1
+; CHECK-NEXT:    ret
   %res = srem i32 %0, 2
   ret i32 %res
 }
 
 define signext i32 @sremneg2_32(i32 signext %0) {
-; NOSFB-LABEL: sremneg2_32:
-; NOSFB:       # %bb.0:
-; NOSFB-NEXT:    srliw a1, a0, 31
-; NOSFB-NEXT:    add a1, a1, a0
-; NOSFB-NEXT:    andi a1, a1, -2
-; NOSFB-NEXT:    subw a0, a0, a1
-; NOSFB-NEXT:    ret
-;
-; SFB-LABEL: sremneg2_32:
-; SFB:       # %bb.0:
-; SFB-NEXT:    mv a1, a0
-; SFB-NEXT:    bgez a0, .LBB5_2
-; SFB-NEXT:  # %bb.1:
-; SFB-NEXT:    addi a1, a0, 1
-; SFB-NEXT:  .LBB5_2:
-; SFB-NEXT:    andi a1, a1, -2
-; SFB-NEXT:    subw a0, a0, a1
-; SFB-NEXT:    ret
+; CHECK-LABEL: sremneg2_32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srliw a1, a0, 31
+; CHECK-NEXT:    add a1, a1, a0
+; CHECK-NEXT:    andi a1, a1, -2
+; CHECK-NEXT:    subw a0, a0, a1
+; CHECK-NEXT:    ret
   %res = srem i32 %0, -2
   ret i32 %res
 }
 
 define i64 @srem2_64(i64 %0) {
-; NOSFB-LABEL: srem2_64:
-; NOSFB:       # %bb.0:
-; NOSFB-NEXT:    srli a1, a0, 63
-; NOSFB-NEXT:    add a1, a1, a0
-; NOSFB-NEXT:    andi a1, a1, -2
-; NOSFB-NEXT:    sub a0, a0, a1
-; NOSFB-NEXT:    ret
-;
-; SFB-LABEL: srem2_64:
-; SFB:       # %bb.0:
-; SFB-NEXT:    mv a1, a0
-; SFB-NEXT:    bgez a0, .LBB6_2
-; SFB-NEXT:  # %bb.1:
-; SFB-NEXT:    addi a1, a0, 1
-; SFB-NEXT:  .LBB6_2:
-; SFB-NEXT:    andi a1, a1, -2
-; SFB-NEXT:    sub a0, a0, a1
-; SFB-NEXT:    ret
+; CHECK-LABEL: srem2_64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srli a1, a0, 63
+; CHECK-NEXT:    add a1, a1, a0
+; CHECK-NEXT:    andi a1, a1, -2
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    ret
   %res = srem i64 %0, 2
   ret i64 %res
 }
 
 define i64 @sremneg2_64(i64 %0) {
-; NOSFB-LABEL: sremneg2_64:
-; NOSFB:       # %bb.0:
-; NOSFB-NEXT:    srli a1, a0, 63
-; NOSFB-NEXT:    add a1, a1, a0
-; NOSFB-NEXT:    andi a1, a1, -2
-; NOSFB-NEXT:    sub a0, a0, a1
-; NOSFB-NEXT:    ret
-;
-; SFB-LABEL: sremneg2_64:
-; SFB:       # %bb.0:
-; SFB-NEXT:    mv a1, a0
-; SFB-NEXT:    bgez a0, .LBB7_2
-; SFB-NEXT:  # %bb.1:
-; SFB-NEXT:    addi a1, a0, 1
-; SFB-NEXT:  .LBB7_2:
-; SFB-NEXT:    andi a1, a1, -2
-; SFB-NEXT:    sub a0, a0, a1
-; SFB-NEXT:    ret
+; CHECK-LABEL: sremneg2_64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srli a1, a0, 63
+; CHECK-NEXT:    add a1, a1, a0
+; CHECK-NEXT:    andi a1, a1, -2
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    ret
   %res = srem i64 %0, -2
   ret i64 %res
 }

dtcxzyw · 2024-07-10T14:09:18Z

llvm/test/CodeGen/RISCV/sdiv-pow2-cmov.ll

-; NOSFB-NEXT:    sraiw a0, a0, 1
-; NOSFB-NEXT:    ret
-;
-; SFB-LABEL: sdiv2_32:


On SiFive's cores with SFB (Short forward branch), bgez + addi only takes one cycle.

llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll

topperc · 2024-07-10T18:00:02Z

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

-  SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
-  SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
+  if (Lg2 == 1) {
+    // If Divisor is 2, add 1 << (BitWidth -1) to it before shifting right.


This comment isn't accurate. It's adding (N0 >> (BitWidth - 1)).

Similar to X86, if X's size is BitWidth, then X sdiv 2 can be expressived as ``` X += X >> (BitWidth - 1) X = X >> 1 ```

dtcxzyw

LGTM.

Same as X86, , if X's size is BitWidth, then X sdiv 2 can be expressived as ``` X += X >> (BitWidth - 1) X = X >> 1 ``` Fix llvm#97884

vfdff requested review from davemgreen, jasonmolenda, dtcxzyw, topperc, efriedma-quic and david-arm July 10, 2024 14:06

llvmbot added backend:AArch64 llvm:SelectionDAG SelectionDAGISel as well labels Jul 10, 2024

dtcxzyw requested changes Jul 10, 2024

View reviewed changes

topperc reviewed Jul 10, 2024

View reviewed changes

[AArch64] Improve the codegen for sdiv 2

5f26a9e

Similar to X86, if X's size is BitWidth, then X sdiv 2 can be expressived as ``` X += X >> (BitWidth - 1) X = X >> 1 ```

vfdff force-pushed the PR97884 branch from 1143e6f to 5f26a9e Compare July 11, 2024 10:56

dtcxzyw approved these changes Jul 11, 2024

View reviewed changes

vfdff merged commit 1bafe77 into llvm:main Jul 12, 2024
7 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AArch64] Improve the codegen for sdiv 2 #98324

[AArch64] Improve the codegen for sdiv 2 #98324

vfdff commented Jul 10, 2024

Uh oh!

llvmbot commented Jul 10, 2024 •

edited

Loading

Uh oh!

dtcxzyw Jul 10, 2024

Uh oh!

Uh oh!

topperc Jul 10, 2024 •

edited

Loading

Uh oh!

dtcxzyw left a comment

Uh oh!

Uh oh!

Uh oh!

[AArch64] Improve the codegen for sdiv 2 #98324

[AArch64] Improve the codegen for sdiv 2 #98324

Conversation

vfdff commented Jul 10, 2024

Uh oh!

llvmbot commented Jul 10, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

dtcxzyw Jul 10, 2024

Choose a reason for hiding this comment

Uh oh!

Uh oh!

topperc Jul 10, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

dtcxzyw left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

llvmbot commented Jul 10, 2024 •

edited

Loading

topperc Jul 10, 2024 •

edited

Loading