[GlobalISel] Handle div-by-pow2 #83155

shiltian · 2024-02-27T17:07:44Z

This patch adds similar handling of div-by-pow2 as in SelectionDAG.

shiltian · 2024-02-27T17:10:12Z

I'm still working on it, but I noticed the code generated from GlobalISel is much larger than the SelectionSAG's. They are definitely some overlapping code, but the GlobalISel version generates more extra code.

define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
  %div = sdiv i128 %lhs, 8589934592
  ret i128 %div
}

GlobalISel:

	.text
	.section	.AMDGPU.config,"",@progbits
	.long	47176
	.long	0
	.long	47180
	.long	0
	.long	47200
	.long	0
	.long	4
	.long	0
	.long	8
	.long	0
	.text
	.globl	v_sdiv_i128_v_pow2k             ; -- Begin function v_sdiv_i128_v_pow2k
	.p2align	2
	.type	v_sdiv_i128_v_pow2k,@function
v_sdiv_i128_v_pow2k:                    ; @v_sdiv_i128_v_pow2k
; %bb.0:
	s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	v_ashrrev_i32_e32 v4, 31, v3
	s_mov_b32 s4, 0
	s_mov_b32 s5, 2
	s_ff1_i32_b64 s5, s[4:5]
	v_mov_b32_e32 v5, v4
	s_sub_i32 s4, 0x80, s5
	s_sub_i32 s7, 64, s4
	s_sub_i32 s6, s4, 64
	s_cmp_lt_u32 s4, 64
	v_lshlrev_b64 v[6:7], s4, v[4:5]
	v_lshrrev_b64 v[8:9], s7, v[4:5]
	s_cselect_b32 s8, 1, 0
	s_cmp_eq_u32 s4, 0
	v_lshlrev_b64 v[10:11], s6, v[4:5]
	s_cselect_b32 s4, 1, 0
	s_and_b32 s7, 1, s8
	v_or_b32_e32 v5, v8, v6
	v_cmp_ne_u32_e64 vcc_lo, 0, s7
	v_or_b32_e32 v8, v9, v7
	s_and_b32 s4, 1, s4
	s_sub_i32 s6, 64, s5
	v_cmp_ne_u32_e64 s4, 0, s4
	v_cndmask_b32_e32 v5, v10, v5, vcc_lo
	v_cndmask_b32_e32 v6, 0, v6, vcc_lo
	v_cndmask_b32_e32 v8, v11, v8, vcc_lo
	v_cndmask_b32_e32 v7, 0, v7, vcc_lo
	v_cndmask_b32_e64 v5, v5, v4, s4
	v_add_co_u32 v0, vcc_lo, v0, v6
	v_cndmask_b32_e64 v4, v8, v4, s4
	v_add_co_ci_u32_e32 v1, vcc_lo, v1, v7, vcc_lo
	v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
	v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
	s_sub_i32 s4, s5, 64
	v_lshrrev_b64 v[4:5], s5, v[0:1]
	s_cmp_lt_u32 s5, 64
	v_lshlrev_b64 v[6:7], s6, v[2:3]
	s_cselect_b32 s6, 1, 0
	s_cmp_eq_u32 s5, 0
	v_ashrrev_i64 v[8:9], s4, v[2:3]
	s_cselect_b32 s7, 1, 0
	s_and_b32 s4, 1, s6
	v_or_b32_e32 v6, v4, v6
	v_or_b32_e32 v7, v5, v7
	v_cmp_ne_u32_e64 vcc_lo, 0, s4
	s_and_b32 s4, 1, s7
	v_ashrrev_i64 v[4:5], s5, v[2:3]
	v_cmp_ne_u32_e64 s4, 0, s4
	v_ashrrev_i32_e32 v3, 31, v3
	v_cndmask_b32_e32 v2, v8, v6, vcc_lo
	v_cndmask_b32_e32 v6, v9, v7, vcc_lo
	v_cndmask_b32_e64 v0, v2, v0, s4
	v_cndmask_b32_e64 v1, v6, v1, s4
	v_cndmask_b32_e32 v2, v3, v4, vcc_lo
	v_cndmask_b32_e32 v3, v3, v5, vcc_lo
	s_setpc_b64 s[30:31]
.Lfunc_end0:
	.size	v_sdiv_i128_v_pow2k, .Lfunc_end0-v_sdiv_i128_v_pow2k
                                        ; -- End function
	.section	.AMDGPU.csdata,"",@progbits
; Function info:
; codeLenInByte = 292
; NumSgprs: 34
; NumVgprs: 12
; ScratchSize: 0
; MemoryBound: 0
	.section	".note.GNU-stack","",@progbits
	.amd_amdgpu_isa "amdgcn-unknown-linux-gnu-gfx1030"

SelectionDAG:

	.text
	.section	.AMDGPU.config,"",@progbits
	.long	47176
	.long	0
	.long	47180
	.long	0
	.long	47200
	.long	0
	.long	4
	.long	0
	.long	8
	.long	0
	.text
	.globl	v_sdiv_i128_v_pow2k             ; -- Begin function v_sdiv_i128_v_pow2k
	.p2align	2
	.type	v_sdiv_i128_v_pow2k,@function
v_sdiv_i128_v_pow2k:                    ; @v_sdiv_i128_v_pow2k
; %bb.0:
	s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	v_ashrrev_i32_e32 v4, 31, v3
	v_mov_b32_e32 v5, v4
	v_lshrrev_b64 v[4:5], 31, v[4:5]
	v_add_co_u32 v0, vcc_lo, v0, v4
	v_add_co_ci_u32_e32 v4, vcc_lo, v1, v5, vcc_lo
	v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
	v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
	v_lshrrev_b32_e32 v4, 1, v4
	v_lshlrev_b64 v[0:1], 31, v[2:3]
	v_ashrrev_i64 v[2:3], 33, v[2:3]
	v_or_b32_e32 v0, v4, v0
	s_setpc_b64 s[30:31]
.Lfunc_end0:
	.size	v_sdiv_i128_v_pow2k, .Lfunc_end0-v_sdiv_i128_v_pow2k
                                        ; -- End function
	.section	.AMDGPU.csdata,"",@progbits
; Function info:
; codeLenInByte = 68
; NumSgprs: 34
; NumVgprs: 6
; ScratchSize: 0
; MemoryBound: 0
	.section	".note.GNU-stack","",@progbits
	.amd_amdgpu_isa "amdgcn-unknown-linux-gnu-gfx1030"

What could cause this? @arsenm

github-actions · 2024-02-27T17:10:13Z

✅ With the latest revision this PR passed the C/C++ code formatter.

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

redstar · 2024-02-27T21:08:05Z

llvm/include/llvm/Target/GlobalISel/Combine.td

+   [{ return Helper.matchSDivByPow2(*${root}); }]),
+  (apply [{ Helper.applySDivByPow2(*${root}); }])>;
+
+def intdiv_combines : GICombineGroup<[udiv_by_const, sdiv_by_const, sdiv_by_pow2]>;


sdiv_by_pow2 matches a true subset of sdiv_by_const. Is there some guarantee which of the rules matches?
The SelectionDAG pattern matcher uses a lexcial order, which can be influenced with a priority. I did not see anything about it in the docs, but I may have overlooked it.

Yeah, that's a good point. I maybe need to put the logic into sdiv_by_const.

I would prefer to have it separate, since my backend has no support for multiplying 2 numbers with the result being twice as large. Then sdiv_by_const results in code which is more expensive than using the sdiv instruction. However, sdiv_by_pow2 is very useful (I considered to implement it myself, so thank you for working on that).

The matching order is a separate problem, I wonder if there are already other dependencies between rules.

shiltian · 2024-02-27T21:38:28Z

After taking the advices, the code generated from GlobalISel was not changed for the case above. Need to figure out where they come from, especially the compare instruction.

Update: It turns out those extra code is generated by legalizer.

redstar · 2024-02-28T15:40:27Z

After taking the advices, the code generated from GlobalISel was not changed for the case above. Need to figure out where they come from, especially the compare instruction.

You are still generating too much instructions. For example,

 auto C1 = Builder.buildCTTZ(Ty, RHS);

RHS is a constant, and you have the value in RHSC. So instead of generating a G_CTTZ instruction, take the constant value (APInt Cst = RHSC->Value;), calculate the result, and use the resulting constant in the next step. Same with the G_SUB for the inexact value.

tschuett · 2024-02-28T15:53:25Z

AArch64 runs the combiner before and after legalization. Most combiner tests are mir files before legalization. I was hit by this file recently:
https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir
It runs the combiner prelegalization and selects exactly one combine.

shiltian · 2024-02-28T16:30:57Z

After taking the advices, the code generated from GlobalISel was not changed for the case above. Need to figure out where they come from, especially the compare instruction.

You are still generating too much instructions. For example,
 auto C1 = Builder.buildCTTZ(Ty, RHS);
RHS is a constant, and you have the value in RHSC. So instead of generating a G_CTTZ instruction, take the constant value (APInt Cst = RHSC->Value;), calculate the result, and use the resulting constant in the next step. Same with the G_SUB for the inexact value.

This does it. Thanks! It looks like there is no sort of "constant folding" after this pass.

tschuett · 2024-02-28T16:43:24Z

There is constant folding while the combiner runs. I briefly scanned Utils.cpp, but CTTZ did not show up.

llvm/test/CodeGen/AMDGPU/div_i128.ll

llvmbot · 2024-02-29T00:00:02Z

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-amdgpu

Author: Shilei Tian (shiltian)

Changes

This patch adds similar handling of div-by-pow2 as in SelectionDAG.

Fix #80671.

Patch is 138.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/83155.diff

4 Files Affected:

(modified) llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h (+10)
(modified) llvm/include/llvm/Target/GlobalISel/Combine.td (+17-4)
(modified) llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp (+101-1)
(modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+2281-9)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 23728636498ba0..c19efba984d0d9 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -673,6 +673,16 @@ class CombinerHelper {
   bool matchSDivByConst(MachineInstr &MI);
   void applySDivByConst(MachineInstr &MI);
 
+  /// Given an G_SDIV \p MI expressing a signed divided by a pow2 constant,
+  /// return expressions that implements it by shifting.
+  bool matchSDivByPow2(MachineInstr &MI);
+  void applySDivByPow2(MachineInstr &MI);
+
+  /// Given an G_UDIV \p MI expressing an unsigned divided by a pow2 constant,
+  /// return expressions that implements it by shifting.
+  bool matchUDivByPow2(MachineInstr &MI);
+  void applyUDivByPow2(MachineInstr &MI);
+
   // G_UMULH x, (1 << c)) -> x >> (bitwidth - c)
   bool matchUMulHToLShr(MachineInstr &MI);
   void applyUMulHToLShr(MachineInstr &MI);
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 17757ca3e41111..1d9a60bd27e7ac 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -264,7 +264,7 @@ def combine_extracted_vector_load : GICombineRule<
   (match (wip_match_opcode G_EXTRACT_VECTOR_ELT):$root,
         [{ return Helper.matchCombineExtractedVectorLoad(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
-  
+
 def combine_indexed_load_store : GICombineRule<
   (defs root:$root, indexed_load_store_matchdata:$matchinfo),
   (match (wip_match_opcode G_LOAD, G_SEXTLOAD, G_ZEXTLOAD, G_STORE):$root,
@@ -1005,7 +1005,20 @@ def sdiv_by_const : GICombineRule<
    [{ return Helper.matchSDivByConst(*${root}); }]),
   (apply [{ Helper.applySDivByConst(*${root}); }])>;
 
-def intdiv_combines : GICombineGroup<[udiv_by_const, sdiv_by_const]>;
+def sdiv_by_pow2 : GICombineRule<
+  (defs root:$root),
+  (match (wip_match_opcode G_SDIV):$root,
+   [{ return Helper.matchSDivByPow2(*${root}); }]),
+  (apply [{ Helper.applySDivByPow2(*${root}); }])>;
+
+def udiv_by_pow2 : GICombineRule<
+  (defs root:$root),
+  (match (wip_match_opcode G_UDIV):$root,
+   [{ return Helper.matchUDivByPow2(*${root}); }]),
+  (apply [{ Helper.applyUDivByPow2(*${root}); }])>;
+
+def intdiv_combines : GICombineGroup<[udiv_by_const, sdiv_by_const,
+                                      sdiv_by_pow2, udiv_by_pow2]>;
 
 def reassoc_ptradd : GICombineRule<
   (defs root:$root, build_fn_matchinfo:$matchinfo),
@@ -1325,7 +1338,7 @@ def constant_fold_binops : GICombineGroup<[constant_fold_binop,
 
 def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
     extract_vec_elt_combines, combines_for_extload, combine_extracted_vector_load,
-    undef_combines, identity_combines, phi_combines, 
+    undef_combines, identity_combines, phi_combines,
     simplify_add_to_sub, hoist_logic_op_with_same_opcode_hands, shifts_too_big,
     reassocs, ptr_add_immed_chain,
     shl_ashr_to_sext_inreg, sext_inreg_of_load,
@@ -1342,7 +1355,7 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
     intdiv_combines, mulh_combines, redundant_neg_operands,
     and_or_disjoint_mask, fma_combines, fold_binop_into_select,
     sub_add_reg, select_to_minmax, redundant_binop_in_equality,
-    fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, 
+    fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
     combine_concat_vector]>;
 
 // A combine group used to for prelegalizer combiners at -O0. The combines in
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 2f18a64ca285bd..d094fcd0ec3af8 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1490,7 +1490,7 @@ void CombinerHelper::applyOptBrCondByInvertingCond(MachineInstr &MI,
   Observer.changedInstr(*BrCond);
 }
 
- 
+
 bool CombinerHelper::tryEmitMemcpyInline(MachineInstr &MI) {
   MachineIRBuilder HelperBuilder(MI);
   GISelObserverWrapper DummyObserver;
@@ -5286,6 +5286,106 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) {
   return MIB.buildMul(Ty, Res, Factor);
 }
 
+bool CombinerHelper::matchSDivByPow2(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV");
+  if (MI.getFlag(MachineInstr::MIFlag::IsExact))
+    return false;
+  auto &SDiv = cast<GenericMachineInstr>(MI);
+  Register RHS = SDiv.getReg(2);
+  auto MatchPow2 = [&](const Constant *C) {
+    if (auto *CI = dyn_cast<ConstantInt>(C))
+      return CI->getValue().isPowerOf2() || CI->getValue().isNegatedPowerOf2();
+    return false;
+  };
+  return matchUnaryPredicate(MRI, RHS, MatchPow2, /* AllowUndefs */ false);
+}
+
+void CombinerHelper::applySDivByPow2(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV");
+  auto &SDiv = cast<GenericMachineInstr>(MI);
+  Register Dst = SDiv.getReg(0);
+  Register LHS = SDiv.getReg(1);
+  Register RHS = SDiv.getReg(2);
+  LLT Ty = MRI.getType(Dst);
+  LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
+
+  Builder.setInstrAndDebugLoc(MI);
+
+  auto RHSC = getIConstantVRegValWithLookThrough(RHS, MRI);
+  assert(RHSC.has_value() && "RHS must be a constant");
+  auto RHSCV = RHSC->Value;
+  auto Zero = Builder.buildConstant(Ty, 0);
+
+  // Special case: (sdiv X, 1) -> X
+  if (RHSCV.isOne()) {
+    replaceSingleDefInstWithReg(MI, LHS);
+    return;
+  }
+  // Special Case: (sdiv X, -1) -> 0-X
+  if (RHSCV.isAllOnes()) {
+    auto Sub = Builder.buildSub(Ty, Zero, LHS);
+    replaceSingleDefInstWithReg(MI, Sub->getOperand(0).getReg());
+    return;
+  }
+
+  unsigned Bitwidth = Ty.getScalarSizeInBits();
+  unsigned TrailingZeros = RHSCV.countTrailingZeros();
+  auto C1 = Builder.buildConstant(ShiftAmtTy, TrailingZeros);
+  auto Inexact = Builder.buildConstant(ShiftAmtTy, Bitwidth - TrailingZeros);
+  auto Sign = Builder.buildAShr(
+      Ty, LHS, Builder.buildConstant(ShiftAmtTy, Bitwidth - 1));
+  // Add (LHS < 0) ? abs2 - 1 : 0;
+  auto Srl = Builder.buildShl(Ty, Sign, Inexact);
+  auto Add = Builder.buildAdd(Ty, LHS, Srl);
+  auto Sra = Builder.buildAShr(Ty, Add, C1);
+
+  // If dividing by a positive value, we're done. Otherwise, the result must
+  // be negated.
+  auto Res = RHSCV.isNegative() ? Builder.buildSub(Ty, Zero, Sra) : Sra;
+  replaceSingleDefInstWithReg(MI, Res->getOperand(0).getReg());
+}
+
+bool CombinerHelper::matchUDivByPow2(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_UDIV && "Expected UDIV");
+  if (MI.getFlag(MachineInstr::MIFlag::IsExact))
+    return false;
+  auto &UDiv = cast<GenericMachineInstr>(MI);
+  Register RHS = UDiv.getReg(2);
+  auto MatchPow2 = [&](const Constant *C) {
+    if (auto *CI = dyn_cast<ConstantInt>(C))
+      return CI->getValue().isPowerOf2();
+    return false;
+  };
+  return matchUnaryPredicate(MRI, RHS, MatchPow2, /* AllowUndefs */ false);
+}
+
+void CombinerHelper::applyUDivByPow2(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_UDIV && "Expected SDIV");
+  auto &UDiv = cast<GenericMachineInstr>(MI);
+  Register Dst = UDiv.getReg(0);
+  Register LHS = UDiv.getReg(1);
+  Register RHS = UDiv.getReg(2);
+  LLT Ty = MRI.getType(Dst);
+  LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
+
+  Builder.setInstrAndDebugLoc(MI);
+
+  auto RHSC = getIConstantVRegValWithLookThrough(RHS, MRI);
+  assert(RHSC.has_value() && "RHS must be a constant");
+  auto RHSCV = RHSC->Value;
+
+  // Special case: (udiv X, 1) -> X
+  if (RHSCV.isOne()) {
+    replaceSingleDefInstWithReg(MI, LHS);
+    return;
+  }
+
+  unsigned TrailingZeros = RHSCV.countTrailingZeros();
+  auto C1 = Builder.buildConstant(ShiftAmtTy, TrailingZeros);
+  auto Res = Builder.buildLShr(Ty, LHS, C1);
+  replaceSingleDefInstWithReg(MI, Res->getOperand(0).getReg());
+}
+
 bool CombinerHelper::matchUMulHToLShr(MachineInstr &MI) {
   assert(MI.getOpcode() == TargetOpcode::G_UMULH);
   Register RHS = MI.getOperand(2).getReg();
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 2f3d5d9d140c2c..8073d06dab5970 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -1,10 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0,GFX9-SDAG-O0 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0 %s
 
-; FIXME: GlobalISel missing the power-of-2 cases in legalization. https://github.com/llvm/llvm-project/issues/80671
-; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9,GFX9 %s
-; xUN: llc -O0 -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0,GFX9-O0 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-G %s
+; RUN: llc -O0 -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-G-O0 %s
 
 define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-LABEL: v_sdiv_i128_vv:
@@ -1223,6 +1222,1158 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-G-LABEL: v_sdiv_i128_vv:
+; GFX9-G:       ; %bb.0: ; %_udiv-special-cases
+; GFX9-G-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-G-NEXT:    v_ashrrev_i32_e32 v16, 31, v3
+; GFX9-G-NEXT:    v_xor_b32_e32 v0, v16, v0
+; GFX9-G-NEXT:    v_xor_b32_e32 v1, v16, v1
+; GFX9-G-NEXT:    v_sub_co_u32_e32 v10, vcc, v0, v16
+; GFX9-G-NEXT:    v_xor_b32_e32 v2, v16, v2
+; GFX9-G-NEXT:    v_subb_co_u32_e32 v11, vcc, v1, v16, vcc
+; GFX9-G-NEXT:    v_ashrrev_i32_e32 v17, 31, v7
+; GFX9-G-NEXT:    v_xor_b32_e32 v3, v16, v3
+; GFX9-G-NEXT:    v_subb_co_u32_e32 v12, vcc, v2, v16, vcc
+; GFX9-G-NEXT:    v_subb_co_u32_e32 v13, vcc, v3, v16, vcc
+; GFX9-G-NEXT:    v_xor_b32_e32 v0, v17, v4
+; GFX9-G-NEXT:    v_xor_b32_e32 v1, v17, v5
+; GFX9-G-NEXT:    v_sub_co_u32_e32 v18, vcc, v0, v17
+; GFX9-G-NEXT:    v_xor_b32_e32 v2, v17, v6
+; GFX9-G-NEXT:    v_subb_co_u32_e32 v19, vcc, v1, v17, vcc
+; GFX9-G-NEXT:    v_xor_b32_e32 v3, v17, v7
+; GFX9-G-NEXT:    v_subb_co_u32_e32 v4, vcc, v2, v17, vcc
+; GFX9-G-NEXT:    v_subb_co_u32_e32 v5, vcc, v3, v17, vcc
+; GFX9-G-NEXT:    v_or_b32_e32 v0, v18, v4
+; GFX9-G-NEXT:    v_or_b32_e32 v1, v19, v5
+; GFX9-G-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX9-G-NEXT:    v_or_b32_e32 v0, v10, v12
+; GFX9-G-NEXT:    v_or_b32_e32 v1, v11, v13
+; GFX9-G-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; GFX9-G-NEXT:    v_ffbh_u32_e32 v1, v18
+; GFX9-G-NEXT:    v_ffbh_u32_e32 v0, v19
+; GFX9-G-NEXT:    v_add_u32_e32 v1, 32, v1
+; GFX9-G-NEXT:    v_ffbh_u32_e32 v2, v4
+; GFX9-G-NEXT:    v_min_u32_e32 v0, v0, v1
+; GFX9-G-NEXT:    v_ffbh_u32_e32 v1, v5
+; GFX9-G-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX9-G-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[4:5]
+; GFX9-G-NEXT:    v_add_u32_e32 v0, 64, v0
+; GFX9-G-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX9-G-NEXT:    v_ffbh_u32_e32 v2, v10
+; GFX9-G-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[6:7]
+; GFX9-G-NEXT:    v_ffbh_u32_e32 v1, v11
+; GFX9-G-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX9-G-NEXT:    v_ffbh_u32_e32 v3, v12
+; GFX9-G-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX9-G-NEXT:    v_ffbh_u32_e32 v2, v13
+; GFX9-G-NEXT:    v_add_u32_e32 v3, 32, v3
+; GFX9-G-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[12:13]
+; GFX9-G-NEXT:    v_add_u32_e32 v1, 64, v1
+; GFX9-G-NEXT:    v_min_u32_e32 v2, v2, v3
+; GFX9-G-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[6:7]
+; GFX9-G-NEXT:    v_sub_co_u32_e64 v0, s[6:7], v0, v1
+; GFX9-G-NEXT:    v_subb_co_u32_e64 v1, s[6:7], 0, 0, s[6:7]
+; GFX9-G-NEXT:    v_mov_b32_e32 v6, 0x7f
+; GFX9-G-NEXT:    v_subb_co_u32_e64 v2, s[6:7], 0, 0, s[6:7]
+; GFX9-G-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9-G-NEXT:    v_subb_co_u32_e64 v3, s[6:7], 0, 0, s[6:7]
+; GFX9-G-NEXT:    v_cmp_gt_u64_e64 s[6:7], v[0:1], v[6:7]
+; GFX9-G-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-G-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[6:7]
+; GFX9-G-NEXT:    v_cmp_lt_u64_e64 s[6:7], 0, v[2:3]
+; GFX9-G-NEXT:    v_or_b32_e32 v15, v1, v3
+; GFX9-G-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[6:7]
+; GFX9-G-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[2:3]
+; GFX9-G-NEXT:    s_mov_b64 s[8:9], 0
+; GFX9-G-NEXT:    v_cndmask_b32_e64 v6, v7, v6, s[6:7]
+; GFX9-G-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GFX9-G-NEXT:    v_or_b32_e32 v20, v7, v6
+; GFX9-G-NEXT:    v_xor_b32_e32 v6, 0x7f, v0
+; GFX9-G-NEXT:    v_or_b32_e32 v14, v6, v2
+; GFX9-G-NEXT:    v_and_b32_e32 v6, 1, v20
+; GFX9-G-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX9-G-NEXT:    v_cndmask_b32_e64 v6, v10, 0, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e64 v7, v11, 0, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e64 v8, v12, 0, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e64 v9, v13, 0, vcc
+; GFX9-G-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; GFX9-G-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GFX9-G-NEXT:    v_or_b32_e32 v14, v20, v14
+; GFX9-G-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX9-G-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
+; GFX9-G-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-G-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GFX9-G-NEXT:    s_cbranch_execz .LBB0_6
+; GFX9-G-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX9-G-NEXT:    v_add_co_u32_e32 v20, vcc, 1, v0
+; GFX9-G-NEXT:    v_addc_co_u32_e32 v21, vcc, 0, v1, vcc
+; GFX9-G-NEXT:    v_addc_co_u32_e32 v22, vcc, 0, v2, vcc
+; GFX9-G-NEXT:    v_addc_co_u32_e32 v23, vcc, 0, v3, vcc
+; GFX9-G-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-G-NEXT:    v_sub_co_u32_e32 v8, vcc, 0x7f, v0
+; GFX9-G-NEXT:    v_sub_u32_e32 v0, 64, v8
+; GFX9-G-NEXT:    v_lshrrev_b64 v[0:1], v0, v[10:11]
+; GFX9-G-NEXT:    v_lshlrev_b64 v[2:3], v8, v[12:13]
+; GFX9-G-NEXT:    v_subrev_u32_e32 v9, 64, v8
+; GFX9-G-NEXT:    v_lshlrev_b64 v[6:7], v8, v[10:11]
+; GFX9-G-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX9-G-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX9-G-NEXT:    v_lshlrev_b64 v[0:1], v9, v[10:11]
+; GFX9-G-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v8
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v7, 0, v7, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-G-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v8, v0, v12, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v9, v1, v13, vcc
+; GFX9-G-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX9-G-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-G-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-G-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-G-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-G-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; GFX9-G-NEXT:    s_xor_b64 s[12:13], exec, s[8:9]
+; GFX9-G-NEXT:    s_cbranch_execz .LBB0_5
+; GFX9-G-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX9-G-NEXT:    v_sub_u32_e32 v2, 64, v20
+; GFX9-G-NEXT:    v_lshrrev_b64 v[0:1], v20, v[10:11]
+; GFX9-G-NEXT:    v_lshlrev_b64 v[2:3], v2, v[12:13]
+; GFX9-G-NEXT:    v_subrev_u32_e32 v24, 64, v20
+; GFX9-G-NEXT:    v_lshrrev_b64 v[14:15], v20, v[12:13]
+; GFX9-G-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX9-G-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX9-G-NEXT:    v_lshrrev_b64 v[0:1], v24, v[12:13]
+; GFX9-G-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v20
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v14, 0, v14, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v15, 0, v15, vcc
+; GFX9-G-NEXT:    v_add_co_u32_e32 v24, vcc, -1, v18
+; GFX9-G-NEXT:    s_mov_b64 s[8:9], 0
+; GFX9-G-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v20
+; GFX9-G-NEXT:    v_addc_co_u32_e32 v25, vcc, -1, v19, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e64 v12, v0, v10, s[4:5]
+; GFX9-G-NEXT:    v_cndmask_b32_e64 v13, v1, v11, s[4:5]
+; GFX9-G-NEXT:    v_addc_co_u32_e32 v26, vcc, -1, v4, vcc
+; GFX9-G-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX9-G-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-G-NEXT:    v_addc_co_u32_e32 v27, vcc, -1, v5, vcc
+; GFX9-G-NEXT:    v_mov_b32_e32 v11, 0
+; GFX9-G-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-G-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-G-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-G-NEXT:  .LBB0_3: ; %udiv-do-while
+; GFX9-G-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-G-NEXT:    v_lshlrev_b64 v[2:3], 1, v[6:7]
+; GFX9-G-NEXT:    v_lshrrev_b32_e32 v10, 31, v7
+; GFX9-G-NEXT:    v_or_b32_e32 v6, v0, v2
+; GFX9-G-NEXT:    v_or_b32_e32 v7, v1, v3
+; GFX9-G-NEXT:    v_lshlrev_b64 v[2:3], 1, v[12:13]
+; GFX9-G-NEXT:    v_lshrrev_b32_e32 v12, 31, v9
+; GFX9-G-NEXT:    v_lshlrev_b64 v[0:1], 1, v[14:15]
+; GFX9-G-NEXT:    v_or_b32_e32 v2, v2, v12
+; GFX9-G-NEXT:    v_lshrrev_b32_e32 v14, 31, v13
+; GFX9-G-NEXT:    v_sub_co_u32_e32 v12, vcc, v24, v2
+; GFX9-G-NEXT:    v_or_b32_e32 v0, v0, v14
+; GFX9-G-NEXT:    v_subb_co_u32_e32 v12, vcc, v25, v3, vcc
+; GFX9-G-NEXT:    v_subb_co_u32_e32 v12, vcc, v26, v0, vcc
+; GFX9-G-NEXT:    v_subb_co_u32_e32 v12, vcc, v27, v1, vcc
+; GFX9-G-NEXT:    v_ashrrev_i32_e32 v28, 31, v12
+; GFX9-G-NEXT:    v_and_b32_e32 v12, v28, v18
+; GFX9-G-NEXT:    v_sub_co_u32_e32 v12, vcc, v2, v12
+; GFX9-G-NEXT:    v_and_b32_e32 v2, v28, v19
+; GFX9-G-NEXT:    v_subb_co_u32_e32 v13, vcc, v3, v2, vcc
+; GFX9-G-NEXT:    v_and_b32_e32 v2, v28, v4
+; GFX9-G-NEXT:    v_subb_co_u32_e32 v14, vcc, v0, v2, vcc
+; GFX9-G-NEXT:    v_and_b32_e32 v0, v28, v5
+; GFX9-G-NEXT:    v_subb_co_u32_e32 v15, vcc, v1, v0, vcc
+; GFX9-G-NEXT:    v_add_co_u32_e32 v20, vcc, -1, v20
+; GFX9-G-NEXT:    v_addc_co_u32_e32 v21, vcc, -1, v21, vcc
+; GFX9-G-NEXT:    v_addc_co_u32_e32 v22, vcc, -1, v22, vcc
+; GFX9-G-NEXT:    v_addc_co_u32_e32 v23, vcc, -1, v23, vcc
+; GFX9-G-NEXT:    v_lshlrev_b64 v[8:9], 1, v[8:9]
+; GFX9-G-NEXT:    v_or_b32_e32 v0, v20, v22
+; GFX9-G-NEXT:    v_or_b32_e32 v1, v21, v23
+; GFX9-G-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX9-G-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX9-G-NEXT:    v_and_b32_e32 v10, 1, v28
+; GFX9-G-NEXT:    v_mov_b32_e32 v0, v10
+; GFX9-G-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX9-G-NEXT:    v_mov_b32_e32 v1, v11
+; GFX9-G-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX9-G-NEXT:    s_cbranch_execnz .LBB0_3
+; GFX9-G-NEXT:  ; %bb.4: ; %Flow
+; GFX9-G-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX9-G-NEXT:  .LBB0_5: ; %Flow2
+; GFX9-G-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GFX9-G-NEXT:    v_lshlrev_b64 v[2:3], 1, v[6:7]
+; GFX9-G-NEXT:    v_lshlrev_b64 v[8:9], 1, v[8:9]
+; GFX9-G-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
+; GFX9-G-NEXT:    v_or_b32_e32 v8, v8, v4
+; GFX9-G-NEXT:    v_or_b32_e32 v6, v0, v2
+; GFX9-G-NEXT:    v_or_b32_e32 v7, v1, v3
+; GFX9-G-NEXT:  .LBB0_6: ; %Flow3
+; GFX9-G-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9-G-NEXT:    v_xor_b32_e32 v3, v17, v16
+; GFX9-G-NEXT:    v_xor_b32_e32 v0, v6, v3
+; GFX9-G-NEXT:    v_xor_b32_e32 v1, v7, v3
+; GFX9-G-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-G-NEXT:    v_xor_b32_e32 v2, v8, v3
+; GFX9-G-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-G-NEXT:    v_xor_b32_e32 v4, v9, v3
+; GFX9-G-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v3, vcc
+; GFX9-G-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX9-G-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-G-O0-LABEL: v_sdiv_i128_vv:
+; GFX9-G-O0:       ; %bb.0: ; %_udiv-special-cases
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-G-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-G-O0-NEX...
[truncated]

shiltian · 2024-02-29T04:42:12Z

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

+
+  Builder.setInstrAndDebugLoc(MI);
+
+  auto RHSC = getIConstantVRegValWithLookThrough(RHS, MRI);


It looks like this will not work when RHS is a vector.

define <2 x i128> @v_sdiv_v2i128(<2 x i128> %num) { %result = sdiv <2 x i128> %num, <i128 8589934592, i128 4096> ret <2 x i128> %result }

We might still need to build the instructions and then fold them accordingly.

We're lacking good vector handling, and we don't really handle non-splat vectors anywhere In globalisel. We need some new helpers for this

For simple cases, you can try:

std::optional<APInt> MaybeRHS = getConstantOrConstantSplatVector(RHS);

std::optional<SmallVector<APInt>> = getConstantOrConstantVector(RHS, **UpperBound**);

X86 has vectors up to 512 bits. There are publicly known Arm cores with 512 bit vectors. This will seriously need an upper bound.

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

arsenm · 2024-02-29T08:32:43Z

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

+  Builder.setInstrAndDebugLoc(MI);
+
+  auto RHSC = getIConstantVRegValWithLookThrough(RHS, MRI);
+  assert(RHSC.has_value() && "RHS must be a constant");


Can bring this in as MatchData instead of re-querying

I prefer the BuildFunTy: https://reviews.llvm.org/D109357.

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

llvm/include/llvm/Target/GlobalISel/Combine.td

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

shiltian · 2024-03-18T13:12:51Z

gentle ping

jayfoad

Some of the comments from my previous review are not resolved.

llvm/include/llvm/Target/GlobalISel/Combine.td

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

llvm/lib/CodeGen/GlobalISel/Utils.cpp

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

This patch adds similar handling of div-by-pow2 as in `SelectionDAG`.

shiltian requested review from aemerson and arsenm February 27, 2024 17:07

shiltian commented Feb 27, 2024

View reviewed changes

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp Show resolved Hide resolved

redstar reviewed Feb 27, 2024

View reviewed changes

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp Outdated Show resolved Hide resolved

tschuett reviewed Feb 27, 2024

View reviewed changes

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp Outdated Show resolved Hide resolved

redstar reviewed Feb 27, 2024

View reviewed changes

shiltian force-pushed the globalisel-divide-by-pow2 branch from f413999 to 4c6292a Compare February 27, 2024 21:33

shiltian force-pushed the globalisel-divide-by-pow2 branch from 4c6292a to 47372fa Compare February 27, 2024 22:52

shiltian commented Feb 28, 2024

View reviewed changes

llvm/test/CodeGen/AMDGPU/div_i128.ll Outdated Show resolved Hide resolved

shiltian force-pushed the globalisel-divide-by-pow2 branch 2 times, most recently from e279053 to 03cfa65 Compare February 28, 2024 23:59

shiltian marked this pull request as ready for review February 28, 2024 23:59

llvmbot added backend:AMDGPU llvm:globalisel labels Feb 28, 2024

shiltian commented Feb 29, 2024

View reviewed changes

arsenm reviewed Feb 29, 2024

View reviewed changes

arsenm requested review from tschuett and Pierre-vh February 29, 2024 08:33

shiltian force-pushed the globalisel-divide-by-pow2 branch from 03cfa65 to f35e8b4 Compare February 29, 2024 15:55

shiltian commented Feb 29, 2024

View reviewed changes

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp Outdated Show resolved Hide resolved

Pierre-vh requested changes Mar 4, 2024

View reviewed changes

shiltian force-pushed the globalisel-divide-by-pow2 branch from f35e8b4 to d6c5329 Compare March 7, 2024 00:09

arsenm reviewed Mar 14, 2024

View reviewed changes

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp Show resolved Hide resolved

shiltian force-pushed the globalisel-divide-by-pow2 branch from f2a4618 to b02ca81 Compare March 14, 2024 12:50

arsenm reviewed Mar 15, 2024

View reviewed changes

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp Outdated Show resolved Hide resolved

shiltian force-pushed the globalisel-divide-by-pow2 branch from b02ca81 to f1f09f6 Compare March 15, 2024 14:08

arsenm reviewed Mar 15, 2024

View reviewed changes

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp Show resolved Hide resolved

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp Outdated Show resolved Hide resolved

shiltian force-pushed the globalisel-divide-by-pow2 branch from f1f09f6 to 83f00c1 Compare March 15, 2024 18:02

jayfoad reviewed Mar 18, 2024

View reviewed changes

shiltian force-pushed the globalisel-divide-by-pow2 branch 3 times, most recently from dbf9e7c to 0943946 Compare March 21, 2024 04:49

arsenm reviewed Mar 21, 2024

View reviewed changes

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp Outdated Show resolved Hide resolved

shiltian force-pushed the globalisel-divide-by-pow2 branch from 0943946 to dfe9416 Compare March 22, 2024 18:16

shiltian commented Mar 22, 2024

View reviewed changes

llvm/lib/CodeGen/GlobalISel/Utils.cpp Outdated Show resolved Hide resolved

jayfoad reviewed Mar 23, 2024

View reviewed changes

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp Outdated Show resolved Hide resolved

shiltian force-pushed the globalisel-divide-by-pow2 branch 4 times, most recently from cb0c058 to f4dac86 Compare March 26, 2024 18:27

shiltian commented Mar 26, 2024

View reviewed changes

llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll Show resolved Hide resolved

shiltian mentioned this pull request Mar 26, 2024

[AMDGPU] Fix missing IsExact flag when expanding vector binary operator #86712

Merged

shiltian force-pushed the globalisel-divide-by-pow2 branch from f4dac86 to 7b23f2b Compare March 28, 2024 01:30

arsenm reviewed Mar 28, 2024

View reviewed changes

shiltian force-pushed the globalisel-divide-by-pow2 branch from 7b23f2b to 0a36edc Compare March 28, 2024 14:52

arsenm approved these changes Mar 29, 2024

View reviewed changes

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp Outdated Show resolved Hide resolved

[GlobalISel] Handle div-by-pow2

91deb92

This patch adds similar handling of div-by-pow2 as in `SelectionDAG`.

shiltian force-pushed the globalisel-divide-by-pow2 branch from 0a36edc to 91deb92 Compare March 29, 2024 14:31

shiltian merged commit 661bb9d into llvm:main Mar 29, 2024

shiltian deleted the globalisel-divide-by-pow2 branch March 29, 2024 16:41

shiltian mentioned this pull request Mar 29, 2024

GlobalISel missing divide-by-pow2 special case expansions #80671

Closed


		Builder.setInstrAndDebugLoc(MI);

		auto RHSC = getIConstantVRegValWithLookThrough(RHS, MRI);

[GlobalISel] Handle div-by-pow2 #83155

[GlobalISel] Handle div-by-pow2 #83155

Uh oh!

Conversation

shiltian commented Feb 27, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

shiltian commented Feb 27, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Feb 27, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

redstar Feb 27, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

shiltian commented Feb 27, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

redstar commented Feb 28, 2024

Uh oh!

tschuett commented Feb 28, 2024

Uh oh!

shiltian commented Feb 28, 2024

Uh oh!

tschuett commented Feb 28, 2024

Uh oh!

Uh oh!

llvmbot commented Feb 29, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

shiltian commented Mar 18, 2024

Uh oh!

jayfoad left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

shiltian commented Feb 27, 2024 •

edited

Loading

shiltian commented Feb 27, 2024 •

edited

Loading

github-actions bot commented Feb 27, 2024 •

edited

Loading

redstar Feb 27, 2024 •

edited

Loading

shiltian commented Feb 27, 2024 •

edited

Loading

llvmbot commented Feb 29, 2024 •

edited

Loading