llvm · aleksandar-amd · Jun 10, 2025 · Jun 11, 2025
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -95,6 +95,10 @@ def gi_vinterpmods_hi :
     GIComplexOperandMatcher<s32, "selectVINTERPModsHi">,
     GIComplexPatternEquiv<VINTERPModsHi>;
 
+def gi_immsub :
+    GIComplexOperandMatcher<s32, "selectImmSub">,
+    GIComplexPatternEquiv<ImmSub>;
+
 // FIXME: Why do we have both VOP3OpSel and VOP3OpSelMods?
 def gi_vop3opsel :
     GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3610,6 +3610,26 @@ bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
   return true;
 }
 
+bool AMDGPUDAGToDAGISel::SelectImmSub(SDValue In, SDValue &Src,
+                                      SDValue &InvSrc) const {
+  Src = In;
+
+  // Handle constant operands
+  ConstantSDNode *ImmVal = dyn_cast<ConstantSDNode>(In);
+  if (ImmVal)
+    InvSrc = CurDAG->getTargetConstant(32 - ImmVal->getZExtValue(), SDLoc(In),
+                                       MVT::i32);
+  else {
+    // Fallback: generate SUB instruction for non-constant, non-negation cases
+    SDNode *VMov = CurDAG->getMachineNode(
+        AMDGPU::S_SUB_U32, SDLoc(In), MVT::i32,
+        {CurDAG->getTargetConstant(32, SDLoc(In), MVT::i32), In});
+    InvSrc = SDValue(VMov, 0);
+  }
+
+  return true;
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
                                          SDValue &SrcMods) const {
   Src = In;

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -246,6 +246,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
                    SDValue &Tbl) const;
 
+  bool SelectImmSub(SDValue In, SDValue &Src, SDValue &InvSrc) const;
+
   SDValue getHi16Elt(SDValue In) const;
 
   SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const;

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5831,6 +5831,31 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
            [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
 }
 
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectImmSub(MachineOperand &Root) const {
+
+  MachineInstr *MI = Root.getParent();
+  MachineBasicBlock *MBB = MI->getParent();
+  Register SrcInv = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+  // Handle constant operands
+  std::optional<uint64_t> Val = getConstantZext32Val(Root.getReg(), *MRI);
+
+  if (!Val) {
+    BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_SUB_U32), SrcInv)
+        .addImm(32)
+        .add(Root);
+  } else {
+    BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SrcInv)
+        .addImm(32 - *Val);
+  }
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(SrcInv); },
+  }};
+}
+
 std::pair<Register, unsigned>
 AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
                                                      bool &Matched) const {

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -224,6 +224,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   InstructionSelector::ComplexRendererFns
   selectVINTERPModsHi(MachineOperand &Root) const;
 
+  InstructionSelector::ComplexRendererFns
+  selectImmSub(MachineOperand &Root) const;
+
   bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset,
                         int64_t *Offset) const;
   InstructionSelector::ComplexRendererFns

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2046,9 +2046,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .clampScalar(0, S32, S64)
     .lower();
 
-  getActionDefinitionsBuilder({G_ROTR, G_ROTL})
-    .scalarize(0)
-    .lower();
+  getActionDefinitionsBuilder(G_ROTR).legalFor({S32}).scalarize(0).lower();
+
+  getActionDefinitionsBuilder(G_ROTL).scalarize(0).lower();
 
   // TODO: Only Try to form v2s16 with legal packed instructions.
   getActionDefinitionsBuilder(G_FSHR)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4073,6 +4073,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_AMDGPU_SMED3:
   case AMDGPU::G_AMDGPU_FMED3:
     return getDefaultMappingVOP(MI);
+  case AMDGPU::G_ROTR:
+  case AMDGPU::G_ROTL: {
+    if (isSALUMapping(MI))
+      return getDefaultMappingSOP(MI);
+    return getDefaultMappingVOP(MI);
+  }
   case AMDGPU::G_UMULH:
   case AMDGPU::G_SMULH: {
     if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1630,6 +1630,8 @@ def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
 def VINTERPMods  : ComplexPattern<untyped, 2, "SelectVINTERPMods">;
 def VINTERPModsHi  : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">;
 
+def ImmSub  : ComplexPattern<untyped, 2, "SelectImmSub">;
+
 //===----------------------------------------------------------------------===//
 // SI assembler operands
 //===----------------------------------------------------------------------===//

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2388,8 +2388,19 @@ def : AMDGPUPat <
                $src1), sub1)
 >;
 
+// rotr pattern
+def : AMDGPUPat <
+  (UniformBinFrag<rotr> i32:$src0, (i32 (ImmSub i32:$src1, i32:$src1_inv))),
+  (S_OR_B32 (S_LSHR_B32 i32:$src0, i32:$src1), (S_LSHL_B32 i32:$src0, i32:$src1_inv))
+>;
+
 let True16Predicate = NotHasTrue16BitInsts in {
-def : ROTRPattern <V_ALIGNBIT_B32_e64>;
+
+// rotr pattern
+def : AMDGPUPat <
+  (DivergentBinFrag<rotr> i32:$src0, i32:$src1),
+  (V_ALIGNBIT_B32_e64 $src0, $src0, $src1)
+>;
 
 def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
           (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir
@@ -181,8 +181,8 @@ body:             |
     ; GFX-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
     ; GFX-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
-    ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY]], [[SUB]](s32)
-    ; GFX-NEXT: $sgpr0 = COPY [[FSHR]](s32)
+    ; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[COPY]], [[SUB]](s32)
+    ; GFX-NEXT: $sgpr0 = COPY [[ROTR]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s32) = G_ROTL %0, %1(s32)
@@ -301,14 +301,14 @@ body:             |
     ; GFX-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>)
     ; GFX-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV4]]
-    ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV]], [[SUB]](s32)
+    ; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[UV]], [[SUB]](s32)
     ; GFX-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV5]]
-    ; GFX-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV1]], [[SUB1]](s32)
+    ; GFX-NEXT: [[ROTR1:%[0-9]+]]:_(s32) = G_ROTR [[UV1]], [[SUB1]](s32)
     ; GFX-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV6]]
-    ; GFX-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV2]], [[UV2]], [[SUB2]](s32)
+    ; GFX-NEXT: [[ROTR2:%[0-9]+]]:_(s32) = G_ROTR [[UV2]], [[SUB2]](s32)
     ; GFX-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV7]]
-    ; GFX-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[UV3]], [[UV3]], [[SUB3]](s32)
-    ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32), [[FSHR2]](s32), [[FSHR3]](s32)
+    ; GFX-NEXT: [[ROTR3:%[0-9]+]]:_(s32) = G_ROTR [[UV3]], [[SUB3]](s32)
+    ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ROTR]](s32), [[ROTR1]](s32), [[ROTR2]](s32), [[ROTR3]](s32)
     ; GFX-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
     %1:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7
@@ -391,8 +391,8 @@ body:             |
     ; GFX-NEXT: {{  $}}
     ; GFX-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
     ; GFX-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
-    ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY]], [[COPY1]](s32)
-    ; GFX-NEXT: $sgpr0 = COPY [[FSHR]](s32)
+    ; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[COPY]], [[COPY1]](s32)
+    ; GFX-NEXT: $sgpr0 = COPY [[ROTR]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s32) = G_ROTR %0, %1(s32)
@@ -452,11 +452,11 @@ body:             |
     ; GFX-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
     ; GFX-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>)
-    ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV]], [[UV4]](s32)
-    ; GFX-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV1]], [[UV5]](s32)
-    ; GFX-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV2]], [[UV2]], [[UV6]](s32)
-    ; GFX-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[UV3]], [[UV3]], [[UV7]](s32)
-    ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32), [[FSHR2]](s32), [[FSHR3]](s32)
+    ; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[UV]], [[UV4]](s32)
+    ; GFX-NEXT: [[ROTR1:%[0-9]+]]:_(s32) = G_ROTR [[UV1]], [[UV5]](s32)
+    ; GFX-NEXT: [[ROTR2:%[0-9]+]]:_(s32) = G_ROTR [[UV2]], [[UV6]](s32)
+    ; GFX-NEXT: [[ROTR3:%[0-9]+]]:_(s32) = G_ROTR [[UV3]], [[UV7]](s32)
+    ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ROTR]](s32), [[ROTR1]](s32), [[ROTR2]](s32), [[ROTR3]](s32)
     ; GFX-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
     %1:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7