Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 38 additions & 15 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3203,11 +3203,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
switch (DefMI.getOpcode()) {
default:
return false;
case AMDGPU::V_MOV_B64_e32:
case AMDGPU::S_MOV_B64:
// TODO: We could fold 64-bit immediates, but this get complicated
// when there are sub-registers.
return false;

case AMDGPU::V_MOV_B64_PSEUDO:
case AMDGPU::S_MOV_B64_IMM_PSEUDO:
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::S_MOV_B32:
case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
Expand All @@ -3220,19 +3219,45 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (!ImmOp->isImm())
return false;

auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t {
int64_t Imm = ImmOp->getImm();
switch (UseOp.getSubReg()) {
default:
return Imm;
case AMDGPU::sub0:
return Lo_32(Imm);
case AMDGPU::sub1:
return Hi_32(Imm);
case AMDGPU::lo16:
return APInt(16, Imm).getSExtValue();
case AMDGPU::hi16:
return APInt(32, Imm).ashr(16).getSExtValue();
case AMDGPU::sub1_lo16:
return APInt(16, Hi_32(Imm)).getSExtValue();
case AMDGPU::sub1_hi16:
return APInt(32, Hi_32(Imm)).ashr(16).getSExtValue();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: seems over-engineered to use APInt. Simpler and faster to return (int16_t)(Imm >> 48).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is not a same value though? The cast to int16_t will truncate it?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

int16_t is signed so the implicit conversion to the return type will sign extend it.

}
};

assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");

unsigned Opc = UseMI.getOpcode();
if (Opc == AMDGPU::COPY) {
assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");

Register DstReg = UseMI.getOperand(0).getReg();
bool Is16Bit = getOpSize(UseMI, 0) == 2;
unsigned OpSize = getOpSize(UseMI, 0);
bool Is16Bit = OpSize == 2;
bool Is64Bit = OpSize == 8;
bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
APInt Imm(32, ImmOp->getImm());

if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16)
Imm = Imm.ashr(16);
unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
: AMDGPU::V_MOV_B32_e32
: Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
: AMDGPU::S_MOV_B32;
APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)));

if (RI.isAGPR(*MRI, DstReg)) {
if (!isInlineConstant(Imm))
if (Is64Bit || !isInlineConstant(Imm))
return false;
NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
}
Expand Down Expand Up @@ -3317,7 +3342,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (NewOpc == AMDGPU::V_FMAMK_F16_t16)
return false;

const int64_t Imm = ImmOp->getImm();
const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);

// FIXME: This would be a lot easier if we could return a new instruction
// instead of having to modify in place.
Expand Down Expand Up @@ -3401,8 +3426,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (NewOpc == AMDGPU::V_FMAAK_F16_t16)
return false;

const int64_t Imm = ImmOp->getImm();

// FIXME: This would be a lot easier if we could return a new instruction
// instead of having to modify in place.

Expand All @@ -3413,7 +3436,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));

// ChangingToImmediate adds Src2 back to the instruction.
Src2->ChangeToImmediate(Imm);
Src2->ChangeToImmediate(getImmFor(*Src2));

// These come before src2.
removeModOperands(UseMI);
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ def S_MOV_B64_IMM_PSEUDO : SPseudoInstSI <(outs SReg_64:$sdst),
let SchedRW = [WriteSALU, Write64Bit];
let Size = 16; // Needs maximum 2 s_mov_b32 instructions 8 byte long each.
let Uses = [];
let UseNamedOperandTable = 1;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is this for?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was missing, and we are querying names operand src0 while folding.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense, thanks. I did not even know there was a flag for this. I thought it happened automatically.

}

// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
Expand Down
8 changes: 3 additions & 5 deletions llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,10 @@ declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture) #1
; Function Attrs: norecurse
define internal fastcc void @svm_node_closure_bsdf(ptr addrspace(1) %sd, ptr %stack, <4 x i32> %node, ptr %offset, i32 %0, i8 %trunc, float %1, float %2, float %mul80, i1 %cmp412.old, <4 x i32> %3, float %4, i32 %5, i1 %cmp440, i1 %cmp442, i1 %or.cond1306, float %.op, ptr addrspace(1) %arrayidx.i.i2202, ptr addrspace(1) %retval.0.i.i22089, ptr addrspace(1) %retval.1.i221310, i1 %cmp575, ptr addrspace(1) %num_closure_left.i2215, i32 %6, i1 %cmp.i2216, i32 %7, i64 %idx.ext.i2223, i32 %sub5.i2221) #2 {
; GCN-LABEL: {{^}}svm_node_closure_bsdf:
; GCN-DAG: v_writelane_b32 [[CSR_VGPR:v[0-9]+]], s30,
; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31,
; GCN: s_movk_i32 s30, 0x60
; GCN-NOT: v_writelane_b32
; GCN: s_movk_i32 s28, 0x60
; GCN-NOT: s31
; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]],
; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]],
; GCN-NOT: v_readlane_b32
; GCN: s_waitcnt vmcnt(0)
; GCN: s_setpc_b64 s[30:31]
entry:
Expand Down
227 changes: 226 additions & 1 deletion llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs -run-pass peephole-opt -o - %s | FileCheck -check-prefix=GCN %s
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs -run-pass peephole-opt -o - %s | FileCheck -check-prefix=GCN %s

---
name: fold_simm_virtual
Expand Down Expand Up @@ -120,3 +120,228 @@ body: |
SI_RETURN_TO_EPILOG $vgpr0_lo16

...

---
name: fold_sreg_64_sub0_to_vgpr_32
body: |
bb.0:

; GCN-LABEL: name: fold_sreg_64_sub0_to_vgpr_32
; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1412567312, implicit $exec
; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B32_e32_]]
%0:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
%1:vgpr_32 = COPY killed %0.sub0
SI_RETURN_TO_EPILOG %1

...

---
name: fold_sreg_64_sub1_to_vgpr_32
body: |
bb.0:

; GCN-LABEL: name: fold_sreg_64_sub1_to_vgpr_32
; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1311768467750121200
; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 305419896, implicit $exec
; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B32_e32_]]
%0:sreg_64 = S_MOV_B64 1311768467750121200
%1:vgpr_32 = COPY killed %0.sub1
SI_RETURN_TO_EPILOG %1

...

---
name: fold_vreg_64_sub1_to_vgpr_32
body: |
bb.0:

; GCN-LABEL: name: fold_vreg_64_sub1_to_vgpr_32
; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 305419896, implicit $exec
; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B32_e32_]]
%0:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
%1:vgpr_32 = COPY killed %0.sub1
SI_RETURN_TO_EPILOG %1

...

---
name: fold_sreg_64_to_vreg_64
body: |
bb.0:

; GCN-LABEL: name: fold_sreg_64_to_vreg_64
; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B]]
%0:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
%1:vreg_64_align2 = COPY killed %0
SI_RETURN_TO_EPILOG %1

...

---
name: fold_sreg_64_to_sreg_64
body: |
bb.0:

; GCN-LABEL: name: fold_sreg_64_to_sreg_64
; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1311768467750121200
; GCN-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_MOV_B]]
%0:sreg_64 = S_MOV_B64 1311768467750121200
%1:sreg_64 = COPY killed %0
SI_RETURN_TO_EPILOG %1

...

---
name: fold_sreg_64_lo16_to_sgpr_lo16
body: |
bb.0:

; GCN-LABEL: name: fold_sreg_64_lo16_to_sgpr_lo16
; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1125912791875585
; GCN-NEXT: $sgpr0 = S_MOV_B32 1
; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0_lo16
%0:sreg_64 = S_MOV_B64_IMM_PSEUDO 1125912791875585
$sgpr0_lo16 = COPY killed %0.lo16
SI_RETURN_TO_EPILOG $sgpr0_lo16

...

---
name: fold_sreg_64_hi16_to_sgpr_lo16
body: |
bb.0:

; GCN-LABEL: name: fold_sreg_64_hi16_to_sgpr_lo16
; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1125912791875585
; GCN-NEXT: $sgpr0 = S_MOV_B32 2
; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0_lo16
%0:sreg_64 = S_MOV_B64_IMM_PSEUDO 1125912791875585
$sgpr0_lo16 = COPY killed %0.hi16
SI_RETURN_TO_EPILOG $sgpr0_lo16

...

---
name: fold_sreg_64_sub1_lo16_to_sgpr_lo16
body: |
bb.0:

; GCN-LABEL: name: fold_sreg_64_sub1_lo16_to_sgpr_lo16
; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1125912791875585
; GCN-NEXT: $sgpr0 = S_MOV_B32 3
; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0_lo16
%0:sreg_64 = S_MOV_B64_IMM_PSEUDO 1125912791875585
$sgpr0_lo16 = COPY killed %0.sub1_lo16
SI_RETURN_TO_EPILOG $sgpr0_lo16

...

---
name: fold_sreg_64_sub1_hi16_to_sgpr_lo16
body: |
bb.0:

; GCN-LABEL: name: fold_sreg_64_sub1_hi16_to_sgpr_lo16
; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1125912791875585
; GCN-NEXT: $sgpr0 = S_MOV_B32 4
; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0_lo16
%0:sreg_64 = S_MOV_B64_IMM_PSEUDO 1125912791875585
$sgpr0_lo16 = COPY killed %0.sub1_hi16
SI_RETURN_TO_EPILOG $sgpr0_lo16

...

---
name: fmac_sreg_64_sub0_src0_to_fmamk
tracksRegLiveness: true
body: |
bb.0:

; GCN-LABEL: name: fmac_sreg_64_sub0_src0_to_fmamk
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_FMAMK_F32_:%[0-9]+]]:vgpr_32 = V_FMAMK_F32 [[DEF]], 2882399984, [[DEF1]], implicit $mode, implicit $exec
; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_FMAMK_F32_]]
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
%3:vgpr_32 = V_FMAC_F32_e64 0, %2.sub0, 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec
SI_RETURN_TO_EPILOG %3
...

---
name: fmac_sreg_64_sub1_src0_to_fmamk
tracksRegLiveness: true
body: |
bb.0:

; GCN-LABEL: name: fmac_sreg_64_sub1_src0_to_fmamk
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_FMAMK_F32_:%[0-9]+]]:vgpr_32 = V_FMAMK_F32 [[DEF]], 305419896, [[DEF1]], implicit $mode, implicit $exec
; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_FMAMK_F32_]]
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
%3:vgpr_32 = V_FMAC_F32_e64 0, %2.sub1, 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec
SI_RETURN_TO_EPILOG %3
...

---
name: fmac_sreg_64_sub1_src1_to_fmaak
tracksRegLiveness: true
body: |
bb.0:

; GCN-LABEL: name: fmac_sreg_64_sub1_src1_to_fmaak
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_FMAMK_F32_:%[0-9]+]]:vgpr_32 = V_FMAMK_F32 [[DEF]], 305419896, [[DEF1]], implicit $mode, implicit $exec
; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_FMAMK_F32_]]
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
%3:vgpr_32 = V_FMAC_F32_e64 0, %0, 0, %2.sub1, 0, %1, 0, 0, implicit $mode, implicit $exec
SI_RETURN_TO_EPILOG %3
...

---
name: fma_sreg_64_sub0_to_fmaak
tracksRegLiveness: true
body: |
bb.0:

; GCN-LABEL: name: fma_sreg_64_sub0_to_fmaak
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_FMAAK_F32_:%[0-9]+]]:vgpr_32 = V_FMAAK_F32 [[DEF]], [[DEF1]], 2882399984, implicit $mode, implicit $exec
; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_FMAAK_F32_]]
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
%3:vgpr_32 = V_FMA_F32_e64 0, %0, 0, %1, 0, %2.sub0, 0, 0, implicit $mode, implicit $exec
SI_RETURN_TO_EPILOG %3
...

---
name: fma_sreg_64_sub1_to_fmaak
tracksRegLiveness: true
body: |
bb.0:

; GCN-LABEL: name: fma_sreg_64_sub1_to_fmaak
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_FMAAK_F32_:%[0-9]+]]:vgpr_32 = V_FMAAK_F32 [[DEF]], [[DEF1]], 305419896, implicit $mode, implicit $exec
; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_FMAAK_F32_]]
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
%3:vgpr_32 = V_FMA_F32_e64 0, %0, 0, %1, 0, %2.sub1, 0, 0, implicit $mode, implicit $exec
SI_RETURN_TO_EPILOG %3
...
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -573,8 +573,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX900-NEXT: v_mov_b32_e32 v3, s35
; GFX900-NEXT: v_add_co_u32_e32 v1, vcc, s34, v1
; GFX900-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v3, vcc
; GFX900-NEXT: s_movk_i32 s0, 0x5000
; GFX900-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1
; GFX900-NEXT: v_add_co_u32_e32 v1, vcc, 0x5000, v1
; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
; GFX900-NEXT: s_movk_i32 s2, 0x7f
Expand Down Expand Up @@ -805,8 +804,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX90A-NEXT: v_mov_b32_e32 v2, s35
; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, s34, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v2, vcc
; GFX90A-NEXT: s_movk_i32 s0, 0x5000
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v1
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x5000, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX90A-NEXT: s_movk_i32 s2, 0x7f
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], 0, 0
Expand Down