-
Notifications
You must be signed in to change notification settings - Fork 12.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Use absolute relocations when compiling for AMDPAL and Mesa3D #67791
Conversation
@llvm/pr-subscribers-mc @llvm/pr-subscribers-llvm-globalisel ChangesThe primary ISA-independent justification for using PC-relative addressing is that it makes code position-independent and therefore allows sharing of .text pages between processes. Since PAL does not share .text pages, we can use absolute relocations when compiling for AMDPAL. Patch is 418.74 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/67791.diff 16 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index db226a302900160..f04f85b7deaa5b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -17,7 +17,10 @@
#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPUInstrInfo.h"
#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/BinaryFormat/ELF.h"
@@ -26,6 +29,7 @@
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
@@ -2764,6 +2768,50 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
return true;
}
+
+ // Emit a ABS32_LO / ABS32_HI relocation stub.
+ void AMDGPULegalizerInfo::buildAbsGlobalAddress(
+ Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
+ int64_t Offset, MachineRegisterInfo &MRI) const {
+ bool IsDwordTy = PtrTy.getSizeInBits() == 32;
+
+ LLT S32 = LLT::scalar(32);
+
+ Register AddrDst;
+ if (IsDwordTy) {
+ AddrDst = MRI.createGenericVirtualRegister(S32);
+ MRI.setRegClass(AddrDst, &AMDGPU::SReg_32RegClass);
+ } else {
+ assert(PtrTy.getSizeInBits() == 64 &&
+ "Must provide a 64-bit pointer type!");
+ AddrDst = MRI.createGenericVirtualRegister(LLT::scalar(64));
+ MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
+ }
+
+ SmallVector<Register> Operands;
+
+ Register AddrLo = MRI.createGenericVirtualRegister(S32);
+ MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
+
+ B.buildInstr(AMDGPU::S_MOV_B32)
+ .addDef(AddrLo)
+ .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
+
+ Operands.push_back(AddrLo);
+
+ if (!IsDwordTy) {
+ Register AddrHi = MRI.createGenericVirtualRegister(S32);
+ MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
+ B.buildInstr(AMDGPU::S_MOV_B32)
+ .addDef(AddrHi)
+ .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
+
+ Operands.push_back(AddrHi);
+ }
+
+ B.buildMergeValues(DstReg, Operands);
+ }
+
bool AMDGPULegalizerInfo::legalizeGlobalValue(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
@@ -2828,6 +2876,12 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
return true;
}
+ if (ST.isAmdPalOS()) {
+ buildAbsGlobalAddress(DstReg, Ty, B, GV, 0, MRI);
+ MI.eraseFromParent();
+ return true;
+ }
+
const SITargetLowering *TLI = ST.getTargetLowering();
if (TLI->shouldEmitFixup(GV)) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index ab7fe92d6a7201e..f60c0498cb6040c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -68,6 +68,10 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
const GlobalValue *GV, int64_t Offset,
unsigned GAFlags = SIInstrInfo::MO_NONE) const;
+ void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B,
+ const GlobalValue *GV, int64_t Offset,
+ MachineRegisterInfo &MRI) const;
+
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 3f188478ca8bc66..58eed81e075560c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -63,6 +63,10 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
return ELF::R_AMDGPU_REL32_HI;
case MCSymbolRefExpr::VK_AMDGPU_REL64:
return ELF::R_AMDGPU_REL64;
+ case MCSymbolRefExpr::VK_AMDGPU_ABS32_LO:
+ return ELF::R_AMDGPU_ABS32_LO;
+ case MCSymbolRefExpr::VK_AMDGPU_ABS32_HI:
+ return ELF::R_AMDGPU_ABS32_HI;
}
MCFixupKind Kind = Fixup.getKind();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f170428b38c49a5..117a803e2b4970a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5708,7 +5708,10 @@ bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
AMDGPU::shouldEmitConstantsToTextSection(TT);
}
-bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
+bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
+ if (Subtarget->isAmdPalOS())
+ return false;
+
// FIXME: Either avoid relying on address space here or change the default
// address space for functions to avoid the explicit check.
return (GV->getValueType()->isFunctionTy() ||
@@ -6726,9 +6729,22 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
}
+ if (Subtarget->isAmdPalOS()) {
+ SDValue AddrLo = DAG.getTargetGlobalAddress(
+ GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
+ AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
+
+ SDValue AddrHi = DAG.getTargetGlobalAddress(
+ GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
+ AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
+
+ return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
+ }
+
if (shouldEmitFixup(GV))
return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
- else if (shouldEmitPCReloc(GV))
+
+ if (shouldEmitPCReloc(GV))
return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
SIInstrInfo::MO_REL32);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.ll
index b60dd6dea7f79d8..fcaafa64bd9e996 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -stop-after=legalizer < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -stop-after=legalizer < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -stop-after=legalizer < %s | FileCheck -check-prefix=GCN-PAL %s
@external_constant = external addrspace(4) constant i32, align 4
@external_constant32 = external addrspace(6) constant i32, align 4
@@ -22,6 +22,14 @@ define ptr addrspace(4) @external_constant_got() {
; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GCN-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+
+ ; GCN-PAL-LABEL: name: external_constant_got
+ ; GCN-PAL: bb.1 (%ir-block.0):
+ ; GCN-PAL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-lo) @external_constant
+ ; GCN-PAL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-hi) @external_constant
+ ; GCN-PAL-NEXT: $vgpr0 = COPY [[S_MOV_B32_]](s32)
+ ; GCN-PAL-NEXT: $vgpr1 = COPY [[S_MOV_B32_1]](s32)
+ ; GCN-PAL-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
ret ptr addrspace(4) @external_constant
}
@@ -34,6 +42,14 @@ define ptr addrspace(1) @external_global_got() {
; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GCN-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+
+ ; GCN-PAL-LABEL: name: external_global_got
+ ; GCN-PAL: bb.1 (%ir-block.0):
+ ; GCN-PAL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-lo) @external_global
+ ; GCN-PAL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-hi) @external_global
+ ; GCN-PAL-NEXT: $vgpr0 = COPY [[S_MOV_B32_]](s32)
+ ; GCN-PAL-NEXT: $vgpr1 = COPY [[S_MOV_B32_1]](s32)
+ ; GCN-PAL-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
ret ptr addrspace(1) @external_global
}
@@ -46,6 +62,14 @@ define ptr addrspace(999) @external_other_got() {
; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GCN-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+
+ ; GCN-PAL-LABEL: name: external_other_got
+ ; GCN-PAL: bb.1 (%ir-block.0):
+ ; GCN-PAL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-lo) @external_other
+ ; GCN-PAL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-hi) @external_other
+ ; GCN-PAL-NEXT: $vgpr0 = COPY [[S_MOV_B32_]](s32)
+ ; GCN-PAL-NEXT: $vgpr1 = COPY [[S_MOV_B32_1]](s32)
+ ; GCN-PAL-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
ret ptr addrspace(999) @external_other
}
@@ -57,6 +81,14 @@ define ptr addrspace(4) @internal_constant_pcrel() {
; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GCN-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+
+ ; GCN-PAL-LABEL: name: internal_constant_pcrel
+ ; GCN-PAL: bb.1 (%ir-block.0):
+ ; GCN-PAL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-lo) @internal_constant
+ ; GCN-PAL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-hi) @internal_constant
+ ; GCN-PAL-NEXT: $vgpr0 = COPY [[S_MOV_B32_]](s32)
+ ; GCN-PAL-NEXT: $vgpr1 = COPY [[S_MOV_B32_1]](s32)
+ ; GCN-PAL-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
ret ptr addrspace(4) @internal_constant
}
@@ -68,6 +100,14 @@ define ptr addrspace(1) @internal_global_pcrel() {
; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GCN-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+
+ ; GCN-PAL-LABEL: name: internal_global_pcrel
+ ; GCN-PAL: bb.1 (%ir-block.0):
+ ; GCN-PAL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-lo) @internal_global
+ ; GCN-PAL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-hi) @internal_global
+ ; GCN-PAL-NEXT: $vgpr0 = COPY [[S_MOV_B32_]](s32)
+ ; GCN-PAL-NEXT: $vgpr1 = COPY [[S_MOV_B32_1]](s32)
+ ; GCN-PAL-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
ret ptr addrspace(1) @internal_global
}
@@ -79,6 +119,14 @@ define ptr addrspace(999) @internal_other_pcrel() {
; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GCN-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+
+ ; GCN-PAL-LABEL: name: internal_other_pcrel
+ ; GCN-PAL: bb.1 (%ir-block.0):
+ ; GCN-PAL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-lo) @internal_other
+ ; GCN-PAL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-hi) @internal_other
+ ; GCN-PAL-NEXT: $vgpr0 = COPY [[S_MOV_B32_]](s32)
+ ; GCN-PAL-NEXT: $vgpr1 = COPY [[S_MOV_B32_1]](s32)
+ ; GCN-PAL-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
ret ptr addrspace(999) @internal_other
}
@@ -90,6 +138,13 @@ define ptr addrspace(6) @external_constant32_got() {
; GCN-NEXT: [[EXTRACT:%[0-9]+]]:_(p6) = G_EXTRACT [[LOAD]](p4), 0
; GCN-NEXT: $vgpr0 = COPY [[EXTRACT]](p6)
; GCN-NEXT: SI_RETURN implicit $vgpr0
+
+ ; GCN-PAL-LABEL: name: external_constant32_got
+ ; GCN-PAL: bb.1 (%ir-block.0):
+ ; GCN-PAL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-lo) @external_constant32
+ ; GCN-PAL-NEXT: [[MV:%[0-9]+]]:_(p6) = G_MERGE_VALUES [[S_MOV_B32_]](s32)
+ ; GCN-PAL-NEXT: $vgpr0 = COPY [[MV]](p6)
+ ; GCN-PAL-NEXT: SI_RETURN implicit $vgpr0
ret ptr addrspace(6) @external_constant32
}
@@ -100,5 +155,12 @@ define ptr addrspace(6) @internal_constant32_pcrel() {
; GCN-NEXT: [[EXTRACT:%[0-9]+]]:_(p6) = G_EXTRACT [[SI_PC_ADD_REL_OFFSET]](p4), 0
; GCN-NEXT: $vgpr0 = COPY [[EXTRACT]](p6)
; GCN-NEXT: SI_RETURN implicit $vgpr0
+
+ ; GCN-PAL-LABEL: name: internal_constant32_pcrel
+ ; GCN-PAL: bb.1 (%ir-block.0):
+ ; GCN-PAL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-lo) @internal_constant32
+ ; GCN-PAL-NEXT: [[MV:%[0-9]+]]:_(p6) = G_MERGE_VALUES [[S_MOV_B32_]](s32)
+ ; GCN-PAL-NEXT: $vgpr0 = COPY [[MV]](p6)
+ ; GCN-PAL-NEXT: SI_RETURN implicit $vgpr0
ret ptr addrspace(6) @internal_constant32
}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll
index 2feeb83e6f1467b..c5dbfb0f219bd9b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll
@@ -12,7 +12,7 @@
; ELF: Relocations [
; ELF-NEXT: Section (3) .rel.text {
-; ELF-NEXT: 0x{{[0-9]+}} R_AMDGPU_ABS32 doff_0_0_b{{$}}
+; ELF-NEXT: 0x{{[0-9]+}} R_AMDGPU_ABS32_LO doff_0_0_b{{$}}
define amdgpu_ps void @ps_main(i32 %arg, i32 inreg %arg1, i32 inreg %arg2) local_unnamed_addr #0 {
%rc = call i32 @llvm.amdgcn.reloc.constant(metadata !1)
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 9ec9414d91171b7..9d94f8e6ca227e8 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -1335,9 +1335,9 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac
; CI-NEXT: s_mov_b32 s40, s0
; CI-NEXT: s_load_dwordx4 s[40:43], s[40:41], 0x0
; CI-NEXT: s_mov_b32 s14, s10
-; CI-NEXT: s_mov_b32 s12, s8
-; CI-NEXT: s_mov_b32 s13, s9
; CI-NEXT: v_lshlrev_b32_e32 v3, 2, v0
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_mov_b32 s12, s8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s40, s40, s11
; CI-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -1345,27 +1345,24 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac
; CI-NEXT: s_load_dword s6, s[4:5], 0x2
; CI-NEXT: s_addc_u32 s41, s41, 0
; CI-NEXT: s_add_u32 s8, s4, 12
-; CI-NEXT: s_addc_u32 s9, s5, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, void_func_void@gotpcrel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, void_func_void@gotpcrel32@hi+12
+; CI-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; CI-NEXT: s_mov_b32 s13, s9
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_add_i32_e32 v40, vcc, s6, v3
-; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CI-NEXT: ds_read_b32 v41, v40
-; CI-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; CI-NEXT: s_addc_u32 s9, s5, 0
; CI-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; CI-NEXT: v_or_b32_e32 v0, v0, v1
; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_mov_b64 s[0:1], s[40:41]
+; CI-NEXT: s_mov_b32 s17, void_func_void@abs32@hi
+; CI-NEXT: s_mov_b32 s16, void_func_void@abs32@lo
; CI-NEXT: v_or_b32_e32 v31, v0, v2
; CI-NEXT: s_mov_b64 s[2:3], s[42:43]
; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_mov_b32 s39, 0xf000
; CI-NEXT: s_mov_b32 s38, -1
-; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CI-NEXT: ds_read_b32 v0, v40 offset:4
; CI-NEXT: s_waitcnt lgkmcnt(0)
@@ -1384,28 +1381,25 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 s36, s36, s11
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_add_u32 s8, s4, 12
; GFX9-NEXT: s_addc_u32 s9, s5, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, void_func_void@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, void_func_void@gotpcrel32@hi+12
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s6
-; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX9-NEXT: ds_read_b32 v42, v41
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_mov_b32 s17, void_func_void@abs32@hi
+; GFX9-NEXT: s_mov_b32 s16, void_func_void@abs32@lo
; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: v_mov_b32_e32 v40, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT: ds_read_b32 v0, v41 offset:4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
index 7c9d01db9c2c093..9ab8be0485eddbe 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
@@ -33,21 +33,18 @@ define amdgpu_gfx void @gfx_func() {
; SDAG-NEXT: v_writelane_b32 v40, s21, 17
; SDAG-NEXT: v_writelane_b32 v40, s22, 18
; SDAG-NEXT: v_writelane_b32 v40, s23, 19
-; SDAG-NEXT: s_addk_i32 s32, 0x400
; SDAG-NEXT: v_writelane_b32 v40, s24, 20
; SDAG-NEXT: v_writelane_b32 v40, s25, 21
-; SDAG-NEXT: s_getpc_b64 s[34:35]
-; SDAG-NEXT: s_add_u32 s34, s34, extern_c_func@gotpcrel32@lo+4
-; SDAG-NEXT: s_addc_u32 s35, s35, extern_c_func@gotpcrel32@hi+12
; SDAG-NEXT: v_writelane_b32 v40, s26, 22
-; SDAG-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; SDAG-NEXT: v_writelane_b32 v40, s27, 23
; SDAG-NEXT: v_writelane_b32 v40, s28, 24
; SDAG-NEXT: v_writelane_b32 v40, s29, 25
; SDAG-NEXT: v_writelane_b32 v40, s30, 26
+; SDAG-NEXT: s_mov_b32 s35, extern_c_func@abs32@hi
+; SDAG-NEXT: s_mov_b32 s34, extern_c_func@abs32@lo
; SDAG-NEXT: s_mov_b64 s[8:9], 0
+; SDAG-NEXT: s_addk_i32 s32, 0x400
; SDAG-NEXT: v_writelane_b32 v40, s31, 27
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: s_swappc_b64 s[30:31], s[34:35]
; SDAG-NEXT: v_readlane_b32 s31, v40, 27
; SDAG-NEXT: v_readlane_b32 s30, v40, 26
@@ -113,21 +110,18 @@ define amdgpu_gfx void @gfx_func() {
; GISEL-NEXT: v_writelane_b32 v40, s21, 17
; GISEL-NEXT: v_writelane_b32 v40, s22, 18
; GISEL-NEXT: v_writelane_b32 v40, s23, 19
-; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: v_writelane_b32 v40, s24, 20
; GISEL-NEXT: v_writelane_b32 v40, s25, 21
-; GISEL-NEXT: s_getpc_b64 s[34:35]
-; GISEL-NEXT: s_add_u32 s34, s34, extern_c_func@gotpcrel32@lo+4
-; GISEL-NEXT: s_addc_u32 s35, s35, extern_c_func@gotpcrel32@hi+12
; GISEL-NEXT: v_writelane_b32 v40, s26, 22
-; GISEL-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GISEL-NEXT: v_writelane_b32 v40, s27, 23
; GISEL-NEXT: v_writelane_b32 v40, s28, 24
; GISEL-NEXT: v_writelane_b32 v40, s29, 25
; GISEL-NEXT: v_writelane_b32 v40, s30, 26
+; GISEL-NEXT: s_mov_b32 s34, extern_c_func@abs32@lo
+; GISEL-NEXT: s_mov_b32 s35, extern_c_func@abs32@hi
; GISEL-NEXT: s_mov_b64 s[8:9], s[4:5]
+; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: v_writelane_b32 v40, s31, 27
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GISEL-NEXT: v_readlane_b32 s31, v40, 27
; GISEL-NEXT: v_readlane_b32 s30, v40, 26
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index c01634231403241..f827a78125b7785 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -119,10 +119,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
6d54229
to
61d133a
Compare
Fix MC test. I am not seeing any diff with git-clang-format. What am I supposed to do? I am using clang-format 16.0.6. |
I can reproduce the clang-format diff just by running |
61d133a
to
44cdc5a
Compare
It looks like the method above my code in the GISel code had wrong intendation at the closing bracket, which caused confusion in my clang-format build. The intendation in my code was broken after applying clang-format as well. Let's see if it works now. |
eb768d3
to
959bfd7
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Mostly LGTM.
f6e71bc
to
8fe5e06
Compare
In GISel, try to re-use the destination register only if it has no register class set. |
8fe5e06
to
18a04d8
Compare
The latest changes try to generally generate new registers iff the target does have a register class being set (in the 32-bit and 64-bit case). I wonder how conservative we need to be in terms of reg-class compatibility checks (the PCRel-Case does not do any checks like that). |
The primary ISA-independent justification for using PC-relative addressing is that it makes code position-independent and therefore allows sharing of .text pages between processes. When not sharing .text pages, we can use absolute relocations instead, which will possibly prevent a bubble by using s_getpc_b64.
18a04d8
to
3dbf02a
Compare
My understanding is:
|
I agree with your understanding. The idea of using buildCast was that it boils down to a COPY if it can, but does some smartness in other cases. The code should only create new registers when the dest has a register class, and since sometimes we have a register (Si_cc_gfx_ret) where the data is stored in sgprs but the type is pointer, this should work as well (which will not work with sole COPY afaik). At least that’s what I experienced during my tests. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, understood. Thanks for explaining about the issue with integers vs. pointers.
[AMDGPU] Use absolute relocations when compiling for AMDPAL and Mesa3D llvm/llvm-project#67791
Update tests affected by upstream change: [AMDGPU] Use absolute relocations when compiling for AMDPAL and Mesa3D llvm/llvm-project#67791 Tests would fail until they are propagated so they are also disabled in this patch. TODO: re-enable tests once everything has propagated.
Update tests affected by upstream change: [AMDGPU] Use absolute relocations when compiling for AMDPAL and Mesa3D llvm/llvm-project#67791 Tests would fail until they are propagated so they are also disabled in this patch. TODO: re-enable tests once everything has propagated.
Update test affected by upstream change: [AMDGPU] Use absolute relocations when compiling for AMDPAL and Mesa3D llvm/llvm-project#67791 Test would fail until they are propagated so they are also disabled in this patch. TODO: re-enable tests once everything has propagated.
Update test affected by upstream change: [AMDGPU] Use absolute relocations when compiling for AMDPAL and Mesa3D llvm/llvm-project#67791 Test would fail until they are propagated so they are also disabled in this patch. TODO: re-enable tests once everything has propagated.
The primary ISA-independent justification for using PC-relative addressing is that it makes code position-independent and therefore allows sharing of .text pages between processes.
When not sharing .text pages, we can use absolute relocations instead, which will possibly prevent a bubble introduced by s_getpc_b64.