[AMDGPU] Use absolute relocations when compiling for AMDPAL and Mesa3D #67791

tsymalla · 2023-09-29T11:47:14Z

The primary ISA-independent justification for using PC-relative addressing is that it makes code position-independent and therefore allows sharing of .text pages between processes.

When not sharing .text pages, we can use absolute relocations instead, which will possibly prevent a bubble introduced by s_getpc_b64.

llvmbot · 2023-09-29T11:48:20Z

@llvm/pr-subscribers-mc
@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-llvm-globalisel

Changes

The primary ISA-independent justification for using PC-relative addressing is that it makes code position-independent and therefore allows sharing of .text pages between processes.

Since PAL does not share .text pages, we can use absolute relocations when compiling for AMDPAL.

Patch is 418.74 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/67791.diff

16 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+54)
(modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h (+4)
(modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp (+4)
(modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+18-2)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.ll (+63-1)
(modified) llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll (+1-1)
(modified) llvm/test/CodeGen/AMDGPU/ds_read2.ll (+10-16)
(modified) llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll (+6-12)
(modified) llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll (+966-1377)
(modified) llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll (+83-112)
(modified) llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll (+68-131)
(modified) llvm/test/CodeGen/AMDGPU/global-constant.ll (+10-1)
(modified) llvm/test/CodeGen/AMDGPU/lds-relocs.ll (+2-2)
(modified) llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll (+4-10)
(modified) llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll (+30-30)
(modified) llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll (+19-11)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index db226a302900160..f04f85b7deaa5b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -17,7 +17,10 @@
 #include "AMDGPUGlobalISelUtils.h"
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/BinaryFormat/ELF.h"
@@ -26,6 +29,7 @@
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
@@ -2764,6 +2768,50 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
   return true;
  }
 
+ 
+ // Emit a ABS32_LO / ABS32_HI relocation stub.
+ void AMDGPULegalizerInfo::buildAbsGlobalAddress(
+   Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
+     int64_t Offset, MachineRegisterInfo &MRI) const {
+  bool IsDwordTy = PtrTy.getSizeInBits() == 32;
+
+  LLT S32 = LLT::scalar(32);
+
+  Register AddrDst;
+  if (IsDwordTy) {
+    AddrDst = MRI.createGenericVirtualRegister(S32);
+    MRI.setRegClass(AddrDst, &AMDGPU::SReg_32RegClass);
+  } else {
+    assert(PtrTy.getSizeInBits() == 64 &&
+           "Must provide a 64-bit pointer type!");
+    AddrDst = MRI.createGenericVirtualRegister(LLT::scalar(64));
+    MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
+  }
+
+  SmallVector<Register> Operands;
+
+  Register AddrLo = MRI.createGenericVirtualRegister(S32);
+  MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
+
+  B.buildInstr(AMDGPU::S_MOV_B32)
+      .addDef(AddrLo)
+      .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
+
+  Operands.push_back(AddrLo);
+
+  if (!IsDwordTy) {
+    Register AddrHi = MRI.createGenericVirtualRegister(S32);
+    MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
+    B.buildInstr(AMDGPU::S_MOV_B32)
+        .addDef(AddrHi)
+        .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
+
+    Operands.push_back(AddrHi);
+  }
+
+  B.buildMergeValues(DstReg, Operands);
+ }
+
 bool AMDGPULegalizerInfo::legalizeGlobalValue(
   MachineInstr &MI, MachineRegisterInfo &MRI,
   MachineIRBuilder &B) const {
@@ -2828,6 +2876,12 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
     return true;
   }
 
+  if (ST.isAmdPalOS()) {
+    buildAbsGlobalAddress(DstReg, Ty, B, GV, 0, MRI);
+    MI.eraseFromParent();
+    return true;
+  }
+
   const SITargetLowering *TLI = ST.getTargetLowering();
 
   if (TLI->shouldEmitFixup(GV)) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index ab7fe92d6a7201e..f60c0498cb6040c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -68,6 +68,10 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
                                const GlobalValue *GV, int64_t Offset,
                                unsigned GAFlags = SIInstrInfo::MO_NONE) const;
 
+  void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B,
+                             const GlobalValue *GV, int64_t Offset,
+                             MachineRegisterInfo &MRI) const;
+
   bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI,
                            MachineIRBuilder &B) const;
   bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 3f188478ca8bc66..58eed81e075560c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -63,6 +63,10 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
     return ELF::R_AMDGPU_REL32_HI;
   case MCSymbolRefExpr::VK_AMDGPU_REL64:
     return ELF::R_AMDGPU_REL64;
+  case MCSymbolRefExpr::VK_AMDGPU_ABS32_LO:
+    return ELF::R_AMDGPU_ABS32_LO;
+  case MCSymbolRefExpr::VK_AMDGPU_ABS32_HI:
+    return ELF::R_AMDGPU_ABS32_HI;
   }
 
   MCFixupKind Kind = Fixup.getKind();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f170428b38c49a5..117a803e2b4970a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5708,7 +5708,10 @@ bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
          AMDGPU::shouldEmitConstantsToTextSection(TT);
 }
 
-bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
+bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {  
+  if (Subtarget->isAmdPalOS())
+    return false;
+
   // FIXME: Either avoid relying on address space here or change the default
   // address space for functions to avoid the explicit check.
   return (GV->getValueType()->isFunctionTy() ||
@@ -6726,9 +6729,22 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
     return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
   }
 
+  if (Subtarget->isAmdPalOS()) {
+    SDValue AddrLo = DAG.getTargetGlobalAddress(
+        GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
+    AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
+
+    SDValue AddrHi = DAG.getTargetGlobalAddress(
+        GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
+    AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
+
+    return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
+  }
+
   if (shouldEmitFixup(GV))
     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
-  else if (shouldEmitPCReloc(GV))
+  
+  if (shouldEmitPCReloc(GV))
     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
                                    SIInstrInfo::MO_REL32);
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.ll
index b60dd6dea7f79d8..fcaafa64bd9e996 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -stop-after=legalizer < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -stop-after=legalizer < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -stop-after=legalizer < %s | FileCheck -check-prefix=GCN-PAL %s
 
 @external_constant = external addrspace(4) constant i32, align 4
 @external_constant32 = external addrspace(6) constant i32, align 4
@@ -22,6 +22,14 @@ define ptr addrspace(4) @external_constant_got() {
   ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; GCN-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+
+  ; GCN-PAL-LABEL: name: external_constant_got
+  ; GCN-PAL: bb.1 (%ir-block.0):
+  ; GCN-PAL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-lo) @external_constant
+  ; GCN-PAL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-hi) @external_constant
+  ; GCN-PAL-NEXT:   $vgpr0 = COPY [[S_MOV_B32_]](s32)
+  ; GCN-PAL-NEXT:   $vgpr1 = COPY [[S_MOV_B32_1]](s32)
+  ; GCN-PAL-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
   ret ptr addrspace(4) @external_constant
 }
 
@@ -34,6 +42,14 @@ define ptr addrspace(1) @external_global_got() {
   ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; GCN-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+
+  ; GCN-PAL-LABEL: name: external_global_got
+  ; GCN-PAL: bb.1 (%ir-block.0):
+  ; GCN-PAL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-lo) @external_global
+  ; GCN-PAL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-hi) @external_global
+  ; GCN-PAL-NEXT:   $vgpr0 = COPY [[S_MOV_B32_]](s32)
+  ; GCN-PAL-NEXT:   $vgpr1 = COPY [[S_MOV_B32_1]](s32)
+  ; GCN-PAL-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
   ret ptr addrspace(1) @external_global
 }
 
@@ -46,6 +62,14 @@ define ptr addrspace(999) @external_other_got() {
   ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; GCN-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+
+  ; GCN-PAL-LABEL: name: external_other_got
+  ; GCN-PAL: bb.1 (%ir-block.0):
+  ; GCN-PAL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-lo) @external_other
+  ; GCN-PAL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-hi) @external_other
+  ; GCN-PAL-NEXT:   $vgpr0 = COPY [[S_MOV_B32_]](s32)
+  ; GCN-PAL-NEXT:   $vgpr1 = COPY [[S_MOV_B32_1]](s32)
+  ; GCN-PAL-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
   ret ptr addrspace(999) @external_other
 }
 
@@ -57,6 +81,14 @@ define ptr addrspace(4) @internal_constant_pcrel() {
   ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; GCN-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+
+  ; GCN-PAL-LABEL: name: internal_constant_pcrel
+  ; GCN-PAL: bb.1 (%ir-block.0):
+  ; GCN-PAL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-lo) @internal_constant
+  ; GCN-PAL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-hi) @internal_constant
+  ; GCN-PAL-NEXT:   $vgpr0 = COPY [[S_MOV_B32_]](s32)
+  ; GCN-PAL-NEXT:   $vgpr1 = COPY [[S_MOV_B32_1]](s32)
+  ; GCN-PAL-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
   ret ptr addrspace(4) @internal_constant
 }
 
@@ -68,6 +100,14 @@ define ptr addrspace(1) @internal_global_pcrel() {
   ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; GCN-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+
+  ; GCN-PAL-LABEL: name: internal_global_pcrel
+  ; GCN-PAL: bb.1 (%ir-block.0):
+  ; GCN-PAL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-lo) @internal_global
+  ; GCN-PAL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-hi) @internal_global
+  ; GCN-PAL-NEXT:   $vgpr0 = COPY [[S_MOV_B32_]](s32)
+  ; GCN-PAL-NEXT:   $vgpr1 = COPY [[S_MOV_B32_1]](s32)
+  ; GCN-PAL-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
   ret ptr addrspace(1) @internal_global
 }
 
@@ -79,6 +119,14 @@ define ptr addrspace(999) @internal_other_pcrel() {
   ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; GCN-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+
+  ; GCN-PAL-LABEL: name: internal_other_pcrel
+  ; GCN-PAL: bb.1 (%ir-block.0):
+  ; GCN-PAL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-lo) @internal_other
+  ; GCN-PAL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-hi) @internal_other
+  ; GCN-PAL-NEXT:   $vgpr0 = COPY [[S_MOV_B32_]](s32)
+  ; GCN-PAL-NEXT:   $vgpr1 = COPY [[S_MOV_B32_1]](s32)
+  ; GCN-PAL-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
   ret ptr addrspace(999) @internal_other
 }
 
@@ -90,6 +138,13 @@ define ptr addrspace(6) @external_constant32_got() {
   ; GCN-NEXT:   [[EXTRACT:%[0-9]+]]:_(p6) = G_EXTRACT [[LOAD]](p4), 0
   ; GCN-NEXT:   $vgpr0 = COPY [[EXTRACT]](p6)
   ; GCN-NEXT:   SI_RETURN implicit $vgpr0
+
+  ; GCN-PAL-LABEL: name: external_constant32_got
+  ; GCN-PAL: bb.1 (%ir-block.0):
+  ; GCN-PAL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-lo) @external_constant32
+  ; GCN-PAL-NEXT:   [[MV:%[0-9]+]]:_(p6) = G_MERGE_VALUES [[S_MOV_B32_]](s32)
+  ; GCN-PAL-NEXT:   $vgpr0 = COPY [[MV]](p6)
+  ; GCN-PAL-NEXT:   SI_RETURN implicit $vgpr0
   ret ptr addrspace(6) @external_constant32
 }
 
@@ -100,5 +155,12 @@ define ptr addrspace(6) @internal_constant32_pcrel() {
   ; GCN-NEXT:   [[EXTRACT:%[0-9]+]]:_(p6) = G_EXTRACT [[SI_PC_ADD_REL_OFFSET]](p4), 0
   ; GCN-NEXT:   $vgpr0 = COPY [[EXTRACT]](p6)
   ; GCN-NEXT:   SI_RETURN implicit $vgpr0
+
+  ; GCN-PAL-LABEL: name: internal_constant32_pcrel
+  ; GCN-PAL: bb.1 (%ir-block.0):
+  ; GCN-PAL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 target-flags(amdgpu-abs32-lo) @internal_constant32
+  ; GCN-PAL-NEXT:   [[MV:%[0-9]+]]:_(p6) = G_MERGE_VALUES [[S_MOV_B32_]](s32)
+  ; GCN-PAL-NEXT:   $vgpr0 = COPY [[MV]](p6)
+  ; GCN-PAL-NEXT:   SI_RETURN implicit $vgpr0
   ret ptr addrspace(6) @internal_constant32
 }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll
index 2feeb83e6f1467b..c5dbfb0f219bd9b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll
@@ -12,7 +12,7 @@
 
 ; ELF: Relocations [
 ; ELF-NEXT: Section (3) .rel.text {
-; ELF-NEXT: 0x{{[0-9]+}} R_AMDGPU_ABS32 doff_0_0_b{{$}}
+; ELF-NEXT: 0x{{[0-9]+}} R_AMDGPU_ABS32_LO doff_0_0_b{{$}}
 
 define amdgpu_ps void @ps_main(i32 %arg, i32 inreg %arg1, i32 inreg %arg2) local_unnamed_addr #0 {
   %rc = call i32 @llvm.amdgcn.reloc.constant(metadata !1)
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 9ec9414d91171b7..9d94f8e6ca227e8 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -1335,9 +1335,9 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac
 ; CI-NEXT:    s_mov_b32 s40, s0
 ; CI-NEXT:    s_load_dwordx4 s[40:43], s[40:41], 0x0
 ; CI-NEXT:    s_mov_b32 s14, s10
-; CI-NEXT:    s_mov_b32 s12, s8
-; CI-NEXT:    s_mov_b32 s13, s9
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_mov_b32 s12, s8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s40, s40, s11
 ; CI-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -1345,27 +1345,24 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac
 ; CI-NEXT:    s_load_dword s6, s[4:5], 0x2
 ; CI-NEXT:    s_addc_u32 s41, s41, 0
 ; CI-NEXT:    s_add_u32 s8, s4, 12
-; CI-NEXT:    s_addc_u32 s9, s5, 0
-; CI-NEXT:    s_getpc_b64 s[4:5]
-; CI-NEXT:    s_add_u32 s4, s4, void_func_void@gotpcrel32@lo+4
-; CI-NEXT:    s_addc_u32 s5, s5, void_func_void@gotpcrel32@hi+12
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; CI-NEXT:    s_mov_b32 s13, s9
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_add_i32_e32 v40, vcc, s6, v3
-; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; CI-NEXT:    ds_read_b32 v41, v40
-; CI-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; CI-NEXT:    s_addc_u32 s9, s5, 0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; CI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; CI-NEXT:    s_mov_b32 s17, void_func_void@abs32@hi
+; CI-NEXT:    s_mov_b32 s16, void_func_void@abs32@lo
 ; CI-NEXT:    v_or_b32_e32 v31, v0, v2
 ; CI-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; CI-NEXT:    s_mov_b32 s32, 0
 ; CI-NEXT:    s_mov_b32 s39, 0xf000
 ; CI-NEXT:    s_mov_b32 s38, -1
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CI-NEXT:    ds_read_b32 v0, v40 offset:4
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1384,28 +1381,25 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_add_u32 s36, s36, s11
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x8
 ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX9-NEXT:    s_add_u32 s8, s4, 12
 ; GFX9-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, void_func_void@gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, void_func_void@gotpcrel32@hi+12
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_add_u32 v41, v0, 2, s6
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX9-NEXT:    ds_read_b32 v42, v41
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b32 s17, void_func_void@abs32@hi
+; GFX9-NEXT:    s_mov_b32 s16, void_func_void@abs32@lo
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v40, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-NEXT:    ds_read_b32 v0, v41 offset:4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
index 7c9d01db9c2c093..9ab8be0485eddbe 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
@@ -33,21 +33,18 @@ define amdgpu_gfx void @gfx_func() {
 ; SDAG-NEXT:    v_writelane_b32 v40, s21, 17
 ; SDAG-NEXT:    v_writelane_b32 v40, s22, 18
 ; SDAG-NEXT:    v_writelane_b32 v40, s23, 19
-; SDAG-NEXT:    s_addk_i32 s32, 0x400
 ; SDAG-NEXT:    v_writelane_b32 v40, s24, 20
 ; SDAG-NEXT:    v_writelane_b32 v40, s25, 21
-; SDAG-NEXT:    s_getpc_b64 s[34:35]
-; SDAG-NEXT:    s_add_u32 s34, s34, extern_c_func@gotpcrel32@lo+4
-; SDAG-NEXT:    s_addc_u32 s35, s35, extern_c_func@gotpcrel32@hi+12
 ; SDAG-NEXT:    v_writelane_b32 v40, s26, 22
-; SDAG-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; SDAG-NEXT:    v_writelane_b32 v40, s27, 23
 ; SDAG-NEXT:    v_writelane_b32 v40, s28, 24
 ; SDAG-NEXT:    v_writelane_b32 v40, s29, 25
 ; SDAG-NEXT:    v_writelane_b32 v40, s30, 26
+; SDAG-NEXT:    s_mov_b32 s35, extern_c_func@abs32@hi
+; SDAG-NEXT:    s_mov_b32 s34, extern_c_func@abs32@lo
 ; SDAG-NEXT:    s_mov_b64 s[8:9], 0
+; SDAG-NEXT:    s_addk_i32 s32, 0x400
 ; SDAG-NEXT:    v_writelane_b32 v40, s31, 27
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; SDAG-NEXT:    v_readlane_b32 s31, v40, 27
 ; SDAG-NEXT:    v_readlane_b32 s30, v40, 26
@@ -113,21 +110,18 @@ define amdgpu_gfx void @gfx_func() {
 ; GISEL-NEXT:    v_writelane_b32 v40, s21, 17
 ; GISEL-NEXT:    v_writelane_b32 v40, s22, 18
 ; GISEL-NEXT:    v_writelane_b32 v40, s23, 19
-; GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GISEL-NEXT:    v_writelane_b32 v40, s24, 20
 ; GISEL-NEXT:    v_writelane_b32 v40, s25, 21
-; GISEL-NEXT:    s_getpc_b64 s[34:35]
-; GISEL-NEXT:    s_add_u32 s34, s34, extern_c_func@gotpcrel32@lo+4
-; GISEL-NEXT:    s_addc_u32 s35, s35, extern_c_func@gotpcrel32@hi+12
 ; GISEL-NEXT:    v_writelane_b32 v40, s26, 22
-; GISEL-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GISEL-NEXT:    v_writelane_b32 v40, s27, 23
 ; GISEL-NEXT:    v_writelane_b32 v40, s28, 24
 ; GISEL-NEXT:    v_writelane_b32 v40, s29, 25
 ; GISEL-NEXT:    v_writelane_b32 v40, s30, 26
+; GISEL-NEXT:    s_mov_b32 s34, extern_c_func@abs32@lo
+; GISEL-NEXT:    s_mov_b32 s35, extern_c_func@abs32@hi
 ; GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GISEL-NEXT:    v_writelane_b32 v40, s31, 27
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GISEL-NEXT:    v_readlane_b32 s31, v40, 27
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 26
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index c01634231403241..f827a78125b7785 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -119,10 +119,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane...
[truncated]

github-actions · 2023-09-29T11:58:02Z

✅ With the latest revision this PR passed the C/C++ code formatter.

tsymalla · 2023-09-29T13:02:20Z

Fix MC test. I am not seeing any diff with git-clang-format. What am I supposed to do? I am using clang-format 16.0.6.

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

jayfoad · 2023-09-29T13:10:07Z

I can reproduce the clang-format diff just by running git clang-format @^. I have tried with two different versions, 14.0.0-1ubuntu1.1 and 18.0.0.

tsymalla · 2023-09-29T13:22:39Z

I can reproduce the clang-format diff just by running git clang-format @^. I have tried with two different versions, 14.0.0-1ubuntu1.1 and 18.0.0.

It looks like the method above my code in the GISel code had wrong intendation at the closing bracket, which caused confusion in my clang-format build. The intendation in my code was broken after applying clang-format as well. Let's see if it works now.

nhaehnle

Mostly LGTM.

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

tsymalla · 2023-10-05T10:52:38Z

In GISel, try to re-use the destination register only if it has no register class set.

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

tsymalla · 2023-10-06T15:04:08Z

The latest changes try to generally generate new registers iff the target does have a register class being set (in the 32-bit and 64-bit case). I wonder how conservative we need to be in terms of reg-class compatibility checks (the PCRel-Case does not do any checks like that).

The primary ISA-independent justification for using PC-relative addressing is that it makes code position-independent and therefore allows sharing of .text pages between processes. When not sharing .text pages, we can use absolute relocations instead, which will possibly prevent a bubble by using s_getpc_b64.

nhaehnle · 2023-10-06T17:38:02Z

My understanding is:

We should be accepting to already existing registers having a register class for some reason
But MergeValues works with any register class, so doesn't necessarily need a fresh register
We shouldn't need Casts, only COPYs

tsymalla · 2023-10-06T18:38:17Z

I agree with your understanding. The idea of using buildCast was that it boils down to a COPY if it can, but does some smartness in other cases. The code should only create new registers when the dest has a register class, and since sometimes we have a register (Si_cc_gfx_ret) where the data is stored in sgprs but the type is pointer, this should work as well (which will not work with sole COPY afaik). At least that’s what I experienced during my tests.

nhaehnle

Ok, understood. Thanks for explaining about the issue with integers vs. pointers.

[AMDGPU] Use absolute relocations when compiling for AMDPAL and Mesa3D llvm/llvm-project#67791

Update tests affected by upstream change: [AMDGPU] Use absolute relocations when compiling for AMDPAL and Mesa3D llvm/llvm-project#67791 Tests would fail until they are propagated so they are also disabled in this patch. TODO: re-enable tests once everything has propagated.

Update test affected by upstream change: [AMDGPU] Use absolute relocations when compiling for AMDPAL and Mesa3D llvm/llvm-project#67791 Test would fail until they are propagated so they are also disabled in this patch. TODO: re-enable tests once everything has propagated.

tsymalla requested review from jayfoad, arsenm, nhaehnle and Flakebi September 29, 2023 11:47

llvmbot added backend:AMDGPU llvm:globalisel labels Sep 29, 2023

tsymalla force-pushed the absolute_relocations branch 3 times, most recently from 6d54229 to 61d133a Compare September 29, 2023 13:01

llvmbot added the mc Machine (object) code label Sep 29, 2023

Flakebi reviewed Sep 29, 2023

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp Outdated Show resolved Hide resolved

tsymalla force-pushed the absolute_relocations branch from 61d133a to 44cdc5a Compare September 29, 2023 13:18

tsymalla force-pushed the absolute_relocations branch 2 times, most recently from eb768d3 to 959bfd7 Compare October 2, 2023 06:26

tsymalla changed the title ~~[AMDGPU] Use absolute relocations when compiling for AMDPAL~~ [AMDGPU] Use absolute relocations when compiling for AMDPAL and Mesa3D Oct 2, 2023

nhaehnle reviewed Oct 3, 2023

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp Show resolved Hide resolved

arsenm reviewed Oct 3, 2023

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp Show resolved Hide resolved

tsymalla force-pushed the absolute_relocations branch 2 times, most recently from f6e71bc to 8fe5e06 Compare October 5, 2023 10:51

nhaehnle reviewed Oct 6, 2023

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp Show resolved Hide resolved

tsymalla force-pushed the absolute_relocations branch from 8fe5e06 to 18a04d8 Compare October 6, 2023 15:02

tsymalla force-pushed the absolute_relocations branch from 18a04d8 to 3dbf02a Compare October 6, 2023 16:41

nhaehnle approved these changes Oct 9, 2023

View reviewed changes

tsymalla merged commit aa5158c into llvm:main Oct 10, 2023

stepthomas mentioned this pull request Oct 10, 2023

AMDGPU stepthomas atomic csub no rtn forms ver2 stepthomas/llvm-project#1

Closed

petar-avramovic added a commit to petar-avramovic/llpc that referenced this pull request Oct 12, 2023

Update shader tests after LLVM update

71e58f4

[AMDGPU] Use absolute relocations when compiling for AMDPAL and Mesa3D llvm/llvm-project#67791

petar-avramovic mentioned this pull request Oct 12, 2023

Update shader tests after LLVM update GPUOpen-Drivers/llpc#2760

Merged

petar-avramovic mentioned this pull request Oct 13, 2023

Update lgc test after LLVM update GPUOpen-Drivers/llpc#2761

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU] Use absolute relocations when compiling for AMDPAL and Mesa3D #67791

[AMDGPU] Use absolute relocations when compiling for AMDPAL and Mesa3D #67791

Uh oh!

tsymalla commented Sep 29, 2023 •

edited

Loading

Uh oh!

llvmbot commented Sep 29, 2023 •

edited

Loading

Uh oh!

github-actions bot commented Sep 29, 2023 •

edited

Loading

Uh oh!

tsymalla commented Sep 29, 2023

Uh oh!

Uh oh!

jayfoad commented Sep 29, 2023 •

edited

Loading

Uh oh!

tsymalla commented Sep 29, 2023

Uh oh!

nhaehnle left a comment

Uh oh!

Uh oh!

Uh oh!

tsymalla commented Oct 5, 2023

Uh oh!

Uh oh!

tsymalla commented Oct 6, 2023

Uh oh!

nhaehnle commented Oct 6, 2023

Uh oh!

tsymalla commented Oct 6, 2023

Uh oh!

nhaehnle left a comment

Uh oh!

Uh oh!

[AMDGPU] Use absolute relocations when compiling for AMDPAL and Mesa3D #67791

[AMDGPU] Use absolute relocations when compiling for AMDPAL and Mesa3D #67791

Uh oh!

Conversation

tsymalla commented Sep 29, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Sep 29, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Sep 29, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

tsymalla commented Sep 29, 2023

Uh oh!

Uh oh!

jayfoad commented Sep 29, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

tsymalla commented Sep 29, 2023

Uh oh!

nhaehnle left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

tsymalla commented Oct 5, 2023

Uh oh!

Uh oh!

tsymalla commented Oct 6, 2023

Uh oh!

nhaehnle commented Oct 6, 2023

Uh oh!

tsymalla commented Oct 6, 2023

Uh oh!

nhaehnle left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

tsymalla commented Sep 29, 2023 •

edited

Loading

llvmbot commented Sep 29, 2023 •

edited

Loading

github-actions bot commented Sep 29, 2023 •

edited

Loading

jayfoad commented Sep 29, 2023 •

edited

Loading