Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AMDGPU: Handle gfx950 global_load_lds_* instructions #116680

Merged
merged 1 commit into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2452,7 +2452,7 @@ class AMDGPUGlobalLoadLDS :
[],
[LLVMQualPointerType<1>, // Base global pointer to load from
LLVMQualPointerType<3>, // LDS base pointer to store to
llvm_i32_ty, // Data byte size: 1/2/4
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty, // imm offset (applied to both global and LDS address)
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
// bit 1 = sc1,
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3329,6 +3329,16 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
case 4:
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
break;
case 12:
if (!Subtarget->hasLDSLoadB96_B128())
return false;
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
break;
case 16:
if (!Subtarget->hasLDSLoadB96_B128())
return false;
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
break;
}

MachineBasicBlock *MBB = MI.getParent();
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Target/AMDGPU/FLATInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -934,6 +934,11 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_usho
defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort">;
defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">;

let SubtargetPredicate = HasGFX950Insts in {
defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx3">;
defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx4">;
}

let SubtargetPredicate = isGFX12Plus in {
defm GLOBAL_ATOMIC_COND_SUB_U32 : FLAT_Global_Atomic_Pseudo <"global_atomic_cond_sub_u32", VGPR_32, i32>;
defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>;
Expand Down Expand Up @@ -1980,6 +1985,10 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Real_AllAddr_LDS <0x028, 0x12>;
defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_LDS <0x029, 0x13>;
defm GLOBAL_LOAD_LDS_DWORD : FLAT_Real_AllAddr_LDS <0x02a, 0x14>;

defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Real_AllAddr_LDS <0x07e, 0x07e>;
defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Real_AllAddr_LDS <0x07d, 0x07d>;

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: why two blank lines?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

whatever was in the merge


defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Real_Atomics_vi <0x40>;
defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Real_Atomics_vi <0x41>;
defm GLOBAL_ATOMIC_ADD : FLAT_Global_Real_Atomics_vi <0x42>;
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -1289,6 +1289,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
// hasGFX940Insts and hasGFX90AInsts are also true.
bool hasGFX950Insts() const { return GFX950Insts; }

/// Returns true if the target supports
/// global_load_lds_dwordx3/global_load_lds_dwordx4 or
/// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
bool hasLDSLoadB96_B128() const {
return hasGFX950Insts();
}

bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }

bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9894,6 +9894,16 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
case 4:
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
break;
case 12:
if (!Subtarget->hasLDSLoadB96_B128())
return SDValue();
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
break;
case 16:
if (!Subtarget->hasLDSLoadB96_B128())
return SDValue();
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
break;
}

auto *M = cast<MemSDNode>(Op);
Expand Down
137 changes: 137 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s

declare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)

;---------------------------------------------------------------------y
; dwordx3
;---------------------------------------------------------------------

define amdgpu_ps void @global_load_lds_dwordx3_vaddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr) {
; GFX950-SDAG-LABEL: global_load_lds_dwordx3_vaddr:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX950-SDAG-NEXT: s_mov_b32 m0, s0
; GFX950-SDAG-NEXT: s_nop 0
; GFX950-SDAG-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: global_load_lds_dwordx3_vaddr:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v2
; GFX950-GISEL-NEXT: s_nop 4
; GFX950-GISEL-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0
; GFX950-GISEL-NEXT: s_endpgm
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1)
ret void
}

define amdgpu_ps void @global_load_lds_dwordx3_saddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr) {
; GFX950-SDAG-LABEL: global_load_lds_dwordx3_saddr:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX950-SDAG-NEXT: s_mov_b32 m0, s2
; GFX950-SDAG-NEXT: s_nop 0
; GFX950-SDAG-NEXT: global_load_lds_dwordx3 v1, s[0:1] offset:32 nt
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: global_load_lds_dwordx3_saddr:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v0
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-GISEL-NEXT: s_nop 3
; GFX950-GISEL-NEXT: global_load_lds_dwordx3 v0, s[0:1] offset:32 nt
; GFX950-GISEL-NEXT: s_endpgm
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 12, i32 32, i32 2)
ret void
}

define amdgpu_ps void @global_load_lds_dwordx3_saddr_and_vaddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr, i32 %voffset) {
; GFX950-SDAG-LABEL: global_load_lds_dwordx3_saddr_and_vaddr:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX950-SDAG-NEXT: s_mov_b32 m0, s2
; GFX950-SDAG-NEXT: s_nop 0
; GFX950-SDAG-NEXT: global_load_lds_dwordx3 v1, s[0:1] offset:48 sc1
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: global_load_lds_dwordx3_saddr_and_vaddr:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v0
; GFX950-GISEL-NEXT: s_nop 4
; GFX950-GISEL-NEXT: global_load_lds_dwordx3 v1, s[0:1] offset:48 sc1
; GFX950-GISEL-NEXT: s_endpgm
%voffset.64 = zext i32 %voffset to i64
%gep = getelementptr i8, ptr addrspace(1) %gptr, i64 %voffset.64
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gep, ptr addrspace(3) %lptr, i32 12, i32 48, i32 16)
ret void
}

;---------------------------------------------------------------------
; dwordx4
;---------------------------------------------------------------------

define amdgpu_ps void @global_load_lds_dwordx4_vaddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr) {
; GFX950-SDAG-LABEL: global_load_lds_dwordx4_vaddr:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX950-SDAG-NEXT: s_mov_b32 m0, s0
; GFX950-SDAG-NEXT: s_nop 0
; GFX950-SDAG-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: global_load_lds_dwordx4_vaddr:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v2
; GFX950-GISEL-NEXT: s_nop 4
; GFX950-GISEL-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0
; GFX950-GISEL-NEXT: s_endpgm
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 16, i32 16, i32 1)
ret void
}

define amdgpu_ps void @global_load_lds_dwordx4_saddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr) {
; GFX950-SDAG-LABEL: global_load_lds_dwordx4_saddr:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX950-SDAG-NEXT: s_mov_b32 m0, s2
; GFX950-SDAG-NEXT: s_nop 0
; GFX950-SDAG-NEXT: global_load_lds_dwordx4 v1, s[0:1] offset:32 nt
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: global_load_lds_dwordx4_saddr:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v0
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-GISEL-NEXT: s_nop 3
; GFX950-GISEL-NEXT: global_load_lds_dwordx4 v0, s[0:1] offset:32 nt
; GFX950-GISEL-NEXT: s_endpgm
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 16, i32 32, i32 2)
ret void
}

define amdgpu_ps void @global_load_lds_dwordx4_saddr_and_vaddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr, i32 %voffset) {
; GFX950-SDAG-LABEL: global_load_lds_dwordx4_saddr_and_vaddr:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX950-SDAG-NEXT: s_mov_b32 m0, s2
; GFX950-SDAG-NEXT: s_nop 0
; GFX950-SDAG-NEXT: global_load_lds_dwordx4 v1, s[0:1] offset:48 sc1
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: global_load_lds_dwordx4_saddr_and_vaddr:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v0
; GFX950-GISEL-NEXT: s_nop 4
; GFX950-GISEL-NEXT: global_load_lds_dwordx4 v1, s[0:1] offset:48 sc1
; GFX950-GISEL-NEXT: s_endpgm
%voffset.64 = zext i32 %voffset to i64
%gep = getelementptr i8, ptr addrspace(1) %gptr, i64 %voffset.64
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gep, ptr addrspace(3) %lptr, i32 16, i32 48, i32 16)
ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX950: {{.*}}
37 changes: 37 additions & 0 deletions llvm/test/MC/AMDGPU/gfx950_asm_features.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX950 --strict-whitespace %s
// xUN: not llvm-mc -triple=amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX950,GFX940 --implicit-check-not=error: %s
// xUN: not llvm-mc -triple=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=NOT-GFX950,GFX90A --implicit-check-not=error: %s
// xUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX950,GFX10 --implicit-check-not=error: %s

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: instruction not supported on this GPU
// GFX950: global_load_lds_dwordx3 v[2:3], off ; encoding: [0x00,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00]

global_load_lds_dwordx3 v[2:3], off

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: global_load_lds_dwordx3 v[2:3], off sc0 nt sc1 ; encoding: [0x00,0x80,0xfb,0xdf,0x02,0x00,0x7f,0x00]
global_load_lds_dwordx3 v[2:3], off sc0 nt sc1

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: global_load_lds_dwordx3 v[2:3], off offset:4 ; encoding: [0x04,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00]
global_load_lds_dwordx3 v[2:3], off offset:4

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: global_load_lds_dwordx3 v2, s[4:5] offset:4 ; encoding: [0x04,0x80,0xf8,0xdd,0x02,0x00,0x04,0x00]
global_load_lds_dwordx3 v2, s[4:5] offset:4

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: instruction not supported on this GPU
// GFX950: global_load_lds_dwordx4 v[2:3], off ; encoding: [0x00,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00]
global_load_lds_dwordx4 v[2:3], off

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: global_load_lds_dwordx4 v[2:3], off sc0 nt sc1 ; encoding: [0x00,0x80,0xf7,0xdf,0x02,0x00,0x7f,0x00]
global_load_lds_dwordx4 v[2:3], off sc0 nt sc1

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: global_load_lds_dwordx4 v[2:3], off offset:4 ; encoding: [0x04,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00]
global_load_lds_dwordx4 v[2:3], off offset:4

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: global_load_lds_dwordx4 v2, s[4:5] offset:4 ; encoding: [0x04,0x80,0xf4,0xdd,0x02,0x00,0x04,0x00]
global_load_lds_dwordx4 v2, s[4:5] offset:4
25 changes: 25 additions & 0 deletions llvm/test/MC/Disassembler/AMDGPU/gfx950.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding -disassemble %s | FileCheck -check-prefix=GFX950 %s

# GFX950: global_load_lds_dwordx3 v2, s[4:5] offset:4 ; encoding: [0x04,0x80,0xf8,0xdd,0x02,0x00,0x04,0x00]
0x04,0x80,0xf8,0xdd,0x02,0x00,0x04,0x00

# GFX950: global_load_lds_dwordx3 v[2:3], off ; encoding: [0x00,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00]
0x00,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00

# GFX950: global_load_lds_dwordx3 v[2:3], off offset:4 ; encoding: [0x04,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00]
0x04,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00

# GFX950: global_load_lds_dwordx3 v[2:3], off sc0 nt sc1 ; encoding: [0x00,0x80,0xfb,0xdf,0x02,0x00,0x7f,0x00]
0x00,0x80,0xfb,0xdf,0x02,0x00,0x7f,0x00

# GFX950: global_load_lds_dwordx4 v2, s[4:5] offset:4 ; encoding: [0x04,0x80,0xf4,0xdd,0x02,0x00,0x04,0x00]
0x04,0x80,0xf4,0xdd,0x02,0x00,0x04,0x00

# GFX950: global_load_lds_dwordx4 v[2:3], off ; encoding: [0x00,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00]
0x00,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00

# GFX950: global_load_lds_dwordx4 v[2:3], off offset:4 ; encoding: [0x04,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00]
0x04,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00

# GFX950: global_load_lds_dwordx4 v[2:3], off sc0 nt sc1 ; encoding: [0x00,0x80,0xf7,0xdf,0x02,0x00,0x7f,0x00]
0x00,0x80,0xf7,0xdf,0x02,0x00,0x7f,0x00
Loading