-
Notifications
You must be signed in to change notification settings - Fork 12.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Add support for v_cvt_scalef32_sr instructions #117820
Conversation
aeac3cd
to
ce8c168
Compare
@llvm/pr-subscribers-mc @llvm/pr-subscribers-llvm-ir Author: Matt Arsenault (arsenm) ChangesCo-authored-by: Shilei Tian <shilei.tian@amd.com> Patch is 55.68 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117820.diff 11 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 2c617a90a4fde9..61039938267feb 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -602,5 +602,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16, "UiUiV2hUifIi", "nc"
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16, "UiUiV2yUifIi", "nc", "fp4-cvt-scale-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32, "UiUiV2fUifIi", "nc", "fp4-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16, "V6UiV32yUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16, "V6UiV32hUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32, "V6UiV32fUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16, "V6UiV32yUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16, "V6UiV32hUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32, "V6UiV32fUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
+
#undef BUILTIN
#undef TARGET_BUILTIN
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
index 691be592e3a4bc..64403f0bf94ebd 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
@@ -1338,3 +1338,65 @@ void test_cvt_scalef32_sr_pk_fp4_f32(global unsigned *out, float2 src, uint seed
*out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(*out, src, seed, scale, 2);
*out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(*out, src, seed, scale, 3);
}
+
+// CHECK-LABEL: @test_cvt_scalef32_sr_pk32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT6_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[SRCBF32_ADDR:%.*]] = alloca <32 x bfloat>, align 64, addrspace(5)
+// CHECK-NEXT: [[SRCH32_ADDR:%.*]] = alloca <32 x half>, align 64, addrspace(5)
+// CHECK-NEXT: [[SRCF32_ADDR:%.*]] = alloca <32 x float>, align 128, addrspace(5)
+// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT: store ptr addrspace(1) [[OUT6:%.*]], ptr addrspace(5) [[OUT6_ADDR]], align 8
+// CHECK-NEXT: store <32 x bfloat> [[SRCBF32:%.*]], ptr addrspace(5) [[SRCBF32_ADDR]], align 64
+// CHECK-NEXT: store <32 x half> [[SRCH32:%.*]], ptr addrspace(5) [[SRCH32_ADDR]], align 64
+// CHECK-NEXT: store <32 x float> [[SRCF32:%.*]], ptr addrspace(5) [[SRCF32_ADDR]], align 128
+// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT: store float [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> [[TMP0]], i32 [[TMP1]], float [[TMP2]])
+// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
+// CHECK-NEXT: store <6 x i32> [[TMP3]], ptr addrspace(1) [[TMP4]], align 32
+// CHECK-NEXT: [[TMP5:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64
+// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT: [[TMP7:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT: [[TMP8:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f16(<32 x half> [[TMP5]], i32 [[TMP6]], float [[TMP7]])
+// CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
+// CHECK-NEXT: store <6 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 32
+// CHECK-NEXT: [[TMP10:%.*]] = load <32 x float>, ptr addrspace(5) [[SRCF32_ADDR]], align 128
+// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT: [[TMP12:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT: [[TMP13:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f32(<32 x float> [[TMP10]], i32 [[TMP11]], float [[TMP12]])
+// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
+// CHECK-NEXT: store <6 x i32> [[TMP13]], ptr addrspace(1) [[TMP14]], align 32
+// CHECK-NEXT: [[TMP15:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64
+// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT: [[TMP17:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT: [[TMP18:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> [[TMP15]], i32 [[TMP16]], float [[TMP17]])
+// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
+// CHECK-NEXT: store <6 x i32> [[TMP18]], ptr addrspace(1) [[TMP19]], align 32
+// CHECK-NEXT: [[TMP20:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64
+// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT: [[TMP22:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT: [[TMP23:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f16(<32 x half> [[TMP20]], i32 [[TMP21]], float [[TMP22]])
+// CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
+// CHECK-NEXT: store <6 x i32> [[TMP23]], ptr addrspace(1) [[TMP24]], align 32
+// CHECK-NEXT: [[TMP25:%.*]] = load <32 x float>, ptr addrspace(5) [[SRCF32_ADDR]], align 128
+// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT: [[TMP27:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT: [[TMP28:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f32(<32 x float> [[TMP25]], i32 [[TMP26]], float [[TMP27]])
+// CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
+// CHECK-NEXT: store <6 x i32> [[TMP28]], ptr addrspace(1) [[TMP29]], align 32
+// CHECK-NEXT: ret void
+//
+void test_cvt_scalef32_sr_pk32(global uint6 *out6, bfloat32 srcbf32, half32 srch32, float32 srcf32, unsigned src1, float src2)
+{
+ *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16(srcbf32, src1, src2);
+ *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16(srch32, src1, src2);
+ *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32(srcf32, src1, src2);
+ *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16(srcbf32, src1, src2);
+ *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16(srch32, src1, src2);
+ *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32(srcf32, src1, src2);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 99a29dadef56de..73f3559ab05a48 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -602,6 +602,10 @@ class AMDGPUCvtScaleF32ToFP6BF6Intrinsic<LLVMType DstTy, LLVMType Src0Ty, LLVMTy
[DstTy], [Src0Ty, Src1Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
>, ClangBuiltin<"__builtin_amdgcn_"#name>;
+class AMDGPUCvtScaleF32SRIntrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic<
+ [DstTy], [Src0Ty, llvm_i32_ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
+>, ClangBuiltin<"__builtin_amdgcn_"#name>;
+
def int_amdgcn_cvt_scalef32_pk32_fp6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_fp6_f16">;
def int_amdgcn_cvt_scalef32_pk32_bf6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_bf6_f16">;
def int_amdgcn_cvt_scalef32_pk32_fp6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_pk32_fp6_bf16">;
@@ -609,6 +613,13 @@ def int_amdgcn_cvt_scalef32_pk32_bf6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i3
def int_amdgcn_cvt_scalef32_2xpk16_fp6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_fp6_f32">;
def int_amdgcn_cvt_scalef32_2xpk16_bf6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_bf6_f32">;
+def int_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_sr_pk32_bf6_bf16">;
+def int_amdgcn_cvt_scalef32_sr_pk32_bf6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_sr_pk32_bf6_f16">;
+def int_amdgcn_cvt_scalef32_sr_pk32_bf6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f32_ty, "cvt_scalef32_sr_pk32_bf6_f32">;
+def int_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_sr_pk32_fp6_bf16">;
+def int_amdgcn_cvt_scalef32_sr_pk32_fp6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_sr_pk32_fp6_f16">;
+def int_amdgcn_cvt_scalef32_sr_pk32_fp6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f32_ty, "cvt_scalef32_sr_pk32_fp6_f32">;
+
class AMDGPUCvtScaleFP4FP8BF8ToF1632Intrinsic<LLVMType DstTy, string name> : DefaultAttrsIntrinsic<
[DstTy],
[llvm_i32_ty, // src
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index ad100f37f8710c..158603a7aff879 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4578,6 +4578,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f16:
case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_bf16:
case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f32:
case Intrinsic::amdgcn_ashr_pk_i8_i32:
case Intrinsic::amdgcn_ashr_pk_u8_i32:
case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 119a4d63704777..b27e2529f4a807 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1779,25 +1779,27 @@ class getSDWASrcForVT <ValueType VT> {
// given VT.
class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
RegisterOperand ret =
- !cond(!eq(VT, f64) : VSrc_f64,
- !eq(VT, f32) : VSrc_f32,
- !eq(VT, f16) : !if(IsTrue16, VSrcT_f16, VSrc_f16),
- !eq(VT, bf16) : !if(IsTrue16, VSrcT_bf16, VSrc_bf16),
- !eq(VT, i16) : !if(IsTrue16, VSrcT_b16, VSrc_b16),
- !eq(VT, i1) : SSrc_i1,
- !eq(VT, v2f32) : VSrc_v2f32,
- !eq(VT, v2i32) : VSrc_v2b32,
- !eq(VT, v2f16) : VSrc_v2f16,
- !eq(VT, v2bf16) : VSrc_v2bf16,
- !eq(VT, v2i16) : VSrc_v2b16,
- !eq(VT, v4f16) : AVSrc_64,
- !eq(VT, v4bf16) : AVSrc_64,
- !eq(VT.Size, 512) : VRegSrc_512,
- !eq(VT.Size, 192) : VRegSrc_192,
- !eq(VT.Size, 128) : VRegSrc_128,
- !eq(VT.Size, 96) : VRegSrc_96,
- !eq(VT.Size, 64) : VSrc_b64,
- 1 : VSrc_b32);
+ !cond(!eq(VT, f64) : VSrc_f64,
+ !eq(VT, f32) : VSrc_f32,
+ !eq(VT, f16) : !if(IsTrue16, VSrcT_f16, VSrc_f16),
+ !eq(VT, bf16) : !if(IsTrue16, VSrcT_bf16, VSrc_bf16),
+ !eq(VT, i16) : !if(IsTrue16, VSrcT_b16, VSrc_b16),
+ !eq(VT, i1) : SSrc_i1,
+ !eq(VT, v2f32) : VSrc_v2f32,
+ !eq(VT, v2i32) : VSrc_v2b32,
+ !eq(VT, v2f16) : VSrc_v2f16,
+ !eq(VT, v2bf16) : VSrc_v2bf16,
+ !eq(VT, v2i16) : VSrc_v2b16,
+ !eq(VT, v4f16) : AVSrc_64,
+ !eq(VT, v4bf16) : AVSrc_64,
+ !eq(VT.Size, 1024) : VRegSrc_1024,
+ !eq(VT.Size, 512) : VRegSrc_512,
+ !eq(VT.Size, 256) : VRegSrc_256,
+ !eq(VT.Size, 192) : VRegSrc_192,
+ !eq(VT.Size, 128) : VRegSrc_128,
+ !eq(VT.Size, 96) : VRegSrc_96,
+ !eq(VT.Size, 64) : VSrc_b64,
+ 1 : VSrc_b32);
}
// Returns the vreg register class to use for sources of VOP3 instructions for the
@@ -2856,6 +2858,10 @@ def VOP_I32_BF16_I32_F32 : VOPProfile<[i32, bf16, i32, f32]>;
def VOP_I32_F16_I32_F32 : VOPProfile<[i32, f16, i32, f32]>;
def VOP_I32_F32_I32_F32 : VOPProfile<[i32, f32, i32, f32]>;
+def VOP_V6I32_V32BF16_I32_F32 : VOPProfile<[v6i32, v32bf16, i32, f32]>;
+def VOP_V6I32_V32F16_I32_F32 : VOPProfile<[v6i32, v32f16, i32, f32]>;
+def VOP_V6I32_V32F32_I32_F32 : VOPProfile<[v6i32, v32f32, i32, f32]>;
+
def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 51fdd4211a5cf6..6a349d2bf06ea2 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1252,6 +1252,7 @@ def VRegSrc_128: SrcReg9<VReg_128, "OPW128">;
def VRegSrc_192: SrcReg9<VReg_192, "OPW192">;
def VRegSrc_256: SrcReg9<VReg_256, "OPW256">;
def VRegSrc_512: SrcReg9<VReg_512, "OPW512">;
+def VRegSrc_1024: SrcReg9<VReg_1024, "OPW1024">;
def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32, "OPW32">;
// True 16 Operands
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index c78f5c108e4d53..3a79532cecb917 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1117,6 +1117,12 @@ let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPExcep
defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3Inst<"v_cvt_scalef32_pk32_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>, int_amdgcn_cvt_scalef32_pk32_bf6_f16>;
defm V_CVT_SCALEF32_PK32_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_pk32_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_F32>, int_amdgcn_cvt_scalef32_pk32_fp6_bf16>;
defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_pk32_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_F32>, int_amdgcn_cvt_scalef32_pk32_bf6_bf16>;
+ defm V_CVT_SCALEF32_SR_PK32_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk32_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16>;
+ defm V_CVT_SCALEF32_SR_PK32_BF6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk32_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk32_bf6_f16>;
+ defm V_CVT_SCALEF32_SR_PK32_BF6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk32_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk32_bf6_f32>;
+ defm V_CVT_SCALEF32_SR_PK32_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk32_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16>;
+ defm V_CVT_SCALEF32_SR_PK32_FP6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk32_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk32_fp6_f16>;
+ defm V_CVT_SCALEF32_SR_PK32_FP6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk32_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk32_fp6_f32>;
}
let SubtargetPredicate = HasGFX950Insts, mayRaiseFPException = 0 in {
@@ -2203,6 +2209,12 @@ defm V_CVT_SCALEF32_PK32_FP6_F16 : VOP3_Real_gfx9<0x258, "v_cvt_scalef32_pk32_f
defm V_CVT_SCALEF32_PK32_FP6_BF16 : VOP3_Real_gfx9<0x259, "v_cvt_scalef32_pk32_fp6_bf16">;
defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3_Real_gfx9<0x25a, "v_cvt_scalef32_pk32_bf6_f16">;
defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3_Real_gfx9<0x25b, "v_cvt_scalef32_pk32_bf6_bf16">;
+defm V_CVT_SCALEF32_SR_PK32_BF6_BF16 : VOP3_Real_gfx9<0x25f, "v_cvt_scalef32_sr_pk32_bf6_bf16">;
+defm V_CVT_SCALEF32_SR_PK32_BF6_F16 : VOP3_Real_gfx9<0x25e, "v_cvt_scalef32_sr_pk32_bf6_f16">;
+defm V_CVT_SCALEF32_SR_PK32_BF6_F32 : VOP3_Real_gfx9<0x255, "v_cvt_scalef32_sr_pk32_bf6_f32">;
+defm V_CVT_SCALEF32_SR_PK32_FP6_BF16 : VOP3_Real_gfx9<0x25d, "v_cvt_scalef32_sr_pk32_fp6_bf16">;
+defm V_CVT_SCALEF32_SR_PK32_FP6_F16 : VOP3_Real_gfx9<0x25c, "v_cvt_scalef32_sr_pk32_fp6_f16">;
+defm V_CVT_SCALEF32_SR_PK32_FP6_F32 : VOP3_Real_gfx9<0x254, "v_cvt_scalef32_sr_pk32_fp6_f32">;
}
let OtherPredicates = [HasF32ToF16BF16ConversionSRInsts] in {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll
new file mode 100644
index 00000000000000..3e9ac6cbe3ba6e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll
@@ -0,0 +1,636 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s
+
+declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> %src, i32 %sr, float %scale)
+declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f16(<32 x half> %src, i32 %sr, float %scale)
+declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f32(<32 x float> %src, i32 %sr, float %scale)
+declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> %src, i32 %sr, float %scale)
+declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f16(<32 x half> %src, i32 %sr, float %scale)
+declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f32(<32 x float> %src, i32 %sr, float %scale)
+
+define amdgpu_ps void @test_scalef32_sr_pk32_bf6_bf16_vv(<32 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
+; GFX950-SDAG-LABEL: test_scalef32_sr_pk32_bf6_bf16_vv:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_bf6_bf16 v[0:5], v[0:15], v16, v17
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_bf16_vv:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v0
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v21, 16, v1
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v2
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v3
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v24, 16, v4
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v25, 16, v5
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v26, 16, v6
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v27, 16, v7
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v28, 16, v8
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v29, 16, v9
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v30, 16, v10
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v31, 16, v11
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v34, 16, v14
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v35, 16, v15
+; GFX950-GISEL-NEXT: v_mov_b32_sdwa v0, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX950-GISEL-NEXT: v_mov_b32_sdwa v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX950-GISEL-NEXT: v_mov_b32_sdwa v2, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX950-GISEL-NEXT: v_mov_b32_sdwa v3, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; ...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesCo-authored-by: Shilei Tian <shilei.tian@amd.com> Patch is 55.68 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117820.diff 11 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 2c617a90a4fde9..61039938267feb 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -602,5 +602,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16, "UiUiV2hUifIi", "nc"
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16, "UiUiV2yUifIi", "nc", "fp4-cvt-scale-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32, "UiUiV2fUifIi", "nc", "fp4-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16, "V6UiV32yUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16, "V6UiV32hUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32, "V6UiV32fUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16, "V6UiV32yUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16, "V6UiV32hUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32, "V6UiV32fUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
+
#undef BUILTIN
#undef TARGET_BUILTIN
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
index 691be592e3a4bc..64403f0bf94ebd 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
@@ -1338,3 +1338,65 @@ void test_cvt_scalef32_sr_pk_fp4_f32(global unsigned *out, float2 src, uint seed
*out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(*out, src, seed, scale, 2);
*out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(*out, src, seed, scale, 3);
}
+
+// CHECK-LABEL: @test_cvt_scalef32_sr_pk32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT6_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[SRCBF32_ADDR:%.*]] = alloca <32 x bfloat>, align 64, addrspace(5)
+// CHECK-NEXT: [[SRCH32_ADDR:%.*]] = alloca <32 x half>, align 64, addrspace(5)
+// CHECK-NEXT: [[SRCF32_ADDR:%.*]] = alloca <32 x float>, align 128, addrspace(5)
+// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT: store ptr addrspace(1) [[OUT6:%.*]], ptr addrspace(5) [[OUT6_ADDR]], align 8
+// CHECK-NEXT: store <32 x bfloat> [[SRCBF32:%.*]], ptr addrspace(5) [[SRCBF32_ADDR]], align 64
+// CHECK-NEXT: store <32 x half> [[SRCH32:%.*]], ptr addrspace(5) [[SRCH32_ADDR]], align 64
+// CHECK-NEXT: store <32 x float> [[SRCF32:%.*]], ptr addrspace(5) [[SRCF32_ADDR]], align 128
+// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT: store float [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> [[TMP0]], i32 [[TMP1]], float [[TMP2]])
+// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
+// CHECK-NEXT: store <6 x i32> [[TMP3]], ptr addrspace(1) [[TMP4]], align 32
+// CHECK-NEXT: [[TMP5:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64
+// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT: [[TMP7:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT: [[TMP8:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f16(<32 x half> [[TMP5]], i32 [[TMP6]], float [[TMP7]])
+// CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
+// CHECK-NEXT: store <6 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 32
+// CHECK-NEXT: [[TMP10:%.*]] = load <32 x float>, ptr addrspace(5) [[SRCF32_ADDR]], align 128
+// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT: [[TMP12:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT: [[TMP13:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f32(<32 x float> [[TMP10]], i32 [[TMP11]], float [[TMP12]])
+// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
+// CHECK-NEXT: store <6 x i32> [[TMP13]], ptr addrspace(1) [[TMP14]], align 32
+// CHECK-NEXT: [[TMP15:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64
+// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT: [[TMP17:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT: [[TMP18:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> [[TMP15]], i32 [[TMP16]], float [[TMP17]])
+// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
+// CHECK-NEXT: store <6 x i32> [[TMP18]], ptr addrspace(1) [[TMP19]], align 32
+// CHECK-NEXT: [[TMP20:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64
+// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT: [[TMP22:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT: [[TMP23:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f16(<32 x half> [[TMP20]], i32 [[TMP21]], float [[TMP22]])
+// CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
+// CHECK-NEXT: store <6 x i32> [[TMP23]], ptr addrspace(1) [[TMP24]], align 32
+// CHECK-NEXT: [[TMP25:%.*]] = load <32 x float>, ptr addrspace(5) [[SRCF32_ADDR]], align 128
+// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT: [[TMP27:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT: [[TMP28:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f32(<32 x float> [[TMP25]], i32 [[TMP26]], float [[TMP27]])
+// CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
+// CHECK-NEXT: store <6 x i32> [[TMP28]], ptr addrspace(1) [[TMP29]], align 32
+// CHECK-NEXT: ret void
+//
+void test_cvt_scalef32_sr_pk32(global uint6 *out6, bfloat32 srcbf32, half32 srch32, float32 srcf32, unsigned src1, float src2)
+{
+ *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16(srcbf32, src1, src2);
+ *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16(srch32, src1, src2);
+ *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32(srcf32, src1, src2);
+ *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16(srcbf32, src1, src2);
+ *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16(srch32, src1, src2);
+ *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32(srcf32, src1, src2);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 99a29dadef56de..73f3559ab05a48 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -602,6 +602,10 @@ class AMDGPUCvtScaleF32ToFP6BF6Intrinsic<LLVMType DstTy, LLVMType Src0Ty, LLVMTy
[DstTy], [Src0Ty, Src1Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
>, ClangBuiltin<"__builtin_amdgcn_"#name>;
+class AMDGPUCvtScaleF32SRIntrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic<
+ [DstTy], [Src0Ty, llvm_i32_ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
+>, ClangBuiltin<"__builtin_amdgcn_"#name>;
+
def int_amdgcn_cvt_scalef32_pk32_fp6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_fp6_f16">;
def int_amdgcn_cvt_scalef32_pk32_bf6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_bf6_f16">;
def int_amdgcn_cvt_scalef32_pk32_fp6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_pk32_fp6_bf16">;
@@ -609,6 +613,13 @@ def int_amdgcn_cvt_scalef32_pk32_bf6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i3
def int_amdgcn_cvt_scalef32_2xpk16_fp6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_fp6_f32">;
def int_amdgcn_cvt_scalef32_2xpk16_bf6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_bf6_f32">;
+def int_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_sr_pk32_bf6_bf16">;
+def int_amdgcn_cvt_scalef32_sr_pk32_bf6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_sr_pk32_bf6_f16">;
+def int_amdgcn_cvt_scalef32_sr_pk32_bf6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f32_ty, "cvt_scalef32_sr_pk32_bf6_f32">;
+def int_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_sr_pk32_fp6_bf16">;
+def int_amdgcn_cvt_scalef32_sr_pk32_fp6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_sr_pk32_fp6_f16">;
+def int_amdgcn_cvt_scalef32_sr_pk32_fp6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f32_ty, "cvt_scalef32_sr_pk32_fp6_f32">;
+
class AMDGPUCvtScaleFP4FP8BF8ToF1632Intrinsic<LLVMType DstTy, string name> : DefaultAttrsIntrinsic<
[DstTy],
[llvm_i32_ty, // src
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index ad100f37f8710c..158603a7aff879 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4578,6 +4578,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f16:
case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_bf16:
case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f32:
case Intrinsic::amdgcn_ashr_pk_i8_i32:
case Intrinsic::amdgcn_ashr_pk_u8_i32:
case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 119a4d63704777..b27e2529f4a807 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1779,25 +1779,27 @@ class getSDWASrcForVT <ValueType VT> {
// given VT.
class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
RegisterOperand ret =
- !cond(!eq(VT, f64) : VSrc_f64,
- !eq(VT, f32) : VSrc_f32,
- !eq(VT, f16) : !if(IsTrue16, VSrcT_f16, VSrc_f16),
- !eq(VT, bf16) : !if(IsTrue16, VSrcT_bf16, VSrc_bf16),
- !eq(VT, i16) : !if(IsTrue16, VSrcT_b16, VSrc_b16),
- !eq(VT, i1) : SSrc_i1,
- !eq(VT, v2f32) : VSrc_v2f32,
- !eq(VT, v2i32) : VSrc_v2b32,
- !eq(VT, v2f16) : VSrc_v2f16,
- !eq(VT, v2bf16) : VSrc_v2bf16,
- !eq(VT, v2i16) : VSrc_v2b16,
- !eq(VT, v4f16) : AVSrc_64,
- !eq(VT, v4bf16) : AVSrc_64,
- !eq(VT.Size, 512) : VRegSrc_512,
- !eq(VT.Size, 192) : VRegSrc_192,
- !eq(VT.Size, 128) : VRegSrc_128,
- !eq(VT.Size, 96) : VRegSrc_96,
- !eq(VT.Size, 64) : VSrc_b64,
- 1 : VSrc_b32);
+ !cond(!eq(VT, f64) : VSrc_f64,
+ !eq(VT, f32) : VSrc_f32,
+ !eq(VT, f16) : !if(IsTrue16, VSrcT_f16, VSrc_f16),
+ !eq(VT, bf16) : !if(IsTrue16, VSrcT_bf16, VSrc_bf16),
+ !eq(VT, i16) : !if(IsTrue16, VSrcT_b16, VSrc_b16),
+ !eq(VT, i1) : SSrc_i1,
+ !eq(VT, v2f32) : VSrc_v2f32,
+ !eq(VT, v2i32) : VSrc_v2b32,
+ !eq(VT, v2f16) : VSrc_v2f16,
+ !eq(VT, v2bf16) : VSrc_v2bf16,
+ !eq(VT, v2i16) : VSrc_v2b16,
+ !eq(VT, v4f16) : AVSrc_64,
+ !eq(VT, v4bf16) : AVSrc_64,
+ !eq(VT.Size, 1024) : VRegSrc_1024,
+ !eq(VT.Size, 512) : VRegSrc_512,
+ !eq(VT.Size, 256) : VRegSrc_256,
+ !eq(VT.Size, 192) : VRegSrc_192,
+ !eq(VT.Size, 128) : VRegSrc_128,
+ !eq(VT.Size, 96) : VRegSrc_96,
+ !eq(VT.Size, 64) : VSrc_b64,
+ 1 : VSrc_b32);
}
// Returns the vreg register class to use for sources of VOP3 instructions for the
@@ -2856,6 +2858,10 @@ def VOP_I32_BF16_I32_F32 : VOPProfile<[i32, bf16, i32, f32]>;
def VOP_I32_F16_I32_F32 : VOPProfile<[i32, f16, i32, f32]>;
def VOP_I32_F32_I32_F32 : VOPProfile<[i32, f32, i32, f32]>;
+def VOP_V6I32_V32BF16_I32_F32 : VOPProfile<[v6i32, v32bf16, i32, f32]>;
+def VOP_V6I32_V32F16_I32_F32 : VOPProfile<[v6i32, v32f16, i32, f32]>;
+def VOP_V6I32_V32F32_I32_F32 : VOPProfile<[v6i32, v32f32, i32, f32]>;
+
def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 51fdd4211a5cf6..6a349d2bf06ea2 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1252,6 +1252,7 @@ def VRegSrc_128: SrcReg9<VReg_128, "OPW128">;
def VRegSrc_192: SrcReg9<VReg_192, "OPW192">;
def VRegSrc_256: SrcReg9<VReg_256, "OPW256">;
def VRegSrc_512: SrcReg9<VReg_512, "OPW512">;
+def VRegSrc_1024: SrcReg9<VReg_1024, "OPW1024">;
def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32, "OPW32">;
// True 16 Operands
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index c78f5c108e4d53..3a79532cecb917 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1117,6 +1117,12 @@ let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPExcep
defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3Inst<"v_cvt_scalef32_pk32_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>, int_amdgcn_cvt_scalef32_pk32_bf6_f16>;
defm V_CVT_SCALEF32_PK32_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_pk32_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_F32>, int_amdgcn_cvt_scalef32_pk32_fp6_bf16>;
defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_pk32_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_F32>, int_amdgcn_cvt_scalef32_pk32_bf6_bf16>;
+ defm V_CVT_SCALEF32_SR_PK32_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk32_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16>;
+ defm V_CVT_SCALEF32_SR_PK32_BF6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk32_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk32_bf6_f16>;
+ defm V_CVT_SCALEF32_SR_PK32_BF6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk32_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk32_bf6_f32>;
+ defm V_CVT_SCALEF32_SR_PK32_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk32_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16>;
+ defm V_CVT_SCALEF32_SR_PK32_FP6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk32_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk32_fp6_f16>;
+ defm V_CVT_SCALEF32_SR_PK32_FP6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk32_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk32_fp6_f32>;
}
let SubtargetPredicate = HasGFX950Insts, mayRaiseFPException = 0 in {
@@ -2203,6 +2209,12 @@ defm V_CVT_SCALEF32_PK32_FP6_F16 : VOP3_Real_gfx9<0x258, "v_cvt_scalef32_pk32_f
defm V_CVT_SCALEF32_PK32_FP6_BF16 : VOP3_Real_gfx9<0x259, "v_cvt_scalef32_pk32_fp6_bf16">;
defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3_Real_gfx9<0x25a, "v_cvt_scalef32_pk32_bf6_f16">;
defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3_Real_gfx9<0x25b, "v_cvt_scalef32_pk32_bf6_bf16">;
+defm V_CVT_SCALEF32_SR_PK32_BF6_BF16 : VOP3_Real_gfx9<0x25f, "v_cvt_scalef32_sr_pk32_bf6_bf16">;
+defm V_CVT_SCALEF32_SR_PK32_BF6_F16 : VOP3_Real_gfx9<0x25e, "v_cvt_scalef32_sr_pk32_bf6_f16">;
+defm V_CVT_SCALEF32_SR_PK32_BF6_F32 : VOP3_Real_gfx9<0x255, "v_cvt_scalef32_sr_pk32_bf6_f32">;
+defm V_CVT_SCALEF32_SR_PK32_FP6_BF16 : VOP3_Real_gfx9<0x25d, "v_cvt_scalef32_sr_pk32_fp6_bf16">;
+defm V_CVT_SCALEF32_SR_PK32_FP6_F16 : VOP3_Real_gfx9<0x25c, "v_cvt_scalef32_sr_pk32_fp6_f16">;
+defm V_CVT_SCALEF32_SR_PK32_FP6_F32 : VOP3_Real_gfx9<0x254, "v_cvt_scalef32_sr_pk32_fp6_f32">;
}
let OtherPredicates = [HasF32ToF16BF16ConversionSRInsts] in {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll
new file mode 100644
index 00000000000000..3e9ac6cbe3ba6e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll
@@ -0,0 +1,636 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s
+
+declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> %src, i32 %sr, float %scale)
+declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f16(<32 x half> %src, i32 %sr, float %scale)
+declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f32(<32 x float> %src, i32 %sr, float %scale)
+declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> %src, i32 %sr, float %scale)
+declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f16(<32 x half> %src, i32 %sr, float %scale)
+declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f32(<32 x float> %src, i32 %sr, float %scale)
+
+define amdgpu_ps void @test_scalef32_sr_pk32_bf6_bf16_vv(<32 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
+; GFX950-SDAG-LABEL: test_scalef32_sr_pk32_bf6_bf16_vv:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_bf6_bf16 v[0:5], v[0:15], v16, v17
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_bf16_vv:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v0
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v21, 16, v1
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v2
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v3
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v24, 16, v4
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v25, 16, v5
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v26, 16, v6
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v27, 16, v7
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v28, 16, v8
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v29, 16, v9
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v30, 16, v10
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v31, 16, v11
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v34, 16, v14
+; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v35, 16, v15
+; GFX950-GISEL-NEXT: v_mov_b32_sdwa v0, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX950-GISEL-NEXT: v_mov_b32_sdwa v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX950-GISEL-NEXT: v_mov_b32_sdwa v2, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX950-GISEL-NEXT: v_mov_b32_sdwa v3, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; ...
[truncated]
|
ce8c168
to
0b0d694
Compare
Co-authored-by: Shilei Tian <shilei.tian@amd.com>
0b0d694
to
9a90e74
Compare
Co-authored-by: Shilei Tian <shilei.tian@amd.com>
Co-authored-by: Shilei Tian shilei.tian@amd.com