-
Notifications
You must be signed in to change notification settings - Fork 12.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][ISel] Add more trunc store actions regarding bf16 #90493
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Shilei Tian (shiltian) ChangesPatch is 34.07 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/90493.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 7f4a2437f62eb4..3124fb23fb0be7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -315,6 +315,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
+ setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
@@ -330,6 +331,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
+ setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
@@ -339,17 +341,21 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
+ setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
+ setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
+ setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
+ setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
diff --git a/llvm/test/CodeGen/AMDGPU/fp_trunc_store_bf16.ll b/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp32_to_bf16.ll
similarity index 93%
rename from llvm/test/CodeGen/AMDGPU/fp_trunc_store_bf16.ll
rename to llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp32_to_bf16.ll
index 5aaff773689f96..87a5d34b90fd76 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_trunc_store_bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp32_to_bf16.ll
@@ -27,6 +27,40 @@ entry:
ret void
}
+define void @v3(<3 x float> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v3:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v5, v4
+; CHECK-NEXT: v_mov_b32_e32 v4, v3
+; CHECK-NEXT: v_bfe_u32 v3, v0, 16, 1
+; CHECK-NEXT: s_movk_i32 s4, 0x7fff
+; CHECK-NEXT: v_add3_u32 v3, v3, v0, s4
+; CHECK-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; CHECK-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
+; CHECK-NEXT: v_bfe_u32 v3, v1, 16, 1
+; CHECK-NEXT: v_add3_u32 v3, v3, v1, s4
+; CHECK-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; CHECK-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
+; CHECK-NEXT: s_mov_b32 s5, 0x7060302
+; CHECK-NEXT: v_perm_b32 v0, v1, v0, s5
+; CHECK-NEXT: v_bfe_u32 v1, v2, 16, 1
+; CHECK-NEXT: v_add3_u32 v1, v1, v2, s4
+; CHECK-NEXT: v_or_b32_e32 v3, 0x400000, v2
+; CHECK-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; CHECK-NEXT: global_store_short_d16_hi v[4:5], v1, off offset:4
+; CHECK-NEXT: global_store_dword v[4:5], v0, off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %conv = fptrunc <3 x float> %num to <3 x bfloat>
+ store <3 x bfloat> %conv, ptr addrspace(1) %p, align 8
+ ret void
+}
+
define void @v4(<4 x float> %num, ptr addrspace(1) %p) {
; CHECK-LABEL: v4:
; CHECK: ; %bb.0: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp64_to_bf16.ll b/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp64_to_bf16.ll
new file mode 100644
index 00000000000000..01e1dfc0ba0c31
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp64_to_bf16.ll
@@ -0,0 +1,632 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s
+
+define void @v2(<2 x double> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_cvt_f32_f64_e64 v8, |v[0:1]|
+; CHECK-NEXT: v_cvt_f64_f32_e32 v[6:7], v8
+; CHECK-NEXT: v_and_b32_e32 v9, 1, v8
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[6:7]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[6:7]
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v6, -1, 1, s[6:7]
+; CHECK-NEXT: v_add_u32_e32 v6, v8, v6
+; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; CHECK-NEXT: s_brev_b32 s8, 1
+; CHECK-NEXT: v_and_or_b32 v7, v1, s8, v6
+; CHECK-NEXT: v_bfe_u32 v6, v6, 16, 1
+; CHECK-NEXT: s_movk_i32 s9, 0x7fff
+; CHECK-NEXT: v_add3_u32 v6, v6, v7, s9
+; CHECK-NEXT: v_or_b32_e32 v7, 0x400000, v7
+; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; CHECK-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc
+; CHECK-NEXT: v_cvt_f32_f64_e64 v7, |v[2:3]|
+; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v7
+; CHECK-NEXT: v_and_b32_e32 v8, 1, v7
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, v[0:1]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[2:3]|, v[0:1]
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[6:7]
+; CHECK-NEXT: v_add_u32_e32 v0, v7, v0
+; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; CHECK-NEXT: v_and_or_b32 v1, v3, s8, v0
+; CHECK-NEXT: v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT: v_add3_u32 v0, v0, v1, s9
+; CHECK-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT: s_mov_b32 s4, 0x7060302
+; CHECK-NEXT: v_perm_b32 v0, v0, v6, s4
+; CHECK-NEXT: global_store_dword v[4:5], v0, off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %conv = fptrunc <2 x double> %num to <2 x bfloat>
+ store <2 x bfloat> %conv, ptr addrspace(1) %p, align 8
+ ret void
+}
+
+define void @v3(<3 x double> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v3:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_cvt_f32_f64_e64 v10, |v[0:1]|
+; CHECK-NEXT: v_cvt_f64_f32_e32 v[8:9], v10
+; CHECK-NEXT: v_and_b32_e32 v11, 1, v10
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[8:9]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[8:9]
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11
+; CHECK-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[6:7]
+; CHECK-NEXT: v_add_u32_e32 v8, v10, v8
+; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc
+; CHECK-NEXT: s_brev_b32 s8, 1
+; CHECK-NEXT: v_and_or_b32 v9, v1, s8, v8
+; CHECK-NEXT: v_bfe_u32 v8, v8, 16, 1
+; CHECK-NEXT: s_movk_i32 s9, 0x7fff
+; CHECK-NEXT: v_add3_u32 v8, v8, v9, s9
+; CHECK-NEXT: v_or_b32_e32 v9, 0x400000, v9
+; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT: v_cvt_f32_f64_e64 v9, |v[2:3]|
+; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v9
+; CHECK-NEXT: v_and_b32_e32 v10, 1, v9
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, v[0:1]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[2:3]|, v[0:1]
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
+; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[6:7]
+; CHECK-NEXT: v_add_u32_e32 v0, v9, v0
+; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
+; CHECK-NEXT: v_and_or_b32 v1, v3, s8, v0
+; CHECK-NEXT: v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT: v_add3_u32 v0, v0, v1, s9
+; CHECK-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT: s_mov_b32 s4, 0x7060302
+; CHECK-NEXT: v_cvt_f32_f64_e64 v3, |v[4:5]|
+; CHECK-NEXT: v_perm_b32 v2, v0, v8, s4
+; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
+; CHECK-NEXT: v_and_b32_e32 v8, 1, v3
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[4:5]|, v[0:1]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[4:5]|, v[0:1]
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[6:7]
+; CHECK-NEXT: v_add_u32_e32 v0, v3, v0
+; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; CHECK-NEXT: v_and_or_b32 v1, v5, s8, v0
+; CHECK-NEXT: v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT: v_add3_u32 v0, v0, v1, s9
+; CHECK-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[4:5]
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT: global_store_short_d16_hi v[6:7], v0, off offset:4
+; CHECK-NEXT: global_store_dword v[6:7], v2, off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %conv = fptrunc <3 x double> %num to <3 x bfloat>
+ store <3 x bfloat> %conv, ptr addrspace(1) %p, align 8
+ ret void
+}
+
+define void @v4(<4 x double> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v4:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_cvt_f32_f64_e64 v12, |v[4:5]|
+; CHECK-NEXT: v_cvt_f64_f32_e32 v[10:11], v12
+; CHECK-NEXT: v_and_b32_e32 v13, 1, v12
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[4:5]|, v[10:11]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[4:5]|, v[10:11]
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13
+; CHECK-NEXT: v_cndmask_b32_e64 v10, -1, 1, s[6:7]
+; CHECK-NEXT: v_add_u32_e32 v10, v12, v10
+; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc
+; CHECK-NEXT: s_brev_b32 s8, 1
+; CHECK-NEXT: v_and_or_b32 v11, v5, s8, v10
+; CHECK-NEXT: v_bfe_u32 v10, v10, 16, 1
+; CHECK-NEXT: s_movk_i32 s9, 0x7fff
+; CHECK-NEXT: v_add3_u32 v10, v10, v11, s9
+; CHECK-NEXT: v_or_b32_e32 v11, 0x400000, v11
+; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[4:5]
+; CHECK-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc
+; CHECK-NEXT: v_cvt_f32_f64_e64 v11, |v[6:7]|
+; CHECK-NEXT: v_cvt_f64_f32_e32 v[4:5], v11
+; CHECK-NEXT: v_and_b32_e32 v12, 1, v11
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[6:7]|, v[4:5]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[6:7]|, v[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
+; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
+; CHECK-NEXT: v_add_u32_e32 v4, v11, v4
+; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc
+; CHECK-NEXT: v_and_or_b32 v5, v7, s8, v4
+; CHECK-NEXT: v_bfe_u32 v4, v4, 16, 1
+; CHECK-NEXT: v_add3_u32 v4, v4, v5, s9
+; CHECK-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[6:7]
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; CHECK-NEXT: s_mov_b32 s10, 0x7060302
+; CHECK-NEXT: v_perm_b32 v5, v4, v10, s10
+; CHECK-NEXT: v_cvt_f32_f64_e64 v4, |v[0:1]|
+; CHECK-NEXT: v_cvt_f64_f32_e32 v[6:7], v4
+; CHECK-NEXT: v_and_b32_e32 v10, 1, v4
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[6:7]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[6:7]
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
+; CHECK-NEXT: v_cndmask_b32_e64 v6, -1, 1, s[6:7]
+; CHECK-NEXT: v_add_u32_e32 v6, v4, v6
+; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; CHECK-NEXT: v_and_or_b32 v6, v1, s8, v4
+; CHECK-NEXT: v_bfe_u32 v4, v4, 16, 1
+; CHECK-NEXT: v_add3_u32 v4, v4, v6, s9
+; CHECK-NEXT: v_or_b32_e32 v6, 0x400000, v6
+; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT: v_cvt_f32_f64_e64 v6, |v[2:3]|
+; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v6
+; CHECK-NEXT: v_and_b32_e32 v7, 1, v6
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, v[0:1]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[2:3]|, v[0:1]
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
+; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[6:7]
+; CHECK-NEXT: v_add_u32_e32 v0, v6, v0
+; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; CHECK-NEXT: v_and_or_b32 v1, v3, s8, v0
+; CHECK-NEXT: v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT: v_add3_u32 v0, v0, v1, s9
+; CHECK-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT: v_perm_b32 v4, v0, v4, s10
+; CHECK-NEXT: global_store_dwordx2 v[8:9], v[4:5], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %conv = fptrunc <4 x double> %num to <4 x bfloat>
+ store <4 x bfloat> %conv, ptr addrspace(1) %p, align 8
+ ret void
+}
+
+define void @v8(<8 x double> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_cvt_f32_f64_e64 v20, |v[12:13]|
+; CHECK-NEXT: v_cvt_f64_f32_e32 v[18:19], v20
+; CHECK-NEXT: v_and_b32_e32 v21, 1, v20
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[12:13]|, v[18:19]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[12:13]|, v[18:19]
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v21
+; CHECK-NEXT: v_cndmask_b32_e64 v18, -1, 1, s[6:7]
+; CHECK-NEXT: v_add_u32_e32 v18, v20, v18
+; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v18, v18, v20, vcc
+; CHECK-NEXT: s_brev_b32 s8, 1
+; CHECK-NEXT: v_and_or_b32 v19, v13, s8, v18
+; CHECK-NEXT: v_bfe_u32 v18, v18, 16, 1
+; CHECK-NEXT: s_movk_i32 s9, 0x7fff
+; CHECK-NEXT: v_add3_u32 v18, v18, v19, s9
+; CHECK-NEXT: v_or_b32_e32 v19, 0x400000, v19
+; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[12:13]
+; CHECK-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc
+; CHECK-NEXT: v_cvt_f32_f64_e64 v19, |v[14:15]|
+; CHECK-NEXT: v_cvt_f64_f32_e32 v[12:13], v19
+; CHECK-NEXT: v_and_b32_e32 v20, 1, v19
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[14:15]|, v[12:13]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[14:15]|, v[12:13]
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v20
+; CHECK-NEXT: v_cndmask_b32_e64 v12, -1, 1, s[6:7]
+; CHECK-NEXT: v_add_u32_e32 v12, v19, v12
+; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v12, v12, v19, vcc
+; CHECK-NEXT: v_and_or_b32 v13, v15, s8, v12
+; CHECK-NEXT: v_bfe_u32 v12, v12, 16, 1
+; CHECK-NEXT: v_add3_u32 v12, v12, v13, s9
+; CHECK-NEXT: v_or_b32_e32 v13, 0x400000, v13
+; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[14:15]
+; CHECK-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc
+; CHECK-NEXT: s_mov_b32 s10, 0x7060302
+; CHECK-NEXT: v_perm_b32 v13, v12, v18, s10
+; CHECK-NEXT: v_cvt_f32_f64_e64 v12, |v[8:9]|
+; CHECK-NEXT: v_cvt_f64_f32_e32 v[14:15], v12
+; CHECK-NEXT: v_and_b32_e32 v18, 1, v12
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[8:9]|, v[14:15]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[8:9]|, v[14:15]
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18
+; CHECK-NEXT: v_cndmask_b32_e64 v14, -1, 1, s[6:7]
+; CHECK-NEXT: v_add_u32_e32 v14, v12, v14
+; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc
+; CHECK-NEXT: v_and_or_b32 v14, v9, s8, v12
+; CHECK-NEXT: v_bfe_u32 v12, v12, 16, 1
+; CHECK-NEXT: v_add3_u32 v12, v12, v14, s9
+; CHECK-NEXT: v_or_b32_e32 v14, 0x400000, v14
+; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[8:9]
+; CHECK-NEXT: v_cndmask_b32_e32 v12, v12, v14, vcc
+; CHECK-NEXT: v_cvt_f32_f64_e64 v14, |v[10:11]|
+; CHECK-NEXT: v_cvt_f64_f32_e32 v[8:9], v14
+; CHECK-NEXT: v_and_b32_e32 v15, 1, v14
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[10:11]|, v[8:9]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[10:11]|, v[8:9]
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15
+; CHECK-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[6:7]
+; CHECK-NEXT: v_add_u32_e32 v8, v14, v8
+; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v14, vcc
+; CHECK-NEXT: v_and_or_b32 v9, v11, s8, v8
+; CHECK-NEXT: v_bfe_u32 v8, v8, 16, 1
+; CHECK-NEXT: v_add3_u32 v8, v8, v9, s9
+; CHECK-NEXT: v_or_b32_e32 v9, 0x400000, v9
+; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[10:11]
+; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT: v_cvt_f32_f64_e64 v10, |v[4:5]|
+; CHECK-NEXT: v_perm_b32 v12, v8, v12, s10
+; CHECK-NEXT: v_cvt_f64_f32_e32 v[8:9], v10
+; CHECK-NEXT: v_and_b32_e32 v11, 1, v10
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[4:5]|, v[8:9]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[4:5]|, v[8:9]
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11
+; CHECK-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[6:7]
+; CHECK-NEXT: v_add_u32_e32 v8, v10, v8
+; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc
+; CHECK-NEXT: v_and_or_b32 v9, v5, s8, v8
+; CHECK-NEXT: v_bfe_u32 v8, v8, 16, 1
+; CHECK-NEXT: v_add3_u32 v8, v8, v9, s9
+; CHECK-NEXT: v_or_b32_e32 v9, 0x400000, v9
+; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[4:5]
+; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT: v_cvt_f32_f64_e64 v9, |v[6:7]|
+; CHECK-NEXT: v_cvt_f64_f32_e32 v[4:5], v9
+; CHECK-NEXT: v_and_b32_e32 v10, 1, v9
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[6:7]|, v[4:5]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[6:7]|, v[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
+; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
+; CHECK-NEXT: v_add_u32_e32 v4, v9, v4
+; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
+; CHECK-NEXT: v_and_or_b32 v5, v7, s8, v4
+; CHECK-NEXT: v_bfe_u32 v4, v4, 16, 1
+; CHECK-NEXT: v_add3_u32 v4, v4, v5, s9
+; CHECK-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[6:7]
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; CHECK-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
+; CHECK-NEXT: v_perm_b32 v11, v4, v8, s10
+; CHECK-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; CHECK-NEXT: v_and_b32_e32 v7, 1, v6
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
+; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
+; CHECK-NEXT: v_add_u32_e32 v4, v6, v4
+; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT: v_and_or_b32 v5, v1, s8, v4
+; CHECK-NEXT: v_bfe_u32 v4, v4, 16, 1
+; CHECK-NEXT: v_add3_u32 v4, v4, v5, s9
+; CHECK-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; CHECK-NEXT: v_cvt_f32_f64_e64 v5, |v[2:3]|
+; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v5
+; CHECK-NEXT: v_and_b32_e32 v6, 1, v5
+; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, v[0:1]
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[2:3]|, v[0:1]
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[6:7]
+; CHECK-NEXT: v_add_u32_e32 v0, v5, v0
+; CHECK-NEXT: s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; CHECK-NEXT: v_and_or_b32 v1, v3, s8, v0
+; CHECK-NEXT: v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT: v_add3_u32 v0, v0, v1, s9
+; CHECK-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
+; CHECK-NEXT:...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. I would also add a scalar test, just to see a single conversion.
ccc29eb
to
670620e
Compare
Change-Id: Iafcee86aa45b175d1c65fb81d7d788efa3bb0c92
No description provided.