@@ -2090,69 +2090,69 @@ define amdgpu_ps double @dyn_extract_v16f64_s_s(i32 inreg %sel) {
20902090; GCN-LABEL: dyn_extract_v16f64_s_s:
20912091; GCN: ; %bb.0: ; %entry
20922092; GCN-NEXT: s_mov_b32 s66, 0
2093+ ; GCN-NEXT: s_mov_b32 s64, 0
2094+ ; GCN-NEXT: s_mov_b32 s62, 0
2095+ ; GCN-NEXT: s_mov_b32 s60, 0
2096+ ; GCN-NEXT: s_mov_b32 s58, 0
2097+ ; GCN-NEXT: s_mov_b32 s56, 0
2098+ ; GCN-NEXT: s_mov_b32 s54, 0
2099+ ; GCN-NEXT: s_mov_b32 s52, 0
2100+ ; GCN-NEXT: s_mov_b32 s50, 0
2101+ ; GCN-NEXT: s_mov_b32 s48, 0
2102+ ; GCN-NEXT: s_mov_b32 s46, 0
2103+ ; GCN-NEXT: s_mov_b32 s44, 0
2104+ ; GCN-NEXT: s_mov_b32 s40, 0
20932105; GCN-NEXT: s_mov_b64 s[36:37], 1.0
20942106; GCN-NEXT: s_mov_b32 m0, s2
20952107; GCN-NEXT: s_mov_b32 s67, 0x40300000
20962108; GCN-NEXT: s_mov_b32 s65, 0x402e0000
2097- ; GCN-NEXT: s_mov_b32 s64, s66
20982109; GCN-NEXT: s_mov_b32 s63, 0x402c0000
2099- ; GCN-NEXT: s_mov_b32 s62, s66
21002110; GCN-NEXT: s_mov_b32 s61, 0x402a0000
2101- ; GCN-NEXT: s_mov_b32 s60, s66
21022111; GCN-NEXT: s_mov_b32 s59, 0x40280000
2103- ; GCN-NEXT: s_mov_b32 s58, s66
21042112; GCN-NEXT: s_mov_b32 s57, 0x40260000
2105- ; GCN-NEXT: s_mov_b32 s56, s66
21062113; GCN-NEXT: s_mov_b32 s55, 0x40240000
2107- ; GCN-NEXT: s_mov_b32 s54, s66
21082114; GCN-NEXT: s_mov_b32 s53, 0x40220000
2109- ; GCN-NEXT: s_mov_b32 s52, s66
21102115; GCN-NEXT: s_mov_b32 s51, 0x40200000
2111- ; GCN-NEXT: s_mov_b32 s50, s66
21122116; GCN-NEXT: s_mov_b32 s49, 0x401c0000
2113- ; GCN-NEXT: s_mov_b32 s48, s66
21142117; GCN-NEXT: s_mov_b32 s47, 0x40180000
2115- ; GCN-NEXT: s_mov_b32 s46, s66
21162118; GCN-NEXT: s_mov_b32 s45, 0x40140000
2117- ; GCN-NEXT: s_mov_b32 s44, s66
21182119; GCN-NEXT: s_mov_b64 s[42:43], 4.0
21192120; GCN-NEXT: s_mov_b32 s41, 0x40080000
2120- ; GCN-NEXT: s_mov_b32 s40, s66
21212121; GCN-NEXT: s_mov_b64 s[38:39], 2.0
21222122; GCN-NEXT: s_movrels_b64 s[0:1], s[36:37]
21232123; GCN-NEXT: ; return to shader part epilog
21242124;
21252125; GFX10PLUS-LABEL: dyn_extract_v16f64_s_s:
21262126; GFX10PLUS: ; %bb.0: ; %entry
2127- ; GFX10PLUS-NEXT: s_mov_b32 s66, 0
21282127; GFX10PLUS-NEXT: s_mov_b64 s[36:37], 1.0
21292128; GFX10PLUS-NEXT: s_mov_b32 m0, s2
2129+ ; GFX10PLUS-NEXT: s_mov_b32 s66, 0
2130+ ; GFX10PLUS-NEXT: s_mov_b32 s64, 0
2131+ ; GFX10PLUS-NEXT: s_mov_b32 s62, 0
2132+ ; GFX10PLUS-NEXT: s_mov_b32 s60, 0
2133+ ; GFX10PLUS-NEXT: s_mov_b32 s58, 0
2134+ ; GFX10PLUS-NEXT: s_mov_b32 s56, 0
2135+ ; GFX10PLUS-NEXT: s_mov_b32 s54, 0
2136+ ; GFX10PLUS-NEXT: s_mov_b32 s52, 0
2137+ ; GFX10PLUS-NEXT: s_mov_b32 s50, 0
2138+ ; GFX10PLUS-NEXT: s_mov_b32 s48, 0
2139+ ; GFX10PLUS-NEXT: s_mov_b32 s46, 0
2140+ ; GFX10PLUS-NEXT: s_mov_b32 s44, 0
2141+ ; GFX10PLUS-NEXT: s_mov_b32 s40, 0
21302142; GFX10PLUS-NEXT: s_mov_b32 s67, 0x40300000
21312143; GFX10PLUS-NEXT: s_mov_b32 s65, 0x402e0000
2132- ; GFX10PLUS-NEXT: s_mov_b32 s64, s66
21332144; GFX10PLUS-NEXT: s_mov_b32 s63, 0x402c0000
2134- ; GFX10PLUS-NEXT: s_mov_b32 s62, s66
21352145; GFX10PLUS-NEXT: s_mov_b32 s61, 0x402a0000
2136- ; GFX10PLUS-NEXT: s_mov_b32 s60, s66
21372146; GFX10PLUS-NEXT: s_mov_b32 s59, 0x40280000
2138- ; GFX10PLUS-NEXT: s_mov_b32 s58, s66
21392147; GFX10PLUS-NEXT: s_mov_b32 s57, 0x40260000
2140- ; GFX10PLUS-NEXT: s_mov_b32 s56, s66
21412148; GFX10PLUS-NEXT: s_mov_b32 s55, 0x40240000
2142- ; GFX10PLUS-NEXT: s_mov_b32 s54, s66
21432149; GFX10PLUS-NEXT: s_mov_b32 s53, 0x40220000
2144- ; GFX10PLUS-NEXT: s_mov_b32 s52, s66
21452150; GFX10PLUS-NEXT: s_mov_b32 s51, 0x40200000
2146- ; GFX10PLUS-NEXT: s_mov_b32 s50, s66
21472151; GFX10PLUS-NEXT: s_mov_b32 s49, 0x401c0000
2148- ; GFX10PLUS-NEXT: s_mov_b32 s48, s66
21492152; GFX10PLUS-NEXT: s_mov_b32 s47, 0x40180000
2150- ; GFX10PLUS-NEXT: s_mov_b32 s46, s66
21512153; GFX10PLUS-NEXT: s_mov_b32 s45, 0x40140000
2152- ; GFX10PLUS-NEXT: s_mov_b32 s44, s66
21532154; GFX10PLUS-NEXT: s_mov_b64 s[42:43], 4.0
21542155; GFX10PLUS-NEXT: s_mov_b32 s41, 0x40080000
2155- ; GFX10PLUS-NEXT: s_mov_b32 s40, s66
21562156; GFX10PLUS-NEXT: s_mov_b64 s[38:39], 2.0
21572157; GFX10PLUS-NEXT: s_movrels_b64 s[0:1], s[36:37]
21582158; GFX10PLUS-NEXT: ; return to shader part epilog
@@ -3085,10 +3085,10 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
30853085; GPRIDX-NEXT: ; %bb.0: ; %entry
30863086; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
30873087; GPRIDX-NEXT: s_load_dword s8, s[4:5], 0x8
3088+ ; GPRIDX-NEXT: s_mov_b32 s4, 0
3089+ ; GPRIDX-NEXT: s_mov_b32 s5, 0x40080000
30883090; GPRIDX-NEXT: s_mov_b32 s2, 0
30893091; GPRIDX-NEXT: s_mov_b32 s3, 0x40140000
3090- ; GPRIDX-NEXT: s_mov_b32 s5, 0x40080000
3091- ; GPRIDX-NEXT: s_mov_b32 s4, s2
30923092; GPRIDX-NEXT: s_waitcnt lgkmcnt(0)
30933093; GPRIDX-NEXT: s_cmp_eq_u32 s8, 1
30943094; GPRIDX-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0
@@ -3176,10 +3176,10 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
31763176; MOVREL-NEXT: ; %bb.0: ; %entry
31773177; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
31783178; MOVREL-NEXT: s_load_dword s8, s[4:5], 0x8
3179+ ; MOVREL-NEXT: s_mov_b32 s4, 0
3180+ ; MOVREL-NEXT: s_mov_b32 s5, 0x40080000
31793181; MOVREL-NEXT: s_mov_b32 s2, 0
31803182; MOVREL-NEXT: s_mov_b32 s3, 0x40140000
3181- ; MOVREL-NEXT: s_mov_b32 s5, 0x40080000
3182- ; MOVREL-NEXT: s_mov_b32 s4, s2
31833183; MOVREL-NEXT: s_waitcnt lgkmcnt(0)
31843184; MOVREL-NEXT: s_cmp_eq_u32 s8, 1
31853185; MOVREL-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0
@@ -3207,7 +3207,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
32073207; GFX10-NEXT: kernel_code_entry_byte_offset = 256
32083208; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
32093209; GFX10-NEXT: granulated_workitem_vgpr_count = 0
3210- ; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
3210+ ; GFX10-NEXT: granulated_wavefront_sgpr_count = 0
32113211; GFX10-NEXT: priority = 0
32123212; GFX10-NEXT: float_mode = 240
32133213; GFX10-NEXT: priv = 0
@@ -3250,7 +3250,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
32503250; GFX10-NEXT: gds_segment_byte_size = 0
32513251; GFX10-NEXT: kernarg_segment_byte_size = 12
32523252; GFX10-NEXT: workgroup_fbarrier_count = 0
3253- ; GFX10-NEXT: wavefront_sgpr_count = 9
3253+ ; GFX10-NEXT: wavefront_sgpr_count = 7
32543254; GFX10-NEXT: workitem_vgpr_count = 3
32553255; GFX10-NEXT: reserved_vgpr_first = 0
32563256; GFX10-NEXT: reserved_vgpr_count = 0
@@ -3267,22 +3267,22 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
32673267; GFX10-NEXT: .end_amd_kernel_code_t
32683268; GFX10-NEXT: ; %bb.0: ; %entry
32693269; GFX10-NEXT: s_clause 0x1
3270- ; GFX10-NEXT: s_load_dword s8 , s[4:5], 0x8
3270+ ; GFX10-NEXT: s_load_dword s6 , s[4:5], 0x8
32713271; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
32723272; GFX10-NEXT: s_mov_b32 s2, 0
3273- ; GFX10-NEXT: s_mov_b32 s3, 0x40140000
3274- ; GFX10-NEXT: s_mov_b32 s5, 0x40080000
3275- ; GFX10-NEXT: s_mov_b32 s4, s2
3273+ ; GFX10-NEXT: s_mov_b32 s3, 0x40080000
32763274; GFX10-NEXT: v_mov_b32_e32 v2, 0
32773275; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3278- ; GFX10-NEXT: s_cmp_eq_u32 s8, 1
3279- ; GFX10-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0
3280- ; GFX10-NEXT: s_cmp_eq_u32 s8, 2
3281- ; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
3282- ; GFX10-NEXT: s_cmp_eq_u32 s8, 3
3283- ; GFX10-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5]
3284- ; GFX10-NEXT: s_cmp_eq_u32 s8, 4
3276+ ; GFX10-NEXT: s_cmp_eq_u32 s6, 1
3277+ ; GFX10-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
3278+ ; GFX10-NEXT: s_cmp_eq_u32 s6, 2
32853279; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
3280+ ; GFX10-NEXT: s_cmp_eq_u32 s6, 3
3281+ ; GFX10-NEXT: s_mov_b32 s4, 0
3282+ ; GFX10-NEXT: s_mov_b32 s5, 0x40140000
3283+ ; GFX10-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3]
3284+ ; GFX10-NEXT: s_cmp_eq_u32 s6, 4
3285+ ; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
32863286; GFX10-NEXT: v_mov_b32_e32 v0, s2
32873287; GFX10-NEXT: v_mov_b32_e32 v1, s3
32883288; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3299,7 +3299,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
32993299; GFX11-NEXT: kernel_code_entry_byte_offset = 256
33003300; GFX11-NEXT: kernel_code_prefetch_byte_size = 0
33013301; GFX11-NEXT: granulated_workitem_vgpr_count = 0
3302- ; GFX11-NEXT: granulated_wavefront_sgpr_count = 1
3302+ ; GFX11-NEXT: granulated_wavefront_sgpr_count = 0
33033303; GFX11-NEXT: priority = 0
33043304; GFX11-NEXT: float_mode = 240
33053305; GFX11-NEXT: priv = 0
@@ -3342,7 +3342,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
33423342; GFX11-NEXT: gds_segment_byte_size = 0
33433343; GFX11-NEXT: kernarg_segment_byte_size = 12
33443344; GFX11-NEXT: workgroup_fbarrier_count = 0
3345- ; GFX11-NEXT: wavefront_sgpr_count = 9
3345+ ; GFX11-NEXT: wavefront_sgpr_count = 7
33463346; GFX11-NEXT: workitem_vgpr_count = 3
33473347; GFX11-NEXT: reserved_vgpr_first = 0
33483348; GFX11-NEXT: reserved_vgpr_count = 0
@@ -3359,22 +3359,22 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
33593359; GFX11-NEXT: .end_amd_kernel_code_t
33603360; GFX11-NEXT: ; %bb.0: ; %entry
33613361; GFX11-NEXT: s_clause 0x1
3362- ; GFX11-NEXT: s_load_b32 s8 , s[0:1], 0x8
3362+ ; GFX11-NEXT: s_load_b32 s6 , s[0:1], 0x8
33633363; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
33643364; GFX11-NEXT: s_mov_b32 s2, 0
3365- ; GFX11-NEXT: s_mov_b32 s3, 0x40140000
3366- ; GFX11-NEXT: s_mov_b32 s5, 0x40080000
3367- ; GFX11-NEXT: s_mov_b32 s4, s2
3365+ ; GFX11-NEXT: s_mov_b32 s3, 0x40080000
33683366; GFX11-NEXT: v_mov_b32_e32 v2, 0
33693367; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3370- ; GFX11-NEXT: s_cmp_eq_u32 s8, 1
3371- ; GFX11-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0
3372- ; GFX11-NEXT: s_cmp_eq_u32 s8, 2
3373- ; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
3374- ; GFX11-NEXT: s_cmp_eq_u32 s8, 3
3375- ; GFX11-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5]
3376- ; GFX11-NEXT: s_cmp_eq_u32 s8, 4
3368+ ; GFX11-NEXT: s_cmp_eq_u32 s6, 1
3369+ ; GFX11-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
3370+ ; GFX11-NEXT: s_cmp_eq_u32 s6, 2
33773371; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
3372+ ; GFX11-NEXT: s_cmp_eq_u32 s6, 3
3373+ ; GFX11-NEXT: s_mov_b32 s4, 0
3374+ ; GFX11-NEXT: s_mov_b32 s5, 0x40140000
3375+ ; GFX11-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3]
3376+ ; GFX11-NEXT: s_cmp_eq_u32 s6, 4
3377+ ; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
33783378; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
33793379; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
33803380; GFX11-NEXT: s_nop 0
@@ -4784,11 +4784,8 @@ define i32 @v_extract_v64i32_32(ptr addrspace(1) %ptr) {
47844784; MOVREL-LABEL: v_extract_v64i32_32:
47854785; MOVREL: ; %bb.0:
47864786; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4787- ; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80
4788- ; MOVREL-NEXT: v_mov_b32_e32 v2, s4
4789- ; MOVREL-NEXT: v_mov_b32_e32 v3, s5
4790- ; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
4791- ; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
4787+ ; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0
4788+ ; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
47924789; MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
47934790; MOVREL-NEXT: s_waitcnt vmcnt(0)
47944791; MOVREL-NEXT: s_setpc_b64 s[30:31]
@@ -4823,11 +4820,8 @@ define i32 @v_extract_v64i32_33(ptr addrspace(1) %ptr) {
48234820; MOVREL-LABEL: v_extract_v64i32_33:
48244821; MOVREL: ; %bb.0:
48254822; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4826- ; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80
4827- ; MOVREL-NEXT: v_mov_b32_e32 v2, s4
4828- ; MOVREL-NEXT: v_mov_b32_e32 v3, s5
4829- ; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
4830- ; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
4823+ ; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0
4824+ ; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
48314825; MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
48324826; MOVREL-NEXT: s_waitcnt vmcnt(0)
48334827; MOVREL-NEXT: v_mov_b32_e32 v0, v1
0 commit comments