@@ -314,8 +314,9 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
314314; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
315315; SI-NEXT: s_mov_b32 s3, 0xf000
316316; SI-NEXT: s_waitcnt lgkmcnt(0)
317- ; SI-NEXT: s_lshl_b32 s2, s2, 24
318- ; SI-NEXT: s_flbit_i32_b32 s4, s2
317+ ; SI-NEXT: s_and_b32 s2, s2, 0xff
318+ ; SI-NEXT: s_flbit_i32_b32 s2, s2
319+ ; SI-NEXT: s_sub_i32 s4, s2, 24
319320; SI-NEXT: s_mov_b32 s2, -1
320321; SI-NEXT: v_mov_b32_e32 v0, s4
321322; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
@@ -326,8 +327,9 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
326327; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
327328; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
328329; VI-NEXT: s_waitcnt lgkmcnt(0)
329- ; VI-NEXT: s_lshl_b32 s2, s2, 24
330+ ; VI-NEXT: s_and_b32 s2, s2, 0xff
330331; VI-NEXT: s_flbit_i32_b32 s2, s2
332+ ; VI-NEXT: s_sub_i32 s2, s2, 24
331333; VI-NEXT: v_mov_b32_e32 v0, s0
332334; VI-NEXT: v_mov_b32_e32 v1, s1
333335; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -347,13 +349,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
347349; EG-NEXT: ALU clause starting at 8:
348350; EG-NEXT: MOV * T0.X, 0.0,
349351; EG-NEXT: ALU clause starting at 9:
350- ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
351- ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
352- ; EG-NEXT: FFBH_UINT T0.W, PV.W,
352+ ; EG-NEXT: FFBH_UINT T0.W, T0.X,
353353; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
354354; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
355+ ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
356+ ; EG-NEXT: -24(nan), 0(0.000000e+00)
355357; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
356- ; EG-NEXT: LSHL * T1.W, PS , literal.y,
358+ ; EG-NEXT: LSHL * T1.W, T1.W , literal.y,
357359; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
358360; EG-NEXT: LSHL T0.X, PV.W, PS,
359361; EG-NEXT: LSHL * T0.W, literal.x, PS,
@@ -389,8 +391,9 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
389391; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
390392; SI-NEXT: s_mov_b32 s3, 0xf000
391393; SI-NEXT: s_waitcnt lgkmcnt(0)
392- ; SI-NEXT: s_lshl_b32 s2, s2, 16
393- ; SI-NEXT: s_flbit_i32_b32 s4, s2
394+ ; SI-NEXT: s_and_b32 s2, s2, 0xffff
395+ ; SI-NEXT: s_flbit_i32_b32 s2, s2
396+ ; SI-NEXT: s_add_i32 s4, s2, -16
394397; SI-NEXT: s_mov_b32 s2, -1
395398; SI-NEXT: v_mov_b32_e32 v0, s4
396399; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -423,13 +426,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
423426; EG-NEXT: ALU clause starting at 8:
424427; EG-NEXT: MOV * T0.X, 0.0,
425428; EG-NEXT: ALU clause starting at 9:
426- ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
427- ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
428- ; EG-NEXT: FFBH_UINT T0.W, PV.W,
429+ ; EG-NEXT: FFBH_UINT T0.W, T0.X,
429430; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
430431; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
432+ ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
433+ ; EG-NEXT: -16(nan), 0(0.000000e+00)
431434; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
432- ; EG-NEXT: LSHL * T1.W, PS , literal.y,
435+ ; EG-NEXT: LSHL * T1.W, T1.W , literal.y,
433436; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
434437; EG-NEXT: LSHL T0.X, PV.W, PS,
435438; EG-NEXT: LSHL * T0.W, literal.x, PS,
@@ -587,8 +590,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
587590; SI-NEXT: s_mov_b32 s4, s0
588591; SI-NEXT: s_mov_b32 s5, s1
589592; SI-NEXT: s_waitcnt vmcnt(0)
590- ; SI-NEXT: v_lshlrev_b32_e32 v1, 24 , v0
591- ; SI-NEXT: v_ffbh_u32_e32 v1, v1
593+ ; SI-NEXT: v_ffbh_u32_e32 v1, v0
594+ ; SI-NEXT: v_subrev_i32_e32 v1, vcc, 24 , v1
592595; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
593596; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
594597; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
@@ -602,8 +605,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
602605; VI-NEXT: v_mov_b32_e32 v1, s3
603606; VI-NEXT: flat_load_ubyte v0, v[0:1]
604607; VI-NEXT: s_waitcnt vmcnt(0)
605- ; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v0
606- ; VI-NEXT: v_ffbh_u32_e32 v1, v1
608+ ; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
609+ ; VI-NEXT: v_subrev_u32_e32 v1, vcc, 24 , v1
607610; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
608611; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
609612; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -615,7 +618,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
615618; EG: ; %bb.0:
616619; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
617620; EG-NEXT: TEX 0 @6
618- ; EG-NEXT: ALU 16 , @9, KC0[CB0:0-32], KC1[]
621+ ; EG-NEXT: ALU 15 , @9, KC0[CB0:0-32], KC1[]
619622; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
620623; EG-NEXT: CF_END
621624; EG-NEXT: PAD
@@ -624,11 +627,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
624627; EG-NEXT: ALU clause starting at 8:
625628; EG-NEXT: MOV * T0.X, KC0[2].Z,
626629; EG-NEXT: ALU clause starting at 9:
627- ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
628- ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
629- ; EG-NEXT: FFBH_UINT T0.W, PV.W,
630- ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
631- ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
630+ ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
631+ ; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
632+ ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
633+ ; EG-NEXT: -24(nan), 3(4.203895e-45)
632634; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
633635; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
634636; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
@@ -683,8 +685,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
683685; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
684686; SI-NEXT: s_waitcnt vmcnt(0)
685687; SI-NEXT: v_or_b32_e32 v0, v0, v1
686- ; SI-NEXT: v_lshlrev_b32_e32 v1, 16 , v0
687- ; SI-NEXT: v_ffbh_u32_e32 v1, v1
688+ ; SI-NEXT: v_ffbh_u32_e32 v1, v0
689+ ; SI-NEXT: v_add_i32_e32 v1, vcc, -16 , v1
688690; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
689691; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
690692; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -719,7 +721,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
719721; EG: ; %bb.0:
720722; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
721723; EG-NEXT: TEX 0 @6
722- ; EG-NEXT: ALU 16 , @9, KC0[CB0:0-32], KC1[]
724+ ; EG-NEXT: ALU 15 , @9, KC0[CB0:0-32], KC1[]
723725; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
724726; EG-NEXT: CF_END
725727; EG-NEXT: PAD
@@ -728,11 +730,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
728730; EG-NEXT: ALU clause starting at 8:
729731; EG-NEXT: MOV * T0.X, KC0[2].Z,
730732; EG-NEXT: ALU clause starting at 9:
731- ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
732- ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
733- ; EG-NEXT: FFBH_UINT T0.W, PV.W,
734- ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
735- ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
733+ ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
734+ ; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
735+ ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
736+ ; EG-NEXT: -16(nan), 3(4.203895e-45)
736737; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
737738; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
738739; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
@@ -1101,8 +1102,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
11011102; SI-NEXT: s_mov_b32 s4, s0
11021103; SI-NEXT: s_mov_b32 s5, s1
11031104; SI-NEXT: s_waitcnt vmcnt(0)
1104- ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
11051105; SI-NEXT: v_ffbh_u32_e32 v0, v0
1106+ ; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0
11061107; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
11071108; SI-NEXT: s_endpgm
11081109;
@@ -1115,8 +1116,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
11151116; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
11161117; VI-NEXT: flat_load_ubyte v0, v[0:1]
11171118; VI-NEXT: s_waitcnt vmcnt(0)
1118- ; VI-NEXT: v_lshlrev_b32_e32 v0, 24 , v0
1119- ; VI-NEXT: v_ffbh_u32_e32 v2, v0
1119+ ; VI-NEXT: v_ffbh_u32_e32 v0, v0
1120+ ; VI-NEXT: v_subrev_u32_e32 v2, vcc, 24 , v0
11201121; VI-NEXT: v_mov_b32_e32 v0, s0
11211122; VI-NEXT: v_mov_b32_e32 v1, s1
11221123; VI-NEXT: flat_store_byte v[0:1], v2
@@ -1135,13 +1136,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
11351136; EG-NEXT: ALU clause starting at 8:
11361137; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X,
11371138; EG-NEXT: ALU clause starting at 9:
1138- ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1139- ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
1140- ; EG-NEXT: FFBH_UINT T0.W, PV.W,
1139+ ; EG-NEXT: FFBH_UINT T0.W, T0.X,
11411140; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
11421141; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1142+ ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
1143+ ; EG-NEXT: -24(nan), 0(0.000000e+00)
11431144; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
1144- ; EG-NEXT: LSHL * T1.W, PS , literal.y,
1145+ ; EG-NEXT: LSHL * T1.W, T1.W , literal.y,
11451146; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
11461147; EG-NEXT: LSHL T0.X, PV.W, PS,
11471148; EG-NEXT: LSHL * T0.W, literal.x, PS,
0 commit comments