@@ -1062,52 +1062,43 @@ entry:
10621062define amdgpu_kernel void @s_mul_i1 (ptr addrspace (1 ) %out , [8 x i32 ], i1 %a , [8 x i32 ], i1 %b ) nounwind {
10631063; SI-LABEL: s_mul_i1:
10641064; SI: ; %bb.0: ; %entry
1065- ; SI-NEXT: s_load_dword s2 , s[0:1], 0x13
1066- ; SI-NEXT: s_load_dwordx2 s[4:5] , s[0:1], 0x9
1067- ; SI-NEXT: s_load_dword s3 , s[0:1], 0x1c
1068- ; SI-NEXT: s_mov_b32 s7 , 0xf000
1069- ; SI-NEXT: s_mov_b32 s6 , -1
1065+ ; SI-NEXT: s_load_dword s4 , s[0:1], 0x13
1066+ ; SI-NEXT: s_load_dword s5 , s[0:1], 0x1c
1067+ ; SI-NEXT: s_load_dwordx2 s[0:1] , s[0:1], 0x9
1068+ ; SI-NEXT: s_mov_b32 s3 , 0xf000
1069+ ; SI-NEXT: s_mov_b32 s2 , -1
10701070; SI-NEXT: s_waitcnt lgkmcnt(0)
1071- ; SI-NEXT: s_bitcmp1_b32 s2, 0
1072- ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
1073- ; SI-NEXT: s_bitcmp1_b32 s3, 0
1074- ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
1075- ; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
1076- ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
1077- ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
1071+ ; SI-NEXT: s_mul_i32 s4, s4, s5
1072+ ; SI-NEXT: s_and_b32 s4, s4, 1
1073+ ; SI-NEXT: v_mov_b32_e32 v0, s4
1074+ ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
10781075; SI-NEXT: s_endpgm
10791076;
10801077; VI-LABEL: s_mul_i1:
10811078; VI: ; %bb.0: ; %entry
1082- ; VI-NEXT: s_load_dword s2 , s[0:1], 0x4c
1083- ; VI-NEXT: s_load_dwordx2 s[4:5] , s[0:1], 0x24
1084- ; VI-NEXT: s_load_dword s3 , s[0:1], 0x70
1085- ; VI-NEXT: s_mov_b32 s7 , 0xf000
1086- ; VI-NEXT: s_mov_b32 s6 , -1
1079+ ; VI-NEXT: s_load_dword s4 , s[0:1], 0x70
1080+ ; VI-NEXT: s_load_dword s5 , s[0:1], 0x4c
1081+ ; VI-NEXT: s_load_dwordx2 s[0:1] , s[0:1], 0x24
1082+ ; VI-NEXT: s_mov_b32 s3 , 0xf000
1083+ ; VI-NEXT: s_mov_b32 s2 , -1
10871084; VI-NEXT: s_waitcnt lgkmcnt(0)
1088- ; VI-NEXT: s_bitcmp1_b32 s2, 0
1089- ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
1090- ; VI-NEXT: s_bitcmp1_b32 s3, 0
1091- ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
1092- ; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
1093- ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
1094- ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
1085+ ; VI-NEXT: v_mov_b32_e32 v0, s4
1086+ ; VI-NEXT: v_mul_lo_u16_e32 v0, s5, v0
1087+ ; VI-NEXT: v_and_b32_e32 v0, 1, v0
1088+ ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
10951089; VI-NEXT: s_endpgm
10961090;
10971091; GFX9-LABEL: s_mul_i1:
10981092; GFX9: ; %bb.0: ; %entry
1099- ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c
1093+ ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x70
1094+ ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x4c
11001095; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1101- ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70
11021096; GFX9-NEXT: s_mov_b32 s7, 0xf000
11031097; GFX9-NEXT: s_mov_b32 s6, -1
11041098; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1105- ; GFX9-NEXT: s_bitcmp1_b32 s2, 0
1106- ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
1107- ; GFX9-NEXT: s_bitcmp1_b32 s3, 0
1108- ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
1109- ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
1110- ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
1099+ ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1100+ ; GFX9-NEXT: v_mul_lo_u16_e32 v0, s3, v0
1101+ ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
11111102; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0
11121103; GFX9-NEXT: s_endpgm
11131104;
@@ -1120,12 +1111,8 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
11201111; GFX10-NEXT: s_mov_b32 s7, 0x31016000
11211112; GFX10-NEXT: s_mov_b32 s6, -1
11221113; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1123- ; GFX10-NEXT: s_bitcmp1_b32 s2, 0
1124- ; GFX10-NEXT: s_cselect_b32 s0, -1, 0
1125- ; GFX10-NEXT: s_bitcmp1_b32 s3, 0
1126- ; GFX10-NEXT: s_cselect_b32 s1, -1, 0
1127- ; GFX10-NEXT: s_and_b32 s0, s0, s1
1128- ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
1114+ ; GFX10-NEXT: v_mul_lo_u16 v0, s2, s3
1115+ ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
11291116; GFX10-NEXT: buffer_store_byte v0, off, s[4:7], 0
11301117; GFX10-NEXT: s_endpgm
11311118;
@@ -1136,15 +1123,11 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
11361123; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70
11371124; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
11381125; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1139- ; GFX11-NEXT: s_bitcmp1_b32 s2, 0
1140- ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
1141- ; GFX11-NEXT: s_bitcmp1_b32 s3, 0
1142- ; GFX11-NEXT: s_cselect_b32 s3, -1, 0
1143- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1144- ; GFX11-NEXT: s_and_b32 s2, s2, s3
1126+ ; GFX11-NEXT: v_mul_lo_u16 v0, s2, s3
11451127; GFX11-NEXT: s_mov_b32 s3, 0x31016000
1146- ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
11471128; GFX11-NEXT: s_mov_b32 s2, -1
1129+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1130+ ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
11481131; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
11491132; GFX11-NEXT: s_nop 0
11501133; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1198,14 +1181,9 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
11981181; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
11991182; SI-NEXT: s_mov_b32 s4, s0
12001183; SI-NEXT: s_mov_b32 s5, s1
1201- ; SI-NEXT: s_waitcnt vmcnt(1)
1202- ; SI-NEXT: v_and_b32_e32 v0, 1, v0
12031184; SI-NEXT: s_waitcnt vmcnt(0)
1204- ; SI-NEXT: v_and_b32_e32 v1, 1, v1
1205- ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
1206- ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1
1207- ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
1208- ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
1185+ ; SI-NEXT: v_mul_lo_u32 v0, v0, v1
1186+ ; SI-NEXT: v_and_b32_e32 v0, 1, v0
12091187; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
12101188; SI-NEXT: s_endpgm
12111189;
@@ -1223,14 +1201,9 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
12231201; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
12241202; VI-NEXT: s_mov_b32 s4, s0
12251203; VI-NEXT: s_mov_b32 s5, s1
1226- ; VI-NEXT: s_waitcnt vmcnt(1)
1227- ; VI-NEXT: v_and_b32_e32 v0, 1, v0
12281204; VI-NEXT: s_waitcnt vmcnt(0)
1229- ; VI-NEXT: v_and_b32_e32 v1, 1, v1
1230- ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
1231- ; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1
1232- ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
1233- ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
1205+ ; VI-NEXT: v_mul_lo_u16_e32 v0, v0, v1
1206+ ; VI-NEXT: v_and_b32_e32 v0, 1, v0
12341207; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
12351208; VI-NEXT: s_endpgm
12361209;
@@ -1248,69 +1221,53 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
12481221; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
12491222; GFX9-NEXT: s_mov_b32 s4, s0
12501223; GFX9-NEXT: s_mov_b32 s5, s1
1251- ; GFX9-NEXT: s_waitcnt vmcnt(1)
1252- ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
12531224; GFX9-NEXT: s_waitcnt vmcnt(0)
1254- ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
1255- ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
1256- ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1
1257- ; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
1258- ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
1225+ ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
1226+ ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
12591227; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0
12601228; GFX9-NEXT: s_endpgm
12611229;
12621230; GFX10-LABEL: v_mul_i1:
12631231; GFX10: ; %bb.0: ; %entry
1264- ; GFX10-NEXT: s_load_dwordx4 s[4:7 ], s[0:1], 0x24
1265- ; GFX10-NEXT: s_mov_b32 s2 , -1
1266- ; GFX10-NEXT: s_mov_b32 s3 , 0x31016000
1267- ; GFX10-NEXT: s_mov_b32 s10, s2
1268- ; GFX10-NEXT: s_mov_b32 s11, s3
1232+ ; GFX10-NEXT: s_load_dwordx4 s[0:3 ], s[0:1], 0x24
1233+ ; GFX10-NEXT: s_mov_b32 s6 , -1
1234+ ; GFX10-NEXT: s_mov_b32 s7 , 0x31016000
1235+ ; GFX10-NEXT: s_mov_b32 s10, s6
1236+ ; GFX10-NEXT: s_mov_b32 s11, s7
12691237; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1270- ; GFX10-NEXT: s_mov_b32 s8, s6
1271- ; GFX10-NEXT: s_mov_b32 s9, s7
1238+ ; GFX10-NEXT: s_mov_b32 s8, s2
1239+ ; GFX10-NEXT: s_mov_b32 s9, s3
12721240; GFX10-NEXT: s_clause 0x1
12731241; GFX10-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
12741242; GFX10-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
1275- ; GFX10-NEXT: s_mov_b32 s1, s5
1276- ; GFX10-NEXT: s_waitcnt vmcnt(1)
1277- ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
1243+ ; GFX10-NEXT: s_mov_b32 s4, s0
1244+ ; GFX10-NEXT: s_mov_b32 s5, s1
12781245; GFX10-NEXT: s_waitcnt vmcnt(0)
1279- ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
1280- ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
1281- ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
1282- ; GFX10-NEXT: s_and_b32 s0, vcc_lo, s0
1283- ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
1284- ; GFX10-NEXT: s_mov_b32 s0, s4
1285- ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], 0
1246+ ; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1
1247+ ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
1248+ ; GFX10-NEXT: buffer_store_byte v0, off, s[4:7], 0
12861249; GFX10-NEXT: s_endpgm
12871250;
12881251; GFX11-LABEL: v_mul_i1:
12891252; GFX11: ; %bb.0: ; %entry
1290- ; GFX11-NEXT: s_load_b128 s[4:7 ], s[0:1], 0x24
1291- ; GFX11-NEXT: s_mov_b32 s2 , -1
1292- ; GFX11-NEXT: s_mov_b32 s3 , 0x31016000
1293- ; GFX11-NEXT: s_mov_b32 s10, s2
1294- ; GFX11-NEXT: s_mov_b32 s11, s3
1253+ ; GFX11-NEXT: s_load_b128 s[0:3 ], s[0:1], 0x24
1254+ ; GFX11-NEXT: s_mov_b32 s6 , -1
1255+ ; GFX11-NEXT: s_mov_b32 s7 , 0x31016000
1256+ ; GFX11-NEXT: s_mov_b32 s10, s6
1257+ ; GFX11-NEXT: s_mov_b32 s11, s7
12951258; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1296- ; GFX11-NEXT: s_mov_b32 s8, s6
1297- ; GFX11-NEXT: s_mov_b32 s9, s7
1259+ ; GFX11-NEXT: s_mov_b32 s8, s2
1260+ ; GFX11-NEXT: s_mov_b32 s9, s3
12981261; GFX11-NEXT: s_clause 0x1
12991262; GFX11-NEXT: buffer_load_u8 v0, off, s[8:11], 0
13001263; GFX11-NEXT: buffer_load_u8 v1, off, s[8:11], 0 offset:4
1301- ; GFX11-NEXT: s_mov_b32 s1, s5
1302- ; GFX11-NEXT: s_waitcnt vmcnt(1)
1303- ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
1264+ ; GFX11-NEXT: s_mov_b32 s4, s0
1265+ ; GFX11-NEXT: s_mov_b32 s5, s1
13041266; GFX11-NEXT: s_waitcnt vmcnt(0)
1305- ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
1306- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1307- ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
1308- ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
1309- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1310- ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
1311- ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
1312- ; GFX11-NEXT: s_mov_b32 s0, s4
1313- ; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
1267+ ; GFX11-NEXT: v_mul_lo_u16 v0, v0, v1
1268+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1269+ ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
1270+ ; GFX11-NEXT: buffer_store_b8 v0, off, s[4:7], 0
13141271; GFX11-NEXT: s_nop 0
13151272; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
13161273; GFX11-NEXT: s_endpgm
0 commit comments