Skip to content

Commit 3c0c24e

Browse files
committed
[AArch64] Combine to UMULL if top bits are known zero
Given mul(zext(a), b), we can convert to a umull so long as we know that the top bits of b are zero. This uses MaskedValueIsZero to detect that case for NEON UMULL patterns. Differential Revision: https://reviews.llvm.org/D140287
1 parent ecaab10 commit 3c0c24e

File tree

3 files changed

+74
-72
lines changed

3 files changed

+74
-72
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+34
Original file line numberDiff line numberDiff line change
@@ -4567,8 +4567,42 @@ static unsigned selectUmullSmull(SDNode *&N0, SDNode *&N1, SelectionDAG &DAG,
45674567
return AArch64ISD::SMULL;
45684568
}
45694569
}
4570+
4571+
// Select UMULL if we can replace the other operand with an extend.
4572+
if (IsN0ZExt || IsN1ZExt) {
4573+
EVT VT = N0->getValueType(0);
4574+
APInt Mask = APInt::getHighBitsSet(VT.getScalarSizeInBits(),
4575+
VT.getScalarSizeInBits() / 2);
4576+
if (DAG.MaskedValueIsZero(SDValue(IsN0ZExt ? N1 : N0, 0), Mask)) {
4577+
EVT HalfVT;
4578+
switch (VT.getSimpleVT().SimpleTy) {
4579+
case MVT::v2i64:
4580+
HalfVT = MVT::v2i32;
4581+
break;
4582+
case MVT::v4i32:
4583+
HalfVT = MVT::v4i16;
4584+
break;
4585+
case MVT::v8i16:
4586+
HalfVT = MVT::v8i8;
4587+
break;
4588+
default:
4589+
return 0;
4590+
}
4591+
// Truncate and then extend the result.
4592+
SDValue NewExt = DAG.getNode(ISD::TRUNCATE, DL, HalfVT,
4593+
SDValue(IsN0ZExt ? N1 : N0, 0));
4594+
NewExt = DAG.getZExtOrTrunc(NewExt, DL, VT);
4595+
if (IsN0ZExt)
4596+
N1 = NewExt.getNode();
4597+
else
4598+
N0 = NewExt.getNode();
4599+
return AArch64ISD::UMULL;
4600+
}
4601+
}
4602+
45704603
if (!IsN1SExt && !IsN1ZExt)
45714604
return 0;
4605+
45724606
// Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
45734607
// into (s/zext A * s/zext C) + (s/zext B * s/zext C)
45744608
if (IsN1SExt && isAddSubSExt(N0, DAG)) {

llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll

+8-12
Original file line numberDiff line numberDiff line change
@@ -116,17 +116,13 @@ entry:
116116
define <2 x i64> @dupzext_v2i16_v2i64(i16 %src, <2 x i16> %b) {
117117
; CHECK-LABEL: dupzext_v2i16_v2i64:
118118
; CHECK: // %bb.0: // %entry
119-
; CHECK-NEXT: movi d1, #0x00ffff0000ffff
120119
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
121120
; CHECK-NEXT: and x8, x0, #0xffff
121+
; CHECK-NEXT: movi d1, #0x00ffff0000ffff
122+
; CHECK-NEXT: dup v2.2d, x8
122123
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
123-
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
124-
; CHECK-NEXT: fmov x9, d0
125-
; CHECK-NEXT: mov x10, v0.d[1]
126-
; CHECK-NEXT: mul x9, x8, x9
127-
; CHECK-NEXT: mul x8, x8, x10
128-
; CHECK-NEXT: fmov d0, x9
129-
; CHECK-NEXT: mov v0.d[1], x8
124+
; CHECK-NEXT: xtn v2.2s, v2.2d
125+
; CHECK-NEXT: umull v0.2d, v2.2s, v0.2s
130126
; CHECK-NEXT: ret
131127
entry:
132128
%in = zext i16 %src to i64
@@ -225,12 +221,12 @@ define void @typei1_orig(i64 %a, ptr %p, ptr %q) {
225221
define <8 x i16> @typei1_v8i1_v8i16(i1 %src, <8 x i1> %b) {
226222
; CHECK-LABEL: typei1_v8i1_v8i16:
227223
; CHECK: // %bb.0: // %entry
228-
; CHECK-NEXT: movi v1.8b, #1
229224
; CHECK-NEXT: and w8, w0, #0x1
225+
; CHECK-NEXT: movi v1.8b, #1
226+
; CHECK-NEXT: dup v2.8h, w8
230227
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
231-
; CHECK-NEXT: dup v1.8h, w8
232-
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
233-
; CHECK-NEXT: mul v0.8h, v1.8h, v0.8h
228+
; CHECK-NEXT: xtn v2.8b, v2.8h
229+
; CHECK-NEXT: umull v0.8h, v2.8b, v0.8b
234230
; CHECK-NEXT: ret
235231
entry:
236232
%in = zext i1 %src to i16

llvm/test/CodeGen/AArch64/aarch64-smull.ll

+32-60
Original file line numberDiff line numberDiff line change
@@ -932,9 +932,9 @@ define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
932932
define <8 x i16> @umull_and_v8i16(<8 x i8> %src1, <8 x i16> %src2) {
933933
; CHECK-LABEL: umull_and_v8i16:
934934
; CHECK: // %bb.0: // %entry
935-
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
936935
; CHECK-NEXT: bic v1.8h, #255, lsl #8
937-
; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
936+
; CHECK-NEXT: xtn v1.8b, v1.8h
937+
; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
938938
; CHECK-NEXT: ret
939939
entry:
940940
%in1 = zext <8 x i8> %src1 to <8 x i16>
@@ -946,9 +946,9 @@ entry:
946946
define <8 x i16> @umull_and_v8i16_c(<8 x i8> %src1, <8 x i16> %src2) {
947947
; CHECK-LABEL: umull_and_v8i16_c:
948948
; CHECK: // %bb.0: // %entry
949-
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
950949
; CHECK-NEXT: bic v1.8h, #255, lsl #8
951-
; CHECK-NEXT: mul v0.8h, v1.8h, v0.8h
950+
; CHECK-NEXT: xtn v1.8b, v1.8h
951+
; CHECK-NEXT: umull v0.8h, v1.8b, v0.8b
952952
; CHECK-NEXT: ret
953953
entry:
954954
%in1 = zext <8 x i8> %src1 to <8 x i16>
@@ -989,9 +989,9 @@ define <8 x i16> @umull_smaller_v8i16(<8 x i4> %src1, <8 x i16> %src2) {
989989
; CHECK: // %bb.0: // %entry
990990
; CHECK-NEXT: movi v2.8b, #15
991991
; CHECK-NEXT: bic v1.8h, #255, lsl #8
992+
; CHECK-NEXT: xtn v1.8b, v1.8h
992993
; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
993-
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
994-
; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
994+
; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
995995
; CHECK-NEXT: ret
996996
entry:
997997
%in1 = zext <8 x i4> %src1 to <8 x i16>
@@ -1004,9 +1004,9 @@ define <4 x i32> @umull_and_v4i32(<4 x i16> %src1, <4 x i32> %src2) {
10041004
; CHECK-LABEL: umull_and_v4i32:
10051005
; CHECK: // %bb.0: // %entry
10061006
; CHECK-NEXT: movi v2.2d, #0x0000ff000000ff
1007-
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
10081007
; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
1009-
; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
1008+
; CHECK-NEXT: xtn v1.4h, v1.4s
1009+
; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
10101010
; CHECK-NEXT: ret
10111011
entry:
10121012
%in1 = zext <4 x i16> %src1 to <4 x i32>
@@ -1019,12 +1019,13 @@ define <8 x i32> @umull_and_v8i32(<8 x i16> %src1, <8 x i32> %src2) {
10191019
; CHECK-LABEL: umull_and_v8i32:
10201020
; CHECK: // %bb.0: // %entry
10211021
; CHECK-NEXT: movi v3.2d, #0x0000ff000000ff
1022-
; CHECK-NEXT: ushll v4.4s, v0.4h, #0
1023-
; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0
1024-
; CHECK-NEXT: and v5.16b, v1.16b, v3.16b
1025-
; CHECK-NEXT: and v1.16b, v2.16b, v3.16b
1026-
; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s
1027-
; CHECK-NEXT: mul v0.4s, v4.4s, v5.4s
1022+
; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8
1023+
; CHECK-NEXT: and v2.16b, v2.16b, v3.16b
1024+
; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
1025+
; CHECK-NEXT: xtn v1.4h, v1.4s
1026+
; CHECK-NEXT: xtn v2.4h, v2.4s
1027+
; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
1028+
; CHECK-NEXT: umull v1.4s, v4.4h, v2.4h
10281029
; CHECK-NEXT: ret
10291030
entry:
10301031
%in1 = zext <8 x i16> %src1 to <8 x i32>
@@ -1037,11 +1038,11 @@ define <8 x i32> @umull_and_v8i32_dup(<8 x i16> %src1, i32 %src2) {
10371038
; CHECK-LABEL: umull_and_v8i32_dup:
10381039
; CHECK: // %bb.0: // %entry
10391040
; CHECK-NEXT: and w8, w0, #0xff
1040-
; CHECK-NEXT: ushll v1.4s, v0.4h, #0
1041-
; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0
1041+
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
10421042
; CHECK-NEXT: dup v2.4s, w8
1043-
; CHECK-NEXT: mul v0.4s, v1.4s, v2.4s
1044-
; CHECK-NEXT: mul v1.4s, v3.4s, v2.4s
1043+
; CHECK-NEXT: xtn v2.4h, v2.4s
1044+
; CHECK-NEXT: umull v0.4s, v0.4h, v2.4h
1045+
; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h
10451046
; CHECK-NEXT: ret
10461047
entry:
10471048
%in1 = zext <8 x i16> %src1 to <8 x i32>
@@ -1056,16 +1057,9 @@ define <2 x i64> @umull_and_v2i64(<2 x i32> %src1, <2 x i64> %src2) {
10561057
; CHECK-LABEL: umull_and_v2i64:
10571058
; CHECK: // %bb.0: // %entry
10581059
; CHECK-NEXT: movi v2.2d, #0x000000000000ff
1059-
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
1060-
; CHECK-NEXT: fmov x10, d0
10611060
; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
1062-
; CHECK-NEXT: fmov x9, d1
1063-
; CHECK-NEXT: mov x8, v1.d[1]
1064-
; CHECK-NEXT: mov x11, v0.d[1]
1065-
; CHECK-NEXT: mul x9, x10, x9
1066-
; CHECK-NEXT: mul x8, x11, x8
1067-
; CHECK-NEXT: fmov d0, x9
1068-
; CHECK-NEXT: mov v0.d[1], x8
1061+
; CHECK-NEXT: xtn v1.2s, v1.2d
1062+
; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
10691063
; CHECK-NEXT: ret
10701064
entry:
10711065
%in1 = zext <2 x i32> %src1 to <2 x i64>
@@ -1078,26 +1072,13 @@ define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) {
10781072
; CHECK-LABEL: umull_and_v4i64:
10791073
; CHECK: // %bb.0: // %entry
10801074
; CHECK-NEXT: movi v3.2d, #0x000000000000ff
1081-
; CHECK-NEXT: ushll v4.2d, v0.2s, #0
1082-
; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0
1083-
; CHECK-NEXT: fmov x14, d4
1075+
; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8
10841076
; CHECK-NEXT: and v2.16b, v2.16b, v3.16b
1085-
; CHECK-NEXT: fmov x11, d0
1086-
; CHECK-NEXT: mov x9, v0.d[1]
1087-
; CHECK-NEXT: and v0.16b, v1.16b, v3.16b
1088-
; CHECK-NEXT: fmov x10, d2
1089-
; CHECK-NEXT: fmov x13, d0
1090-
; CHECK-NEXT: mov x8, v2.d[1]
1091-
; CHECK-NEXT: mov x12, v0.d[1]
1092-
; CHECK-NEXT: mul x10, x11, x10
1093-
; CHECK-NEXT: mov x15, v4.d[1]
1094-
; CHECK-NEXT: mul x11, x14, x13
1095-
; CHECK-NEXT: mul x8, x9, x8
1096-
; CHECK-NEXT: fmov d1, x10
1097-
; CHECK-NEXT: mul x9, x15, x12
1098-
; CHECK-NEXT: fmov d0, x11
1099-
; CHECK-NEXT: mov v1.d[1], x8
1100-
; CHECK-NEXT: mov v0.d[1], x9
1077+
; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
1078+
; CHECK-NEXT: xtn v1.2s, v1.2d
1079+
; CHECK-NEXT: xtn v2.2s, v2.2d
1080+
; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
1081+
; CHECK-NEXT: umull v1.2d, v4.2s, v2.2s
11011082
; CHECK-NEXT: ret
11021083
entry:
11031084
%in1 = zext <4 x i32> %src1 to <4 x i64>
@@ -1109,21 +1090,12 @@ entry:
11091090
define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) {
11101091
; CHECK-LABEL: umull_and_v4i64_dup:
11111092
; CHECK: // %bb.0: // %entry
1112-
; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0
11131093
; CHECK-NEXT: and x8, x0, #0xff
1114-
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
1115-
; CHECK-NEXT: fmov x9, d1
1116-
; CHECK-NEXT: fmov x11, d0
1117-
; CHECK-NEXT: mov x10, v1.d[1]
1118-
; CHECK-NEXT: mov x12, v0.d[1]
1119-
; CHECK-NEXT: mul x9, x9, x8
1120-
; CHECK-NEXT: mul x11, x11, x8
1121-
; CHECK-NEXT: mul x10, x10, x8
1122-
; CHECK-NEXT: mul x8, x12, x8
1123-
; CHECK-NEXT: fmov d1, x9
1124-
; CHECK-NEXT: fmov d0, x11
1125-
; CHECK-NEXT: mov v1.d[1], x10
1126-
; CHECK-NEXT: mov v0.d[1], x8
1094+
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
1095+
; CHECK-NEXT: dup v2.2d, x8
1096+
; CHECK-NEXT: xtn v2.2s, v2.2d
1097+
; CHECK-NEXT: umull v0.2d, v0.2s, v2.2s
1098+
; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
11271099
; CHECK-NEXT: ret
11281100
entry:
11291101
%in1 = zext <4 x i32> %src1 to <4 x i64>

0 commit comments

Comments
 (0)