Skip to content

Commit ac2291d

Browse files
authored
[X86] Allow handling of i128/256/512 AND/OR/XOR bitlogic on the FPU (#171616)
If the scalar integer sources are freely transferable to the FPU, then perform the bitlogic op as a SSE/AVX operation. Uses the mayFoldIntoVector helper added at #171589
1 parent 9bcba9d commit ac2291d

File tree

6 files changed

+909
-749
lines changed

6 files changed

+909
-749
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,6 +1142,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
11421142
setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
11431143
setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
11441144

1145+
setOperationAction(ISD::AND, MVT::i128, Custom);
1146+
setOperationAction(ISD::OR, MVT::i128, Custom);
1147+
setOperationAction(ISD::XOR, MVT::i128, Custom);
1148+
11451149
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
11461150
setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
11471151
setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
@@ -1481,6 +1485,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
14811485
setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
14821486
setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
14831487

1488+
setOperationAction(ISD::AND, MVT::i256, Custom);
1489+
setOperationAction(ISD::OR, MVT::i256, Custom);
1490+
setOperationAction(ISD::XOR, MVT::i256, Custom);
1491+
14841492
// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
14851493
// even though v8i16 is a legal type.
14861494
setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
@@ -1836,6 +1844,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
18361844
if (Subtarget.hasDQI())
18371845
setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
18381846

1847+
setOperationAction(ISD::AND, MVT::i512, Custom);
1848+
setOperationAction(ISD::OR, MVT::i512, Custom);
1849+
setOperationAction(ISD::XOR, MVT::i512, Custom);
1850+
18391851
for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
18401852
setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
18411853
setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
@@ -33926,6 +33938,23 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
3392633938
case X86ISD::CVTPS2PH:
3392733939
Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
3392833940
return;
33941+
case ISD::AND:
33942+
case ISD::OR:
33943+
case ISD::XOR: {
33944+
SDValue N0 = N->getOperand(0);
33945+
SDValue N1 = N->getOperand(1);
33946+
EVT VT = N->getValueType(0);
33947+
assert((VT == MVT::i128 || VT == MVT::i256 || VT == MVT::i512) &&
33948+
"Unexpected VT!");
33949+
// See if this is free to perform on the FPU to avoid splitting.
33950+
MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
33951+
if (!mayFoldIntoVector(N0, Subtarget) || !mayFoldIntoVector(N1, Subtarget))
33952+
return;
33953+
SDValue Op = DAG.getNode(Opc, dl, VecVT, DAG.getBitcast(VecVT, N0),
33954+
DAG.getBitcast(VecVT, N1));
33955+
Results.push_back(DAG.getBitcast(VT, Op));
33956+
return;
33957+
}
3392933958
case ISD::CTPOP: {
3393033959
assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
3393133960
// If we have at most 32 active bits, then perform as i32 CTPOP.

llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll

Lines changed: 194 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -621,17 +621,41 @@ define void @vec256_double(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
621621
}
622622

623623
define void @vec256_i128(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
624-
; ALL-LABEL: vec256_i128:
625-
; ALL: # %bb.0:
626-
; ALL-NEXT: movq (%rdi), %rax
627-
; ALL-NEXT: movq 8(%rdi), %rcx
628-
; ALL-NEXT: notq %rcx
629-
; ALL-NEXT: notq %rax
630-
; ALL-NEXT: movq %rax, (%rsi)
631-
; ALL-NEXT: movq %rcx, 8(%rsi)
632-
; ALL-NEXT: movq %rcx, 24(%rsi)
633-
; ALL-NEXT: movq %rax, 16(%rsi)
634-
; ALL-NEXT: retq
624+
; SCALAR-LABEL: vec256_i128:
625+
; SCALAR: # %bb.0:
626+
; SCALAR-NEXT: movq (%rdi), %rax
627+
; SCALAR-NEXT: movq 8(%rdi), %rcx
628+
; SCALAR-NEXT: notq %rcx
629+
; SCALAR-NEXT: notq %rax
630+
; SCALAR-NEXT: movq %rax, (%rsi)
631+
; SCALAR-NEXT: movq %rcx, 8(%rsi)
632+
; SCALAR-NEXT: movq %rcx, 24(%rsi)
633+
; SCALAR-NEXT: movq %rax, 16(%rsi)
634+
; SCALAR-NEXT: retq
635+
;
636+
; SSE-LABEL: vec256_i128:
637+
; SSE: # %bb.0:
638+
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
639+
; SSE-NEXT: pxor (%rdi), %xmm0
640+
; SSE-NEXT: movdqa %xmm0, (%rsi)
641+
; SSE-NEXT: movdqa %xmm0, 16(%rsi)
642+
; SSE-NEXT: retq
643+
;
644+
; AVX-LABEL: vec256_i128:
645+
; AVX: # %bb.0:
646+
; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
647+
; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
648+
; AVX-NEXT: vmovdqa %xmm0, 16(%rsi)
649+
; AVX-NEXT: vmovdqa %xmm0, (%rsi)
650+
; AVX-NEXT: retq
651+
;
652+
; AVX512-LABEL: vec256_i128:
653+
; AVX512: # %bb.0:
654+
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
655+
; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0
656+
; AVX512-NEXT: vmovdqa %xmm0, 16(%rsi)
657+
; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
658+
; AVX512-NEXT: retq
635659
%in.elt.not = load i128, ptr %in.elt.ptr, align 64
636660
%in.elt = xor i128 %in.elt.not, -1
637661
%out.elt0.ptr = getelementptr i128, ptr %out.vec.ptr, i64 0
@@ -1034,19 +1058,46 @@ define void @vec384_double(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
10341058
}
10351059

10361060
define void @vec384_i128(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
1037-
; ALL-LABEL: vec384_i128:
1038-
; ALL: # %bb.0:
1039-
; ALL-NEXT: movq (%rdi), %rax
1040-
; ALL-NEXT: movq 8(%rdi), %rcx
1041-
; ALL-NEXT: notq %rcx
1042-
; ALL-NEXT: notq %rax
1043-
; ALL-NEXT: movq %rax, (%rsi)
1044-
; ALL-NEXT: movq %rcx, 8(%rsi)
1045-
; ALL-NEXT: movq %rcx, 24(%rsi)
1046-
; ALL-NEXT: movq %rax, 16(%rsi)
1047-
; ALL-NEXT: movq %rcx, 40(%rsi)
1048-
; ALL-NEXT: movq %rax, 32(%rsi)
1049-
; ALL-NEXT: retq
1061+
; SCALAR-LABEL: vec384_i128:
1062+
; SCALAR: # %bb.0:
1063+
; SCALAR-NEXT: movq (%rdi), %rax
1064+
; SCALAR-NEXT: movq 8(%rdi), %rcx
1065+
; SCALAR-NEXT: notq %rcx
1066+
; SCALAR-NEXT: notq %rax
1067+
; SCALAR-NEXT: movq %rax, (%rsi)
1068+
; SCALAR-NEXT: movq %rcx, 8(%rsi)
1069+
; SCALAR-NEXT: movq %rcx, 24(%rsi)
1070+
; SCALAR-NEXT: movq %rax, 16(%rsi)
1071+
; SCALAR-NEXT: movq %rcx, 40(%rsi)
1072+
; SCALAR-NEXT: movq %rax, 32(%rsi)
1073+
; SCALAR-NEXT: retq
1074+
;
1075+
; SSE-LABEL: vec384_i128:
1076+
; SSE: # %bb.0:
1077+
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
1078+
; SSE-NEXT: pxor (%rdi), %xmm0
1079+
; SSE-NEXT: movdqa %xmm0, (%rsi)
1080+
; SSE-NEXT: movdqa %xmm0, 16(%rsi)
1081+
; SSE-NEXT: movdqa %xmm0, 32(%rsi)
1082+
; SSE-NEXT: retq
1083+
;
1084+
; AVX-LABEL: vec384_i128:
1085+
; AVX: # %bb.0:
1086+
; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1087+
; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
1088+
; AVX-NEXT: vmovdqa %xmm0, (%rsi)
1089+
; AVX-NEXT: vmovdqa %xmm0, 16(%rsi)
1090+
; AVX-NEXT: vmovdqa %xmm0, 32(%rsi)
1091+
; AVX-NEXT: retq
1092+
;
1093+
; AVX512-LABEL: vec384_i128:
1094+
; AVX512: # %bb.0:
1095+
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1096+
; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0
1097+
; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
1098+
; AVX512-NEXT: vmovdqa %xmm0, 16(%rsi)
1099+
; AVX512-NEXT: vmovdqa %xmm0, 32(%rsi)
1100+
; AVX512-NEXT: retq
10501101
%in.elt.not = load i128, ptr %in.elt.ptr, align 64
10511102
%in.elt = xor i128 %in.elt.not, -1
10521103
%out.elt0.ptr = getelementptr i128, ptr %out.vec.ptr, i64 0
@@ -1559,21 +1610,60 @@ define void @vec512_double(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
15591610
}
15601611

15611612
define void @vec512_i128(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
1562-
; ALL-LABEL: vec512_i128:
1563-
; ALL: # %bb.0:
1564-
; ALL-NEXT: movq (%rdi), %rax
1565-
; ALL-NEXT: movq 8(%rdi), %rcx
1566-
; ALL-NEXT: notq %rcx
1567-
; ALL-NEXT: notq %rax
1568-
; ALL-NEXT: movq %rax, (%rsi)
1569-
; ALL-NEXT: movq %rcx, 8(%rsi)
1570-
; ALL-NEXT: movq %rcx, 24(%rsi)
1571-
; ALL-NEXT: movq %rax, 16(%rsi)
1572-
; ALL-NEXT: movq %rcx, 40(%rsi)
1573-
; ALL-NEXT: movq %rax, 32(%rsi)
1574-
; ALL-NEXT: movq %rcx, 56(%rsi)
1575-
; ALL-NEXT: movq %rax, 48(%rsi)
1576-
; ALL-NEXT: retq
1613+
; SCALAR-LABEL: vec512_i128:
1614+
; SCALAR: # %bb.0:
1615+
; SCALAR-NEXT: movq (%rdi), %rax
1616+
; SCALAR-NEXT: movq 8(%rdi), %rcx
1617+
; SCALAR-NEXT: notq %rcx
1618+
; SCALAR-NEXT: notq %rax
1619+
; SCALAR-NEXT: movq %rax, (%rsi)
1620+
; SCALAR-NEXT: movq %rcx, 8(%rsi)
1621+
; SCALAR-NEXT: movq %rcx, 24(%rsi)
1622+
; SCALAR-NEXT: movq %rax, 16(%rsi)
1623+
; SCALAR-NEXT: movq %rcx, 40(%rsi)
1624+
; SCALAR-NEXT: movq %rax, 32(%rsi)
1625+
; SCALAR-NEXT: movq %rcx, 56(%rsi)
1626+
; SCALAR-NEXT: movq %rax, 48(%rsi)
1627+
; SCALAR-NEXT: retq
1628+
;
1629+
; SSE-LABEL: vec512_i128:
1630+
; SSE: # %bb.0:
1631+
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
1632+
; SSE-NEXT: pxor (%rdi), %xmm0
1633+
; SSE-NEXT: movdqa %xmm0, (%rsi)
1634+
; SSE-NEXT: movdqa %xmm0, 16(%rsi)
1635+
; SSE-NEXT: movdqa %xmm0, 32(%rsi)
1636+
; SSE-NEXT: movdqa %xmm0, 48(%rsi)
1637+
; SSE-NEXT: retq
1638+
;
1639+
; AVX1-LABEL: vec512_i128:
1640+
; AVX1: # %bb.0:
1641+
; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1642+
; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
1643+
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1644+
; AVX1-NEXT: vmovaps %ymm0, (%rsi)
1645+
; AVX1-NEXT: vmovaps %ymm0, 32(%rsi)
1646+
; AVX1-NEXT: vzeroupper
1647+
; AVX1-NEXT: retq
1648+
;
1649+
; AVX2-LABEL: vec512_i128:
1650+
; AVX2: # %bb.0:
1651+
; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1652+
; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0
1653+
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1654+
; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
1655+
; AVX2-NEXT: vmovdqa %ymm0, 32(%rsi)
1656+
; AVX2-NEXT: vzeroupper
1657+
; AVX2-NEXT: retq
1658+
;
1659+
; AVX512-LABEL: vec512_i128:
1660+
; AVX512: # %bb.0:
1661+
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1662+
; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0
1663+
; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1664+
; AVX512-NEXT: vmovdqa64 %zmm0, (%rsi)
1665+
; AVX512-NEXT: vzeroupper
1666+
; AVX512-NEXT: retq
15771667
%in.elt.not = load i128, ptr %in.elt.ptr, align 64
15781668
%in.elt = xor i128 %in.elt.not, -1
15791669
%out.elt0.ptr = getelementptr i128, ptr %out.vec.ptr, i64 0
@@ -1588,25 +1678,71 @@ define void @vec512_i128(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
15881678
}
15891679

15901680
define void @vec512_i256(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
1591-
; ALL-LABEL: vec512_i256:
1592-
; ALL: # %bb.0:
1593-
; ALL-NEXT: movq 16(%rdi), %rax
1594-
; ALL-NEXT: movq 24(%rdi), %rcx
1595-
; ALL-NEXT: movq (%rdi), %rdx
1596-
; ALL-NEXT: movq 8(%rdi), %rdi
1597-
; ALL-NEXT: notq %rdi
1598-
; ALL-NEXT: notq %rdx
1599-
; ALL-NEXT: notq %rcx
1600-
; ALL-NEXT: notq %rax
1601-
; ALL-NEXT: movq %rax, 16(%rsi)
1602-
; ALL-NEXT: movq %rcx, 24(%rsi)
1603-
; ALL-NEXT: movq %rdx, (%rsi)
1604-
; ALL-NEXT: movq %rdi, 8(%rsi)
1605-
; ALL-NEXT: movq %rax, 48(%rsi)
1606-
; ALL-NEXT: movq %rcx, 56(%rsi)
1607-
; ALL-NEXT: movq %rdx, 32(%rsi)
1608-
; ALL-NEXT: movq %rdi, 40(%rsi)
1609-
; ALL-NEXT: retq
1681+
; SCALAR-LABEL: vec512_i256:
1682+
; SCALAR: # %bb.0:
1683+
; SCALAR-NEXT: movq 16(%rdi), %rax
1684+
; SCALAR-NEXT: movq 24(%rdi), %rcx
1685+
; SCALAR-NEXT: movq (%rdi), %rdx
1686+
; SCALAR-NEXT: movq 8(%rdi), %rdi
1687+
; SCALAR-NEXT: notq %rdi
1688+
; SCALAR-NEXT: notq %rdx
1689+
; SCALAR-NEXT: notq %rcx
1690+
; SCALAR-NEXT: notq %rax
1691+
; SCALAR-NEXT: movq %rax, 16(%rsi)
1692+
; SCALAR-NEXT: movq %rcx, 24(%rsi)
1693+
; SCALAR-NEXT: movq %rdx, (%rsi)
1694+
; SCALAR-NEXT: movq %rdi, 8(%rsi)
1695+
; SCALAR-NEXT: movq %rax, 48(%rsi)
1696+
; SCALAR-NEXT: movq %rcx, 56(%rsi)
1697+
; SCALAR-NEXT: movq %rdx, 32(%rsi)
1698+
; SCALAR-NEXT: movq %rdi, 40(%rsi)
1699+
; SCALAR-NEXT: retq
1700+
;
1701+
; SSE-LABEL: vec512_i256:
1702+
; SSE: # %bb.0:
1703+
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
1704+
; SSE-NEXT: movdqa (%rdi), %xmm1
1705+
; SSE-NEXT: pxor %xmm0, %xmm1
1706+
; SSE-NEXT: pxor 16(%rdi), %xmm0
1707+
; SSE-NEXT: movdqa %xmm0, 16(%rsi)
1708+
; SSE-NEXT: movdqa %xmm1, (%rsi)
1709+
; SSE-NEXT: movdqa %xmm1, 32(%rsi)
1710+
; SSE-NEXT: movdqa %xmm0, 48(%rsi)
1711+
; SSE-NEXT: retq
1712+
;
1713+
; AVX1-LABEL: vec512_i256:
1714+
; AVX1: # %bb.0:
1715+
; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
1716+
; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
1717+
; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
1718+
; AVX1-NEXT: vextractf128 $1, %ymm0, 16(%rsi)
1719+
; AVX1-NEXT: vmovaps %xmm0, (%rsi)
1720+
; AVX1-NEXT: vextractf128 $1, %ymm0, 48(%rsi)
1721+
; AVX1-NEXT: vmovaps %xmm0, 32(%rsi)
1722+
; AVX1-NEXT: vzeroupper
1723+
; AVX1-NEXT: retq
1724+
;
1725+
; AVX2-LABEL: vec512_i256:
1726+
; AVX2: # %bb.0:
1727+
; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
1728+
; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
1729+
; AVX2-NEXT: vextracti128 $1, %ymm0, 16(%rsi)
1730+
; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
1731+
; AVX2-NEXT: vextracti128 $1, %ymm0, 48(%rsi)
1732+
; AVX2-NEXT: vmovdqa %xmm0, 32(%rsi)
1733+
; AVX2-NEXT: vzeroupper
1734+
; AVX2-NEXT: retq
1735+
;
1736+
; AVX512-LABEL: vec512_i256:
1737+
; AVX512: # %bb.0:
1738+
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
1739+
; AVX512-NEXT: vpxor (%rdi), %ymm0, %ymm0
1740+
; AVX512-NEXT: vextracti128 $1, %ymm0, 16(%rsi)
1741+
; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
1742+
; AVX512-NEXT: vextracti128 $1, %ymm0, 48(%rsi)
1743+
; AVX512-NEXT: vmovdqa %xmm0, 32(%rsi)
1744+
; AVX512-NEXT: vzeroupper
1745+
; AVX512-NEXT: retq
16101746
%in.elt.not = load i256, ptr %in.elt.ptr, align 64
16111747
%in.elt = xor i256 %in.elt.not, -1
16121748
%out.elt0.ptr = getelementptr i256, ptr %out.vec.ptr, i64 0
@@ -1616,14 +1752,8 @@ define void @vec512_i256(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
16161752
ret void
16171753
}
16181754
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
1619-
; AVX: {{.*}}
1620-
; AVX1: {{.*}}
1621-
; AVX2: {{.*}}
1622-
; AVX512: {{.*}}
16231755
; AVX512BW: {{.*}}
16241756
; AVX512F: {{.*}}
1625-
; SCALAR: {{.*}}
1626-
; SSE: {{.*}}
16271757
; SSE2: {{.*}}
16281758
; SSE2-ONLY: {{.*}}
16291759
; SSE3: {{.*}}

llvm/test/CodeGen/X86/pr166744.ll

Lines changed: 8 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,11 @@ define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) {
1414
; POSTRA-NEXT: btrl %esi, %ecx
1515
; POSTRA-NEXT: orl %ecx, %edx
1616
; POSTRA-NEXT: movl %edx, (%rdi,%rax,4)
17-
; POSTRA-NEXT: movq 16(%rdi), %rax
18-
; POSTRA-NEXT: movq (%rdi), %rcx
19-
; POSTRA-NEXT: movq 24(%rdi), %rdx
20-
; POSTRA-NEXT: movq 8(%rdi), %rsi
21-
; POSTRA-NEXT: orq 56(%rdi), %rdx
22-
; POSTRA-NEXT: orq 40(%rdi), %rsi
23-
; POSTRA-NEXT: orq 48(%rdi), %rax
24-
; POSTRA-NEXT: orq 32(%rdi), %rcx
25-
; POSTRA-NEXT: orq %rdx, %rsi
26-
; POSTRA-NEXT: orq %rax, %rcx
27-
; POSTRA-NEXT: orq %rsi, %rcx
17+
; POSTRA-NEXT: vmovdqu (%rdi), %ymm0
18+
; POSTRA-NEXT: vpor 32(%rdi), %ymm0, %ymm0
19+
; POSTRA-NEXT: vptest %ymm0, %ymm0
2820
; POSTRA-NEXT: setne %al
21+
; POSTRA-NEXT: vzeroupper
2922
; POSTRA-NEXT: retq
3023
;
3124
; NOPOSTRA-LABEL: PR166744:
@@ -38,18 +31,11 @@ define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) {
3831
; NOPOSTRA-NEXT: shlxl %eax, %edx, %eax
3932
; NOPOSTRA-NEXT: orl %ecx, %eax
4033
; NOPOSTRA-NEXT: movl %eax, (%rdi,%rsi)
41-
; NOPOSTRA-NEXT: movq 16(%rdi), %rax
42-
; NOPOSTRA-NEXT: movq (%rdi), %rcx
43-
; NOPOSTRA-NEXT: movq 8(%rdi), %rdx
44-
; NOPOSTRA-NEXT: movq 24(%rdi), %rsi
45-
; NOPOSTRA-NEXT: orq 56(%rdi), %rsi
46-
; NOPOSTRA-NEXT: orq 40(%rdi), %rdx
47-
; NOPOSTRA-NEXT: orq 48(%rdi), %rax
48-
; NOPOSTRA-NEXT: orq 32(%rdi), %rcx
49-
; NOPOSTRA-NEXT: orq %rsi, %rdx
50-
; NOPOSTRA-NEXT: orq %rax, %rcx
51-
; NOPOSTRA-NEXT: orq %rdx, %rcx
34+
; NOPOSTRA-NEXT: vmovdqu (%rdi), %ymm0
35+
; NOPOSTRA-NEXT: vpor 32(%rdi), %ymm0, %ymm0
36+
; NOPOSTRA-NEXT: vptest %ymm0, %ymm0
5237
; NOPOSTRA-NEXT: setne %al
38+
; NOPOSTRA-NEXT: vzeroupper
5339
; NOPOSTRA-NEXT: retq
5440
%rem = and i64 %idx, 511
5541
%sh_prom = zext nneg i64 %rem to i512

0 commit comments

Comments
 (0)