-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[X86] Allow handling of i128/256/512 AND/OR/XOR bitlogic on the FPU #171616
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
If the scalar integer sources are freely transferable to the FPU, then perform the bitlogic op as a SSE/AVX operation.
|
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesIf the scalar integer sources are freely transferable to the FPU, then perform the bitlogic op as a SSE/AVX operation. Uses the mayFoldIntoVector helper added at #171589 Patch is 69.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/171616.diff 6 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3b3b20edbbe84..67f46c61cbeac 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1142,6 +1142,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
+ setOperationAction(ISD::AND, MVT::i128, Custom);
+ setOperationAction(ISD::OR, MVT::i128, Custom);
+ setOperationAction(ISD::XOR, MVT::i128, Custom);
+
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
@@ -1481,6 +1485,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
+ setOperationAction(ISD::AND, MVT::i256, Custom);
+ setOperationAction(ISD::OR, MVT::i256, Custom);
+ setOperationAction(ISD::XOR, MVT::i256, Custom);
+
// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
// even though v8i16 is a legal type.
setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
@@ -1836,6 +1844,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.hasDQI())
setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
+ setOperationAction(ISD::AND, MVT::i512, Custom);
+ setOperationAction(ISD::OR, MVT::i512, Custom);
+ setOperationAction(ISD::XOR, MVT::i512, Custom);
+
for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
@@ -33919,6 +33931,23 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case X86ISD::CVTPS2PH:
Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
return;
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ assert((VT == MVT::i128 || VT == MVT::i256 || VT == MVT::i512) &&
+ "Unexpected VT!");
+ // See if this is free to perform on the FPU to avoid splitting.
+ MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+ if (!mayFoldIntoVector(N0, Subtarget) || !mayFoldIntoVector(N1, Subtarget))
+ return;
+ SDValue Op = DAG.getNode(Opc, dl, VecVT, DAG.getBitcast(VecVT, N0),
+ DAG.getBitcast(VecVT, N1));
+ Results.push_back(DAG.getBitcast(VT, Op));
+ return;
+ }
case ISD::CTPOP: {
assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
// If we have at most 32 active bits, then perform as i32 CTPOP.
diff --git a/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll b/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll
index 6d4be7dbe6349..d9158c4af18fa 100644
--- a/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll
+++ b/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll
@@ -621,17 +621,41 @@ define void @vec256_double(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
}
define void @vec256_i128(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
-; ALL-LABEL: vec256_i128:
-; ALL: # %bb.0:
-; ALL-NEXT: movq (%rdi), %rax
-; ALL-NEXT: movq 8(%rdi), %rcx
-; ALL-NEXT: notq %rcx
-; ALL-NEXT: notq %rax
-; ALL-NEXT: movq %rax, (%rsi)
-; ALL-NEXT: movq %rcx, 8(%rsi)
-; ALL-NEXT: movq %rcx, 24(%rsi)
-; ALL-NEXT: movq %rax, 16(%rsi)
-; ALL-NEXT: retq
+; SCALAR-LABEL: vec256_i128:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: movq (%rdi), %rax
+; SCALAR-NEXT: movq 8(%rdi), %rcx
+; SCALAR-NEXT: notq %rcx
+; SCALAR-NEXT: notq %rax
+; SCALAR-NEXT: movq %rax, (%rsi)
+; SCALAR-NEXT: movq %rcx, 8(%rsi)
+; SCALAR-NEXT: movq %rcx, 24(%rsi)
+; SCALAR-NEXT: movq %rax, 16(%rsi)
+; SCALAR-NEXT: retq
+;
+; SSE-LABEL: vec256_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE-NEXT: pxor (%rdi), %xmm0
+; SSE-NEXT: movdqa %xmm0, (%rsi)
+; SSE-NEXT: movdqa %xmm0, 16(%rsi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: vec256_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
+; AVX-NEXT: vmovdqa %xmm0, 16(%rsi)
+; AVX-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: vec256_i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, 16(%rsi)
+; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512-NEXT: retq
%in.elt.not = load i128, ptr %in.elt.ptr, align 64
%in.elt = xor i128 %in.elt.not, -1
%out.elt0.ptr = getelementptr i128, ptr %out.vec.ptr, i64 0
@@ -1034,19 +1058,46 @@ define void @vec384_double(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
}
define void @vec384_i128(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
-; ALL-LABEL: vec384_i128:
-; ALL: # %bb.0:
-; ALL-NEXT: movq (%rdi), %rax
-; ALL-NEXT: movq 8(%rdi), %rcx
-; ALL-NEXT: notq %rcx
-; ALL-NEXT: notq %rax
-; ALL-NEXT: movq %rax, (%rsi)
-; ALL-NEXT: movq %rcx, 8(%rsi)
-; ALL-NEXT: movq %rcx, 24(%rsi)
-; ALL-NEXT: movq %rax, 16(%rsi)
-; ALL-NEXT: movq %rcx, 40(%rsi)
-; ALL-NEXT: movq %rax, 32(%rsi)
-; ALL-NEXT: retq
+; SCALAR-LABEL: vec384_i128:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: movq (%rdi), %rax
+; SCALAR-NEXT: movq 8(%rdi), %rcx
+; SCALAR-NEXT: notq %rcx
+; SCALAR-NEXT: notq %rax
+; SCALAR-NEXT: movq %rax, (%rsi)
+; SCALAR-NEXT: movq %rcx, 8(%rsi)
+; SCALAR-NEXT: movq %rcx, 24(%rsi)
+; SCALAR-NEXT: movq %rax, 16(%rsi)
+; SCALAR-NEXT: movq %rcx, 40(%rsi)
+; SCALAR-NEXT: movq %rax, 32(%rsi)
+; SCALAR-NEXT: retq
+;
+; SSE-LABEL: vec384_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE-NEXT: pxor (%rdi), %xmm0
+; SSE-NEXT: movdqa %xmm0, (%rsi)
+; SSE-NEXT: movdqa %xmm0, 16(%rsi)
+; SSE-NEXT: movdqa %xmm0, 32(%rsi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: vec384_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
+; AVX-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX-NEXT: vmovdqa %xmm0, 16(%rsi)
+; AVX-NEXT: vmovdqa %xmm0, 32(%rsi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: vec384_i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512-NEXT: vmovdqa %xmm0, 16(%rsi)
+; AVX512-NEXT: vmovdqa %xmm0, 32(%rsi)
+; AVX512-NEXT: retq
%in.elt.not = load i128, ptr %in.elt.ptr, align 64
%in.elt = xor i128 %in.elt.not, -1
%out.elt0.ptr = getelementptr i128, ptr %out.vec.ptr, i64 0
@@ -1559,21 +1610,60 @@ define void @vec512_double(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
}
define void @vec512_i128(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
-; ALL-LABEL: vec512_i128:
-; ALL: # %bb.0:
-; ALL-NEXT: movq (%rdi), %rax
-; ALL-NEXT: movq 8(%rdi), %rcx
-; ALL-NEXT: notq %rcx
-; ALL-NEXT: notq %rax
-; ALL-NEXT: movq %rax, (%rsi)
-; ALL-NEXT: movq %rcx, 8(%rsi)
-; ALL-NEXT: movq %rcx, 24(%rsi)
-; ALL-NEXT: movq %rax, 16(%rsi)
-; ALL-NEXT: movq %rcx, 40(%rsi)
-; ALL-NEXT: movq %rax, 32(%rsi)
-; ALL-NEXT: movq %rcx, 56(%rsi)
-; ALL-NEXT: movq %rax, 48(%rsi)
-; ALL-NEXT: retq
+; SCALAR-LABEL: vec512_i128:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: movq (%rdi), %rax
+; SCALAR-NEXT: movq 8(%rdi), %rcx
+; SCALAR-NEXT: notq %rcx
+; SCALAR-NEXT: notq %rax
+; SCALAR-NEXT: movq %rax, (%rsi)
+; SCALAR-NEXT: movq %rcx, 8(%rsi)
+; SCALAR-NEXT: movq %rcx, 24(%rsi)
+; SCALAR-NEXT: movq %rax, 16(%rsi)
+; SCALAR-NEXT: movq %rcx, 40(%rsi)
+; SCALAR-NEXT: movq %rax, 32(%rsi)
+; SCALAR-NEXT: movq %rcx, 56(%rsi)
+; SCALAR-NEXT: movq %rax, 48(%rsi)
+; SCALAR-NEXT: retq
+;
+; SSE-LABEL: vec512_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE-NEXT: pxor (%rdi), %xmm0
+; SSE-NEXT: movdqa %xmm0, (%rsi)
+; SSE-NEXT: movdqa %xmm0, 16(%rsi)
+; SSE-NEXT: movdqa %xmm0, 32(%rsi)
+; SSE-NEXT: movdqa %xmm0, 48(%rsi)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: vec512_i128:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps %ymm0, (%rsi)
+; AVX1-NEXT: vmovaps %ymm0, 32(%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: vec512_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX2-NEXT: vmovdqa %ymm0, 32(%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: vec512_i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512-NEXT: vmovdqa64 %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%in.elt.not = load i128, ptr %in.elt.ptr, align 64
%in.elt = xor i128 %in.elt.not, -1
%out.elt0.ptr = getelementptr i128, ptr %out.vec.ptr, i64 0
@@ -1588,25 +1678,71 @@ define void @vec512_i128(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
}
define void @vec512_i256(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
-; ALL-LABEL: vec512_i256:
-; ALL: # %bb.0:
-; ALL-NEXT: movq 16(%rdi), %rax
-; ALL-NEXT: movq 24(%rdi), %rcx
-; ALL-NEXT: movq (%rdi), %rdx
-; ALL-NEXT: movq 8(%rdi), %rdi
-; ALL-NEXT: notq %rdi
-; ALL-NEXT: notq %rdx
-; ALL-NEXT: notq %rcx
-; ALL-NEXT: notq %rax
-; ALL-NEXT: movq %rax, 16(%rsi)
-; ALL-NEXT: movq %rcx, 24(%rsi)
-; ALL-NEXT: movq %rdx, (%rsi)
-; ALL-NEXT: movq %rdi, 8(%rsi)
-; ALL-NEXT: movq %rax, 48(%rsi)
-; ALL-NEXT: movq %rcx, 56(%rsi)
-; ALL-NEXT: movq %rdx, 32(%rsi)
-; ALL-NEXT: movq %rdi, 40(%rsi)
-; ALL-NEXT: retq
+; SCALAR-LABEL: vec512_i256:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: movq 16(%rdi), %rax
+; SCALAR-NEXT: movq 24(%rdi), %rcx
+; SCALAR-NEXT: movq (%rdi), %rdx
+; SCALAR-NEXT: movq 8(%rdi), %rdi
+; SCALAR-NEXT: notq %rdi
+; SCALAR-NEXT: notq %rdx
+; SCALAR-NEXT: notq %rcx
+; SCALAR-NEXT: notq %rax
+; SCALAR-NEXT: movq %rax, 16(%rsi)
+; SCALAR-NEXT: movq %rcx, 24(%rsi)
+; SCALAR-NEXT: movq %rdx, (%rsi)
+; SCALAR-NEXT: movq %rdi, 8(%rsi)
+; SCALAR-NEXT: movq %rax, 48(%rsi)
+; SCALAR-NEXT: movq %rcx, 56(%rsi)
+; SCALAR-NEXT: movq %rdx, 32(%rsi)
+; SCALAR-NEXT: movq %rdi, 40(%rsi)
+; SCALAR-NEXT: retq
+;
+; SSE-LABEL: vec512_i256:
+; SSE: # %bb.0:
+; SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: pxor %xmm0, %xmm1
+; SSE-NEXT: pxor 16(%rdi), %xmm0
+; SSE-NEXT: movdqa %xmm0, 16(%rsi)
+; SSE-NEXT: movdqa %xmm1, (%rsi)
+; SSE-NEXT: movdqa %xmm1, 32(%rsi)
+; SSE-NEXT: movdqa %xmm0, 48(%rsi)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: vec512_i256:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
+; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, 16(%rsi)
+; AVX1-NEXT: vmovaps %xmm0, (%rsi)
+; AVX1-NEXT: vextractf128 $1, %ymm0, 48(%rsi)
+; AVX1-NEXT: vmovaps %xmm0, 32(%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: vec512_i256:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, 16(%rsi)
+; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX2-NEXT: vextracti128 $1, %ymm0, 48(%rsi)
+; AVX2-NEXT: vmovdqa %xmm0, 32(%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: vec512_i256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vpxor (%rdi), %ymm0, %ymm0
+; AVX512-NEXT: vextracti128 $1, %ymm0, 16(%rsi)
+; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512-NEXT: vextracti128 $1, %ymm0, 48(%rsi)
+; AVX512-NEXT: vmovdqa %xmm0, 32(%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%in.elt.not = load i256, ptr %in.elt.ptr, align 64
%in.elt = xor i256 %in.elt.not, -1
%out.elt0.ptr = getelementptr i256, ptr %out.vec.ptr, i64 0
@@ -1616,14 +1752,8 @@ define void @vec512_i256(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; AVX1: {{.*}}
-; AVX2: {{.*}}
-; AVX512: {{.*}}
; AVX512BW: {{.*}}
; AVX512F: {{.*}}
-; SCALAR: {{.*}}
-; SSE: {{.*}}
; SSE2: {{.*}}
; SSE2-ONLY: {{.*}}
; SSE3: {{.*}}
diff --git a/llvm/test/CodeGen/X86/pr166744.ll b/llvm/test/CodeGen/X86/pr166744.ll
index ffdb68c7a6c01..8ecdc064e4dfb 100644
--- a/llvm/test/CodeGen/X86/pr166744.ll
+++ b/llvm/test/CodeGen/X86/pr166744.ll
@@ -14,18 +14,11 @@ define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) {
; POSTRA-NEXT: btrl %esi, %ecx
; POSTRA-NEXT: orl %ecx, %edx
; POSTRA-NEXT: movl %edx, (%rdi,%rax,4)
-; POSTRA-NEXT: movq 16(%rdi), %rax
-; POSTRA-NEXT: movq (%rdi), %rcx
-; POSTRA-NEXT: movq 24(%rdi), %rdx
-; POSTRA-NEXT: movq 8(%rdi), %rsi
-; POSTRA-NEXT: orq 56(%rdi), %rdx
-; POSTRA-NEXT: orq 40(%rdi), %rsi
-; POSTRA-NEXT: orq 48(%rdi), %rax
-; POSTRA-NEXT: orq 32(%rdi), %rcx
-; POSTRA-NEXT: orq %rdx, %rsi
-; POSTRA-NEXT: orq %rax, %rcx
-; POSTRA-NEXT: orq %rsi, %rcx
+; POSTRA-NEXT: vmovdqu (%rdi), %ymm0
+; POSTRA-NEXT: vpor 32(%rdi), %ymm0, %ymm0
+; POSTRA-NEXT: vptest %ymm0, %ymm0
; POSTRA-NEXT: setne %al
+; POSTRA-NEXT: vzeroupper
; POSTRA-NEXT: retq
;
; NOPOSTRA-LABEL: PR166744:
@@ -38,18 +31,11 @@ define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) {
; NOPOSTRA-NEXT: shlxl %eax, %edx, %eax
; NOPOSTRA-NEXT: orl %ecx, %eax
; NOPOSTRA-NEXT: movl %eax, (%rdi,%rsi)
-; NOPOSTRA-NEXT: movq 16(%rdi), %rax
-; NOPOSTRA-NEXT: movq (%rdi), %rcx
-; NOPOSTRA-NEXT: movq 8(%rdi), %rdx
-; NOPOSTRA-NEXT: movq 24(%rdi), %rsi
-; NOPOSTRA-NEXT: orq 56(%rdi), %rsi
-; NOPOSTRA-NEXT: orq 40(%rdi), %rdx
-; NOPOSTRA-NEXT: orq 48(%rdi), %rax
-; NOPOSTRA-NEXT: orq 32(%rdi), %rcx
-; NOPOSTRA-NEXT: orq %rsi, %rdx
-; NOPOSTRA-NEXT: orq %rax, %rcx
-; NOPOSTRA-NEXT: orq %rdx, %rcx
+; NOPOSTRA-NEXT: vmovdqu (%rdi), %ymm0
+; NOPOSTRA-NEXT: vpor 32(%rdi), %ymm0, %ymm0
+; NOPOSTRA-NEXT: vptest %ymm0, %ymm0
; NOPOSTRA-NEXT: setne %al
+; NOPOSTRA-NEXT: vzeroupper
; NOPOSTRA-NEXT: retq
%rem = and i64 %idx, 511
%sh_prom = zext nneg i64 %rem to i512
diff --git a/llvm/test/CodeGen/X86/ptest.ll b/llvm/test/CodeGen/X86/ptest.ll
index 6e43b897caef1..166b7abc9e053 100644
--- a/llvm/test/CodeGen/X86/ptest.ll
+++ b/llvm/test/CodeGen/X86/ptest.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2,-avx | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1,-avx | FileCheck %s --check-prefixes=CHECK,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2,-avx | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1,-avx | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
@@ -433,15 +433,23 @@ define i1 @vecmp_load64x4(ptr %p0) {
}
define i1 @vecmp_load128x2(ptr %p0) {
-; CHECK-LABEL: vecmp_load128x2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: movq 8(%rdi), %rcx
-; CHECK-NEXT: orq 24(%rdi), %rcx
-; CHECK-NEXT: orq 16(%rdi), %rax
-; CHECK-NEXT: orq %rcx, %rax
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: retq
+; SSE-LABEL: vecmp_load128x2:
+; SSE: # %bb.0:
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: movq 8(%rdi), %rcx
+; SSE-NEXT: orq 24(%rdi), %rcx
+; SSE-NEXT: orq 16(%rdi), %rax
+; SSE-NEXT: orq %rcx, %rax
+; SSE-NEXT: sete %al
+; SSE-NEXT: retq
+;
+; AVX-LABEL: vecmp_load128x2:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqu (%rdi), %xmm0
+; AVX-NEXT: vpor 16(%rdi), %xmm0, %xmm0
+; AVX-NEXT: vptest %xmm0, %xmm0
+; AVX-NEXT: sete %al
+; AVX-NEXT: retq
%p1 = getelementptr i8, ptr %p0, i64 16
%i0 = load i128, ptr %p0, align 1
%i1 = load i128, ptr %p1, align 1
@@ -453,21 +461,42 @@ define i1 @vecmp_load128x2(ptr %p0) {
}
define i1 @vecmp_load128x4(ptr %p0) {
-; CHECK-LABEL: vecmp_load128x4:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: movq 8(%rdi), %rcx
-; CHECK-NEXT: movq 24(%rdi), %rdx
-; CHECK-NEXT: movq 16(%rdi), %rsi
-; CHECK-NEXT: orq 32(%rdi), %rax
-; CHECK-NEXT: orq 40(%rdi), %rcx
-; CHECK-NEXT: orq 48(%rdi), %rsi
-; CHECK-NEXT: orq %rax, %rsi
-; CHECK-NEXT: orq 56(%rdi), %rdx
-; CHECK-NEXT: orq %rcx, %rdx
-; CHECK-NEXT: orq %rsi, %rdx
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: retq
+; SSE-LABEL: vecmp_load128x4:
+; SSE: # %bb.0:
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: movq 8(%rdi), %rcx
+; SSE-NEXT: movq 24(%rdi), %rdx
+; SSE-NEXT: movq 16(%rdi), %rsi
+; SSE-NEXT: orq 32(%rdi), %rax
+; SSE-NEXT: orq 40(%rdi), %rcx
+; SSE-NEXT: orq 48(%rdi), %rsi
+; SSE-NEXT: orq %rax, %rsi
+; SSE-NEXT: orq 56(%rdi), %rdx
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: sete %al
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: vecmp_load128x4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqu (%rdi), %xmm0
+; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1
+; AVX1-NEXT: vpor 32(%rdi), %xmm0, %xmm0
+; AVX1-NEXT: vpor 48(%rdi), %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vptest %xmm0, %xmm0
+; AVX1-NEXT: sete %al
+; AVX1-NEXT: retq
+;
+; AVX512-LABEL: vecmp_load128x4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqu (%rdi), %xmm0
+; AVX512-NEXT: vmovdqu 16(%rdi), %xmm1
+; AVX512-NEXT: vpor 32(%rdi), %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 | xmm1 | mem
+; AVX512-NEXT: vptest %xmm0, %xmm0
+; AVX512-NEXT: sete %al
+; AVX512-NEXT: retq
%p1 = getelementptr i8, ptr %p0, i64 16
%p2 = getelementptr i8, ptr %p0, i64 32
%p3 = getelementptr i8, ptr %p0, i64 48
@@ -486,21 +515,39 @@ define i1 @vecmp_load128x4(ptr %p0) {
; PR144861
define i1 @vecmp_load256x2(ptr %p0) {
-; CHECK-LABEL: vecmp_load256x2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movq 24(%rdi), %rax
-; CHECK-NEXT: movq (%rdi), %rcx
-; CHECK-NEXT: movq 8(%rdi), %rdx
-; CHECK-NEXT: movq 16(%rdi), %rsi
-; CHECK-NEXT: orq 48(%rdi), %rsi
-; CHECK-NEXT: orq 32(%rdi), %rcx
-; CHECK-NEXT: orq %rsi, %rcx
-; CHECK-NEXT: orq 56(%rdi), %rax
-; CHECK-NEXT: orq 40(%rdi), %rdx
-; CHECK-NEXT: orq %rax, %rdx
-; CHECK-NEXT: orq %rcx, %rdx
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: retq
+; SSE-LABEL: vecmp_load256x2:
+; SSE: # %bb.0:
+; SSE-NEXT: movq 24(%rdi), %rax
+; SSE-NEXT: movq (%rdi), %rcx
+; SSE-NEXT: movq 8(%rdi), %rdx
+; SSE-NEXT: movq 16(%rdi), %rsi
+; SSE-NEXT: orq 48(%rdi), %rsi
+; SSE-NEXT: orq 32(%rdi), %rcx
+; SSE-NEXT: orq %rsi, %rcx
+; SSE-NEXT: orq 56(%rdi), %rax
+; SSE-NEXT: orq 40(%rdi), %rdx
+; SSE-NEXT: orq %rax, %rdx
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: sete %al
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: vecmp_load256x2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovups (%rdi), %ymm0
+; AVX1-NEXT: vorps 32(%rdi), %ymm0, %ymm0
+; AVX1-NEXT: vptest %ymm0, %ymm0
+; AVX1-NEXT: sete %al
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX512-LABEL: vecmp_load256x2:
+; AVX512: # %bb.0:
+; A...
[truncated]
|
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm1 | ||
| ; AVX-NEXT: vpxor 16(%rdi), %xmm0, %xmm0 | ||
| ; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) | ||
| ; AVX-NEXT: vmovdqa %xmm1, (%rsi) | ||
| ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) | ||
| ; AVX-NEXT: vmovdqa %xmm1, (%rdx) | ||
| ; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) | ||
| ; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not ymm here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It'd involve adding a subvector splat which store combining won't attempt
phoebewang
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
…n vector types if the new type is legal Prevents us from attempting to store illegal types like <2 x i128> that will force scalarization/splitting Noticed while trying to avoid some split stores mentioned in llvm#171616
If the scalar integer sources are freely transferable to the FPU, then perform the bitlogic op as a SSE/AVX operation.
Uses the mayFoldIntoVector helper added at #171589