diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ec746843f8ea8..f8730a3de11c5 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1142,6 +1142,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::LRINT, MVT::v4f32, Custom); setOperationAction(ISD::LRINT, MVT::v2i32, Custom); + setOperationAction(ISD::AND, MVT::i128, Custom); + setOperationAction(ISD::OR, MVT::i128, Custom); + setOperationAction(ISD::XOR, MVT::i128, Custom); + for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom); setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom); @@ -1481,6 +1485,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::LRINT, MVT::v8f32, Custom); setOperationAction(ISD::LRINT, MVT::v4f64, Custom); + setOperationAction(ISD::AND, MVT::i256, Custom); + setOperationAction(ISD::OR, MVT::i256, Custom); + setOperationAction(ISD::XOR, MVT::i256, Custom); + // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted // even though v8i16 is a legal type. setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32); @@ -1836,6 +1844,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.hasDQI()) setOperationAction(ISD::LLRINT, MVT::v8f64, Legal); + setOperationAction(ISD::AND, MVT::i512, Custom); + setOperationAction(ISD::OR, MVT::i512, Custom); + setOperationAction(ISD::XOR, MVT::i512, Custom); + for (MVT VT : { MVT::v16i1, MVT::v16i8 }) { setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32); setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32); @@ -33926,6 +33938,23 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case X86ISD::CVTPS2PH: Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG)); return; + case ISD::AND: + case ISD::OR: + case ISD::XOR: { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + assert((VT == MVT::i128 || VT == MVT::i256 || VT == MVT::i512) && + "Unexpected VT!"); + // See if this is free to perform on the FPU to avoid splitting. + MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); + if (!mayFoldIntoVector(N0, Subtarget) || !mayFoldIntoVector(N1, Subtarget)) + return; + SDValue Op = DAG.getNode(Opc, dl, VecVT, DAG.getBitcast(VecVT, N0), + DAG.getBitcast(VecVT, N1)); + Results.push_back(DAG.getBitcast(VT, Op)); + return; + } case ISD::CTPOP: { assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!"); // If we have at most 32 active bits, then perform as i32 CTPOP. diff --git a/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll b/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll index 6d4be7dbe6349..d9158c4af18fa 100644 --- a/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll +++ b/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll @@ -621,17 +621,41 @@ define void @vec256_double(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { } define void @vec256_i128(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec256_i128: -; ALL: # %bb.0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: movq 8(%rdi), %rcx -; ALL-NEXT: notq %rcx -; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, (%rsi) -; ALL-NEXT: movq %rcx, 8(%rsi) -; ALL-NEXT: movq %rcx, 24(%rsi) -; ALL-NEXT: movq %rax, 16(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec256_i128: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movq (%rdi), %rax +; SCALAR-NEXT: movq 8(%rdi), %rcx +; SCALAR-NEXT: notq %rcx +; SCALAR-NEXT: notq %rax +; SCALAR-NEXT: movq %rax, (%rsi) +; SCALAR-NEXT: movq %rcx, 8(%rsi) +; SCALAR-NEXT: movq %rcx, 24(%rsi) +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec256_i128: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE-NEXT: pxor (%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: movdqa %xmm0, 16(%rsi) +; SSE-NEXT: retq +; +; AVX-LABEL: vec256_i128: +; AVX: # %bb.0: +; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512-LABEL: vec256_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-NEXT: retq %in.elt.not = load i128, ptr %in.elt.ptr, align 64 %in.elt = xor i128 %in.elt.not, -1 %out.elt0.ptr = getelementptr i128, ptr %out.vec.ptr, i64 0 @@ -1034,19 +1058,46 @@ define void @vec384_double(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { } define void @vec384_i128(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec384_i128: -; ALL: # %bb.0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: movq 8(%rdi), %rcx -; ALL-NEXT: notq %rcx -; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, (%rsi) -; ALL-NEXT: movq %rcx, 8(%rsi) -; ALL-NEXT: movq %rcx, 24(%rsi) -; ALL-NEXT: movq %rax, 16(%rsi) -; ALL-NEXT: movq %rcx, 40(%rsi) -; ALL-NEXT: movq %rax, 32(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec384_i128: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movq (%rdi), %rax +; SCALAR-NEXT: movq 8(%rdi), %rcx +; SCALAR-NEXT: notq %rcx +; SCALAR-NEXT: notq %rax +; SCALAR-NEXT: movq %rax, (%rsi) +; SCALAR-NEXT: movq %rcx, 8(%rsi) +; SCALAR-NEXT: movq %rcx, 24(%rsi) +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movq %rcx, 40(%rsi) +; SCALAR-NEXT: movq %rax, 32(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec384_i128: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE-NEXT: pxor (%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: movdqa %xmm0, 16(%rsi) +; SSE-NEXT: movdqa %xmm0, 32(%rsi) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_i128: +; AVX: # %bb.0: +; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX-NEXT: retq +; +; AVX512-LABEL: vec384_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX512-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX512-NEXT: retq %in.elt.not = load i128, ptr %in.elt.ptr, align 64 %in.elt = xor i128 %in.elt.not, -1 %out.elt0.ptr = getelementptr i128, ptr %out.vec.ptr, i64 0 @@ -1559,21 +1610,60 @@ define void @vec512_double(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { } define void @vec512_i128(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec512_i128: -; ALL: # %bb.0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: movq 8(%rdi), %rcx -; ALL-NEXT: notq %rcx -; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, (%rsi) -; ALL-NEXT: movq %rcx, 8(%rsi) -; ALL-NEXT: movq %rcx, 24(%rsi) -; ALL-NEXT: movq %rax, 16(%rsi) -; ALL-NEXT: movq %rcx, 40(%rsi) -; ALL-NEXT: movq %rax, 32(%rsi) -; ALL-NEXT: movq %rcx, 56(%rsi) -; ALL-NEXT: movq %rax, 48(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec512_i128: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movq (%rdi), %rax +; SCALAR-NEXT: movq 8(%rdi), %rcx +; SCALAR-NEXT: notq %rcx +; SCALAR-NEXT: notq %rax +; SCALAR-NEXT: movq %rax, (%rsi) +; SCALAR-NEXT: movq %rcx, 8(%rsi) +; SCALAR-NEXT: movq %rcx, 24(%rsi) +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movq %rcx, 40(%rsi) +; SCALAR-NEXT: movq %rax, 32(%rsi) +; SCALAR-NEXT: movq %rcx, 56(%rsi) +; SCALAR-NEXT: movq %rax, 48(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec512_i128: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE-NEXT: pxor (%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: movdqa %xmm0, 16(%rsi) +; SSE-NEXT: movdqa %xmm0, 32(%rsi) +; SSE-NEXT: movdqa %xmm0, 48(%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec512_i128: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec512_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: vec512_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.elt.not = load i128, ptr %in.elt.ptr, align 64 %in.elt = xor i128 %in.elt.not, -1 %out.elt0.ptr = getelementptr i128, ptr %out.vec.ptr, i64 0 @@ -1588,25 +1678,71 @@ define void @vec512_i128(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { } define void @vec512_i256(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec512_i256: -; ALL: # %bb.0: -; ALL-NEXT: movq 16(%rdi), %rax -; ALL-NEXT: movq 24(%rdi), %rcx -; ALL-NEXT: movq (%rdi), %rdx -; ALL-NEXT: movq 8(%rdi), %rdi -; ALL-NEXT: notq %rdi -; ALL-NEXT: notq %rdx -; ALL-NEXT: notq %rcx -; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, 16(%rsi) -; ALL-NEXT: movq %rcx, 24(%rsi) -; ALL-NEXT: movq %rdx, (%rsi) -; ALL-NEXT: movq %rdi, 8(%rsi) -; ALL-NEXT: movq %rax, 48(%rsi) -; ALL-NEXT: movq %rcx, 56(%rsi) -; ALL-NEXT: movq %rdx, 32(%rsi) -; ALL-NEXT: movq %rdi, 40(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec512_i256: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movq 16(%rdi), %rax +; SCALAR-NEXT: movq 24(%rdi), %rcx +; SCALAR-NEXT: movq (%rdi), %rdx +; SCALAR-NEXT: movq 8(%rdi), %rdi +; SCALAR-NEXT: notq %rdi +; SCALAR-NEXT: notq %rdx +; SCALAR-NEXT: notq %rcx +; SCALAR-NEXT: notq %rax +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movq %rcx, 24(%rsi) +; SCALAR-NEXT: movq %rdx, (%rsi) +; SCALAR-NEXT: movq %rdi, 8(%rsi) +; SCALAR-NEXT: movq %rax, 48(%rsi) +; SCALAR-NEXT: movq %rcx, 56(%rsi) +; SCALAR-NEXT: movq %rdx, 32(%rsi) +; SCALAR-NEXT: movq %rdi, 40(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec512_i256: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: pxor 16(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 16(%rsi) +; SSE-NEXT: movdqa %xmm1, (%rsi) +; SSE-NEXT: movdqa %xmm1, 32(%rsi) +; SSE-NEXT: movdqa %xmm0, 48(%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec512_i256: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 +; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, 16(%rsi) +; AVX1-NEXT: vmovaps %xmm0, (%rsi) +; AVX1-NEXT: vextractf128 $1, %ymm0, 48(%rsi) +; AVX1-NEXT: vmovaps %xmm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec512_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, 16(%rsi) +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: vextracti128 $1, %ymm0, 48(%rsi) +; AVX2-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: vec512_i256: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, 16(%rsi) +; AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-NEXT: vextracti128 $1, %ymm0, 48(%rsi) +; AVX512-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.elt.not = load i256, ptr %in.elt.ptr, align 64 %in.elt = xor i256 %in.elt.not, -1 %out.elt0.ptr = getelementptr i256, ptr %out.vec.ptr, i64 0 @@ -1616,14 +1752,8 @@ define void @vec512_i256(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { ret void } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; AVX: {{.*}} -; AVX1: {{.*}} -; AVX2: {{.*}} -; AVX512: {{.*}} ; AVX512BW: {{.*}} ; AVX512F: {{.*}} -; SCALAR: {{.*}} -; SSE: {{.*}} ; SSE2: {{.*}} ; SSE2-ONLY: {{.*}} ; SSE3: {{.*}} diff --git a/llvm/test/CodeGen/X86/pr166744.ll b/llvm/test/CodeGen/X86/pr166744.ll index ffdb68c7a6c01..8ecdc064e4dfb 100644 --- a/llvm/test/CodeGen/X86/pr166744.ll +++ b/llvm/test/CodeGen/X86/pr166744.ll @@ -14,18 +14,11 @@ define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) { ; POSTRA-NEXT: btrl %esi, %ecx ; POSTRA-NEXT: orl %ecx, %edx ; POSTRA-NEXT: movl %edx, (%rdi,%rax,4) -; POSTRA-NEXT: movq 16(%rdi), %rax -; POSTRA-NEXT: movq (%rdi), %rcx -; POSTRA-NEXT: movq 24(%rdi), %rdx -; POSTRA-NEXT: movq 8(%rdi), %rsi -; POSTRA-NEXT: orq 56(%rdi), %rdx -; POSTRA-NEXT: orq 40(%rdi), %rsi -; POSTRA-NEXT: orq 48(%rdi), %rax -; POSTRA-NEXT: orq 32(%rdi), %rcx -; POSTRA-NEXT: orq %rdx, %rsi -; POSTRA-NEXT: orq %rax, %rcx -; POSTRA-NEXT: orq %rsi, %rcx +; POSTRA-NEXT: vmovdqu (%rdi), %ymm0 +; POSTRA-NEXT: vpor 32(%rdi), %ymm0, %ymm0 +; POSTRA-NEXT: vptest %ymm0, %ymm0 ; POSTRA-NEXT: setne %al +; POSTRA-NEXT: vzeroupper ; POSTRA-NEXT: retq ; ; NOPOSTRA-LABEL: PR166744: @@ -38,18 +31,11 @@ define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) { ; NOPOSTRA-NEXT: shlxl %eax, %edx, %eax ; NOPOSTRA-NEXT: orl %ecx, %eax ; NOPOSTRA-NEXT: movl %eax, (%rdi,%rsi) -; NOPOSTRA-NEXT: movq 16(%rdi), %rax -; NOPOSTRA-NEXT: movq (%rdi), %rcx -; NOPOSTRA-NEXT: movq 8(%rdi), %rdx -; NOPOSTRA-NEXT: movq 24(%rdi), %rsi -; NOPOSTRA-NEXT: orq 56(%rdi), %rsi -; NOPOSTRA-NEXT: orq 40(%rdi), %rdx -; NOPOSTRA-NEXT: orq 48(%rdi), %rax -; NOPOSTRA-NEXT: orq 32(%rdi), %rcx -; NOPOSTRA-NEXT: orq %rsi, %rdx -; NOPOSTRA-NEXT: orq %rax, %rcx -; NOPOSTRA-NEXT: orq %rdx, %rcx +; NOPOSTRA-NEXT: vmovdqu (%rdi), %ymm0 +; NOPOSTRA-NEXT: vpor 32(%rdi), %ymm0, %ymm0 +; NOPOSTRA-NEXT: vptest %ymm0, %ymm0 ; NOPOSTRA-NEXT: setne %al +; NOPOSTRA-NEXT: vzeroupper ; NOPOSTRA-NEXT: retq %rem = and i64 %idx, 511 %sh_prom = zext nneg i64 %rem to i512 diff --git a/llvm/test/CodeGen/X86/ptest.ll b/llvm/test/CodeGen/X86/ptest.ll index 6e43b897caef1..166b7abc9e053 100644 --- a/llvm/test/CodeGen/X86/ptest.ll +++ b/llvm/test/CodeGen/X86/ptest.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2,-avx | FileCheck %s --check-prefixes=CHECK,SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1,-avx | FileCheck %s --check-prefixes=CHECK,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2,-avx | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1,-avx | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 @@ -433,15 +433,23 @@ define i1 @vecmp_load64x4(ptr %p0) { } define i1 @vecmp_load128x2(ptr %p0) { -; CHECK-LABEL: vecmp_load128x2: -; CHECK: # %bb.0: -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: movq 8(%rdi), %rcx -; CHECK-NEXT: orq 24(%rdi), %rcx -; CHECK-NEXT: orq 16(%rdi), %rax -; CHECK-NEXT: orq %rcx, %rax -; CHECK-NEXT: sete %al -; CHECK-NEXT: retq +; SSE-LABEL: vecmp_load128x2: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: movq 8(%rdi), %rcx +; SSE-NEXT: orq 24(%rdi), %rcx +; SSE-NEXT: orq 16(%rdi), %rax +; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: sete %al +; SSE-NEXT: retq +; +; AVX-LABEL: vecmp_load128x2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqu (%rdi), %xmm0 +; AVX-NEXT: vpor 16(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vptest %xmm0, %xmm0 +; AVX-NEXT: sete %al +; AVX-NEXT: retq %p1 = getelementptr i8, ptr %p0, i64 16 %i0 = load i128, ptr %p0, align 1 %i1 = load i128, ptr %p1, align 1 @@ -453,21 +461,42 @@ define i1 @vecmp_load128x2(ptr %p0) { } define i1 @vecmp_load128x4(ptr %p0) { -; CHECK-LABEL: vecmp_load128x4: -; CHECK: # %bb.0: -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: movq 8(%rdi), %rcx -; CHECK-NEXT: movq 24(%rdi), %rdx -; CHECK-NEXT: movq 16(%rdi), %rsi -; CHECK-NEXT: orq 32(%rdi), %rax -; CHECK-NEXT: orq 40(%rdi), %rcx -; CHECK-NEXT: orq 48(%rdi), %rsi -; CHECK-NEXT: orq %rax, %rsi -; CHECK-NEXT: orq 56(%rdi), %rdx -; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: orq %rsi, %rdx -; CHECK-NEXT: sete %al -; CHECK-NEXT: retq +; SSE-LABEL: vecmp_load128x4: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: movq 8(%rdi), %rcx +; SSE-NEXT: movq 24(%rdi), %rdx +; SSE-NEXT: movq 16(%rdi), %rsi +; SSE-NEXT: orq 32(%rdi), %rax +; SSE-NEXT: orq 40(%rdi), %rcx +; SSE-NEXT: orq 48(%rdi), %rsi +; SSE-NEXT: orq %rax, %rsi +; SSE-NEXT: orq 56(%rdi), %rdx +; SSE-NEXT: orq %rcx, %rdx +; SSE-NEXT: orq %rsi, %rdx +; SSE-NEXT: sete %al +; SSE-NEXT: retq +; +; AVX1-LABEL: vecmp_load128x4: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX1-NEXT: vpor 32(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpor 48(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vptest %xmm0, %xmm0 +; AVX1-NEXT: sete %al +; AVX1-NEXT: retq +; +; AVX512-LABEL: vecmp_load128x4: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX512-NEXT: vpor 32(%rdi), %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 | xmm1 | mem +; AVX512-NEXT: vptest %xmm0, %xmm0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: retq %p1 = getelementptr i8, ptr %p0, i64 16 %p2 = getelementptr i8, ptr %p0, i64 32 %p3 = getelementptr i8, ptr %p0, i64 48 @@ -486,21 +515,39 @@ define i1 @vecmp_load128x4(ptr %p0) { ; PR144861 define i1 @vecmp_load256x2(ptr %p0) { -; CHECK-LABEL: vecmp_load256x2: -; CHECK: # %bb.0: -; CHECK-NEXT: movq 24(%rdi), %rax -; CHECK-NEXT: movq (%rdi), %rcx -; CHECK-NEXT: movq 8(%rdi), %rdx -; CHECK-NEXT: movq 16(%rdi), %rsi -; CHECK-NEXT: orq 48(%rdi), %rsi -; CHECK-NEXT: orq 32(%rdi), %rcx -; CHECK-NEXT: orq %rsi, %rcx -; CHECK-NEXT: orq 56(%rdi), %rax -; CHECK-NEXT: orq 40(%rdi), %rdx -; CHECK-NEXT: orq %rax, %rdx -; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: sete %al -; CHECK-NEXT: retq +; SSE-LABEL: vecmp_load256x2: +; SSE: # %bb.0: +; SSE-NEXT: movq 24(%rdi), %rax +; SSE-NEXT: movq (%rdi), %rcx +; SSE-NEXT: movq 8(%rdi), %rdx +; SSE-NEXT: movq 16(%rdi), %rsi +; SSE-NEXT: orq 48(%rdi), %rsi +; SSE-NEXT: orq 32(%rdi), %rcx +; SSE-NEXT: orq %rsi, %rcx +; SSE-NEXT: orq 56(%rdi), %rax +; SSE-NEXT: orq 40(%rdi), %rdx +; SSE-NEXT: orq %rax, %rdx +; SSE-NEXT: orq %rcx, %rdx +; SSE-NEXT: sete %al +; SSE-NEXT: retq +; +; AVX1-LABEL: vecmp_load256x2: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: vorps 32(%rdi), %ymm0, %ymm0 +; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: sete %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX512-LABEL: vecmp_load256x2: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512-NEXT: vpor 32(%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vptest %ymm0, %ymm0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %p1 = getelementptr i8, ptr %p0, i64 32 %i0 = load i256, ptr %p0, align 1 %i1 = load i256, ptr %p1, align 1 @@ -512,33 +559,55 @@ define i1 @vecmp_load256x2(ptr %p0) { } define i1 @vecmp_load512x2(ptr %p0) { -; CHECK-LABEL: vecmp_load512x2: -; CHECK: # %bb.0: -; CHECK-NEXT: movq 24(%rdi), %rax -; CHECK-NEXT: movq 56(%rdi), %rdx -; CHECK-NEXT: movq 40(%rdi), %rsi -; CHECK-NEXT: movq 16(%rdi), %rcx -; CHECK-NEXT: movq 48(%rdi), %r8 -; CHECK-NEXT: movq (%rdi), %r9 -; CHECK-NEXT: movq 8(%rdi), %r10 -; CHECK-NEXT: movq 32(%rdi), %r11 -; CHECK-NEXT: orq 96(%rdi), %r11 -; CHECK-NEXT: orq 64(%rdi), %r9 -; CHECK-NEXT: orq %r11, %r9 -; CHECK-NEXT: orq 112(%rdi), %r8 -; CHECK-NEXT: orq 80(%rdi), %rcx -; CHECK-NEXT: orq %r8, %rcx -; CHECK-NEXT: orq %r9, %rcx -; CHECK-NEXT: orq 104(%rdi), %rsi -; CHECK-NEXT: orq 72(%rdi), %r10 -; CHECK-NEXT: orq %rsi, %r10 -; CHECK-NEXT: orq 120(%rdi), %rdx -; CHECK-NEXT: orq 88(%rdi), %rax -; CHECK-NEXT: orq %rdx, %rax -; CHECK-NEXT: orq %r10, %rax -; CHECK-NEXT: orq %rcx, %rax -; CHECK-NEXT: sete %al -; CHECK-NEXT: retq +; SSE-LABEL: vecmp_load512x2: +; SSE: # %bb.0: +; SSE-NEXT: movq 24(%rdi), %rax +; SSE-NEXT: movq 56(%rdi), %rdx +; SSE-NEXT: movq 40(%rdi), %rsi +; SSE-NEXT: movq 16(%rdi), %rcx +; SSE-NEXT: movq 48(%rdi), %r8 +; SSE-NEXT: movq (%rdi), %r9 +; SSE-NEXT: movq 8(%rdi), %r10 +; SSE-NEXT: movq 32(%rdi), %r11 +; SSE-NEXT: orq 96(%rdi), %r11 +; SSE-NEXT: orq 64(%rdi), %r9 +; SSE-NEXT: orq %r11, %r9 +; SSE-NEXT: orq 112(%rdi), %r8 +; SSE-NEXT: orq 80(%rdi), %rcx +; SSE-NEXT: orq %r8, %rcx +; SSE-NEXT: orq %r9, %rcx +; SSE-NEXT: orq 104(%rdi), %rsi +; SSE-NEXT: orq 72(%rdi), %r10 +; SSE-NEXT: orq %rsi, %r10 +; SSE-NEXT: orq 120(%rdi), %rdx +; SSE-NEXT: orq 88(%rdi), %rax +; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: orq %r10, %rax +; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: sete %al +; SSE-NEXT: retq +; +; AVX1-LABEL: vecmp_load512x2: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; AVX1-NEXT: vorps 96(%rdi), %ymm1, %ymm1 +; AVX1-NEXT: vorps 64(%rdi), %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: sete %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX512-LABEL: vecmp_load512x2: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512-NEXT: vporq 64(%rdi), %zmm0, %zmm0 +; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %p1 = getelementptr i8, ptr %p0, i64 64 %i0 = load i512, ptr %p0, align 1 %i1 = load i512, ptr %p1, align 1 diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll index 23c3e845f2276..d27b032058bc7 100644 --- a/llvm/test/CodeGen/X86/setcc-wide-types.ll +++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,NO512,SSE,SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefixes=CHECK,NO512,SSE,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=CHECK,NO512,AVX,AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,NO512,AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512BW @@ -75,47 +75,24 @@ define i32 @eq_i128(<2 x i64> %x, <2 x i64> %y) { define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-LABEL: ne_i256: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm4, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm4, %rcx -; SSE2-NEXT: movq %xmm0, %rdx -; SSE2-NEXT: movq %xmm1, %rsi -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rdi -; SSE2-NEXT: xorq %rax, %rdi -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %r8 -; SSE2-NEXT: xorq %rcx, %r8 -; SSE2-NEXT: orq %rdi, %r8 -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: xorq %rdx, %rax -; SSE2-NEXT: movq %xmm3, %rcx -; SSE2-NEXT: xorq %rsi, %rcx -; SSE2-NEXT: orq %rax, %rcx +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: movmskps %xmm1, %ecx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %r8, %rcx +; SSE2-NEXT: xorl $15, %ecx ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: ne_i256: ; SSE41: # %bb.0: -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: movq %xmm1, %rcx -; SSE41-NEXT: pextrq $1, %xmm0, %rdx -; SSE41-NEXT: pextrq $1, %xmm1, %rsi -; SSE41-NEXT: movq %xmm2, %rdi -; SSE41-NEXT: xorq %rax, %rdi -; SSE41-NEXT: movq %xmm3, %r8 -; SSE41-NEXT: xorq %rcx, %r8 -; SSE41-NEXT: orq %rdi, %r8 -; SSE41-NEXT: pextrq $1, %xmm2, %rax -; SSE41-NEXT: xorq %rdx, %rax -; SSE41-NEXT: pextrq $1, %xmm3, %rcx -; SSE41-NEXT: xorq %rsi, %rcx -; SSE41-NEXT: orq %rax, %rcx +; SSE41-NEXT: pxor %xmm3, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: orq %r8, %rcx +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; @@ -155,47 +132,24 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) { define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-LABEL: eq_i256: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm4, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm4, %rcx -; SSE2-NEXT: movq %xmm0, %rdx -; SSE2-NEXT: movq %xmm1, %rsi -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rdi -; SSE2-NEXT: xorq %rax, %rdi -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %r8 -; SSE2-NEXT: xorq %rcx, %r8 -; SSE2-NEXT: orq %rdi, %r8 -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: xorq %rdx, %rax -; SSE2-NEXT: movq %xmm3, %rcx -; SSE2-NEXT: xorq %rsi, %rcx -; SSE2-NEXT: orq %rax, %rcx +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: movmskps %xmm1, %ecx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %r8, %rcx +; SSE2-NEXT: xorl $15, %ecx ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: eq_i256: ; SSE41: # %bb.0: -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: movq %xmm1, %rcx -; SSE41-NEXT: pextrq $1, %xmm0, %rdx -; SSE41-NEXT: pextrq $1, %xmm1, %rsi -; SSE41-NEXT: movq %xmm2, %rdi -; SSE41-NEXT: xorq %rax, %rdi -; SSE41-NEXT: movq %xmm3, %r8 -; SSE41-NEXT: xorq %rcx, %r8 -; SSE41-NEXT: orq %rdi, %r8 -; SSE41-NEXT: pextrq $1, %xmm2, %rax -; SSE41-NEXT: xorq %rdx, %rax -; SSE41-NEXT: pextrq $1, %xmm3, %rcx -; SSE41-NEXT: xorq %rsi, %rcx -; SSE41-NEXT: orq %rax, %rcx +; SSE41-NEXT: pxor %xmm3, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: orq %r8, %rcx +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -235,166 +189,53 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) { define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) { ; SSE2-LABEL: ne_i512: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rdx -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rsi -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rdi -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %r8 -; SSE2-NEXT: movq %xmm0, %r9 -; SSE2-NEXT: movq %xmm2, %r10 -; SSE2-NEXT: movq %xmm1, %rcx -; SSE2-NEXT: movq %xmm3, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %r11 -; SSE2-NEXT: xorq %rdx, %r11 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rdx -; SSE2-NEXT: xorq %rsi, %rdx -; SSE2-NEXT: orq %r11, %rdx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rsi -; SSE2-NEXT: xorq %rdi, %rsi -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rdi -; SSE2-NEXT: xorq %r8, %rdi -; SSE2-NEXT: orq %rsi, %rdi -; SSE2-NEXT: orq %rdx, %rdi -; SSE2-NEXT: movq %xmm4, %rdx -; SSE2-NEXT: xorq %r9, %rdx -; SSE2-NEXT: movq %xmm6, %rsi -; SSE2-NEXT: xorq %r10, %rsi -; SSE2-NEXT: orq %rdx, %rsi -; SSE2-NEXT: movq %xmm5, %rdx -; SSE2-NEXT: xorq %rcx, %rdx -; SSE2-NEXT: movq %xmm7, %rcx -; SSE2-NEXT: xorq %rax, %rcx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: orq %rsi, %rcx +; SSE2-NEXT: pxor %xmm7, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: movmskps %xmm1, %ecx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rdi, %rcx +; SSE2-NEXT: xorl $15, %ecx ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: ne_i512: ; SSE41: # %bb.0: -; SSE41-NEXT: movq %xmm0, %rcx -; SSE41-NEXT: movq %xmm2, %rdx -; SSE41-NEXT: movq %xmm1, %rsi -; SSE41-NEXT: movq %xmm3, %rdi -; SSE41-NEXT: pextrq $1, %xmm0, %r8 -; SSE41-NEXT: pextrq $1, %xmm2, %r9 -; SSE41-NEXT: pextrq $1, %xmm1, %r10 -; SSE41-NEXT: pextrq $1, %xmm3, %rax -; SSE41-NEXT: movq %xmm4, %r11 -; SSE41-NEXT: xorq %rcx, %r11 -; SSE41-NEXT: movq %xmm6, %rcx -; SSE41-NEXT: xorq %rdx, %rcx -; SSE41-NEXT: orq %r11, %rcx -; SSE41-NEXT: movq %xmm5, %rdx -; SSE41-NEXT: xorq %rsi, %rdx -; SSE41-NEXT: movq %xmm7, %rsi -; SSE41-NEXT: xorq %rdi, %rsi -; SSE41-NEXT: orq %rdx, %rsi -; SSE41-NEXT: orq %rcx, %rsi -; SSE41-NEXT: pextrq $1, %xmm4, %rcx -; SSE41-NEXT: xorq %r8, %rcx -; SSE41-NEXT: pextrq $1, %xmm6, %rdx -; SSE41-NEXT: xorq %r9, %rdx -; SSE41-NEXT: orq %rcx, %rdx -; SSE41-NEXT: pextrq $1, %xmm5, %rcx -; SSE41-NEXT: xorq %r10, %rcx -; SSE41-NEXT: pextrq $1, %xmm7, %rdi -; SSE41-NEXT: xorq %rax, %rdi -; SSE41-NEXT: orq %rcx, %rdi -; SSE41-NEXT: orq %rdx, %rdi +; SSE41-NEXT: pxor %xmm7, %xmm3 +; SSE41-NEXT: pxor %xmm5, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: pxor %xmm6, %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: orq %rsi, %rdi +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: ne_i512: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq %xmm0, %rdx -; AVX1-NEXT: vmovq %xmm1, %rsi -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovq %xmm4, %rdi -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vmovq %xmm5, %r8 -; AVX1-NEXT: vpextrq $1, %xmm0, %r9 -; AVX1-NEXT: vpextrq $1, %xmm1, %r10 -; AVX1-NEXT: vpextrq $1, %xmm4, %rcx -; AVX1-NEXT: vpextrq $1, %xmm5, %rax -; AVX1-NEXT: vmovq %xmm2, %r11 -; AVX1-NEXT: xorq %rdx, %r11 -; AVX1-NEXT: vmovq %xmm3, %rdx -; AVX1-NEXT: xorq %rsi, %rdx -; AVX1-NEXT: orq %r11, %rdx -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rsi -; AVX1-NEXT: xorq %rdi, %rsi -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rdi -; AVX1-NEXT: xorq %r8, %rdi -; AVX1-NEXT: orq %rsi, %rdi -; AVX1-NEXT: orq %rdx, %rdi -; AVX1-NEXT: vpextrq $1, %xmm2, %rdx -; AVX1-NEXT: xorq %r9, %rdx -; AVX1-NEXT: vpextrq $1, %xmm3, %rsi -; AVX1-NEXT: xorq %r10, %rsi -; AVX1-NEXT: orq %rdx, %rsi -; AVX1-NEXT: vpextrq $1, %xmm0, %rdx -; AVX1-NEXT: xorq %rcx, %rdx -; AVX1-NEXT: vpextrq $1, %xmm1, %rcx -; AVX1-NEXT: xorq %rax, %rcx -; AVX1-NEXT: orq %rdx, %rcx -; AVX1-NEXT: orq %rsi, %rcx +; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: orq %rdi, %rcx +; AVX1-NEXT: vptest %ymm0, %ymm0 ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: ne_i512: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovq %xmm0, %rdx -; AVX2-NEXT: vmovq %xmm1, %rsi -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vmovq %xmm4, %rdi -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-NEXT: vmovq %xmm5, %r8 -; AVX2-NEXT: vpextrq $1, %xmm0, %r9 -; AVX2-NEXT: vpextrq $1, %xmm1, %r10 -; AVX2-NEXT: vpextrq $1, %xmm4, %rcx -; AVX2-NEXT: vpextrq $1, %xmm5, %rax -; AVX2-NEXT: vmovq %xmm2, %r11 -; AVX2-NEXT: xorq %rdx, %r11 -; AVX2-NEXT: vmovq %xmm3, %rdx -; AVX2-NEXT: xorq %rsi, %rdx -; AVX2-NEXT: orq %r11, %rdx -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rsi -; AVX2-NEXT: xorq %rdi, %rsi -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rdi -; AVX2-NEXT: xorq %r8, %rdi -; AVX2-NEXT: orq %rsi, %rdi -; AVX2-NEXT: orq %rdx, %rdi -; AVX2-NEXT: vpextrq $1, %xmm2, %rdx -; AVX2-NEXT: xorq %r9, %rdx -; AVX2-NEXT: vpextrq $1, %xmm3, %rsi -; AVX2-NEXT: xorq %r10, %rsi -; AVX2-NEXT: orq %rdx, %rsi -; AVX2-NEXT: vpextrq $1, %xmm0, %rdx -; AVX2-NEXT: xorq %rcx, %rdx -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: xorq %rax, %rcx -; AVX2-NEXT: orq %rdx, %rcx -; AVX2-NEXT: orq %rsi, %rcx +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rdi, %rcx +; AVX2-NEXT: vptest %ymm0, %ymm0 ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -417,166 +258,53 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) { define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) { ; SSE2-LABEL: eq_i512: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rdx -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rsi -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rdi -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %r8 -; SSE2-NEXT: movq %xmm0, %r9 -; SSE2-NEXT: movq %xmm2, %r10 -; SSE2-NEXT: movq %xmm1, %rcx -; SSE2-NEXT: movq %xmm3, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %r11 -; SSE2-NEXT: xorq %rdx, %r11 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rdx -; SSE2-NEXT: xorq %rsi, %rdx -; SSE2-NEXT: orq %r11, %rdx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rsi -; SSE2-NEXT: xorq %rdi, %rsi -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rdi -; SSE2-NEXT: xorq %r8, %rdi -; SSE2-NEXT: orq %rsi, %rdi -; SSE2-NEXT: orq %rdx, %rdi -; SSE2-NEXT: movq %xmm4, %rdx -; SSE2-NEXT: xorq %r9, %rdx -; SSE2-NEXT: movq %xmm6, %rsi -; SSE2-NEXT: xorq %r10, %rsi -; SSE2-NEXT: orq %rdx, %rsi -; SSE2-NEXT: movq %xmm5, %rdx -; SSE2-NEXT: xorq %rcx, %rdx -; SSE2-NEXT: movq %xmm7, %rcx -; SSE2-NEXT: xorq %rax, %rcx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: orq %rsi, %rcx +; SSE2-NEXT: pxor %xmm7, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: movmskps %xmm1, %ecx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rdi, %rcx +; SSE2-NEXT: xorl $15, %ecx ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: eq_i512: ; SSE41: # %bb.0: -; SSE41-NEXT: movq %xmm0, %rcx -; SSE41-NEXT: movq %xmm2, %rdx -; SSE41-NEXT: movq %xmm1, %rsi -; SSE41-NEXT: movq %xmm3, %rdi -; SSE41-NEXT: pextrq $1, %xmm0, %r8 -; SSE41-NEXT: pextrq $1, %xmm2, %r9 -; SSE41-NEXT: pextrq $1, %xmm1, %r10 -; SSE41-NEXT: pextrq $1, %xmm3, %rax -; SSE41-NEXT: movq %xmm4, %r11 -; SSE41-NEXT: xorq %rcx, %r11 -; SSE41-NEXT: movq %xmm6, %rcx -; SSE41-NEXT: xorq %rdx, %rcx -; SSE41-NEXT: orq %r11, %rcx -; SSE41-NEXT: movq %xmm5, %rdx -; SSE41-NEXT: xorq %rsi, %rdx -; SSE41-NEXT: movq %xmm7, %rsi -; SSE41-NEXT: xorq %rdi, %rsi -; SSE41-NEXT: orq %rdx, %rsi -; SSE41-NEXT: orq %rcx, %rsi -; SSE41-NEXT: pextrq $1, %xmm4, %rcx -; SSE41-NEXT: xorq %r8, %rcx -; SSE41-NEXT: pextrq $1, %xmm6, %rdx -; SSE41-NEXT: xorq %r9, %rdx -; SSE41-NEXT: orq %rcx, %rdx -; SSE41-NEXT: pextrq $1, %xmm5, %rcx -; SSE41-NEXT: xorq %r10, %rcx -; SSE41-NEXT: pextrq $1, %xmm7, %rdi -; SSE41-NEXT: xorq %rax, %rdi -; SSE41-NEXT: orq %rcx, %rdi -; SSE41-NEXT: orq %rdx, %rdi +; SSE41-NEXT: pxor %xmm7, %xmm3 +; SSE41-NEXT: pxor %xmm5, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: pxor %xmm6, %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: orq %rsi, %rdi +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: eq_i512: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq %xmm0, %rdx -; AVX1-NEXT: vmovq %xmm1, %rsi -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovq %xmm4, %rdi -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vmovq %xmm5, %r8 -; AVX1-NEXT: vpextrq $1, %xmm0, %r9 -; AVX1-NEXT: vpextrq $1, %xmm1, %r10 -; AVX1-NEXT: vpextrq $1, %xmm4, %rcx -; AVX1-NEXT: vpextrq $1, %xmm5, %rax -; AVX1-NEXT: vmovq %xmm2, %r11 -; AVX1-NEXT: xorq %rdx, %r11 -; AVX1-NEXT: vmovq %xmm3, %rdx -; AVX1-NEXT: xorq %rsi, %rdx -; AVX1-NEXT: orq %r11, %rdx -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rsi -; AVX1-NEXT: xorq %rdi, %rsi -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rdi -; AVX1-NEXT: xorq %r8, %rdi -; AVX1-NEXT: orq %rsi, %rdi -; AVX1-NEXT: orq %rdx, %rdi -; AVX1-NEXT: vpextrq $1, %xmm2, %rdx -; AVX1-NEXT: xorq %r9, %rdx -; AVX1-NEXT: vpextrq $1, %xmm3, %rsi -; AVX1-NEXT: xorq %r10, %rsi -; AVX1-NEXT: orq %rdx, %rsi -; AVX1-NEXT: vpextrq $1, %xmm0, %rdx -; AVX1-NEXT: xorq %rcx, %rdx -; AVX1-NEXT: vpextrq $1, %xmm1, %rcx -; AVX1-NEXT: xorq %rax, %rcx -; AVX1-NEXT: orq %rdx, %rcx -; AVX1-NEXT: orq %rsi, %rcx +; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: orq %rdi, %rcx +; AVX1-NEXT: vptest %ymm0, %ymm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: eq_i512: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovq %xmm0, %rdx -; AVX2-NEXT: vmovq %xmm1, %rsi -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vmovq %xmm4, %rdi -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-NEXT: vmovq %xmm5, %r8 -; AVX2-NEXT: vpextrq $1, %xmm0, %r9 -; AVX2-NEXT: vpextrq $1, %xmm1, %r10 -; AVX2-NEXT: vpextrq $1, %xmm4, %rcx -; AVX2-NEXT: vpextrq $1, %xmm5, %rax -; AVX2-NEXT: vmovq %xmm2, %r11 -; AVX2-NEXT: xorq %rdx, %r11 -; AVX2-NEXT: vmovq %xmm3, %rdx -; AVX2-NEXT: xorq %rsi, %rdx -; AVX2-NEXT: orq %r11, %rdx -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rsi -; AVX2-NEXT: xorq %rdi, %rsi -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rdi -; AVX2-NEXT: xorq %r8, %rdi -; AVX2-NEXT: orq %rsi, %rdi -; AVX2-NEXT: orq %rdx, %rdi -; AVX2-NEXT: vpextrq $1, %xmm2, %rdx -; AVX2-NEXT: xorq %r9, %rdx -; AVX2-NEXT: vpextrq $1, %xmm3, %rsi -; AVX2-NEXT: xorq %r10, %rsi -; AVX2-NEXT: orq %rdx, %rsi -; AVX2-NEXT: vpextrq $1, %xmm0, %rdx -; AVX2-NEXT: xorq %rcx, %rdx -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: xorq %rax, %rcx -; AVX2-NEXT: orq %rdx, %rcx -; AVX2-NEXT: orq %rsi, %rcx +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rdi, %rcx +; AVX2-NEXT: vptest %ymm0, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -868,34 +596,44 @@ define i32 @eq_i128_pair(ptr %a, ptr %b) { ; if we allowed 2 pairs of 32-byte loads per block. define i32 @ne_i256_pair(ptr %a, ptr %b) { -; SSE-LABEL: ne_i256_pair: -; SSE: # %bb.0: -; SSE-NEXT: movq 16(%rdi), %rax -; SSE-NEXT: movq 24(%rdi), %rcx -; SSE-NEXT: movq (%rdi), %rdx -; SSE-NEXT: movq 8(%rdi), %r8 -; SSE-NEXT: xorq 8(%rsi), %r8 -; SSE-NEXT: xorq 24(%rsi), %rcx -; SSE-NEXT: xorq (%rsi), %rdx -; SSE-NEXT: xorq 16(%rsi), %rax -; SSE-NEXT: movq 48(%rdi), %r9 -; SSE-NEXT: movq 32(%rdi), %r10 -; SSE-NEXT: movq 56(%rdi), %r11 -; SSE-NEXT: movq 40(%rdi), %rdi -; SSE-NEXT: xorq 40(%rsi), %rdi -; SSE-NEXT: orq %r8, %rdi -; SSE-NEXT: xorq 56(%rsi), %r11 -; SSE-NEXT: orq %rcx, %r11 -; SSE-NEXT: orq %rdi, %r11 -; SSE-NEXT: xorq 32(%rsi), %r10 -; SSE-NEXT: orq %rdx, %r10 -; SSE-NEXT: xorq 48(%rsi), %r9 -; SSE-NEXT: orq %rax, %r9 -; SSE-NEXT: orq %r10, %r9 -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: orq %r11, %r9 -; SSE-NEXT: setne %al -; SSE-NEXT: retq +; SSE2-LABEL: ne_i256_pair: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: pxor (%rsi), %xmm0 +; SSE2-NEXT: pxor 16(%rsi), %xmm1 +; SSE2-NEXT: pxor 32(%rsi), %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pxor 48(%rsi), %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %ecx +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: setne %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: ne_i256_pair: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movdqa 48(%rdi), %xmm3 +; SSE41-NEXT: pxor (%rsi), %xmm0 +; SSE41-NEXT: pxor 16(%rsi), %xmm1 +; SSE41-NEXT: pxor 32(%rsi), %xmm2 +; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: pxor 48(%rsi), %xmm3 +; SSE41-NEXT: por %xmm1, %xmm3 +; SSE41-NEXT: por %xmm2, %xmm3 +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: ptest %xmm3, %xmm3 +; SSE41-NEXT: setne %al +; SSE41-NEXT: retq ; ; AVX1-LABEL: ne_i256_pair: ; AVX1: # %bb.0: @@ -953,34 +691,44 @@ define i32 @ne_i256_pair(ptr %a, ptr %b) { ; if we allowed 2 pairs of 32-byte loads per block. define i32 @eq_i256_pair(ptr %a, ptr %b) { -; SSE-LABEL: eq_i256_pair: -; SSE: # %bb.0: -; SSE-NEXT: movq 16(%rdi), %rax -; SSE-NEXT: movq 24(%rdi), %rcx -; SSE-NEXT: movq (%rdi), %rdx -; SSE-NEXT: movq 8(%rdi), %r8 -; SSE-NEXT: xorq 8(%rsi), %r8 -; SSE-NEXT: xorq 24(%rsi), %rcx -; SSE-NEXT: xorq (%rsi), %rdx -; SSE-NEXT: xorq 16(%rsi), %rax -; SSE-NEXT: movq 48(%rdi), %r9 -; SSE-NEXT: movq 32(%rdi), %r10 -; SSE-NEXT: movq 56(%rdi), %r11 -; SSE-NEXT: movq 40(%rdi), %rdi -; SSE-NEXT: xorq 40(%rsi), %rdi -; SSE-NEXT: orq %r8, %rdi -; SSE-NEXT: xorq 56(%rsi), %r11 -; SSE-NEXT: orq %rcx, %r11 -; SSE-NEXT: orq %rdi, %r11 -; SSE-NEXT: xorq 32(%rsi), %r10 -; SSE-NEXT: orq %rdx, %r10 -; SSE-NEXT: xorq 48(%rsi), %r9 -; SSE-NEXT: orq %rax, %r9 -; SSE-NEXT: orq %r10, %r9 -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: orq %r11, %r9 -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: eq_i256_pair: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: pxor (%rsi), %xmm0 +; SSE2-NEXT: pxor 16(%rsi), %xmm1 +; SSE2-NEXT: pxor 32(%rsi), %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pxor 48(%rsi), %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %ecx +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: eq_i256_pair: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movdqa 48(%rdi), %xmm3 +; SSE41-NEXT: pxor (%rsi), %xmm0 +; SSE41-NEXT: pxor 16(%rsi), %xmm1 +; SSE41-NEXT: pxor 32(%rsi), %xmm2 +; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: pxor 48(%rsi), %xmm3 +; SSE41-NEXT: por %xmm1, %xmm3 +; SSE41-NEXT: por %xmm2, %xmm3 +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: ptest %xmm3, %xmm3 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq ; ; AVX1-LABEL: eq_i256_pair: ; AVX1: # %bb.0: @@ -1038,58 +786,106 @@ define i32 @eq_i256_pair(ptr %a, ptr %b) { ; if we allowed 2 pairs of 64-byte loads per block. define i32 @ne_i512_pair(ptr %a, ptr %b) { -; NO512-LABEL: ne_i512_pair: -; NO512: # %bb.0: -; NO512-NEXT: movq 40(%rdi), %rax -; NO512-NEXT: movq 56(%rdi), %rcx -; NO512-NEXT: movq 24(%rdi), %rdx -; NO512-NEXT: xorq 24(%rsi), %rdx -; NO512-NEXT: xorq 56(%rsi), %rcx -; NO512-NEXT: movq 88(%rdi), %r8 -; NO512-NEXT: xorq 88(%rsi), %r8 -; NO512-NEXT: orq %rdx, %r8 -; NO512-NEXT: movq 120(%rdi), %rdx -; NO512-NEXT: xorq 120(%rsi), %rdx -; NO512-NEXT: orq %rcx, %rdx -; NO512-NEXT: movq 8(%rdi), %rcx -; NO512-NEXT: xorq 8(%rsi), %rcx -; NO512-NEXT: xorq 40(%rsi), %rax -; NO512-NEXT: orq %r8, %rdx -; NO512-NEXT: movq 72(%rdi), %r8 -; NO512-NEXT: xorq 72(%rsi), %r8 -; NO512-NEXT: orq %rcx, %r8 -; NO512-NEXT: movq 104(%rdi), %rcx -; NO512-NEXT: xorq 104(%rsi), %rcx -; NO512-NEXT: orq %rax, %rcx -; NO512-NEXT: movq 48(%rdi), %rax -; NO512-NEXT: orq %r8, %rcx -; NO512-NEXT: movq 16(%rdi), %r8 -; NO512-NEXT: xorq 16(%rsi), %r8 -; NO512-NEXT: xorq 48(%rsi), %rax -; NO512-NEXT: orq %rdx, %rcx -; NO512-NEXT: movq 80(%rdi), %rdx -; NO512-NEXT: xorq 80(%rsi), %rdx -; NO512-NEXT: orq %r8, %rdx -; NO512-NEXT: movq 112(%rdi), %r8 -; NO512-NEXT: xorq 112(%rsi), %r8 -; NO512-NEXT: orq %rax, %r8 -; NO512-NEXT: movq (%rdi), %rax -; NO512-NEXT: xorq (%rsi), %rax -; NO512-NEXT: orq %rdx, %r8 -; NO512-NEXT: movq 64(%rdi), %rdx -; NO512-NEXT: xorq 64(%rsi), %rdx -; NO512-NEXT: orq %rax, %rdx -; NO512-NEXT: movq 32(%rdi), %rax -; NO512-NEXT: xorq 32(%rsi), %rax -; NO512-NEXT: movq 96(%rdi), %rdi -; NO512-NEXT: xorq 96(%rsi), %rdi -; NO512-NEXT: orq %rax, %rdi -; NO512-NEXT: orq %rdx, %rdi -; NO512-NEXT: orq %r8, %rdi -; NO512-NEXT: xorl %eax, %eax -; NO512-NEXT: orq %rcx, %rdi -; NO512-NEXT: setne %al -; NO512-NEXT: retq +; SSE2-LABEL: ne_i512_pair: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: pxor 16(%rsi), %xmm1 +; SSE2-NEXT: pxor 48(%rsi), %xmm3 +; SSE2-NEXT: pxor (%rsi), %xmm0 +; SSE2-NEXT: pxor 32(%rsi), %xmm2 +; SSE2-NEXT: movdqa 96(%rdi), %xmm4 +; SSE2-NEXT: movdqa 64(%rdi), %xmm5 +; SSE2-NEXT: movdqa 112(%rdi), %xmm6 +; SSE2-NEXT: movdqa 80(%rdi), %xmm7 +; SSE2-NEXT: pxor 80(%rsi), %xmm7 +; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: pxor 112(%rsi), %xmm6 +; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: por %xmm7, %xmm6 +; SSE2-NEXT: pxor 64(%rsi), %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: pxor 96(%rsi), %xmm4 +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: por %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %ecx +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: setne %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: ne_i512_pair: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movdqa 48(%rdi), %xmm3 +; SSE41-NEXT: pxor 16(%rsi), %xmm1 +; SSE41-NEXT: pxor 48(%rsi), %xmm3 +; SSE41-NEXT: pxor (%rsi), %xmm0 +; SSE41-NEXT: pxor 32(%rsi), %xmm2 +; SSE41-NEXT: movdqa 96(%rdi), %xmm4 +; SSE41-NEXT: movdqa 64(%rdi), %xmm5 +; SSE41-NEXT: movdqa 112(%rdi), %xmm6 +; SSE41-NEXT: movdqa 80(%rdi), %xmm7 +; SSE41-NEXT: pxor 80(%rsi), %xmm7 +; SSE41-NEXT: por %xmm1, %xmm7 +; SSE41-NEXT: pxor 112(%rsi), %xmm6 +; SSE41-NEXT: por %xmm3, %xmm6 +; SSE41-NEXT: por %xmm7, %xmm6 +; SSE41-NEXT: pxor 64(%rsi), %xmm5 +; SSE41-NEXT: por %xmm0, %xmm5 +; SSE41-NEXT: pxor 96(%rsi), %xmm4 +; SSE41-NEXT: por %xmm2, %xmm4 +; SSE41-NEXT: por %xmm5, %xmm4 +; SSE41-NEXT: por %xmm6, %xmm4 +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: ptest %xmm4, %xmm4 +; SSE41-NEXT: setne %al +; SSE41-NEXT: retq +; +; AVX1-LABEL: ne_i512_pair: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; AVX1-NEXT: vmovups 64(%rdi), %ymm2 +; AVX1-NEXT: vmovups 96(%rdi), %ymm3 +; AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 +; AVX1-NEXT: vxorps 32(%rsi), %ymm1, %ymm1 +; AVX1-NEXT: vxorps 64(%rsi), %ymm2, %ymm2 +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vxorps 96(%rsi), %ymm3, %ymm2 +; AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: setne %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: ne_i512_pair: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX2-NEXT: vmovdqu 64(%rdi), %ymm2 +; AVX2-NEXT: vmovdqu 96(%rdi), %ymm3 +; AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpxor 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpxor 64(%rsi), %ymm2, %ymm2 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor 96(%rsi), %ymm3, %ymm2 +; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: setne %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: ne_i512_pair: ; AVX512F: # %bb.0: @@ -1132,58 +928,106 @@ define i32 @ne_i512_pair(ptr %a, ptr %b) { ; if we allowed 2 pairs of 64-byte loads per block. define i32 @eq_i512_pair(ptr %a, ptr %b) { -; NO512-LABEL: eq_i512_pair: -; NO512: # %bb.0: -; NO512-NEXT: movq 40(%rdi), %rax -; NO512-NEXT: movq 56(%rdi), %rcx -; NO512-NEXT: movq 24(%rdi), %rdx -; NO512-NEXT: xorq 24(%rsi), %rdx -; NO512-NEXT: xorq 56(%rsi), %rcx -; NO512-NEXT: movq 88(%rdi), %r8 -; NO512-NEXT: xorq 88(%rsi), %r8 -; NO512-NEXT: orq %rdx, %r8 -; NO512-NEXT: movq 120(%rdi), %rdx -; NO512-NEXT: xorq 120(%rsi), %rdx -; NO512-NEXT: orq %rcx, %rdx -; NO512-NEXT: movq 8(%rdi), %rcx -; NO512-NEXT: xorq 8(%rsi), %rcx -; NO512-NEXT: xorq 40(%rsi), %rax -; NO512-NEXT: orq %r8, %rdx -; NO512-NEXT: movq 72(%rdi), %r8 -; NO512-NEXT: xorq 72(%rsi), %r8 -; NO512-NEXT: orq %rcx, %r8 -; NO512-NEXT: movq 104(%rdi), %rcx -; NO512-NEXT: xorq 104(%rsi), %rcx -; NO512-NEXT: orq %rax, %rcx -; NO512-NEXT: movq 48(%rdi), %rax -; NO512-NEXT: orq %r8, %rcx -; NO512-NEXT: movq 16(%rdi), %r8 -; NO512-NEXT: xorq 16(%rsi), %r8 -; NO512-NEXT: xorq 48(%rsi), %rax -; NO512-NEXT: orq %rdx, %rcx -; NO512-NEXT: movq 80(%rdi), %rdx -; NO512-NEXT: xorq 80(%rsi), %rdx -; NO512-NEXT: orq %r8, %rdx -; NO512-NEXT: movq 112(%rdi), %r8 -; NO512-NEXT: xorq 112(%rsi), %r8 -; NO512-NEXT: orq %rax, %r8 -; NO512-NEXT: movq (%rdi), %rax -; NO512-NEXT: xorq (%rsi), %rax -; NO512-NEXT: orq %rdx, %r8 -; NO512-NEXT: movq 64(%rdi), %rdx -; NO512-NEXT: xorq 64(%rsi), %rdx -; NO512-NEXT: orq %rax, %rdx -; NO512-NEXT: movq 32(%rdi), %rax -; NO512-NEXT: xorq 32(%rsi), %rax -; NO512-NEXT: movq 96(%rdi), %rdi -; NO512-NEXT: xorq 96(%rsi), %rdi -; NO512-NEXT: orq %rax, %rdi -; NO512-NEXT: orq %rdx, %rdi -; NO512-NEXT: orq %r8, %rdi -; NO512-NEXT: xorl %eax, %eax -; NO512-NEXT: orq %rcx, %rdi -; NO512-NEXT: sete %al -; NO512-NEXT: retq +; SSE2-LABEL: eq_i512_pair: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: pxor 16(%rsi), %xmm1 +; SSE2-NEXT: pxor 48(%rsi), %xmm3 +; SSE2-NEXT: pxor (%rsi), %xmm0 +; SSE2-NEXT: pxor 32(%rsi), %xmm2 +; SSE2-NEXT: movdqa 96(%rdi), %xmm4 +; SSE2-NEXT: movdqa 64(%rdi), %xmm5 +; SSE2-NEXT: movdqa 112(%rdi), %xmm6 +; SSE2-NEXT: movdqa 80(%rdi), %xmm7 +; SSE2-NEXT: pxor 80(%rsi), %xmm7 +; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: pxor 112(%rsi), %xmm6 +; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: por %xmm7, %xmm6 +; SSE2-NEXT: pxor 64(%rsi), %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: pxor 96(%rsi), %xmm4 +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: por %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %ecx +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: eq_i512_pair: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movdqa 48(%rdi), %xmm3 +; SSE41-NEXT: pxor 16(%rsi), %xmm1 +; SSE41-NEXT: pxor 48(%rsi), %xmm3 +; SSE41-NEXT: pxor (%rsi), %xmm0 +; SSE41-NEXT: pxor 32(%rsi), %xmm2 +; SSE41-NEXT: movdqa 96(%rdi), %xmm4 +; SSE41-NEXT: movdqa 64(%rdi), %xmm5 +; SSE41-NEXT: movdqa 112(%rdi), %xmm6 +; SSE41-NEXT: movdqa 80(%rdi), %xmm7 +; SSE41-NEXT: pxor 80(%rsi), %xmm7 +; SSE41-NEXT: por %xmm1, %xmm7 +; SSE41-NEXT: pxor 112(%rsi), %xmm6 +; SSE41-NEXT: por %xmm3, %xmm6 +; SSE41-NEXT: por %xmm7, %xmm6 +; SSE41-NEXT: pxor 64(%rsi), %xmm5 +; SSE41-NEXT: por %xmm0, %xmm5 +; SSE41-NEXT: pxor 96(%rsi), %xmm4 +; SSE41-NEXT: por %xmm2, %xmm4 +; SSE41-NEXT: por %xmm5, %xmm4 +; SSE41-NEXT: por %xmm6, %xmm4 +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: ptest %xmm4, %xmm4 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq +; +; AVX1-LABEL: eq_i512_pair: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; AVX1-NEXT: vmovups 64(%rdi), %ymm2 +; AVX1-NEXT: vmovups 96(%rdi), %ymm3 +; AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 +; AVX1-NEXT: vxorps 32(%rsi), %ymm1, %ymm1 +; AVX1-NEXT: vxorps 64(%rsi), %ymm2, %ymm2 +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vxorps 96(%rsi), %ymm3, %ymm2 +; AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: sete %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: eq_i512_pair: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX2-NEXT: vmovdqu 64(%rdi), %ymm2 +; AVX2-NEXT: vmovdqu 96(%rdi), %ymm3 +; AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpxor 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpxor 64(%rsi), %ymm2, %ymm2 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor 96(%rsi), %ymm3, %ymm2 +; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: sete %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: eq_i512_pair: ; AVX512F: # %bb.0: @@ -1481,15 +1325,24 @@ define i1 @allbits_i128_load_arg(ptr %w) { } define i1 @anybits_i256_load_arg(ptr %w) { -; SSE-LABEL: anybits_i256_load_arg: -; SSE: # %bb.0: -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: movq 8(%rdi), %rcx -; SSE-NEXT: orq 24(%rdi), %rcx -; SSE-NEXT: orq 16(%rdi), %rax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: setne %al -; SSE-NEXT: retq +; SSE2-LABEL: anybits_i256_load_arg: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: por 16(%rdi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: movmskps %xmm1, %eax +; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: anybits_i256_load_arg: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: por 16(%rdi), %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: setne %al +; SSE41-NEXT: retq ; ; AVX-LABEL: anybits_i256_load_arg: ; AVX: # %bb.0: @@ -1504,16 +1357,25 @@ define i1 @anybits_i256_load_arg(ptr %w) { } define i1 @allbits_i256_load_arg(ptr %w) { -; SSE-LABEL: allbits_i256_load_arg: -; SSE: # %bb.0: -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: movq 8(%rdi), %rcx -; SSE-NEXT: andq 24(%rdi), %rcx -; SSE-NEXT: andq 16(%rdi), %rax -; SSE-NEXT: andq %rcx, %rax -; SSE-NEXT: cmpq $-1, %rax -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: allbits_i256_load_arg: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pand 16(%rdi), %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: movmskps %xmm1, %eax +; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: allbits_i256_load_arg: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: pand 16(%rdi), %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: ptest %xmm1, %xmm0 +; SSE41-NEXT: setb %al +; SSE41-NEXT: retq ; ; AVX1-LABEL: allbits_i256_load_arg: ; AVX1: # %bb.0: @@ -1548,21 +1410,48 @@ define i1 @allbits_i256_load_arg(ptr %w) { } define i1 @anybits_i512_load_arg(ptr %w) { -; NO512-LABEL: anybits_i512_load_arg: -; NO512: # %bb.0: -; NO512-NEXT: movq 16(%rdi), %rax -; NO512-NEXT: movq (%rdi), %rcx -; NO512-NEXT: movq 8(%rdi), %rdx -; NO512-NEXT: movq 24(%rdi), %rsi -; NO512-NEXT: orq 56(%rdi), %rsi -; NO512-NEXT: orq 40(%rdi), %rdx -; NO512-NEXT: orq %rsi, %rdx -; NO512-NEXT: orq 48(%rdi), %rax -; NO512-NEXT: orq 32(%rdi), %rcx -; NO512-NEXT: orq %rax, %rcx -; NO512-NEXT: orq %rdx, %rcx -; NO512-NEXT: setne %al -; NO512-NEXT: retq +; SSE2-LABEL: anybits_i512_load_arg: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: por 48(%rdi), %xmm1 +; SSE2-NEXT: por 32(%rdi), %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: movmskps %xmm1, %eax +; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: anybits_i512_load_arg: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: por 48(%rdi), %xmm1 +; SSE41-NEXT: por 32(%rdi), %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: setne %al +; SSE41-NEXT: retq +; +; AVX1-LABEL: anybits_i512_load_arg: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: vorps 32(%rdi), %ymm0, %ymm0 +; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: setne %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: anybits_i512_load_arg: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vpor 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: setne %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: anybits_i512_load_arg: ; AVX512: # %bb.0: @@ -1578,22 +1467,52 @@ define i1 @anybits_i512_load_arg(ptr %w) { } define i1 @allbits_i512_load_arg(ptr %w) { -; NO512-LABEL: allbits_i512_load_arg: -; NO512: # %bb.0: -; NO512-NEXT: movq 16(%rdi), %rax -; NO512-NEXT: movq (%rdi), %rcx -; NO512-NEXT: movq 8(%rdi), %rdx -; NO512-NEXT: movq 24(%rdi), %rsi -; NO512-NEXT: andq 56(%rdi), %rsi -; NO512-NEXT: andq 40(%rdi), %rdx -; NO512-NEXT: andq %rsi, %rdx -; NO512-NEXT: andq 48(%rdi), %rax -; NO512-NEXT: andq 32(%rdi), %rcx -; NO512-NEXT: andq %rax, %rcx -; NO512-NEXT: andq %rdx, %rcx -; NO512-NEXT: cmpq $-1, %rcx -; NO512-NEXT: sete %al -; NO512-NEXT: retq +; SSE2-LABEL: allbits_i512_load_arg: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: pand 48(%rdi), %xmm1 +; SSE2-NEXT: pand 32(%rdi), %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: movmskps %xmm1, %eax +; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: allbits_i512_load_arg: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: pand 48(%rdi), %xmm1 +; SSE41-NEXT: pand 32(%rdi), %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: ptest %xmm1, %xmm0 +; SSE41-NEXT: setb %al +; SSE41-NEXT: retq +; +; AVX1-LABEL: allbits_i512_load_arg: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: vandps 32(%rdi), %ymm0, %ymm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vptest %ymm1, %ymm0 +; AVX1-NEXT: setb %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: allbits_i512_load_arg: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vpand 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: setb %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: allbits_i512_load_arg: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll index 01fbafb18eb9f..21915a0d50a77 100644 --- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-sse2 | FileCheck %s --check-prefixes=ALL,SCALAR -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSE2-ONLY -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse3 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSE3 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSSE3,SSSE3-ONLY -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSSE3,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSSE3,SSE42 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX2,AVX2-ONLY -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX2,AVX512,AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX2,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-sse2 | FileCheck %s --check-prefixes=SCALAR +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,SSE2-ONLY +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE2,SSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE2,SSSE3,SSSE3-ONLY +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE2,SSSE3,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE2,SSSE3,SSE42 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=SSE,AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=SSE,AVX,AVX2,AVX2-ONLY +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl | FileCheck %s --check-prefixes=SSE,AVX,AVX2,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=SSE,AVX,AVX2,AVX512,AVX512BW define void @vec32_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { ; SCALAR-LABEL: vec32_v2i8: @@ -5679,29 +5679,56 @@ define void @vec512_v2f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. } define void @vec512_v2i128(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec512_v2i128: -; ALL: # %bb.0: -; ALL-NEXT: movq 16(%rdi), %rax -; ALL-NEXT: movq 24(%rdi), %rcx -; ALL-NEXT: movq (%rdi), %r8 -; ALL-NEXT: movq 8(%rdi), %rdi -; ALL-NEXT: notq %rdi -; ALL-NEXT: notq %r8 -; ALL-NEXT: notq %rcx -; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, 16(%rsi) -; ALL-NEXT: movq %rcx, 24(%rsi) -; ALL-NEXT: movq %r8, (%rsi) -; ALL-NEXT: movq %rdi, 8(%rsi) -; ALL-NEXT: movq %rax, 16(%rdx) -; ALL-NEXT: movq %rcx, 24(%rdx) -; ALL-NEXT: movq %r8, (%rdx) -; ALL-NEXT: movq %rdi, 8(%rdx) -; ALL-NEXT: movq %rax, 48(%rdx) -; ALL-NEXT: movq %rcx, 56(%rdx) -; ALL-NEXT: movq %r8, 32(%rdx) -; ALL-NEXT: movq %rdi, 40(%rdx) -; ALL-NEXT: retq +; SCALAR-LABEL: vec512_v2i128: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movq 16(%rdi), %rax +; SCALAR-NEXT: movq 24(%rdi), %rcx +; SCALAR-NEXT: movq (%rdi), %r8 +; SCALAR-NEXT: movq 8(%rdi), %rdi +; SCALAR-NEXT: notq %rdi +; SCALAR-NEXT: notq %r8 +; SCALAR-NEXT: notq %rcx +; SCALAR-NEXT: notq %rax +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movq %rcx, 24(%rsi) +; SCALAR-NEXT: movq %r8, (%rsi) +; SCALAR-NEXT: movq %rdi, 8(%rsi) +; SCALAR-NEXT: movq %rax, 16(%rdx) +; SCALAR-NEXT: movq %rcx, 24(%rdx) +; SCALAR-NEXT: movq %r8, (%rdx) +; SCALAR-NEXT: movq %rdi, 8(%rdx) +; SCALAR-NEXT: movq %rax, 48(%rdx) +; SCALAR-NEXT: movq %rcx, 56(%rdx) +; SCALAR-NEXT: movq %r8, 32(%rdx) +; SCALAR-NEXT: movq %rdi, 40(%rdx) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec512_v2i128: +; SSE2: # %bb.0: +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: pxor 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm1, 32(%rdx) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec512_v2i128: +; AVX: # %bb.0: +; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm1 +; AVX-NEXT: vpxor 16(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX-NEXT: vmovdqa %xmm1, (%rsi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX-NEXT: retq %in.subvec.not = load <2 x i128>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <2 x i128> %in.subvec.not, store <2 x i128> %in.subvec, ptr %out.subvec.ptr, align 64