Skip to content

Conversation

@RKSimon
Copy link
Collaborator

@RKSimon RKSimon commented Dec 10, 2025

If the scalar integer sources are freely transferable to the FPU, then perform the bitlogic op as a SSE/AVX operation.

Uses the mayFoldIntoVector helper added at #171589

If the scalar integer sources are freely transferable to the FPU, then perform the bitlogic op as a SSE/AVX operation.
@llvmbot
Copy link
Member

llvmbot commented Dec 10, 2025

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

Changes

If the scalar integer sources are freely transferable to the FPU, then perform the bitlogic op as a SSE/AVX operation.

Uses the mayFoldIntoVector helper added at #171589


Patch is 69.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/171616.diff

6 Files Affected:

  • (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+29)
  • (modified) llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll (+194-64)
  • (modified) llvm/test/CodeGen/X86/pr166744.ll (+8-22)
  • (modified) llvm/test/CodeGen/X86/ptest.ll (+137-68)
  • (modified) llvm/test/CodeGen/X86/setcc-wide-types.ll (+481-562)
  • (modified) llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll (+60-33)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3b3b20edbbe84..67f46c61cbeac 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1142,6 +1142,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
     setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
 
+    setOperationAction(ISD::AND, MVT::i128, Custom);
+    setOperationAction(ISD::OR, MVT::i128, Custom);
+    setOperationAction(ISD::XOR, MVT::i128, Custom);
+
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
       setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
       setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
@@ -1481,6 +1485,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
     setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
 
+    setOperationAction(ISD::AND, MVT::i256, Custom);
+    setOperationAction(ISD::OR, MVT::i256, Custom);
+    setOperationAction(ISD::XOR, MVT::i256, Custom);
+
     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
     // even though v8i16 is a legal type.
     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i16, MVT::v8i32);
@@ -1836,6 +1844,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     if (Subtarget.hasDQI())
       setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
 
+    setOperationAction(ISD::AND, MVT::i512, Custom);
+    setOperationAction(ISD::OR, MVT::i512, Custom);
+    setOperationAction(ISD::XOR, MVT::i512, Custom);
+
     for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
       setOperationPromotedToType(ISD::FP_TO_SINT       , VT, MVT::v16i32);
       setOperationPromotedToType(ISD::FP_TO_UINT       , VT, MVT::v16i32);
@@ -33919,6 +33931,23 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   case X86ISD::CVTPS2PH:
     Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
     return;
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    EVT VT = N->getValueType(0);
+    assert((VT == MVT::i128 || VT == MVT::i256 || VT == MVT::i512) &&
+           "Unexpected VT!");
+    // See if this is free to perform on the FPU to avoid splitting.
+    MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+    if (!mayFoldIntoVector(N0, Subtarget) || !mayFoldIntoVector(N1, Subtarget))
+      return;
+    SDValue Op = DAG.getNode(Opc, dl, VecVT, DAG.getBitcast(VecVT, N0),
+                             DAG.getBitcast(VecVT, N1));
+    Results.push_back(DAG.getBitcast(VT, Op));
+    return;
+  }
   case ISD::CTPOP: {
     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
     // If we have at most 32 active bits, then perform as i32 CTPOP.
diff --git a/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll b/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll
index 6d4be7dbe6349..d9158c4af18fa 100644
--- a/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll
+++ b/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll
@@ -621,17 +621,41 @@ define void @vec256_double(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
 }
 
 define void @vec256_i128(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
-; ALL-LABEL: vec256_i128:
-; ALL:       # %bb.0:
-; ALL-NEXT:    movq (%rdi), %rax
-; ALL-NEXT:    movq 8(%rdi), %rcx
-; ALL-NEXT:    notq %rcx
-; ALL-NEXT:    notq %rax
-; ALL-NEXT:    movq %rax, (%rsi)
-; ALL-NEXT:    movq %rcx, 8(%rsi)
-; ALL-NEXT:    movq %rcx, 24(%rsi)
-; ALL-NEXT:    movq %rax, 16(%rsi)
-; ALL-NEXT:    retq
+; SCALAR-LABEL: vec256_i128:
+; SCALAR:       # %bb.0:
+; SCALAR-NEXT:    movq (%rdi), %rax
+; SCALAR-NEXT:    movq 8(%rdi), %rcx
+; SCALAR-NEXT:    notq %rcx
+; SCALAR-NEXT:    notq %rax
+; SCALAR-NEXT:    movq %rax, (%rsi)
+; SCALAR-NEXT:    movq %rcx, 8(%rsi)
+; SCALAR-NEXT:    movq %rcx, 24(%rsi)
+; SCALAR-NEXT:    movq %rax, 16(%rsi)
+; SCALAR-NEXT:    retq
+;
+; SSE-LABEL: vec256_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE-NEXT:    pxor (%rdi), %xmm0
+; SSE-NEXT:    movdqa %xmm0, (%rsi)
+; SSE-NEXT:    movdqa %xmm0, 16(%rsi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: vec256_i128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, 16(%rsi)
+; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: vec256_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpxor (%rdi), %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, 16(%rsi)
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512-NEXT:    retq
   %in.elt.not = load i128, ptr %in.elt.ptr, align 64
   %in.elt = xor i128 %in.elt.not, -1
   %out.elt0.ptr = getelementptr i128, ptr %out.vec.ptr, i64 0
@@ -1034,19 +1058,46 @@ define void @vec384_double(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
 }
 
 define void @vec384_i128(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
-; ALL-LABEL: vec384_i128:
-; ALL:       # %bb.0:
-; ALL-NEXT:    movq (%rdi), %rax
-; ALL-NEXT:    movq 8(%rdi), %rcx
-; ALL-NEXT:    notq %rcx
-; ALL-NEXT:    notq %rax
-; ALL-NEXT:    movq %rax, (%rsi)
-; ALL-NEXT:    movq %rcx, 8(%rsi)
-; ALL-NEXT:    movq %rcx, 24(%rsi)
-; ALL-NEXT:    movq %rax, 16(%rsi)
-; ALL-NEXT:    movq %rcx, 40(%rsi)
-; ALL-NEXT:    movq %rax, 32(%rsi)
-; ALL-NEXT:    retq
+; SCALAR-LABEL: vec384_i128:
+; SCALAR:       # %bb.0:
+; SCALAR-NEXT:    movq (%rdi), %rax
+; SCALAR-NEXT:    movq 8(%rdi), %rcx
+; SCALAR-NEXT:    notq %rcx
+; SCALAR-NEXT:    notq %rax
+; SCALAR-NEXT:    movq %rax, (%rsi)
+; SCALAR-NEXT:    movq %rcx, 8(%rsi)
+; SCALAR-NEXT:    movq %rcx, 24(%rsi)
+; SCALAR-NEXT:    movq %rax, 16(%rsi)
+; SCALAR-NEXT:    movq %rcx, 40(%rsi)
+; SCALAR-NEXT:    movq %rax, 32(%rsi)
+; SCALAR-NEXT:    retq
+;
+; SSE-LABEL: vec384_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE-NEXT:    pxor (%rdi), %xmm0
+; SSE-NEXT:    movdqa %xmm0, (%rsi)
+; SSE-NEXT:    movdqa %xmm0, 16(%rsi)
+; SSE-NEXT:    movdqa %xmm0, 32(%rsi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: vec384_i128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX-NEXT:    vmovdqa %xmm0, 16(%rsi)
+; AVX-NEXT:    vmovdqa %xmm0, 32(%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: vec384_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpxor (%rdi), %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512-NEXT:    vmovdqa %xmm0, 16(%rsi)
+; AVX512-NEXT:    vmovdqa %xmm0, 32(%rsi)
+; AVX512-NEXT:    retq
   %in.elt.not = load i128, ptr %in.elt.ptr, align 64
   %in.elt = xor i128 %in.elt.not, -1
   %out.elt0.ptr = getelementptr i128, ptr %out.vec.ptr, i64 0
@@ -1559,21 +1610,60 @@ define void @vec512_double(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
 }
 
 define void @vec512_i128(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
-; ALL-LABEL: vec512_i128:
-; ALL:       # %bb.0:
-; ALL-NEXT:    movq (%rdi), %rax
-; ALL-NEXT:    movq 8(%rdi), %rcx
-; ALL-NEXT:    notq %rcx
-; ALL-NEXT:    notq %rax
-; ALL-NEXT:    movq %rax, (%rsi)
-; ALL-NEXT:    movq %rcx, 8(%rsi)
-; ALL-NEXT:    movq %rcx, 24(%rsi)
-; ALL-NEXT:    movq %rax, 16(%rsi)
-; ALL-NEXT:    movq %rcx, 40(%rsi)
-; ALL-NEXT:    movq %rax, 32(%rsi)
-; ALL-NEXT:    movq %rcx, 56(%rsi)
-; ALL-NEXT:    movq %rax, 48(%rsi)
-; ALL-NEXT:    retq
+; SCALAR-LABEL: vec512_i128:
+; SCALAR:       # %bb.0:
+; SCALAR-NEXT:    movq (%rdi), %rax
+; SCALAR-NEXT:    movq 8(%rdi), %rcx
+; SCALAR-NEXT:    notq %rcx
+; SCALAR-NEXT:    notq %rax
+; SCALAR-NEXT:    movq %rax, (%rsi)
+; SCALAR-NEXT:    movq %rcx, 8(%rsi)
+; SCALAR-NEXT:    movq %rcx, 24(%rsi)
+; SCALAR-NEXT:    movq %rax, 16(%rsi)
+; SCALAR-NEXT:    movq %rcx, 40(%rsi)
+; SCALAR-NEXT:    movq %rax, 32(%rsi)
+; SCALAR-NEXT:    movq %rcx, 56(%rsi)
+; SCALAR-NEXT:    movq %rax, 48(%rsi)
+; SCALAR-NEXT:    retq
+;
+; SSE-LABEL: vec512_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE-NEXT:    pxor (%rdi), %xmm0
+; SSE-NEXT:    movdqa %xmm0, (%rsi)
+; SSE-NEXT:    movdqa %xmm0, 16(%rsi)
+; SSE-NEXT:    movdqa %xmm0, 32(%rsi)
+; SSE-NEXT:    movdqa %xmm0, 48(%rsi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: vec512_i128:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor (%rdi), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vmovaps %ymm0, (%rsi)
+; AVX1-NEXT:    vmovaps %ymm0, 32(%rsi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vec512_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor (%rdi), %xmm0, %xmm0
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX2-NEXT:    vmovdqa %ymm0, 32(%rsi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: vec512_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpxor (%rdi), %xmm0, %xmm0
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512-NEXT:    vmovdqa64 %zmm0, (%rsi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %in.elt.not = load i128, ptr %in.elt.ptr, align 64
   %in.elt = xor i128 %in.elt.not, -1
   %out.elt0.ptr = getelementptr i128, ptr %out.vec.ptr, i64 0
@@ -1588,25 +1678,71 @@ define void @vec512_i128(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
 }
 
 define void @vec512_i256(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
-; ALL-LABEL: vec512_i256:
-; ALL:       # %bb.0:
-; ALL-NEXT:    movq 16(%rdi), %rax
-; ALL-NEXT:    movq 24(%rdi), %rcx
-; ALL-NEXT:    movq (%rdi), %rdx
-; ALL-NEXT:    movq 8(%rdi), %rdi
-; ALL-NEXT:    notq %rdi
-; ALL-NEXT:    notq %rdx
-; ALL-NEXT:    notq %rcx
-; ALL-NEXT:    notq %rax
-; ALL-NEXT:    movq %rax, 16(%rsi)
-; ALL-NEXT:    movq %rcx, 24(%rsi)
-; ALL-NEXT:    movq %rdx, (%rsi)
-; ALL-NEXT:    movq %rdi, 8(%rsi)
-; ALL-NEXT:    movq %rax, 48(%rsi)
-; ALL-NEXT:    movq %rcx, 56(%rsi)
-; ALL-NEXT:    movq %rdx, 32(%rsi)
-; ALL-NEXT:    movq %rdi, 40(%rsi)
-; ALL-NEXT:    retq
+; SCALAR-LABEL: vec512_i256:
+; SCALAR:       # %bb.0:
+; SCALAR-NEXT:    movq 16(%rdi), %rax
+; SCALAR-NEXT:    movq 24(%rdi), %rcx
+; SCALAR-NEXT:    movq (%rdi), %rdx
+; SCALAR-NEXT:    movq 8(%rdi), %rdi
+; SCALAR-NEXT:    notq %rdi
+; SCALAR-NEXT:    notq %rdx
+; SCALAR-NEXT:    notq %rcx
+; SCALAR-NEXT:    notq %rax
+; SCALAR-NEXT:    movq %rax, 16(%rsi)
+; SCALAR-NEXT:    movq %rcx, 24(%rsi)
+; SCALAR-NEXT:    movq %rdx, (%rsi)
+; SCALAR-NEXT:    movq %rdi, 8(%rsi)
+; SCALAR-NEXT:    movq %rax, 48(%rsi)
+; SCALAR-NEXT:    movq %rcx, 56(%rsi)
+; SCALAR-NEXT:    movq %rdx, 32(%rsi)
+; SCALAR-NEXT:    movq %rdi, 40(%rsi)
+; SCALAR-NEXT:    retq
+;
+; SSE-LABEL: vec512_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE-NEXT:    movdqa (%rdi), %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pxor 16(%rdi), %xmm0
+; SSE-NEXT:    movdqa %xmm0, 16(%rsi)
+; SSE-NEXT:    movdqa %xmm1, (%rsi)
+; SSE-NEXT:    movdqa %xmm1, 32(%rsi)
+; SSE-NEXT:    movdqa %xmm0, 48(%rsi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: vec512_i256:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps (%rdi), %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, 16(%rsi)
+; AVX1-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX1-NEXT:    vextractf128 $1, %ymm0, 48(%rsi)
+; AVX1-NEXT:    vmovaps %xmm0, 32(%rsi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vec512_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor (%rdi), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, 16(%rsi)
+; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX2-NEXT:    vextracti128 $1, %ymm0, 48(%rsi)
+; AVX2-NEXT:    vmovdqa %xmm0, 32(%rsi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: vec512_i256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vpxor (%rdi), %ymm0, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, 16(%rsi)
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512-NEXT:    vextracti128 $1, %ymm0, 48(%rsi)
+; AVX512-NEXT:    vmovdqa %xmm0, 32(%rsi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %in.elt.not = load i256, ptr %in.elt.ptr, align 64
   %in.elt = xor i256 %in.elt.not, -1
   %out.elt0.ptr = getelementptr i256, ptr %out.vec.ptr, i64 0
@@ -1616,14 +1752,8 @@ define void @vec512_i256(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
   ret void
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; AVX1: {{.*}}
-; AVX2: {{.*}}
-; AVX512: {{.*}}
 ; AVX512BW: {{.*}}
 ; AVX512F: {{.*}}
-; SCALAR: {{.*}}
-; SSE: {{.*}}
 ; SSE2: {{.*}}
 ; SSE2-ONLY: {{.*}}
 ; SSE3: {{.*}}
diff --git a/llvm/test/CodeGen/X86/pr166744.ll b/llvm/test/CodeGen/X86/pr166744.ll
index ffdb68c7a6c01..8ecdc064e4dfb 100644
--- a/llvm/test/CodeGen/X86/pr166744.ll
+++ b/llvm/test/CodeGen/X86/pr166744.ll
@@ -14,18 +14,11 @@ define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) {
 ; POSTRA-NEXT:    btrl %esi, %ecx
 ; POSTRA-NEXT:    orl %ecx, %edx
 ; POSTRA-NEXT:    movl %edx, (%rdi,%rax,4)
-; POSTRA-NEXT:    movq 16(%rdi), %rax
-; POSTRA-NEXT:    movq (%rdi), %rcx
-; POSTRA-NEXT:    movq 24(%rdi), %rdx
-; POSTRA-NEXT:    movq 8(%rdi), %rsi
-; POSTRA-NEXT:    orq 56(%rdi), %rdx
-; POSTRA-NEXT:    orq 40(%rdi), %rsi
-; POSTRA-NEXT:    orq 48(%rdi), %rax
-; POSTRA-NEXT:    orq 32(%rdi), %rcx
-; POSTRA-NEXT:    orq %rdx, %rsi
-; POSTRA-NEXT:    orq %rax, %rcx
-; POSTRA-NEXT:    orq %rsi, %rcx
+; POSTRA-NEXT:    vmovdqu (%rdi), %ymm0
+; POSTRA-NEXT:    vpor 32(%rdi), %ymm0, %ymm0
+; POSTRA-NEXT:    vptest %ymm0, %ymm0
 ; POSTRA-NEXT:    setne %al
+; POSTRA-NEXT:    vzeroupper
 ; POSTRA-NEXT:    retq
 ;
 ; NOPOSTRA-LABEL: PR166744:
@@ -38,18 +31,11 @@ define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) {
 ; NOPOSTRA-NEXT:    shlxl %eax, %edx, %eax
 ; NOPOSTRA-NEXT:    orl %ecx, %eax
 ; NOPOSTRA-NEXT:    movl %eax, (%rdi,%rsi)
-; NOPOSTRA-NEXT:    movq 16(%rdi), %rax
-; NOPOSTRA-NEXT:    movq (%rdi), %rcx
-; NOPOSTRA-NEXT:    movq 8(%rdi), %rdx
-; NOPOSTRA-NEXT:    movq 24(%rdi), %rsi
-; NOPOSTRA-NEXT:    orq 56(%rdi), %rsi
-; NOPOSTRA-NEXT:    orq 40(%rdi), %rdx
-; NOPOSTRA-NEXT:    orq 48(%rdi), %rax
-; NOPOSTRA-NEXT:    orq 32(%rdi), %rcx
-; NOPOSTRA-NEXT:    orq %rsi, %rdx
-; NOPOSTRA-NEXT:    orq %rax, %rcx
-; NOPOSTRA-NEXT:    orq %rdx, %rcx
+; NOPOSTRA-NEXT:    vmovdqu (%rdi), %ymm0
+; NOPOSTRA-NEXT:    vpor 32(%rdi), %ymm0, %ymm0
+; NOPOSTRA-NEXT:    vptest %ymm0, %ymm0
 ; NOPOSTRA-NEXT:    setne %al
+; NOPOSTRA-NEXT:    vzeroupper
 ; NOPOSTRA-NEXT:    retq
   %rem = and i64 %idx, 511
   %sh_prom = zext nneg i64 %rem to i512
diff --git a/llvm/test/CodeGen/X86/ptest.ll b/llvm/test/CodeGen/X86/ptest.ll
index 6e43b897caef1..166b7abc9e053 100644
--- a/llvm/test/CodeGen/X86/ptest.ll
+++ b/llvm/test/CodeGen/X86/ptest.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2,-avx | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1,-avx | FileCheck %s --check-prefixes=CHECK,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2,-avx | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1,-avx | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2   | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
@@ -433,15 +433,23 @@ define i1 @vecmp_load64x4(ptr %p0) {
 }
 
 define i1 @vecmp_load128x2(ptr %p0) {
-; CHECK-LABEL: vecmp_load128x2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    movq 8(%rdi), %rcx
-; CHECK-NEXT:    orq 24(%rdi), %rcx
-; CHECK-NEXT:    orq 16(%rdi), %rax
-; CHECK-NEXT:    orq %rcx, %rax
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    retq
+; SSE-LABEL: vecmp_load128x2:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq (%rdi), %rax
+; SSE-NEXT:    movq 8(%rdi), %rcx
+; SSE-NEXT:    orq 24(%rdi), %rcx
+; SSE-NEXT:    orq 16(%rdi), %rax
+; SSE-NEXT:    orq %rcx, %rax
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: vecmp_load128x2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX-NEXT:    vpor 16(%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vptest %xmm0, %xmm0
+; AVX-NEXT:    sete %al
+; AVX-NEXT:    retq
   %p1 = getelementptr i8, ptr %p0, i64 16
   %i0 = load i128, ptr %p0, align 1
   %i1 = load i128, ptr %p1, align 1
@@ -453,21 +461,42 @@ define i1 @vecmp_load128x2(ptr %p0) {
 }
 
 define i1 @vecmp_load128x4(ptr %p0) {
-; CHECK-LABEL: vecmp_load128x4:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    movq 8(%rdi), %rcx
-; CHECK-NEXT:    movq 24(%rdi), %rdx
-; CHECK-NEXT:    movq 16(%rdi), %rsi
-; CHECK-NEXT:    orq 32(%rdi), %rax
-; CHECK-NEXT:    orq 40(%rdi), %rcx
-; CHECK-NEXT:    orq 48(%rdi), %rsi
-; CHECK-NEXT:    orq %rax, %rsi
-; CHECK-NEXT:    orq 56(%rdi), %rdx
-; CHECK-NEXT:    orq %rcx, %rdx
-; CHECK-NEXT:    orq %rsi, %rdx
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    retq
+; SSE-LABEL: vecmp_load128x4:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq (%rdi), %rax
+; SSE-NEXT:    movq 8(%rdi), %rcx
+; SSE-NEXT:    movq 24(%rdi), %rdx
+; SSE-NEXT:    movq 16(%rdi), %rsi
+; SSE-NEXT:    orq 32(%rdi), %rax
+; SSE-NEXT:    orq 40(%rdi), %rcx
+; SSE-NEXT:    orq 48(%rdi), %rsi
+; SSE-NEXT:    orq %rax, %rsi
+; SSE-NEXT:    orq 56(%rdi), %rdx
+; SSE-NEXT:    orq %rcx, %rdx
+; SSE-NEXT:    orq %rsi, %rdx
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: vecmp_load128x4:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqu 16(%rdi), %xmm1
+; AVX1-NEXT:    vpor 32(%rdi), %xmm0, %xmm0
+; AVX1-NEXT:    vpor 48(%rdi), %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vptest %xmm0, %xmm0
+; AVX1-NEXT:    sete %al
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: vecmp_load128x4:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX512-NEXT:    vmovdqu 16(%rdi), %xmm1
+; AVX512-NEXT:    vpor 32(%rdi), %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm0 | xmm1 | mem
+; AVX512-NEXT:    vptest %xmm0, %xmm0
+; AVX512-NEXT:    sete %al
+; AVX512-NEXT:    retq
   %p1 = getelementptr i8, ptr %p0, i64 16
   %p2 = getelementptr i8, ptr %p0, i64 32
   %p3 = getelementptr i8, ptr %p0, i64 48
@@ -486,21 +515,39 @@ define i1 @vecmp_load128x4(ptr %p0) {
 
 ; PR144861
 define i1 @vecmp_load256x2(ptr %p0) {
-; CHECK-LABEL: vecmp_load256x2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq 24(%rdi), %rax
-; CHECK-NEXT:    movq (%rdi), %rcx
-; CHECK-NEXT:    movq 8(%rdi), %rdx
-; CHECK-NEXT:    movq 16(%rdi), %rsi
-; CHECK-NEXT:    orq 48(%rdi), %rsi
-; CHECK-NEXT:    orq 32(%rdi), %rcx
-; CHECK-NEXT:    orq %rsi, %rcx
-; CHECK-NEXT:    orq 56(%rdi), %rax
-; CHECK-NEXT:    orq 40(%rdi), %rdx
-; CHECK-NEXT:    orq %rax, %rdx
-; CHECK-NEXT:    orq %rcx, %rdx
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    retq
+; SSE-LABEL: vecmp_load256x2:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq 24(%rdi), %rax
+; SSE-NEXT:    movq (%rdi), %rcx
+; SSE-NEXT:    movq 8(%rdi), %rdx
+; SSE-NEXT:    movq 16(%rdi), %rsi
+; SSE-NEXT:    orq 48(%rdi), %rsi
+; SSE-NEXT:    orq 32(%rdi), %rcx
+; SSE-NEXT:    orq %rsi, %rcx
+; SSE-NEXT:    orq 56(%rdi), %rax
+; SSE-NEXT:    orq 40(%rdi), %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    orq %rcx, %rdx
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: vecmp_load256x2:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovups (%rdi), %ymm0
+; AVX1-NEXT:    vorps 32(%rdi), %ymm0, %ymm0
+; AVX1-NEXT:    vptest %ymm0, %ymm0
+; AVX1-NEXT:    sete %al
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: vecmp_load256x2:
+; AVX512:       # %bb.0:
+; A...
[truncated]

Comment on lines +5723 to +5730
; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm1
; AVX-NEXT: vpxor 16(%rdi), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, 16(%rsi)
; AVX-NEXT: vmovdqa %xmm1, (%rsi)
; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
; AVX-NEXT: vmovdqa %xmm1, (%rdx)
; AVX-NEXT: vmovdqa %xmm0, 48(%rdx)
; AVX-NEXT: vmovdqa %xmm1, 32(%rdx)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not ymm here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It'd involve adding a subvector splat which store combining won't attempt

Copy link
Contributor

@phoebewang phoebewang left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM.

@RKSimon RKSimon enabled auto-merge (squash) December 11, 2025 11:21
@RKSimon RKSimon merged commit ac2291d into llvm:main Dec 11, 2025
9 of 10 checks passed
@RKSimon RKSimon deleted the x86-bitlogic-bitint branch December 11, 2025 11:51
RKSimon added a commit to RKSimon/llvm-project that referenced this pull request Dec 11, 2025
…n vector types if the new type is legal

Prevents us from attempting to store illegal types like <2 x i128> that will force scalarization/splitting

Noticed while trying to avoid some split stores mentioned in llvm#171616
RKSimon added a commit that referenced this pull request Dec 11, 2025
…n vector types if the new type is legal (#171813)

Prevents us from attempting to store illegal types like <2 x i128> that will force scalarization/splitting

Noticed while trying to avoid some split stores mentioned in #171616
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants