-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[X86] Use vectorized i256 bit counts when we know the source originated from the vector unit #171589
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…ed from the vector unit Currently we only permit i256 CTTZ/CTLZ AVX512 lowering when the source is loadable as GPR->FPU transition costs would outweigh the vectorization benefit. This patch checks for other cases where the source can avoid the GPR - a mayFoldToVector helper checks for a bitcast originally from a vector type, as well as constant values and the original mayFoldLoad check. There will be other cases for the mayFoldToVector helper, but I've just used this for CTTZ/CTLZ initially.
|
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesCurrently we only permit i256 CTTZ/CTLZ AVX512 lowering when the source is loadable as GPR->FPU transition costs would outweigh the vectorization benefit. This patch checks for other cases where the source can avoid the GPR - a mayFoldToVector helper checks for a bitcast originally from a vector type, as well as constant values and the original mayFoldLoad check. There will be other cases for the mayFoldToVector helper, but I've just used this for CTTZ/CTLZ initially. Full diff: https://github.com/llvm/llvm-project/pull/171589.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fbd875a93fd4a..b4ad7465d612e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2846,6 +2846,15 @@ bool X86::mayFoldIntoZeroExtend(SDValue Op) {
return false;
}
+// Return true if its cheap to bitcast this to a vector type.
+static bool mayFoldToVector(SDValue Op, const X86Subtarget &Subtarget) {
+ if (peekThroughBitcasts(Op).getValueType().isVector())
+ return true;
+ if (isa<ConstantSDNode>(Op) || isa<ConstantFPSDNode>(Op))
+ return true;
+ return X86::mayFoldLoad(Op, Subtarget);
+}
+
static bool isLogicOp(unsigned Opcode) {
// TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
@@ -33958,7 +33967,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
EVT VT = N->getValueType(0);
assert(Subtarget.hasCDI() && "AVX512CD required");
assert((VT == MVT::i256 || VT == MVT::i512) && "Unexpected VT!");
- if (VT == MVT::i256 && !X86::mayFoldLoad(N0, Subtarget))
+ if (VT == MVT::i256 && !mayFoldToVector(N0, Subtarget))
return;
unsigned SizeInBits = VT.getSizeInBits();
diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
index 749b3ddc96d0d..06ccbf4daa1e8 100644
--- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
@@ -1567,72 +1567,38 @@ define i32 @vector_ctlz_i256(<8 x i32> %v0) nounwind {
;
; AVX512F-LABEL: vector_ctlz_i256:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vmovq %xmm0, %rdx
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512F-NEXT: lzcntq %rsi, %rdi
-; AVX512F-NEXT: lzcntq %rdx, %r8
-; AVX512F-NEXT: addl $64, %r8d
-; AVX512F-NEXT: testq %rsi, %rsi
-; AVX512F-NEXT: cmovnel %edi, %r8d
-; AVX512F-NEXT: lzcntq %rcx, %rdi
-; AVX512F-NEXT: lzcntq %rax, %rax
-; AVX512F-NEXT: addl $64, %eax
-; AVX512F-NEXT: testq %rcx, %rcx
-; AVX512F-NEXT: cmovnel %edi, %eax
-; AVX512F-NEXT: subl $-128, %eax
-; AVX512F-NEXT: orq %rsi, %rdx
-; AVX512F-NEXT: cmovnel %r8d, %eax
-; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [256,256,256,256]
+; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX512F-NEXT: vplzcntq %zmm0, %zmm2
+; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $12, %k0, %k0
+; AVX512F-NEXT: kshiftrw $12, %k0, %k1
+; AVX512F-NEXT: vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT: vmovd %xmm1, %eax
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: vector_ctlz_i256:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %rdx
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512VL-NEXT: lzcntq %rsi, %rdi
-; AVX512VL-NEXT: lzcntq %rdx, %r8
-; AVX512VL-NEXT: addl $64, %r8d
-; AVX512VL-NEXT: testq %rsi, %rsi
-; AVX512VL-NEXT: cmovnel %edi, %r8d
-; AVX512VL-NEXT: lzcntq %rcx, %rdi
-; AVX512VL-NEXT: lzcntq %rax, %rax
-; AVX512VL-NEXT: addl $64, %eax
-; AVX512VL-NEXT: testq %rcx, %rcx
-; AVX512VL-NEXT: cmovnel %edi, %eax
-; AVX512VL-NEXT: subl $-128, %eax
-; AVX512VL-NEXT: orq %rsi, %rdx
-; AVX512VL-NEXT: cmovnel %r8d, %eax
-; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX512VL-NEXT: vplzcntq %ymm0, %ymm1
+; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512VL-NEXT: vptestmq %ymm0, %ymm0, %k1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512VL-NEXT: vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512POPCNT-LABEL: vector_ctlz_i256:
; AVX512POPCNT: # %bb.0:
-; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX512POPCNT-NEXT: vmovq %xmm0, %rax
-; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512POPCNT-NEXT: vmovq %xmm0, %rdx
-; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512POPCNT-NEXT: lzcntq %rsi, %rdi
-; AVX512POPCNT-NEXT: lzcntq %rdx, %r8
-; AVX512POPCNT-NEXT: addl $64, %r8d
-; AVX512POPCNT-NEXT: testq %rsi, %rsi
-; AVX512POPCNT-NEXT: cmovnel %edi, %r8d
-; AVX512POPCNT-NEXT: lzcntq %rcx, %rdi
-; AVX512POPCNT-NEXT: lzcntq %rax, %rax
-; AVX512POPCNT-NEXT: addl $64, %eax
-; AVX512POPCNT-NEXT: testq %rcx, %rcx
-; AVX512POPCNT-NEXT: cmovnel %edi, %eax
-; AVX512POPCNT-NEXT: subl $-128, %eax
-; AVX512POPCNT-NEXT: orq %rsi, %rdx
-; AVX512POPCNT-NEXT: cmovnel %r8d, %eax
-; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX512POPCNT-NEXT: vplzcntq %ymm0, %ymm1
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512POPCNT-NEXT: vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
; AVX512POPCNT-NEXT: vzeroupper
; AVX512POPCNT-NEXT: retq
%a0 = bitcast <8 x i32> %v0 to i256
@@ -3246,72 +3212,35 @@ define i32 @vector_ctlz_undef_i256(<8 x i32> %v0) nounwind {
;
; AVX512F-LABEL: vector_ctlz_undef_i256:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vmovq %xmm0, %rdx
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512F-NEXT: lzcntq %rsi, %rdi
-; AVX512F-NEXT: lzcntq %rdx, %r8
-; AVX512F-NEXT: addl $64, %r8d
-; AVX512F-NEXT: testq %rsi, %rsi
-; AVX512F-NEXT: cmovnel %edi, %r8d
-; AVX512F-NEXT: lzcntq %rcx, %rdi
-; AVX512F-NEXT: lzcntq %rax, %rax
-; AVX512F-NEXT: addl $64, %eax
-; AVX512F-NEXT: testq %rcx, %rcx
-; AVX512F-NEXT: cmovnel %edi, %eax
-; AVX512F-NEXT: subl $-128, %eax
-; AVX512F-NEXT: orq %rsi, %rdx
-; AVX512F-NEXT: cmovnel %r8d, %eax
-; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX512F-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $12, %k0, %k0
+; AVX512F-NEXT: kshiftrw $12, %k0, %k1
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: vector_ctlz_undef_i256:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %rdx
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512VL-NEXT: lzcntq %rsi, %rdi
-; AVX512VL-NEXT: lzcntq %rdx, %r8
-; AVX512VL-NEXT: addl $64, %r8d
-; AVX512VL-NEXT: testq %rsi, %rsi
-; AVX512VL-NEXT: cmovnel %edi, %r8d
-; AVX512VL-NEXT: lzcntq %rcx, %rdi
-; AVX512VL-NEXT: lzcntq %rax, %rax
-; AVX512VL-NEXT: addl $64, %eax
-; AVX512VL-NEXT: testq %rcx, %rcx
-; AVX512VL-NEXT: cmovnel %edi, %eax
-; AVX512VL-NEXT: subl $-128, %eax
-; AVX512VL-NEXT: orq %rsi, %rdx
-; AVX512VL-NEXT: cmovnel %r8d, %eax
-; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX512VL-NEXT: vptestmq %ymm0, %ymm0, %k1
+; AVX512VL-NEXT: vplzcntq %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT: vpcompressq %ymm0, %ymm0 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512POPCNT-LABEL: vector_ctlz_undef_i256:
; AVX512POPCNT: # %bb.0:
-; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX512POPCNT-NEXT: vmovq %xmm0, %rax
-; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512POPCNT-NEXT: vmovq %xmm0, %rdx
-; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512POPCNT-NEXT: lzcntq %rsi, %rdi
-; AVX512POPCNT-NEXT: lzcntq %rdx, %r8
-; AVX512POPCNT-NEXT: addl $64, %r8d
-; AVX512POPCNT-NEXT: testq %rsi, %rsi
-; AVX512POPCNT-NEXT: cmovnel %edi, %r8d
-; AVX512POPCNT-NEXT: lzcntq %rcx, %rdi
-; AVX512POPCNT-NEXT: lzcntq %rax, %rax
-; AVX512POPCNT-NEXT: addl $64, %eax
-; AVX512POPCNT-NEXT: testq %rcx, %rcx
-; AVX512POPCNT-NEXT: cmovnel %edi, %eax
-; AVX512POPCNT-NEXT: subl $-128, %eax
-; AVX512POPCNT-NEXT: orq %rsi, %rdx
-; AVX512POPCNT-NEXT: cmovnel %r8d, %eax
-; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX512POPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1
+; AVX512POPCNT-NEXT: vplzcntq %ymm0, %ymm0
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512POPCNT-NEXT: vpcompressq %ymm0, %ymm0 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
; AVX512POPCNT-NEXT: vzeroupper
; AVX512POPCNT-NEXT: retq
%a0 = bitcast <8 x i32> %v0 to i256
@@ -4887,72 +4816,47 @@ define i32 @vector_cttz_i256(<8 x i32> %v0) nounwind {
;
; AVX512F-LABEL: vector_cttz_i256:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT: vmovq %xmm1, %rcx
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512F-NEXT: vmovq %xmm0, %rsi
-; AVX512F-NEXT: tzcntq %rsi, %rdi
-; AVX512F-NEXT: tzcntq %rdx, %r8
-; AVX512F-NEXT: addl $64, %r8d
-; AVX512F-NEXT: testq %rsi, %rsi
-; AVX512F-NEXT: cmovnel %edi, %r8d
-; AVX512F-NEXT: tzcntq %rcx, %rdi
-; AVX512F-NEXT: tzcntq %rax, %rax
-; AVX512F-NEXT: addl $64, %eax
-; AVX512F-NEXT: testq %rcx, %rcx
-; AVX512F-NEXT: cmovnel %edi, %eax
-; AVX512F-NEXT: subl $-128, %eax
-; AVX512F-NEXT: orq %rdx, %rsi
-; AVX512F-NEXT: cmovnel %r8d, %eax
-; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [256,256,256,256]
+; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpandn %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vplzcntq %zmm2, %zmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [64,128,192,256]
+; AVX512F-NEXT: vpsubq %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $12, %k0, %k0
+; AVX512F-NEXT: kshiftrw $12, %k0, %k1
+; AVX512F-NEXT: vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT: vmovd %xmm1, %eax
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: vector_cttz_i256:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT: vmovq %xmm1, %rcx
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512VL-NEXT: vmovq %xmm0, %rsi
-; AVX512VL-NEXT: tzcntq %rsi, %rdi
-; AVX512VL-NEXT: tzcntq %rdx, %r8
-; AVX512VL-NEXT: addl $64, %r8d
-; AVX512VL-NEXT: testq %rsi, %rsi
-; AVX512VL-NEXT: cmovnel %edi, %r8d
-; AVX512VL-NEXT: tzcntq %rcx, %rdi
-; AVX512VL-NEXT: tzcntq %rax, %rax
-; AVX512VL-NEXT: addl $64, %eax
-; AVX512VL-NEXT: testq %rcx, %rcx
-; AVX512VL-NEXT: cmovnel %edi, %eax
-; AVX512VL-NEXT: subl $-128, %eax
-; AVX512VL-NEXT: orq %rdx, %rsi
-; AVX512VL-NEXT: cmovnel %r8d, %eax
-; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpaddq %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vpandn %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vplzcntq %ymm1, %ymm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [64,128,192,256]
+; AVX512VL-NEXT: vpsubq %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vptestmq %ymm0, %ymm0, %k1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512VL-NEXT: vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512POPCNT-LABEL: vector_cttz_i256:
; AVX512POPCNT: # %bb.0:
-; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512POPCNT-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512POPCNT-NEXT: vmovq %xmm1, %rcx
-; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512POPCNT-NEXT: vmovq %xmm0, %rsi
-; AVX512POPCNT-NEXT: tzcntq %rsi, %rdi
-; AVX512POPCNT-NEXT: tzcntq %rdx, %r8
-; AVX512POPCNT-NEXT: addl $64, %r8d
-; AVX512POPCNT-NEXT: testq %rsi, %rsi
-; AVX512POPCNT-NEXT: cmovnel %edi, %r8d
-; AVX512POPCNT-NEXT: tzcntq %rcx, %rdi
-; AVX512POPCNT-NEXT: tzcntq %rax, %rax
-; AVX512POPCNT-NEXT: addl $64, %eax
-; AVX512POPCNT-NEXT: testq %rcx, %rcx
-; AVX512POPCNT-NEXT: cmovnel %edi, %eax
-; AVX512POPCNT-NEXT: subl $-128, %eax
-; AVX512POPCNT-NEXT: orq %rdx, %rsi
-; AVX512POPCNT-NEXT: cmovnel %r8d, %eax
-; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vpaddq %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT: vpandn %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT: vpopcntq %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512POPCNT-NEXT: vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
; AVX512POPCNT-NEXT: vzeroupper
; AVX512POPCNT-NEXT: retq
%a0 = bitcast <8 x i32> %v0 to i256
@@ -6484,72 +6388,44 @@ define i32 @vector_cttz_undef_i256(<8 x i32> %v0) nounwind {
;
; AVX512F-LABEL: vector_cttz_undef_i256:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT: vmovq %xmm1, %rcx
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512F-NEXT: vmovq %xmm0, %rsi
-; AVX512F-NEXT: tzcntq %rsi, %rdi
-; AVX512F-NEXT: tzcntq %rdx, %r8
-; AVX512F-NEXT: addl $64, %r8d
-; AVX512F-NEXT: testq %rsi, %rsi
-; AVX512F-NEXT: cmovnel %edi, %r8d
-; AVX512F-NEXT: tzcntq %rcx, %rdi
-; AVX512F-NEXT: tzcntq %rax, %rax
-; AVX512F-NEXT: addl $64, %eax
-; AVX512F-NEXT: testq %rcx, %rcx
-; AVX512F-NEXT: cmovnel %edi, %eax
-; AVX512F-NEXT: subl $-128, %eax
-; AVX512F-NEXT: orq %rdx, %rsi
-; AVX512F-NEXT: cmovnel %r8d, %eax
-; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT: vpandn %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [64,128,192,256]
+; AVX512F-NEXT: vpsubq %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $12, %k0, %k0
+; AVX512F-NEXT: kshiftrw $12, %k0, %k1
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: vector_cttz_undef_i256:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT: vmovq %xmm1, %rcx
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512VL-NEXT: vmovq %xmm0, %rsi
-; AVX512VL-NEXT: tzcntq %rsi, %rdi
-; AVX512VL-NEXT: tzcntq %rdx, %r8
-; AVX512VL-NEXT: addl $64, %r8d
-; AVX512VL-NEXT: testq %rsi, %rsi
-; AVX512VL-NEXT: cmovnel %edi, %r8d
-; AVX512VL-NEXT: tzcntq %rcx, %rdi
-; AVX512VL-NEXT: tzcntq %rax, %rax
-; AVX512VL-NEXT: addl $64, %eax
-; AVX512VL-NEXT: testq %rcx, %rcx
-; AVX512VL-NEXT: cmovnel %edi, %eax
-; AVX512VL-NEXT: subl $-128, %eax
-; AVX512VL-NEXT: orq %rdx, %rsi
-; AVX512VL-NEXT: cmovnel %r8d, %eax
-; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpaddq %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vpandn %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [64,128,192,256]
+; AVX512VL-NEXT: vplzcntq %ymm1, %ymm1
+; AVX512VL-NEXT: vpsubq %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vptestmq %ymm0, %ymm0, %k1
+; AVX512VL-NEXT: vpcompressq %ymm1, %ymm0 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512POPCNT-LABEL: vector_cttz_undef_i256:
; AVX512POPCNT: # %bb.0:
-; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512POPCNT-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512POPCNT-NEXT: vmovq %xmm1, %rcx
-; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512POPCNT-NEXT: vmovq %xmm0, %rsi
-; AVX512POPCNT-NEXT: tzcntq %rsi, %rdi
-; AVX512POPCNT-NEXT: tzcntq %rdx, %r8
-; AVX512POPCNT-NEXT: addl $64, %r8d
-; AVX512POPCNT-NEXT: testq %rsi, %rsi
-; AVX512POPCNT-NEXT: cmovnel %edi, %r8d
-; AVX512POPCNT-NEXT: tzcntq %rcx, %rdi
-; AVX512POPCNT-NEXT: tzcntq %rax, %rax
-; AVX512POPCNT-NEXT: addl $64, %eax
-; AVX512POPCNT-NEXT: testq %rcx, %rcx
-; AVX512POPCNT-NEXT: cmovnel %edi, %eax
-; AVX512POPCNT-NEXT: subl $-128, %eax
-; AVX512POPCNT-NEXT: orq %rdx, %rsi
-; AVX512POPCNT-NEXT: cmovnel %r8d, %eax
-; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vpaddq %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT: vpandn %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT: vpopcntq %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1
+; AVX512POPCNT-NEXT: vpcompressq %ymm1, %ymm0 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
; AVX512POPCNT-NEXT: vzeroupper
; AVX512POPCNT-NEXT: retq
%a0 = bitcast <8 x i32> %v0 to i256
|
| } | ||
|
|
||
| // Return true if its cheap to bitcast this to a vector type. | ||
| static bool mayFoldToVector(SDValue Op, const X86Subtarget &Subtarget) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
mayFoldIntoVector to match above?
phoebewang
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/169/builds/17882 Here is the relevant piece of the build log for the reference |
Currently we only permit i256 CTTZ/CTLZ AVX512 lowering when the source is loadable as GPR->FPU transition costs would outweigh the vectorization benefit.
This patch checks for other cases where the source can avoid the GPR - a mayFoldToVector helper checks for a bitcast originally from a vector type, as well as constant values and the original mayFoldLoad check.
There will be other cases for the mayFoldToVector helper, but I've just used this for CTTZ/CTLZ initially.