diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll index 0fd555991ae29..749b3ddc96d0d 100644 --- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll +++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll @@ -52,6 +52,63 @@ define i32 @load_ctpop_i128(ptr %p0) nounwind { ret i32 %res } +define i32 @vector_ctpop_i128(<4 x i32> %v0) nounwind { +; SSE-LABEL: vector_ctpop_i128: +; SSE: # %bb.0: +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: popcntq %rcx, %rcx +; SSE-NEXT: popcntq %rax, %rax +; SSE-NEXT: addl %ecx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_ctpop_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: popcntq %rax, %rdx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: popcntq %rcx, %rax +; AVX2-NEXT: addl %edx, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_ctpop_i128: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vmovq %xmm0, %rcx +; AVX512F-NEXT: popcntq %rax, %rdx +; AVX512F-NEXT: popcntq %rcx, %rax +; AVX512F-NEXT: addl %edx, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_ctpop_i128: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512VL-NEXT: popcntq %rcx, %rcx +; AVX512VL-NEXT: popcntq %rax, %rax +; AVX512VL-NEXT: addl %ecx, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_ctpop_i128: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovq %xmm0, %rax +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512POPCNT-NEXT: popcntq %rcx, %rcx +; AVX512POPCNT-NEXT: popcntq %rax, %rax +; AVX512POPCNT-NEXT: addl %ecx, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <4 x i32> %v0 to i128 + %cnt = call i128 @llvm.ctpop.i128(i128 %a0) + %res = trunc i128 %cnt to i32 + ret i32 %res +} + define i32 @test_ctpop_i256(i256 %a0) nounwind { ; CHECK-LABEL: test_ctpop_i256: ; CHECK: # %bb.0: @@ -183,6 +240,107 @@ define i32 @load_ctpop_i256(ptr %p0) nounwind { ret i32 %res } +define i32 @vector_ctpop_i256(<8 x i32> %v0) nounwind { +; SSE-LABEL: vector_ctpop_i256: +; SSE: # %bb.0: +; SSE-NEXT: pextrq $1, %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rcx +; SSE-NEXT: movq %xmm1, %rdx +; SSE-NEXT: pextrq $1, %xmm1, %rsi +; SSE-NEXT: popcntq %rsi, %rsi +; SSE-NEXT: popcntq %rdx, %rdx +; SSE-NEXT: addl %esi, %edx +; SSE-NEXT: xorl %esi, %esi +; SSE-NEXT: popcntq %rax, %rsi +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: popcntq %rcx, %rax +; SSE-NEXT: addl %esi, %eax +; SSE-NEXT: addl %edx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_ctpop_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rdx +; AVX2-NEXT: vmovq %xmm0, %rsi +; AVX2-NEXT: popcntq %rdx, %rdx +; AVX2-NEXT: popcntq %rsi, %rsi +; AVX2-NEXT: addl %edx, %esi +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: popcntq %rax, %rdx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: popcntq %rcx, %rax +; AVX2-NEXT: addl %edx, %eax +; AVX2-NEXT: addl %esi, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_ctpop_i256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vmovq %xmm0, %rcx +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512F-NEXT: vmovq %xmm0, %rsi +; AVX512F-NEXT: popcntq %rdx, %rdx +; AVX512F-NEXT: popcntq %rsi, %rsi +; AVX512F-NEXT: addl %edx, %esi +; AVX512F-NEXT: popcntq %rax, %rdx +; AVX512F-NEXT: popcntq %rcx, %rax +; AVX512F-NEXT: addl %edx, %eax +; AVX512F-NEXT: addl %esi, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_ctpop_i256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vmovq %xmm0, %rcx +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rdx +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512VL-NEXT: popcntq %rsi, %rsi +; AVX512VL-NEXT: popcntq %rdx, %rdx +; AVX512VL-NEXT: addl %esi, %edx +; AVX512VL-NEXT: xorl %esi, %esi +; AVX512VL-NEXT: popcntq %rax, %rsi +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: popcntq %rcx, %rax +; AVX512VL-NEXT: addl %esi, %eax +; AVX512VL-NEXT: addl %edx, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_ctpop_i256: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rax +; AVX512POPCNT-NEXT: vmovq %xmm0, %rcx +; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512POPCNT-NEXT: vmovq %xmm0, %rdx +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512POPCNT-NEXT: popcntq %rsi, %rsi +; AVX512POPCNT-NEXT: popcntq %rdx, %rdx +; AVX512POPCNT-NEXT: addl %esi, %edx +; AVX512POPCNT-NEXT: xorl %esi, %esi +; AVX512POPCNT-NEXT: popcntq %rax, %rsi +; AVX512POPCNT-NEXT: xorl %eax, %eax +; AVX512POPCNT-NEXT: popcntq %rcx, %rax +; AVX512POPCNT-NEXT: addl %esi, %eax +; AVX512POPCNT-NEXT: addl %edx, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <8 x i32> %v0 to i256 + %cnt = call i256 @llvm.ctpop.i256(i256 %a0) + %res = trunc i256 %cnt to i32 + ret i32 %res +} + define i32 @test_ctpop_i512(i512 %a0) nounwind { ; CHECK-LABEL: test_ctpop_i512: ; CHECK: # %bb.0: @@ -404,6 +562,166 @@ define i32 @load_ctpop_i512(ptr %p0) nounwind { ret i32 %res } +define i32 @vector_ctpop_i512(<16 x i32> %v0) nounwind { +; SSE-LABEL: vector_ctpop_i512: +; SSE: # %bb.0: +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: movq %xmm1, %rdx +; SSE-NEXT: pextrq $1, %xmm1, %rsi +; SSE-NEXT: pextrq $1, %xmm2, %rdi +; SSE-NEXT: movq %xmm2, %r8 +; SSE-NEXT: movq %xmm3, %r9 +; SSE-NEXT: pextrq $1, %xmm3, %r10 +; SSE-NEXT: popcntq %r10, %r10 +; SSE-NEXT: popcntq %r9, %r9 +; SSE-NEXT: addl %r10d, %r9d +; SSE-NEXT: popcntq %rdi, %rdi +; SSE-NEXT: popcntq %r8, %r8 +; SSE-NEXT: addl %edi, %r8d +; SSE-NEXT: addl %r9d, %r8d +; SSE-NEXT: popcntq %rsi, %rsi +; SSE-NEXT: popcntq %rdx, %rdx +; SSE-NEXT: addl %esi, %edx +; SSE-NEXT: popcntq %rcx, %rcx +; SSE-NEXT: popcntq %rax, %rax +; SSE-NEXT: addl %ecx, %eax +; SSE-NEXT: addl %edx, %eax +; SSE-NEXT: addl %r8d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_ctpop_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rdx +; AVX2-NEXT: vpextrq $1, %xmm0, %rsi +; AVX2-NEXT: vpextrq $1, %xmm1, %rdi +; AVX2-NEXT: vmovq %xmm1, %r8 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %r9 +; AVX2-NEXT: vmovq %xmm0, %r10 +; AVX2-NEXT: popcntq %r9, %r9 +; AVX2-NEXT: popcntq %r10, %r10 +; AVX2-NEXT: addl %r9d, %r10d +; AVX2-NEXT: popcntq %rdi, %rdi +; AVX2-NEXT: popcntq %r8, %r8 +; AVX2-NEXT: addl %edi, %r8d +; AVX2-NEXT: addl %r10d, %r8d +; AVX2-NEXT: popcntq %rsi, %rsi +; AVX2-NEXT: popcntq %rdx, %rdx +; AVX2-NEXT: addl %esi, %edx +; AVX2-NEXT: popcntq %rcx, %rcx +; AVX2-NEXT: popcntq %rax, %rax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: addl %edx, %eax +; AVX2-NEXT: addl %r8d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_ctpop_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx +; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512F-NEXT: vmovq %xmm0, %rsi +; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm1, %rdi +; AVX512F-NEXT: vmovq %xmm1, %r8 +; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512F-NEXT: vpextrq $1, %xmm0, %r9 +; AVX512F-NEXT: vmovq %xmm0, %r10 +; AVX512F-NEXT: popcntq %r9, %r9 +; AVX512F-NEXT: popcntq %r10, %r10 +; AVX512F-NEXT: addl %r9d, %r10d +; AVX512F-NEXT: popcntq %rdi, %rdi +; AVX512F-NEXT: popcntq %r8, %r8 +; AVX512F-NEXT: addl %edi, %r8d +; AVX512F-NEXT: addl %r10d, %r8d +; AVX512F-NEXT: popcntq %rdx, %rdx +; AVX512F-NEXT: popcntq %rsi, %rsi +; AVX512F-NEXT: addl %edx, %esi +; AVX512F-NEXT: popcntq %rcx, %rcx +; AVX512F-NEXT: popcntq %rax, %rax +; AVX512F-NEXT: addl %ecx, %eax +; AVX512F-NEXT: addl %esi, %eax +; AVX512F-NEXT: addl %r8d, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_ctpop_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rcx +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512VL-NEXT: vmovq %xmm0, %rsi +; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; AVX512VL-NEXT: vmovq %xmm1, %rdi +; AVX512VL-NEXT: vpextrq $1, %xmm1, %r8 +; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %r9 +; AVX512VL-NEXT: vpextrq $1, %xmm0, %r10 +; AVX512VL-NEXT: popcntq %r10, %r10 +; AVX512VL-NEXT: popcntq %r9, %r9 +; AVX512VL-NEXT: addl %r10d, %r9d +; AVX512VL-NEXT: popcntq %r8, %r8 +; AVX512VL-NEXT: popcntq %rdi, %rdi +; AVX512VL-NEXT: addl %r8d, %edi +; AVX512VL-NEXT: addl %r9d, %edi +; AVX512VL-NEXT: popcntq %rdx, %rdx +; AVX512VL-NEXT: popcntq %rsi, %rsi +; AVX512VL-NEXT: addl %edx, %esi +; AVX512VL-NEXT: popcntq %rcx, %rcx +; AVX512VL-NEXT: popcntq %rax, %rax +; AVX512VL-NEXT: addl %ecx, %eax +; AVX512VL-NEXT: addl %esi, %eax +; AVX512VL-NEXT: addl %edi, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_ctpop_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512POPCNT-NEXT: vmovq %xmm1, %rax +; AVX512POPCNT-NEXT: vpextrq $1, %xmm1, %rcx +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512POPCNT-NEXT: vmovq %xmm0, %rsi +; AVX512POPCNT-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; AVX512POPCNT-NEXT: vmovq %xmm1, %rdi +; AVX512POPCNT-NEXT: vpextrq $1, %xmm1, %r8 +; AVX512POPCNT-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512POPCNT-NEXT: vmovq %xmm0, %r9 +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %r10 +; AVX512POPCNT-NEXT: popcntq %r10, %r10 +; AVX512POPCNT-NEXT: popcntq %r9, %r9 +; AVX512POPCNT-NEXT: addl %r10d, %r9d +; AVX512POPCNT-NEXT: popcntq %r8, %r8 +; AVX512POPCNT-NEXT: popcntq %rdi, %rdi +; AVX512POPCNT-NEXT: addl %r8d, %edi +; AVX512POPCNT-NEXT: addl %r9d, %edi +; AVX512POPCNT-NEXT: popcntq %rdx, %rdx +; AVX512POPCNT-NEXT: popcntq %rsi, %rsi +; AVX512POPCNT-NEXT: addl %edx, %esi +; AVX512POPCNT-NEXT: popcntq %rcx, %rcx +; AVX512POPCNT-NEXT: popcntq %rax, %rax +; AVX512POPCNT-NEXT: addl %ecx, %eax +; AVX512POPCNT-NEXT: addl %esi, %eax +; AVX512POPCNT-NEXT: addl %edi, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <16 x i32> %v0 to i512 + %cnt = call i512 @llvm.ctpop.i512(i512 %a0) + %res = trunc i512 %cnt to i32 + ret i32 %res +} + define i32 @test_ctpop_i1024(i1024 %a0) nounwind { ; SSE-LABEL: test_ctpop_i1024: ; SSE: # %bb.0: @@ -969,6 +1287,75 @@ define i32 @load_ctlz_i128(ptr %p0) nounwind { ret i32 %res } +define i32 @vector_ctlz_i128(<4 x i32> %v0) nounwind { +; SSE-LABEL: vector_ctlz_i128: +; SSE: # %bb.0: +; SSE-NEXT: movq %xmm0, %rcx +; SSE-NEXT: pextrq $1, %xmm0, %rdx +; SSE-NEXT: bsrq %rdx, %rsi +; SSE-NEXT: xorl $63, %esi +; SSE-NEXT: movl $127, %eax +; SSE-NEXT: bsrq %rcx, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_ctlz_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: lzcntq %rcx, %rdx +; AVX2-NEXT: lzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edx, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_ctlz_i128: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512F-NEXT: lzcntq %rcx, %rdx +; AVX512F-NEXT: lzcntq %rax, %rax +; AVX512F-NEXT: addl $64, %eax +; AVX512F-NEXT: testq %rcx, %rcx +; AVX512F-NEXT: cmovnel %edx, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_ctlz_i128: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: lzcntq %rcx, %rdx +; AVX512VL-NEXT: lzcntq %rax, %rax +; AVX512VL-NEXT: addl $64, %eax +; AVX512VL-NEXT: testq %rcx, %rcx +; AVX512VL-NEXT: cmovnel %edx, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_ctlz_i128: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512POPCNT-NEXT: vmovq %xmm0, %rax +; AVX512POPCNT-NEXT: lzcntq %rcx, %rdx +; AVX512POPCNT-NEXT: lzcntq %rax, %rax +; AVX512POPCNT-NEXT: addl $64, %eax +; AVX512POPCNT-NEXT: testq %rcx, %rcx +; AVX512POPCNT-NEXT: cmovnel %edx, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <4 x i32> %v0 to i128 + %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 0) + %res = trunc i128 %cnt to i32 + ret i32 %res +} + define i32 @test_ctlz_i256(i256 %a0) nounwind { ; SSE-LABEL: test_ctlz_i256: ; SSE: # %bb.0: @@ -1125,6 +1512,135 @@ define i32 @load_ctlz_i256(ptr %p0) nounwind { ret i32 %res } +define i32 @vector_ctlz_i256(<8 x i32> %v0) nounwind { +; SSE-LABEL: vector_ctlz_i256: +; SSE: # %bb.0: +; SSE-NEXT: movq %xmm0, %rcx +; SSE-NEXT: pextrq $1, %xmm0, %rdx +; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: pextrq $1, %xmm1, %rsi +; SSE-NEXT: bsrq %rsi, %rdi +; SSE-NEXT: xorl $63, %edi +; SSE-NEXT: bsrq %rax, %r8 +; SSE-NEXT: xorl $63, %r8d +; SSE-NEXT: orl $64, %r8d +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: cmovnel %edi, %r8d +; SSE-NEXT: bsrq %rdx, %rsi +; SSE-NEXT: xorl $63, %esi +; SSE-NEXT: movl $127, %eax +; SSE-NEXT: bsrq %rcx, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: ptest %xmm1, %xmm1 +; SSE-NEXT: cmovnel %r8d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_ctlz_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rdx +; AVX2-NEXT: vpextrq $1, %xmm0, %rsi +; AVX2-NEXT: lzcntq %rsi, %rdi +; AVX2-NEXT: lzcntq %rdx, %r8 +; AVX2-NEXT: addl $64, %r8d +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %edi, %r8d +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: lzcntq %rcx, %rdi +; AVX2-NEXT: lzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edi, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rsi, %rdx +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_ctlz_i256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %rdx +; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512F-NEXT: lzcntq %rsi, %rdi +; AVX512F-NEXT: lzcntq %rdx, %r8 +; AVX512F-NEXT: addl $64, %r8d +; AVX512F-NEXT: testq %rsi, %rsi +; AVX512F-NEXT: cmovnel %edi, %r8d +; AVX512F-NEXT: lzcntq %rcx, %rdi +; AVX512F-NEXT: lzcntq %rax, %rax +; AVX512F-NEXT: addl $64, %eax +; AVX512F-NEXT: testq %rcx, %rcx +; AVX512F-NEXT: cmovnel %edi, %eax +; AVX512F-NEXT: subl $-128, %eax +; AVX512F-NEXT: orq %rsi, %rdx +; AVX512F-NEXT: cmovnel %r8d, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_ctlz_i256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rdx +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512VL-NEXT: lzcntq %rsi, %rdi +; AVX512VL-NEXT: lzcntq %rdx, %r8 +; AVX512VL-NEXT: addl $64, %r8d +; AVX512VL-NEXT: testq %rsi, %rsi +; AVX512VL-NEXT: cmovnel %edi, %r8d +; AVX512VL-NEXT: lzcntq %rcx, %rdi +; AVX512VL-NEXT: lzcntq %rax, %rax +; AVX512VL-NEXT: addl $64, %eax +; AVX512VL-NEXT: testq %rcx, %rcx +; AVX512VL-NEXT: cmovnel %edi, %eax +; AVX512VL-NEXT: subl $-128, %eax +; AVX512VL-NEXT: orq %rsi, %rdx +; AVX512VL-NEXT: cmovnel %r8d, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_ctlz_i256: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512POPCNT-NEXT: vmovq %xmm0, %rax +; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512POPCNT-NEXT: vmovq %xmm0, %rdx +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512POPCNT-NEXT: lzcntq %rsi, %rdi +; AVX512POPCNT-NEXT: lzcntq %rdx, %r8 +; AVX512POPCNT-NEXT: addl $64, %r8d +; AVX512POPCNT-NEXT: testq %rsi, %rsi +; AVX512POPCNT-NEXT: cmovnel %edi, %r8d +; AVX512POPCNT-NEXT: lzcntq %rcx, %rdi +; AVX512POPCNT-NEXT: lzcntq %rax, %rax +; AVX512POPCNT-NEXT: addl $64, %eax +; AVX512POPCNT-NEXT: testq %rcx, %rcx +; AVX512POPCNT-NEXT: cmovnel %edi, %eax +; AVX512POPCNT-NEXT: subl $-128, %eax +; AVX512POPCNT-NEXT: orq %rsi, %rdx +; AVX512POPCNT-NEXT: cmovnel %r8d, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <8 x i32> %v0 to i256 + %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 0) + %res = trunc i256 %cnt to i32 + ret i32 %res +} + define i32 @test_ctlz_i512(i512 %a0) nounwind { ; SSE-LABEL: test_ctlz_i512: ; SSE: # %bb.0: @@ -1423,10 +1939,155 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind { ; AVX2-NEXT: popq %r15 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: load_ctlz_i512: +; AVX512F-LABEL: load_ctlz_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512] +; AVX512F-NEXT: vpcompressq %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: load_ctlz_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512VL-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: load_ctlz_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512POPCNT-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = load i512, ptr %p0 + %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0) + %res = trunc i512 %cnt to i32 + ret i32 %res +} + +define i32 @vector_ctlz_i512(<16 x i32> %v0) nounwind { +; SSE-LABEL: vector_ctlz_i512: +; SSE: # %bb.0: +; SSE-NEXT: movq %xmm0, %rdx +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: pextrq $1, %xmm1, %rax +; SSE-NEXT: pextrq $1, %xmm2, %rdi +; SSE-NEXT: movq %xmm2, %rsi +; SSE-NEXT: movq %xmm3, %r8 +; SSE-NEXT: pextrq $1, %xmm3, %r9 +; SSE-NEXT: bsrq %r9, %r10 +; SSE-NEXT: xorl $63, %r10d +; SSE-NEXT: bsrq %r8, %r8 +; SSE-NEXT: xorl $63, %r8d +; SSE-NEXT: orl $64, %r8d +; SSE-NEXT: testq %r9, %r9 +; SSE-NEXT: cmovnel %r10d, %r8d +; SSE-NEXT: bsrq %rdi, %r9 +; SSE-NEXT: xorl $63, %r9d +; SSE-NEXT: bsrq %rsi, %rsi +; SSE-NEXT: xorl $63, %esi +; SSE-NEXT: orl $64, %esi +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnel %r9d, %esi +; SSE-NEXT: movq %xmm1, %rdi +; SSE-NEXT: subl $-128, %esi +; SSE-NEXT: ptest %xmm3, %xmm3 +; SSE-NEXT: cmovnel %r8d, %esi +; SSE-NEXT: bsrq %rax, %r8 +; SSE-NEXT: xorl $63, %r8d +; SSE-NEXT: bsrq %rdi, %rdi +; SSE-NEXT: xorl $63, %edi +; SSE-NEXT: orl $64, %edi +; SSE-NEXT: testq %rax, %rax +; SSE-NEXT: cmovnel %r8d, %edi +; SSE-NEXT: bsrq %rcx, %r8 +; SSE-NEXT: xorl $63, %r8d +; SSE-NEXT: movl $127, %eax +; SSE-NEXT: bsrq %rdx, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %r8d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: ptest %xmm1, %xmm1 +; SSE-NEXT: cmovnel %edi, %eax +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: ptest %xmm2, %xmm2 +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_ctlz_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm2, %rcx +; AVX2-NEXT: vmovq %xmm2, %rdx +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm2, %rax +; AVX2-NEXT: vmovq %xmm2, %r8 +; AVX2-NEXT: vpextrq $1, %xmm0, %rsi +; AVX2-NEXT: vmovq %xmm1, %rdi +; AVX2-NEXT: vpextrq $1, %xmm1, %r9 +; AVX2-NEXT: lzcntq %rax, %r10 +; AVX2-NEXT: lzcntq %r8, %r11 +; AVX2-NEXT: addl $64, %r11d +; AVX2-NEXT: testq %rax, %rax +; AVX2-NEXT: cmovnel %r10d, %r11d +; AVX2-NEXT: xorl %r10d, %r10d +; AVX2-NEXT: lzcntq %r9, %r10 +; AVX2-NEXT: lzcntq %rdi, %rdi +; AVX2-NEXT: addl $64, %edi +; AVX2-NEXT: testq %r9, %r9 +; AVX2-NEXT: cmovnel %r10d, %edi +; AVX2-NEXT: subl $-128, %edi +; AVX2-NEXT: orq %rax, %r8 +; AVX2-NEXT: cmovnel %r11d, %edi +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %rcx, %rax +; AVX2-NEXT: xorl %r8d, %r8d +; AVX2-NEXT: lzcntq %rdx, %r8 +; AVX2-NEXT: addl $64, %r8d +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %eax, %r8d +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: xorl %r9d, %r9d +; AVX2-NEXT: lzcntq %rsi, %r9 +; AVX2-NEXT: lzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %r9d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: vptest %ymm1, %ymm1 +; AVX2-NEXT: cmovnel %edi, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_ctlz_i512: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] -; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vplzcntq %zmm0, %zmm0 ; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 @@ -1435,10 +2096,10 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind { ; AVX512F-NEXT: vmovd %xmm1, %eax ; AVX512F-NEXT: retq ; -; AVX512VL-LABEL: load_ctlz_i512: +; AVX512VL-LABEL: vector_ctlz_i512: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] -; AVX512VL-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1 ; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 ; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 @@ -1448,10 +2109,10 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512POPCNT-LABEL: load_ctlz_i512: +; AVX512POPCNT-LABEL: vector_ctlz_i512: ; AVX512POPCNT: # %bb.0: -; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] -; AVX512POPCNT-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; AVX512POPCNT-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1 ; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 ; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 @@ -1460,7 +2121,7 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind { ; AVX512POPCNT-NEXT: vmovd %xmm0, %eax ; AVX512POPCNT-NEXT: vzeroupper ; AVX512POPCNT-NEXT: retq - %a0 = load i512, ptr %p0 + %a0 = bitcast <16 x i32> %v0 to i512 %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0) %res = trunc i512 %cnt to i32 ret i32 %res @@ -2312,6 +2973,74 @@ define i32 @load_ctlz_undef_i128(ptr %p0) nounwind { ret i32 %res } +define i32 @vector_ctlz_undef_i128(<4 x i32> %v0) nounwind { +; SSE-LABEL: vector_ctlz_undef_i128: +; SSE: # %bb.0: +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: bsrq %rcx, %rdx +; SSE-NEXT: xorl $63, %edx +; SSE-NEXT: bsrq %rax, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_ctlz_undef_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: lzcntq %rcx, %rdx +; AVX2-NEXT: lzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edx, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_ctlz_undef_i128: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512F-NEXT: lzcntq %rcx, %rdx +; AVX512F-NEXT: lzcntq %rax, %rax +; AVX512F-NEXT: addl $64, %eax +; AVX512F-NEXT: testq %rcx, %rcx +; AVX512F-NEXT: cmovnel %edx, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_ctlz_undef_i128: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: lzcntq %rcx, %rdx +; AVX512VL-NEXT: lzcntq %rax, %rax +; AVX512VL-NEXT: addl $64, %eax +; AVX512VL-NEXT: testq %rcx, %rcx +; AVX512VL-NEXT: cmovnel %edx, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_ctlz_undef_i128: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512POPCNT-NEXT: vmovq %xmm0, %rax +; AVX512POPCNT-NEXT: lzcntq %rcx, %rdx +; AVX512POPCNT-NEXT: lzcntq %rax, %rax +; AVX512POPCNT-NEXT: addl $64, %eax +; AVX512POPCNT-NEXT: testq %rcx, %rcx +; AVX512POPCNT-NEXT: cmovnel %edx, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <4 x i32> %v0 to i128 + %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 -1) + %res = trunc i128 %cnt to i32 + ret i32 %res +} + define i32 @test_ctlz_undef_i256(i256 %a0) nounwind { ; SSE-LABEL: test_ctlz_undef_i256: ; SSE: # %bb.0: @@ -2463,6 +3192,134 @@ define i32 @load_ctlz_undef_i256(ptr %p0) nounwind { ret i32 %res } +define i32 @vector_ctlz_undef_i256(<8 x i32> %v0) nounwind { +; SSE-LABEL: vector_ctlz_undef_i256: +; SSE: # %bb.0: +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: movq %xmm1, %rdx +; SSE-NEXT: pextrq $1, %xmm1, %rsi +; SSE-NEXT: bsrq %rsi, %rdi +; SSE-NEXT: xorl $63, %edi +; SSE-NEXT: bsrq %rdx, %rdx +; SSE-NEXT: xorl $63, %edx +; SSE-NEXT: orl $64, %edx +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: cmovnel %edi, %edx +; SSE-NEXT: bsrq %rcx, %rsi +; SSE-NEXT: xorl $63, %esi +; SSE-NEXT: bsrq %rax, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: ptest %xmm1, %xmm1 +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_ctlz_undef_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rdx +; AVX2-NEXT: vpextrq $1, %xmm0, %rsi +; AVX2-NEXT: lzcntq %rsi, %rdi +; AVX2-NEXT: lzcntq %rdx, %r8 +; AVX2-NEXT: addl $64, %r8d +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %edi, %r8d +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: lzcntq %rcx, %rdi +; AVX2-NEXT: lzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edi, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rsi, %rdx +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_ctlz_undef_i256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %rdx +; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512F-NEXT: lzcntq %rsi, %rdi +; AVX512F-NEXT: lzcntq %rdx, %r8 +; AVX512F-NEXT: addl $64, %r8d +; AVX512F-NEXT: testq %rsi, %rsi +; AVX512F-NEXT: cmovnel %edi, %r8d +; AVX512F-NEXT: lzcntq %rcx, %rdi +; AVX512F-NEXT: lzcntq %rax, %rax +; AVX512F-NEXT: addl $64, %eax +; AVX512F-NEXT: testq %rcx, %rcx +; AVX512F-NEXT: cmovnel %edi, %eax +; AVX512F-NEXT: subl $-128, %eax +; AVX512F-NEXT: orq %rsi, %rdx +; AVX512F-NEXT: cmovnel %r8d, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_ctlz_undef_i256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rdx +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512VL-NEXT: lzcntq %rsi, %rdi +; AVX512VL-NEXT: lzcntq %rdx, %r8 +; AVX512VL-NEXT: addl $64, %r8d +; AVX512VL-NEXT: testq %rsi, %rsi +; AVX512VL-NEXT: cmovnel %edi, %r8d +; AVX512VL-NEXT: lzcntq %rcx, %rdi +; AVX512VL-NEXT: lzcntq %rax, %rax +; AVX512VL-NEXT: addl $64, %eax +; AVX512VL-NEXT: testq %rcx, %rcx +; AVX512VL-NEXT: cmovnel %edi, %eax +; AVX512VL-NEXT: subl $-128, %eax +; AVX512VL-NEXT: orq %rsi, %rdx +; AVX512VL-NEXT: cmovnel %r8d, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_ctlz_undef_i256: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512POPCNT-NEXT: vmovq %xmm0, %rax +; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512POPCNT-NEXT: vmovq %xmm0, %rdx +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512POPCNT-NEXT: lzcntq %rsi, %rdi +; AVX512POPCNT-NEXT: lzcntq %rdx, %r8 +; AVX512POPCNT-NEXT: addl $64, %r8d +; AVX512POPCNT-NEXT: testq %rsi, %rsi +; AVX512POPCNT-NEXT: cmovnel %edi, %r8d +; AVX512POPCNT-NEXT: lzcntq %rcx, %rdi +; AVX512POPCNT-NEXT: lzcntq %rax, %rax +; AVX512POPCNT-NEXT: addl $64, %eax +; AVX512POPCNT-NEXT: testq %rcx, %rcx +; AVX512POPCNT-NEXT: cmovnel %edi, %eax +; AVX512POPCNT-NEXT: subl $-128, %eax +; AVX512POPCNT-NEXT: orq %rsi, %rdx +; AVX512POPCNT-NEXT: cmovnel %r8d, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <8 x i32> %v0 to i256 + %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 -1) + %res = trunc i256 %cnt to i32 + ret i32 %res +} + define i32 @test_ctlz_undef_i512(i512 %a0) nounwind { ; SSE-LABEL: test_ctlz_undef_i512: ; SSE: # %bb.0: @@ -2796,6 +3653,147 @@ define i32 @load_ctlz_undef_i512(ptr %p0) nounwind { ret i32 %res } +define i32 @vector_ctlz_undef_i512(<16 x i32> %v0) nounwind { +; SSE-LABEL: vector_ctlz_undef_i512: +; SSE: # %bb.0: +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: pextrq $1, %xmm1, %rax +; SSE-NEXT: pextrq $1, %xmm2, %rsi +; SSE-NEXT: movq %xmm2, %rdx +; SSE-NEXT: movq %xmm3, %rdi +; SSE-NEXT: pextrq $1, %xmm3, %r8 +; SSE-NEXT: bsrq %r8, %r9 +; SSE-NEXT: xorl $63, %r9d +; SSE-NEXT: bsrq %rdi, %rdi +; SSE-NEXT: xorl $63, %edi +; SSE-NEXT: orl $64, %edi +; SSE-NEXT: testq %r8, %r8 +; SSE-NEXT: cmovnel %r9d, %edi +; SSE-NEXT: bsrq %rsi, %r8 +; SSE-NEXT: xorl $63, %r8d +; SSE-NEXT: bsrq %rdx, %rdx +; SSE-NEXT: xorl $63, %edx +; SSE-NEXT: orl $64, %edx +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: cmovnel %r8d, %edx +; SSE-NEXT: movq %xmm0, %rsi +; SSE-NEXT: subl $-128, %edx +; SSE-NEXT: ptest %xmm3, %xmm3 +; SSE-NEXT: movq %xmm1, %r8 +; SSE-NEXT: cmovnel %edi, %edx +; SSE-NEXT: bsrq %rax, %rdi +; SSE-NEXT: xorl $63, %edi +; SSE-NEXT: bsrq %r8, %r8 +; SSE-NEXT: xorl $63, %r8d +; SSE-NEXT: orl $64, %r8d +; SSE-NEXT: testq %rax, %rax +; SSE-NEXT: cmovnel %edi, %r8d +; SSE-NEXT: bsrq %rcx, %rdi +; SSE-NEXT: xorl $63, %edi +; SSE-NEXT: bsrq %rsi, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %edi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: ptest %xmm1, %xmm1 +; SSE-NEXT: cmovnel %r8d, %eax +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: ptest %xmm2, %xmm2 +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_ctlz_undef_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm2, %rcx +; AVX2-NEXT: vmovq %xmm2, %rdx +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm2, %rax +; AVX2-NEXT: vmovq %xmm2, %r8 +; AVX2-NEXT: vpextrq $1, %xmm0, %rsi +; AVX2-NEXT: vmovq %xmm1, %rdi +; AVX2-NEXT: vpextrq $1, %xmm1, %r9 +; AVX2-NEXT: lzcntq %rax, %r10 +; AVX2-NEXT: lzcntq %r8, %r11 +; AVX2-NEXT: addl $64, %r11d +; AVX2-NEXT: testq %rax, %rax +; AVX2-NEXT: cmovnel %r10d, %r11d +; AVX2-NEXT: xorl %r10d, %r10d +; AVX2-NEXT: lzcntq %r9, %r10 +; AVX2-NEXT: lzcntq %rdi, %rdi +; AVX2-NEXT: addl $64, %edi +; AVX2-NEXT: testq %r9, %r9 +; AVX2-NEXT: cmovnel %r10d, %edi +; AVX2-NEXT: subl $-128, %edi +; AVX2-NEXT: orq %rax, %r8 +; AVX2-NEXT: cmovnel %r11d, %edi +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %rcx, %rax +; AVX2-NEXT: xorl %r8d, %r8d +; AVX2-NEXT: lzcntq %rdx, %r8 +; AVX2-NEXT: addl $64, %r8d +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %eax, %r8d +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: xorl %r9d, %r9d +; AVX2-NEXT: lzcntq %rsi, %r9 +; AVX2-NEXT: lzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %r9d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: vptest %ymm1, %ymm1 +; AVX2-NEXT: cmovnel %edi, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_ctlz_undef_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_ctlz_undef_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_ctlz_undef_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; AVX512POPCNT-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <16 x i32> %v0 to i512 + %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 -1) + %res = trunc i512 %cnt to i32 + ret i32 %res +} + define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind { ; SSE-LABEL: test_ctlz_undef_i1024: ; SSE: # %bb.0: @@ -3636,6 +4634,49 @@ define i32 @load_cttz_i128(ptr %p0) nounwind { ret i32 %res } +define i32 @vector_cttz_i128(<4 x i32> %v0) nounwind { +; SSE-LABEL: vector_cttz_i128: +; SSE: # %bb.0: +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: movq %xmm0, %rdx +; SSE-NEXT: rep bsfq %rdx, %rsi +; SSE-NEXT: movl $64, %eax +; SSE-NEXT: rep bsfq %rcx, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_cttz_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: tzcntq %rcx, %rdx +; AVX2-NEXT: tzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edx, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512-LABEL: vector_cttz_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: tzcntq %rcx, %rdx +; AVX512-NEXT: tzcntq %rax, %rax +; AVX512-NEXT: addl $64, %eax +; AVX512-NEXT: testq %rcx, %rcx +; AVX512-NEXT: cmovnel %edx, %eax +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: retq + %a0 = bitcast <4 x i32> %v0 to i128 + %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 0) + %res = trunc i128 %cnt to i32 + ret i32 %res +} + define i32 @test_cttz_i256(i256 %a0) nounwind { ; SSE-LABEL: test_cttz_i256: ; SSE: # %bb.0: @@ -3775,21 +4816,146 @@ define i32 @load_cttz_i256(ptr %p0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512POPCNT-LABEL: load_cttz_i256: +; AVX512POPCNT-LABEL: load_cttz_i256: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512POPCNT-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512POPCNT-NEXT: vpandn %ymm1, %ymm0, %ymm1 +; AVX512POPCNT-NEXT: vpopcntq %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1 +; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256] +; AVX512POPCNT-NEXT: vpcompressq %ymm1, %ymm0 {%k1} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = load i256, ptr %p0 + %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0) + %res = trunc i256 %cnt to i32 + ret i32 %res +} + +define i32 @vector_cttz_i256(<8 x i32> %v0) nounwind { +; SSE-LABEL: vector_cttz_i256: +; SSE: # %bb.0: +; SSE-NEXT: pextrq $1, %xmm1, %rcx +; SSE-NEXT: pextrq $1, %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rdx +; SSE-NEXT: rep bsfq %rdx, %rsi +; SSE-NEXT: rep bsfq %rax, %rdi +; SSE-NEXT: addl $64, %edi +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %esi, %edi +; SSE-NEXT: movq %xmm1, %rdx +; SSE-NEXT: rep bsfq %rdx, %rsi +; SSE-NEXT: movl $64, %eax +; SSE-NEXT: rep bsfq %rcx, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: ptest %xmm0, %xmm0 +; SSE-NEXT: cmovnel %edi, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_cttz_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: vpextrq $1, %xmm0, %rdx +; AVX2-NEXT: vmovq %xmm0, %rsi +; AVX2-NEXT: tzcntq %rsi, %rdi +; AVX2-NEXT: tzcntq %rdx, %r8 +; AVX2-NEXT: addl $64, %r8d +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %edi, %r8d +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: tzcntq %rcx, %rdi +; AVX2-NEXT: tzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edi, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rdx, %rsi +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_cttz_i256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vmovq %xmm1, %rcx +; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512F-NEXT: vmovq %xmm0, %rsi +; AVX512F-NEXT: tzcntq %rsi, %rdi +; AVX512F-NEXT: tzcntq %rdx, %r8 +; AVX512F-NEXT: addl $64, %r8d +; AVX512F-NEXT: testq %rsi, %rsi +; AVX512F-NEXT: cmovnel %edi, %r8d +; AVX512F-NEXT: tzcntq %rcx, %rdi +; AVX512F-NEXT: tzcntq %rax, %rax +; AVX512F-NEXT: addl $64, %eax +; AVX512F-NEXT: testq %rcx, %rcx +; AVX512F-NEXT: cmovnel %edi, %eax +; AVX512F-NEXT: subl $-128, %eax +; AVX512F-NEXT: orq %rdx, %rsi +; AVX512F-NEXT: cmovnel %r8d, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_cttz_i256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vmovq %xmm1, %rcx +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512VL-NEXT: vmovq %xmm0, %rsi +; AVX512VL-NEXT: tzcntq %rsi, %rdi +; AVX512VL-NEXT: tzcntq %rdx, %r8 +; AVX512VL-NEXT: addl $64, %r8d +; AVX512VL-NEXT: testq %rsi, %rsi +; AVX512VL-NEXT: cmovnel %edi, %r8d +; AVX512VL-NEXT: tzcntq %rcx, %rdi +; AVX512VL-NEXT: tzcntq %rax, %rax +; AVX512VL-NEXT: addl $64, %eax +; AVX512VL-NEXT: testq %rcx, %rcx +; AVX512VL-NEXT: cmovnel %edi, %eax +; AVX512VL-NEXT: subl $-128, %eax +; AVX512VL-NEXT: orq %rdx, %rsi +; AVX512VL-NEXT: cmovnel %r8d, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_cttz_i256: ; AVX512POPCNT: # %bb.0: -; AVX512POPCNT-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512POPCNT-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512POPCNT-NEXT: vpaddq %ymm1, %ymm0, %ymm1 -; AVX512POPCNT-NEXT: vpandn %ymm1, %ymm0, %ymm1 -; AVX512POPCNT-NEXT: vpopcntq %ymm1, %ymm1 -; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512POPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1 -; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256] -; AVX512POPCNT-NEXT: vpcompressq %ymm1, %ymm0 {%k1} -; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512POPCNT-NEXT: vpextrq $1, %xmm1, %rax +; AVX512POPCNT-NEXT: vmovq %xmm1, %rcx +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512POPCNT-NEXT: vmovq %xmm0, %rsi +; AVX512POPCNT-NEXT: tzcntq %rsi, %rdi +; AVX512POPCNT-NEXT: tzcntq %rdx, %r8 +; AVX512POPCNT-NEXT: addl $64, %r8d +; AVX512POPCNT-NEXT: testq %rsi, %rsi +; AVX512POPCNT-NEXT: cmovnel %edi, %r8d +; AVX512POPCNT-NEXT: tzcntq %rcx, %rdi +; AVX512POPCNT-NEXT: tzcntq %rax, %rax +; AVX512POPCNT-NEXT: addl $64, %eax +; AVX512POPCNT-NEXT: testq %rcx, %rcx +; AVX512POPCNT-NEXT: cmovnel %edi, %eax +; AVX512POPCNT-NEXT: subl $-128, %eax +; AVX512POPCNT-NEXT: orq %rdx, %rsi +; AVX512POPCNT-NEXT: cmovnel %r8d, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512POPCNT-NEXT: vzeroupper ; AVX512POPCNT-NEXT: retq - %a0 = load i256, ptr %p0 + %a0 = bitcast <8 x i32> %v0 to i256 %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0) %res = trunc i256 %cnt to i32 ret i32 %res @@ -4128,6 +5294,148 @@ define i32 @load_cttz_i512(ptr %p0) nounwind { ret i32 %res } +define i32 @vector_cttz_i512(<16 x i32> %v0) nounwind { +; SSE-LABEL: vector_cttz_i512: +; SSE: # %bb.0: +; SSE-NEXT: pextrq $1, %xmm3, %rdx +; SSE-NEXT: movq %xmm3, %rcx +; SSE-NEXT: pextrq $1, %xmm2, %rax +; SSE-NEXT: pextrq $1, %xmm1, %rsi +; SSE-NEXT: movq %xmm1, %rdi +; SSE-NEXT: pextrq $1, %xmm0, %r8 +; SSE-NEXT: movq %xmm0, %r9 +; SSE-NEXT: rep bsfq %r9, %r10 +; SSE-NEXT: rep bsfq %r8, %r8 +; SSE-NEXT: addl $64, %r8d +; SSE-NEXT: testq %r9, %r9 +; SSE-NEXT: cmovnel %r10d, %r8d +; SSE-NEXT: rep bsfq %rdi, %r9 +; SSE-NEXT: rep bsfq %rsi, %rsi +; SSE-NEXT: addl $64, %esi +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnel %r9d, %esi +; SSE-NEXT: movq %xmm2, %rdi +; SSE-NEXT: subl $-128, %esi +; SSE-NEXT: ptest %xmm0, %xmm0 +; SSE-NEXT: cmovnel %r8d, %esi +; SSE-NEXT: rep bsfq %rdi, %r8 +; SSE-NEXT: rep bsfq %rax, %r9 +; SSE-NEXT: addl $64, %r9d +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnel %r8d, %r9d +; SSE-NEXT: rep bsfq %rcx, %rdi +; SSE-NEXT: movl $64, %eax +; SSE-NEXT: rep bsfq %rdx, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %edi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: ptest %xmm2, %xmm2 +; SSE-NEXT: cmovnel %r9d, %eax +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: ptest %xmm0, %xmm0 +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_cttz_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm2, %rax +; AVX2-NEXT: vpextrq $1, %xmm1, %rcx +; AVX2-NEXT: vmovq %xmm1, %rdx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rsi +; AVX2-NEXT: vpextrq $1, %xmm0, %rdi +; AVX2-NEXT: vmovq %xmm1, %r8 +; AVX2-NEXT: vmovq %xmm0, %r9 +; AVX2-NEXT: tzcntq %r9, %r10 +; AVX2-NEXT: tzcntq %rdi, %r11 +; AVX2-NEXT: addl $64, %r11d +; AVX2-NEXT: testq %r9, %r9 +; AVX2-NEXT: cmovnel %r10d, %r11d +; AVX2-NEXT: xorl %r10d, %r10d +; AVX2-NEXT: tzcntq %r8, %r10 +; AVX2-NEXT: tzcntq %rsi, %rsi +; AVX2-NEXT: addl $64, %esi +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: cmovnel %r10d, %esi +; AVX2-NEXT: subl $-128, %esi +; AVX2-NEXT: orq %rdi, %r9 +; AVX2-NEXT: cmovnel %r11d, %esi +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: tzcntq %rdx, %rdi +; AVX2-NEXT: xorl %r8d, %r8d +; AVX2-NEXT: tzcntq %rcx, %r8 +; AVX2-NEXT: addl $64, %r8d +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovnel %edi, %r8d +; AVX2-NEXT: vmovq %xmm2, %rdi +; AVX2-NEXT: xorl %r9d, %r9d +; AVX2-NEXT: tzcntq %rdi, %r9 +; AVX2-NEXT: tzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rdi, %rdi +; AVX2-NEXT: cmovnel %r9d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: cmovnel %esi, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_cttz_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_cttz_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512VL-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_cttz_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <16 x i32> %v0 to i512 + %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0) + %res = trunc i512 %cnt to i32 + ret i32 %res +} + define i32 @test_cttz_i1024(i1024 %a0) nounwind { ; SSE-LABEL: test_cttz_i1024: ; SSE: # %bb.0: @@ -4930,6 +6238,48 @@ define i32 @load_cttz_undef_i128(ptr %p0) nounwind { ret i32 %res } +define i32 @vector_cttz_undef_i128(<4 x i32> %v0) nounwind { +; SSE-LABEL: vector_cttz_undef_i128: +; SSE: # %bb.0: +; SSE-NEXT: pextrq $1, %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rcx +; SSE-NEXT: rep bsfq %rcx, %rdx +; SSE-NEXT: rep bsfq %rax, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_cttz_undef_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: tzcntq %rcx, %rdx +; AVX2-NEXT: tzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edx, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512-LABEL: vector_cttz_undef_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: tzcntq %rcx, %rdx +; AVX512-NEXT: tzcntq %rax, %rax +; AVX512-NEXT: addl $64, %eax +; AVX512-NEXT: testq %rcx, %rcx +; AVX512-NEXT: cmovnel %edx, %eax +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: retq + %a0 = bitcast <4 x i32> %v0 to i128 + %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 -1) + %res = trunc i128 %cnt to i32 + ret i32 %res +} + define i32 @test_cttz_undef_i256(i256 %a0) nounwind { ; SSE-LABEL: test_cttz_undef_i256: ; SSE: # %bb.0: @@ -5084,6 +6434,130 @@ define i32 @load_cttz_undef_i256(ptr %p0) nounwind { ret i32 %res } +define i32 @vector_cttz_undef_i256(<8 x i32> %v0) nounwind { +; SSE-LABEL: vector_cttz_undef_i256: +; SSE: # %bb.0: +; SSE-NEXT: pextrq $1, %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rcx +; SSE-NEXT: pextrq $1, %xmm0, %rdx +; SSE-NEXT: movq %xmm0, %rsi +; SSE-NEXT: rep bsfq %rsi, %rdi +; SSE-NEXT: rep bsfq %rdx, %rdx +; SSE-NEXT: addl $64, %edx +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: cmovnel %edi, %edx +; SSE-NEXT: rep bsfq %rcx, %rsi +; SSE-NEXT: rep bsfq %rax, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: ptest %xmm0, %xmm0 +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_cttz_undef_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: vpextrq $1, %xmm0, %rdx +; AVX2-NEXT: vmovq %xmm0, %rsi +; AVX2-NEXT: tzcntq %rsi, %rdi +; AVX2-NEXT: tzcntq %rdx, %r8 +; AVX2-NEXT: addl $64, %r8d +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %edi, %r8d +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: tzcntq %rcx, %rdi +; AVX2-NEXT: tzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edi, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rdx, %rsi +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_cttz_undef_i256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vmovq %xmm1, %rcx +; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512F-NEXT: vmovq %xmm0, %rsi +; AVX512F-NEXT: tzcntq %rsi, %rdi +; AVX512F-NEXT: tzcntq %rdx, %r8 +; AVX512F-NEXT: addl $64, %r8d +; AVX512F-NEXT: testq %rsi, %rsi +; AVX512F-NEXT: cmovnel %edi, %r8d +; AVX512F-NEXT: tzcntq %rcx, %rdi +; AVX512F-NEXT: tzcntq %rax, %rax +; AVX512F-NEXT: addl $64, %eax +; AVX512F-NEXT: testq %rcx, %rcx +; AVX512F-NEXT: cmovnel %edi, %eax +; AVX512F-NEXT: subl $-128, %eax +; AVX512F-NEXT: orq %rdx, %rsi +; AVX512F-NEXT: cmovnel %r8d, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_cttz_undef_i256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vmovq %xmm1, %rcx +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512VL-NEXT: vmovq %xmm0, %rsi +; AVX512VL-NEXT: tzcntq %rsi, %rdi +; AVX512VL-NEXT: tzcntq %rdx, %r8 +; AVX512VL-NEXT: addl $64, %r8d +; AVX512VL-NEXT: testq %rsi, %rsi +; AVX512VL-NEXT: cmovnel %edi, %r8d +; AVX512VL-NEXT: tzcntq %rcx, %rdi +; AVX512VL-NEXT: tzcntq %rax, %rax +; AVX512VL-NEXT: addl $64, %eax +; AVX512VL-NEXT: testq %rcx, %rcx +; AVX512VL-NEXT: cmovnel %edi, %eax +; AVX512VL-NEXT: subl $-128, %eax +; AVX512VL-NEXT: orq %rdx, %rsi +; AVX512VL-NEXT: cmovnel %r8d, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_cttz_undef_i256: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512POPCNT-NEXT: vpextrq $1, %xmm1, %rax +; AVX512POPCNT-NEXT: vmovq %xmm1, %rcx +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512POPCNT-NEXT: vmovq %xmm0, %rsi +; AVX512POPCNT-NEXT: tzcntq %rsi, %rdi +; AVX512POPCNT-NEXT: tzcntq %rdx, %r8 +; AVX512POPCNT-NEXT: addl $64, %r8d +; AVX512POPCNT-NEXT: testq %rsi, %rsi +; AVX512POPCNT-NEXT: cmovnel %edi, %r8d +; AVX512POPCNT-NEXT: tzcntq %rcx, %rdi +; AVX512POPCNT-NEXT: tzcntq %rax, %rax +; AVX512POPCNT-NEXT: addl $64, %eax +; AVX512POPCNT-NEXT: testq %rcx, %rcx +; AVX512POPCNT-NEXT: cmovnel %edi, %eax +; AVX512POPCNT-NEXT: subl $-128, %eax +; AVX512POPCNT-NEXT: orq %rdx, %rsi +; AVX512POPCNT-NEXT: cmovnel %r8d, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <8 x i32> %v0 to i256 + %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 -1) + %res = trunc i256 %cnt to i32 + ret i32 %res +} + define i32 @test_cttz_undef_i512(i512 %a0) nounwind { ; SSE-LABEL: test_cttz_undef_i512: ; SSE: # %bb.0: @@ -5409,6 +6883,144 @@ define i32 @load_cttz_undef_i512(ptr %p0) nounwind { ret i32 %res } +define i32 @vector_cttz_undef_i512(<16 x i32> %v0) nounwind { +; SSE-LABEL: vector_cttz_undef_i512: +; SSE: # %bb.0: +; SSE-NEXT: pextrq $1, %xmm3, %rax +; SSE-NEXT: pextrq $1, %xmm2, %rdx +; SSE-NEXT: pextrq $1, %xmm1, %rcx +; SSE-NEXT: movq %xmm1, %rsi +; SSE-NEXT: pextrq $1, %xmm0, %rdi +; SSE-NEXT: movq %xmm0, %r8 +; SSE-NEXT: rep bsfq %r8, %r9 +; SSE-NEXT: rep bsfq %rdi, %rdi +; SSE-NEXT: addl $64, %edi +; SSE-NEXT: testq %r8, %r8 +; SSE-NEXT: cmovnel %r9d, %edi +; SSE-NEXT: rep bsfq %rsi, %r8 +; SSE-NEXT: rep bsfq %rcx, %rcx +; SSE-NEXT: addl $64, %ecx +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: cmovnel %r8d, %ecx +; SSE-NEXT: movq %xmm2, %rsi +; SSE-NEXT: subl $-128, %ecx +; SSE-NEXT: ptest %xmm0, %xmm0 +; SSE-NEXT: cmovnel %edi, %ecx +; SSE-NEXT: rep bsfq %rsi, %rdi +; SSE-NEXT: rep bsfq %rdx, %rdx +; SSE-NEXT: addl $64, %edx +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: cmovnel %edi, %edx +; SSE-NEXT: movq %xmm3, %rsi +; SSE-NEXT: rep bsfq %rsi, %rdi +; SSE-NEXT: rep bsfq %rax, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: cmovnel %edi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: ptest %xmm2, %xmm2 +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: ptest %xmm0, %xmm0 +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_cttz_undef_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm2, %rax +; AVX2-NEXT: vpextrq $1, %xmm1, %rcx +; AVX2-NEXT: vmovq %xmm1, %rdx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rsi +; AVX2-NEXT: vpextrq $1, %xmm0, %rdi +; AVX2-NEXT: vmovq %xmm1, %r8 +; AVX2-NEXT: vmovq %xmm0, %r9 +; AVX2-NEXT: tzcntq %r9, %r10 +; AVX2-NEXT: tzcntq %rdi, %r11 +; AVX2-NEXT: addl $64, %r11d +; AVX2-NEXT: testq %r9, %r9 +; AVX2-NEXT: cmovnel %r10d, %r11d +; AVX2-NEXT: xorl %r10d, %r10d +; AVX2-NEXT: tzcntq %r8, %r10 +; AVX2-NEXT: tzcntq %rsi, %rsi +; AVX2-NEXT: addl $64, %esi +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: cmovnel %r10d, %esi +; AVX2-NEXT: subl $-128, %esi +; AVX2-NEXT: orq %rdi, %r9 +; AVX2-NEXT: cmovnel %r11d, %esi +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: tzcntq %rdx, %rdi +; AVX2-NEXT: xorl %r8d, %r8d +; AVX2-NEXT: tzcntq %rcx, %r8 +; AVX2-NEXT: addl $64, %r8d +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovnel %edi, %r8d +; AVX2-NEXT: vmovq %xmm2, %rdi +; AVX2-NEXT: xorl %r9d, %r9d +; AVX2-NEXT: tzcntq %rdi, %r9 +; AVX2-NEXT: tzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rdi, %rdi +; AVX2-NEXT: cmovnel %r9d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: cmovnel %esi, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_cttz_undef_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_cttz_undef_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512VL-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_cttz_undef_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <16 x i32> %v0 to i512 + %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 -1) + %res = trunc i512 %cnt to i32 + ret i32 %res +} + define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind { ; SSE-LABEL: test_cttz_undef_i1024: ; SSE: # %bb.0: