diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index dd666f15bf57f..f8f7d3e367896 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -40090,8 +40090,9 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, bool AllowBWIVPERMV3 = (Depth >= BWIVPERMV3ShuffleDepth || HasSlowVariableMask); - // If root was a VPERMV3 node, always allow a variable shuffle. - if (Root.getOpcode() == X86ISD::VPERMV3) + // If root was a VPERMV/VPERMV3 node, always allow a variable shuffle. + if ((UnaryShuffle && Root.getOpcode() == X86ISD::VPERMV) || + Root.getOpcode() == X86ISD::VPERMV3) AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true; bool MaskContainsZeros = isAnyZero(Mask); diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index 351d98540c2a5..3f8f061f359f9 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -588,20 +588,19 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,7,1,3,7,0,0,0] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,7,1,3,5,7] -; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FCP-NEXT: vpor %ymm5, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,5,3,7,1,5,3,7] +; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [5,7,1,3,7,0,0,0] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm5[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FCP-NEXT: vpor %ymm1, %ymm5, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,4,5,6,7,4,5],zero,zero,ymm2[26,27],zero,zero,zero,zero,ymm2[24,25,20,21,22,23,20,21,28,29] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3],zero,zero,ymm4[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,6,0,4,2,6] ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,4,5,8,9,u,u,u,u,u,u,u,u,18,19,22,23,26,27,u,u,u,u] @@ -670,17 +669,16 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] -; AVX512-FCP-NEXT: vpermi2q %ymm5, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6] -; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,7,1,3,5,7] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,2,6,0,4,2,6] +; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,3,7,1,5,3,7] ; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[0,1,8,9],zero,zero,zero,zero,ymm4[u,u,u,u,u,u,2,3],zero,zero,ymm4[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm2[26,27],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29] @@ -753,17 +751,16 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] -; AVX512DQ-FCP-NEXT: vpermi2q %ymm5, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6] -; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,7,1,3,5,7] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,2,6,0,4,2,6] +; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,3,7,1,5,3,7] ; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[0,1,8,9],zero,zero,zero,zero,ymm4[u,u,u,u,u,u,2,3],zero,zero,ymm4[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm2[26,27],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll index 6476c3139daa7..34100adacbeb9 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll @@ -95,17 +95,17 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,1,4,6,6,5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,1] -; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,2] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7] -; AVX2-NEXT: vmovaps {{.*#+}} xmm3 = [3,5,7,u] -; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,5,0,1,3,5,0,1] +; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,4,2,6,0,4,2,6] +; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,2,2,1,4,6,6,5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6],ymm3[7] +; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [5,3,0,1,5,3,0,1] ; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vmovaps {{.*#+}} xmm3 = [3,5,7,u] +; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3] ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovlps %xmm0, 48(%rax) @@ -130,17 +130,17 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,1,4,6,6,5] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,1] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7] -; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm3 = [3,5,7,u] -; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,5,0,1,3,5,0,1] +; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,4,2,6,0,4,2,6] +; AVX2-FP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FP-NEXT: vpermps %ymm0, %ymm1, %ymm1 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,2,2,1,4,6,6,5] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6],ymm3[7] +; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [5,3,0,1,5,3,0,1] ; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FP-NEXT: vpermps %ymm0, %ymm3, %ymm0 +; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm3 = [3,5,7,u] +; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3] ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FP-NEXT: vmovlps %xmm0, 48(%rax) @@ -165,23 +165,22 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm1 = [3,5,7,u] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,5,0,1,3,5,0,1] -; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm0, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3] -; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm4 = [0,2,4,6,u,u,u,1] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm4, %ymm2 -; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,2,4,0,0,2,4,0] +; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [5,3,0,1,5,3,0,1] +; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [3,5,7,u] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,4,2,0,0,4,2,0] ; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm4, %ymm0 +; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm4 = [0,2,4,6,u,u,u,1] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm4, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX2-FCP-NEXT: vmovlps %xmm2, 48(%rax) +; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-FCP-NEXT: vmovlps %xmm1, 48(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FCP-NEXT: vmovaps %xmm1, 32(%rax) +; AVX2-FCP-NEXT: vmovaps %xmm3, 32(%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index f7a44fea5b02b..01a2f7f46f939 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -5576,8 +5576,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm3 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm12 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] -; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm4, %xmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] +; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm4, %xmm5 ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 @@ -5585,15 +5585,15 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11] ; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm3, %xmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm17 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] ; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm5, %xmm10 ; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm9[0,0,1,1] -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm16 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm18 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4] -; AVX512BW-FCP-NEXT: vpermd %ymm16, %ymm9, %ymm22 +; AVX512BW-FCP-NEXT: vpermd %ymm18, %ymm9, %ymm22 ; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm23 ; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] ; AVX512BW-FCP-NEXT: movl $138547332, %eax # imm = 0x8421084 @@ -5611,9 +5611,9 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] ; AVX512BW-FCP-NEXT: vpshufb %zmm15, %zmm22, %zmm22 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm16[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] -; AVX512BW-FCP-NEXT: vpshufb %zmm16, %zmm23, %zmm23 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm18[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] +; AVX512BW-FCP-NEXT: vpshufb %zmm18, %zmm23, %zmm23 ; AVX512BW-FCP-NEXT: vporq %zmm22, %zmm23, %zmm22 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,2,3,3,6,6,7,7] ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] @@ -5633,66 +5633,65 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: kmovq %rax, %k4 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm21, %zmm8 {%k4} ; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm11, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm12, %xmm17 -; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm17, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm12, %xmm16 +; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] ; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm11, %zmm11 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm14 = zmm11[0,0,1,1,4,4,5,5] ; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm13, %xmm11 -; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm18, %xmm17 -; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm17, %xmm11 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm18[0],xmm13[1],xmm18[1],xmm13[2],xmm18[2],xmm13[3],xmm18[3],xmm13[4],xmm18[4],xmm13[5],xmm18[5],xmm13[6],xmm18[6],xmm13[7],xmm18[7] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] -; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm13, %xmm13 +; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm17, %xmm16 +; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm16, %xmm11 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm17[0],xmm13[1],xmm17[1],xmm13[2],xmm17[2],xmm13[3],xmm17[3],xmm13[4],xmm17[4],xmm13[5],xmm17[5],xmm13[6],xmm17[6],xmm13[7],xmm17[7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm13, %xmm13 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm13, %zmm11 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,0,1,1,4,4,5,5] ; AVX512BW-FCP-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C ; AVX512BW-FCP-NEXT: kmovq %rax, %k4 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm11 {%k4} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] -; AVX512BW-FCP-NEXT: vpermd %zmm13, %zmm14, %zmm14 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm13, %zmm13 ; AVX512BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-FCP-NEXT: kmovq %rax, %k4 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm11 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %ymm18 -; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm18, %ymm7 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm11 {%k4} +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm6 +; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm14, %ymm7 ; AVX512BW-FCP-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm14, %ymm7 -; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm18, %ymm19 -; AVX512BW-FCP-NEXT: vporq %ymm7, %ymm19, %ymm7 +; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm13, %ymm7 +; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm14, %ymm17 +; AVX512BW-FCP-NEXT: vporq %ymm7, %ymm17, %ymm7 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm7 ; AVX512BW-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm15 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512BW-FCP-NEXT: vpshufb %ymm16, %ymm19, %ymm16 -; AVX512BW-FCP-NEXT: vporq %ymm15, %ymm16, %ymm15 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm17 +; AVX512BW-FCP-NEXT: vpshufb %ymm18, %ymm17, %ymm18 +; AVX512BW-FCP-NEXT: vporq %ymm15, %ymm18, %ymm15 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm9, %ymm9 +; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm9, %ymm9 ; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm9 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm9, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,12,14,13,13,13,13,12,14] -; AVX512BW-FCP-NEXT: vpermd %zmm13, %zmm6, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,4,6,5,5,5,5,4,6] +; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm6, %zmm6 ; AVX512BW-FCP-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm9 {%k1} ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm18[27],zero,zero,ymm18[26],zero,ymm18[28],zero,ymm18[30],zero,zero,ymm18[29],zero,ymm18[31],zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[27],zero,zero,ymm14[26],zero,ymm14[28],zero,ymm14[30],zero,zero,ymm14[29],zero,ymm14[31],zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm14[27],zero,zero,ymm14[26],zero,ymm14[28],zero,ymm14[30],zero,zero,ymm14[29],zero,ymm14[31],zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[27],zero,zero,ymm13[26],zero,ymm13[28],zero,ymm13[30],zero,zero,ymm13[29],zero,ymm13[31],zero,zero ; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm6, %ymm4 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,2,3,3,8,8,9,9] ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm4 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm2, %xmm2 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm7[26],zero,ymm7[28],zero,zero,zero,zero,ymm7[29],zero,ymm7[31],zero,zero,ymm7[30] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm19[26],zero,ymm19[28],zero,zero,ymm19[27],zero,ymm19[29],zero,ymm19[31],zero,zero,ymm19[30],zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm17[26],zero,ymm17[28],zero,zero,ymm17[27],zero,ymm17[29],zero,ymm17[31],zero,zero,ymm17[30],zero ; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm5, %ymm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k3} @@ -5899,8 +5898,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm4, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm4, %xmm5 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 @@ -5908,15 +5907,15 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm3, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm5, %xmm10 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm9[0,0,1,1] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm18 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm16, %ymm9, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm18, %ymm9, %ymm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm23 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] ; AVX512DQ-BW-FCP-NEXT: movl $138547332, %eax # imm = 0x8421084 @@ -5934,9 +5933,9 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm15, %zmm22, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm16[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm16, %zmm23, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm18[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm18, %zmm23, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vporq %zmm22, %zmm23, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,2,3,3,6,6,7,7] ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] @@ -5956,66 +5955,65 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm21, %zmm8 {%k4} ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm11, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm12, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vporq %xmm14, %xmm17, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm12, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm11, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm14 = zmm11[0,0,1,1,4,4,5,5] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm13, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm18, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm17, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm18[0],xmm13[1],xmm18[1],xmm13[2],xmm18[2],xmm13[3],xmm18[3],xmm13[4],xmm18[4],xmm13[5],xmm18[5],xmm13[6],xmm18[6],xmm13[7],xmm18[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm13, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm17, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm16, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm17[0],xmm13[1],xmm17[1],xmm13[2],xmm17[2],xmm13[3],xmm17[3],xmm13[4],xmm17[4],xmm13[5],xmm17[5],xmm13[6],xmm17[6],xmm13[7],xmm17[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm13, %xmm13 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm13, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,0,1,1,4,4,5,5] ; AVX512DQ-BW-FCP-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm11 {%k4} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] -; AVX512DQ-BW-FCP-NEXT: vpermd %zmm13, %zmm14, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm13, %zmm13 ; AVX512DQ-BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm11 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %ymm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm18, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm11 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm14, %ymm7 ; AVX512DQ-BW-FCP-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm14, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm18, %ymm19 -; AVX512DQ-BW-FCP-NEXT: vporq %ymm7, %ymm19, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm13, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm14, %ymm17 +; AVX512DQ-BW-FCP-NEXT: vporq %ymm7, %ymm17, %ymm7 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm7 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm16, %ymm19, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vporq %ymm15, %ymm16, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm17 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm18, %ymm17, %ymm18 +; AVX512DQ-BW-FCP-NEXT: vporq %ymm15, %ymm18, %ymm15 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm9, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm9, %ymm9 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm9 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm9, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,12,14,13,13,13,13,12,14] -; AVX512DQ-BW-FCP-NEXT: vpermd %zmm13, %zmm6, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,4,6,5,5,5,5,4,6] +; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm6, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm9 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm18[27],zero,zero,ymm18[26],zero,ymm18[28],zero,ymm18[30],zero,zero,ymm18[29],zero,ymm18[31],zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[27],zero,zero,ymm14[26],zero,ymm14[28],zero,ymm14[30],zero,zero,ymm14[29],zero,ymm14[31],zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm14[27],zero,zero,ymm14[26],zero,ymm14[28],zero,ymm14[30],zero,zero,ymm14[29],zero,ymm14[31],zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[27],zero,zero,ymm13[26],zero,ymm13[28],zero,ymm13[30],zero,zero,ymm13[29],zero,ymm13[31],zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm6, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,2,3,3,8,8,9,9] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm2, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm7[26],zero,ymm7[28],zero,zero,zero,zero,ymm7[29],zero,ymm7[31],zero,zero,ymm7[30] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm19[26],zero,ymm19[28],zero,zero,ymm19[27],zero,ymm19[29],zero,ymm19[31],zero,zero,ymm19[30],zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm17[26],zero,ymm17[28],zero,zero,ymm17[27],zero,ymm17[29],zero,ymm17[31],zero,zero,ymm17[30],zero ; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm5, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k3} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 89ed0040a71c2..c1dba071b4353 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -1000,20 +1000,19 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7] -; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,5,0,5,1,3,0] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,4,8],zero,zero,zero,zero,ymm5[1,5,9],zero,zero,zero,zero,ymm5[2,6,18],zero,zero,zero,zero,ymm5[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FCP-NEXT: vpor %ymm5, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,5,3,0,3,1,5,0] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,7,1,3,5,7] +; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm5[1,5,9,13],zero,zero,zero,ymm5[2,6,10,14],zero,zero,zero,ymm5[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FCP-NEXT: vpor %ymm1, %ymm5, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28] ; AVX2-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,6,0,4,2,6] ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] @@ -1082,16 +1081,15 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] -; AVX512-FCP-NEXT: vpermi2q %ymm5, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6] -; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,5,0,5,1,3,0] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,2,6,0,4,2,6] +; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[18,22,26],zero,zero,zero,zero,ymm1[19,23,27],zero,zero,zero,zero +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,3,0,3,1,5,0] +; AVX512-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,zero,zero,ymm0[2,6,18],zero,zero,zero,zero,ymm0[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero @@ -1165,16 +1163,15 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] -; AVX512DQ-FCP-NEXT: vpermi2q %ymm5, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6] -; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,5,0,5,1,3,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,2,6,0,4,2,6] +; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[18,22,26],zero,zero,zero,zero,ymm1[19,23,27],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,3,0,3,1,5,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,zero,zero,ymm0[2,6,18],zero,zero,zero,zero,ymm0[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero @@ -1244,30 +1241,29 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] -; AVX512BW-FCP-NEXT: vpermi2q %ymm5, %ymm0, %ymm1 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,3,5,7,1,3,5,7] -; AVX512BW-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6,10,14],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,5,0,5,1,3,0] -; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm5 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7] +; AVX512BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,3,0,3,1,5,0] +; AVX512BW-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,4,8],zero,zero,zero,zero,ymm5[1,5,9],zero,zero,zero,zero,ymm5[2,6,18],zero,zero,zero,zero,ymm5[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0 +; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm5, %ymm1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero ; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,6,0,4,2,6] ; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX512BW-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX512BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm1[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm0[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rax) -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-FCP-NEXT: vmovq %xmm1, 48(%rax) -; AVX512BW-FCP-NEXT: vmovdqa %xmm0, 32(%rax) +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, 48(%rax) +; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -1323,30 +1319,29 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm5, %ymm0, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,3,5,7,1,3,5,7] -; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6,10,14],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,5,0,5,1,3,0] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7] +; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,3,0,3,1,5,0] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,4,8],zero,zero,zero,zero,ymm5[1,5,9],zero,zero,zero,zero,ymm5[2,6,18],zero,zero,zero,zero,ymm5[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm5, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,6,0,4,2,6] ; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX512DQ-BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm1[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm0[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rax) -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, 48(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, 32(%rax) +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, 48(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index 75c470a6d40c6..bb236b375778c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -2122,44 +2122,48 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,6,4,6,4,6,4,6] +; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,2,0,2,0,2,0,2] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermq %zmm5, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] ; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] ; AVX512BW-FCP-NEXT: movl $-2004318072, %ecx # imm = 0x88888888 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vpermq %zmm4, %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm6, %zmm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] +; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movl $572662306, %ecx # imm = 0x22222222 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm4 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm5 {%k2} ; AVX512BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm4 {%k3} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,3,1,3,1,3,1,3] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm5 {%k3} +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,7,5,7,5,7,5,7] +; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] -; AVX512BW-FCP-NEXT: vpermq %zmm2, %zmm5, %zmm2 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,3,1,3,1,3,1,3] +; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermq %zmm2, %zmm6, %zmm2 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] -; AVX512BW-FCP-NEXT: vpermq %zmm0, %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vpermq %zmm0, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -2226,44 +2230,48 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,6,4,6,4,6,4,6] +; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,2,0,2,0,2,0,2] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm5, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] ; AVX512DQ-BW-FCP-NEXT: movl $-2004318072, %ecx # imm = 0x88888888 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm4, %zmm6, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm6, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movl $572662306, %ecx # imm = 0x22222222 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm4 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm5 {%k2} ; AVX512DQ-BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm4 {%k3} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,3,1,3,1,3,1,3] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm5 {%k3} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,7,5,7,5,7,5,7] +; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm2, %zmm5, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,3,1,3,1,3,1,3] +; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm2, %zmm6, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm0, %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm0, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64