Skip to content

Commit

Permalink
[X86] Disable lowering constant build vectors to broadcasts on AVX512…
Browse files Browse the repository at this point in the history
… targets

On AVX512 targets we're better off keeping constant vector at full width to ensure that they can be load folded into vector instructions, reducing register pressure.

If a vector constant remains as a basic load, X86FixupVectorConstantsPass will still convert this to a broadcast instruction for us.

Non-VLX targets are still seeing some regressions due to these being implicitly widened to 512-bit ops in isel patterns and not in the DAG, so I've limited this to just 512-bit vectors for now.
  • Loading branch information
RKSimon committed Apr 18, 2024
1 parent 44713f1 commit 27c0a8a
Show file tree
Hide file tree
Showing 98 changed files with 92,997 additions and 96,561 deletions.
226 changes: 206 additions & 20 deletions llvm/lib/Target/X86/X86FixupVectorConstants.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,26 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
{X86::VMOVSDZrm, 1, 64, rebuildZeroUpperCst},
{X86::VMOVDDUPZ128rm, 1, 64, rebuildSplatCst}},
128, 1);
case X86::VMOVAPDZ128rmk:
case X86::VMOVUPDZ128rmk:
return FixupConstant({{X86::VMOVSDZrmk, 1, 64, rebuildZeroUpperCst},
{X86::VMOVDDUPZ128rmk, 1, 64, rebuildSplatCst}},
128, 3);
case X86::VMOVAPDZ128rmkz:
case X86::VMOVUPDZ128rmkz:
return FixupConstant({{X86::VMOVSDZrmkz, 1, 64, rebuildZeroUpperCst},
{X86::VMOVDDUPZ128rmkz, 1, 64, rebuildSplatCst}},
128, 2);
case X86::VMOVAPSZ128rmk:
case X86::VMOVUPSZ128rmk:
return FixupConstant({{X86::VMOVSSZrmk, 1, 32, rebuildZeroUpperCst},
{X86::VBROADCASTSSZ128rmk, 1, 32, rebuildSplatCst}},
128, 3);
case X86::VMOVAPSZ128rmkz:
case X86::VMOVUPSZ128rmkz:
return FixupConstant({{X86::VMOVSSZrmkz, 1, 32, rebuildZeroUpperCst},
{X86::VBROADCASTSSZ128rmkz, 1, 32, rebuildSplatCst}},
128, 2);
case X86::VMOVAPDZ256rm:
case X86::VMOVAPSZ256rm:
case X86::VMOVUPDZ256rm:
Expand All @@ -433,6 +453,26 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
{X86::VBROADCASTSDZ256rm, 1, 64, rebuildSplatCst},
{X86::VBROADCASTF32X4Z256rm, 1, 128, rebuildSplatCst}},
256, 1);
case X86::VMOVAPDZ256rmk:
case X86::VMOVUPDZ256rmk:
return FixupConstant({{X86::VBROADCASTSDZ256rmk, 1, 64, rebuildSplatCst}},
256, 3);
case X86::VMOVAPDZ256rmkz:
case X86::VMOVUPDZ256rmkz:
return FixupConstant({{X86::VBROADCASTSDZ256rmkz, 1, 64, rebuildSplatCst}},
256, 2);
case X86::VMOVAPSZ256rmk:
case X86::VMOVUPSZ256rmk:
return FixupConstant(
{{X86::VBROADCASTSSZ256rmk, 1, 32, rebuildSplatCst},
{X86::VBROADCASTF32X4Z256rmk, 1, 128, rebuildSplatCst}},
256, 3);
case X86::VMOVAPSZ256rmkz:
case X86::VMOVUPSZ256rmkz:
return FixupConstant(
{{X86::VBROADCASTSSZ256rmkz, 1, 32, rebuildSplatCst},
{X86::VBROADCASTF32X4Z256rmkz, 1, 128, rebuildSplatCst}},
256, 2);
case X86::VMOVAPDZrm:
case X86::VMOVAPSZrm:
case X86::VMOVUPDZrm:
Expand All @@ -442,6 +482,26 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
{X86::VBROADCASTF32X4rm, 1, 128, rebuildSplatCst},
{X86::VBROADCASTF64X4rm, 1, 256, rebuildSplatCst}},
512, 1);
case X86::VMOVAPDZrmk:
case X86::VMOVUPDZrmk:
return FixupConstant({{X86::VBROADCASTSDZrmk, 1, 64, rebuildSplatCst},
{X86::VBROADCASTF64X4rmk, 1, 256, rebuildSplatCst}},
512, 3);
case X86::VMOVAPDZrmkz:
case X86::VMOVUPDZrmkz:
return FixupConstant({{X86::VBROADCASTSDZrmkz, 1, 64, rebuildSplatCst},
{X86::VBROADCASTF64X4rmkz, 1, 256, rebuildSplatCst}},
512, 2);
case X86::VMOVAPSZrmk:
case X86::VMOVUPSZrmk:
return FixupConstant({{X86::VBROADCASTSSZrmk, 1, 32, rebuildSplatCst},
{X86::VBROADCASTF32X4rmk, 1, 128, rebuildSplatCst}},
512, 3);
case X86::VMOVAPSZrmkz:
case X86::VMOVUPSZrmkz:
return FixupConstant({{X86::VBROADCASTSSZrmkz, 1, 32, rebuildSplatCst},
{X86::VBROADCASTF32X4rmkz, 1, 128, rebuildSplatCst}},
512, 2);
/* Integer Loads */
case X86::MOVDQArm:
case X86::MOVDQUrm: {
Expand Down Expand Up @@ -537,6 +597,42 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
{X86::VPMOVZXDQZ128rm, 2, 32, rebuildZExtCst}};
return FixupConstant(Fixups, 128, 1);
}
case X86::VMOVDQA32Z128rmk:
case X86::VMOVDQU32Z128rmk:
return FixupConstant({{X86::VPBROADCASTDZ128rmk, 1, 32, rebuildSplatCst},
{X86::VPMOVSXBDZ128rmk, 4, 8, rebuildSExtCst},
{X86::VPMOVZXBDZ128rmk, 4, 8, rebuildZExtCst},
{X86::VPMOVSXWDZ128rmk, 4, 16, rebuildSExtCst},
{X86::VPMOVZXWDZ128rmk, 4, 16, rebuildZExtCst}},
128, 3);
case X86::VMOVDQA32Z128rmkz:
case X86::VMOVDQU32Z128rmkz:
return FixupConstant({{X86::VPBROADCASTDZ128rmkz, 1, 32, rebuildSplatCst},
{X86::VPMOVSXBDZ128rmkz, 4, 8, rebuildSExtCst},
{X86::VPMOVZXBDZ128rmkz, 4, 8, rebuildZExtCst},
{X86::VPMOVSXWDZ128rmkz, 4, 16, rebuildSExtCst},
{X86::VPMOVZXWDZ128rmkz, 4, 16, rebuildZExtCst}},
128, 2);
case X86::VMOVDQA64Z128rmk:
case X86::VMOVDQU64Z128rmk:
return FixupConstant({{X86::VPMOVSXBQZ128rmk, 2, 8, rebuildSExtCst},
{X86::VPMOVZXBQZ128rmk, 2, 8, rebuildZExtCst},
{X86::VPMOVSXWQZ128rmk, 2, 16, rebuildSExtCst},
{X86::VPMOVZXWQZ128rmk, 2, 16, rebuildZExtCst},
{X86::VPBROADCASTQZ128rmk, 1, 64, rebuildSplatCst},
{X86::VPMOVSXDQZ128rmk, 2, 32, rebuildSExtCst},
{X86::VPMOVZXDQZ128rmk, 2, 32, rebuildZExtCst}},
128, 3);
case X86::VMOVDQA64Z128rmkz:
case X86::VMOVDQU64Z128rmkz:
return FixupConstant({{X86::VPMOVSXBQZ128rmkz, 2, 8, rebuildSExtCst},
{X86::VPMOVZXBQZ128rmkz, 2, 8, rebuildZExtCst},
{X86::VPMOVSXWQZ128rmkz, 2, 16, rebuildSExtCst},
{X86::VPMOVZXWQZ128rmkz, 2, 16, rebuildZExtCst},
{X86::VPBROADCASTQZ128rmkz, 1, 64, rebuildSplatCst},
{X86::VPMOVSXDQZ128rmkz, 2, 32, rebuildSExtCst},
{X86::VPMOVZXDQZ128rmkz, 2, 32, rebuildZExtCst}},
128, 2);
case X86::VMOVDQA32Z256rm:
case X86::VMOVDQA64Z256rm:
case X86::VMOVDQU32Z256rm:
Expand All @@ -561,6 +657,46 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
{X86::VPMOVZXDQZ256rm, 4, 32, rebuildZExtCst}};
return FixupConstant(Fixups, 256, 1);
}
case X86::VMOVDQA32Z256rmk:
case X86::VMOVDQU32Z256rmk:
return FixupConstant(
{{X86::VPBROADCASTDZ256rmk, 1, 32, rebuildSplatCst},
{X86::VPMOVSXBDZ256rmk, 8, 8, rebuildSExtCst},
{X86::VPMOVZXBDZ256rmk, 8, 8, rebuildZExtCst},
{X86::VBROADCASTI32X4Z256rmk, 1, 128, rebuildSplatCst},
{X86::VPMOVSXWDZ256rmk, 8, 16, rebuildSExtCst},
{X86::VPMOVZXWDZ256rmk, 8, 16, rebuildZExtCst}},
256, 3);
case X86::VMOVDQA32Z256rmkz:
case X86::VMOVDQU32Z256rmkz:
return FixupConstant(
{{X86::VPBROADCASTDZ256rmkz, 1, 32, rebuildSplatCst},
{X86::VPMOVSXBDZ256rmkz, 8, 8, rebuildSExtCst},
{X86::VPMOVZXBDZ256rmkz, 8, 8, rebuildZExtCst},
{X86::VBROADCASTI32X4Z256rmkz, 1, 128, rebuildSplatCst},
{X86::VPMOVSXWDZ256rmkz, 8, 16, rebuildSExtCst},
{X86::VPMOVZXWDZ256rmkz, 8, 16, rebuildZExtCst}},
256, 2);
case X86::VMOVDQA64Z256rmk:
case X86::VMOVDQU64Z256rmk:
return FixupConstant({{X86::VPMOVSXBQZ256rmk, 4, 8, rebuildSExtCst},
{X86::VPMOVZXBQZ256rmk, 4, 8, rebuildZExtCst},
{X86::VPBROADCASTQZ256rmk, 1, 64, rebuildSplatCst},
{X86::VPMOVSXWQZ256rmk, 4, 16, rebuildSExtCst},
{X86::VPMOVZXWQZ256rmk, 4, 16, rebuildZExtCst},
{X86::VPMOVSXDQZ256rmk, 4, 32, rebuildSExtCst},
{X86::VPMOVZXDQZ256rmk, 4, 32, rebuildZExtCst}},
256, 3);
case X86::VMOVDQA64Z256rmkz:
case X86::VMOVDQU64Z256rmkz:
return FixupConstant({{X86::VPMOVSXBQZ256rmkz, 4, 8, rebuildSExtCst},
{X86::VPMOVZXBQZ256rmkz, 4, 8, rebuildZExtCst},
{X86::VPBROADCASTQZ256rmkz, 1, 64, rebuildSplatCst},
{X86::VPMOVSXWQZ256rmkz, 4, 16, rebuildSExtCst},
{X86::VPMOVZXWQZ256rmkz, 4, 16, rebuildZExtCst},
{X86::VPMOVSXDQZ256rmkz, 4, 32, rebuildSExtCst},
{X86::VPMOVZXDQZ256rmkz, 4, 32, rebuildZExtCst}},
256, 2);
case X86::VMOVDQA32Zrm:
case X86::VMOVDQA64Zrm:
case X86::VMOVDQU32Zrm:
Expand All @@ -586,43 +722,93 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
{X86::VPMOVZXDQZrm, 8, 32, rebuildZExtCst}};
return FixupConstant(Fixups, 512, 1);
}
case X86::VMOVDQA32Zrmk:
case X86::VMOVDQU32Zrmk:
return FixupConstant({{X86::VPBROADCASTDZrmk, 1, 32, rebuildSplatCst},
{X86::VBROADCASTI32X4rmk, 1, 128, rebuildSplatCst},
{X86::VPMOVSXBDZrmk, 16, 8, rebuildSExtCst},
{X86::VPMOVZXBDZrmk, 16, 8, rebuildZExtCst},
{X86::VPMOVSXWDZrmk, 16, 16, rebuildSExtCst},
{X86::VPMOVZXWDZrmk, 16, 16, rebuildZExtCst}},
512, 3);
case X86::VMOVDQA32Zrmkz:
case X86::VMOVDQU32Zrmkz:
return FixupConstant({{X86::VPBROADCASTDZrmkz, 1, 32, rebuildSplatCst},
{X86::VBROADCASTI32X4rmkz, 1, 128, rebuildSplatCst},
{X86::VPMOVSXBDZrmkz, 16, 8, rebuildSExtCst},
{X86::VPMOVZXBDZrmkz, 16, 8, rebuildZExtCst},
{X86::VPMOVSXWDZrmkz, 16, 16, rebuildSExtCst},
{X86::VPMOVZXWDZrmkz, 16, 16, rebuildZExtCst}},
512, 2);
case X86::VMOVDQA64Zrmk:
case X86::VMOVDQU64Zrmk:
return FixupConstant({{X86::VPBROADCASTQZrmk, 1, 64, rebuildSplatCst},
{X86::VPMOVSXBQZrmk, 8, 8, rebuildSExtCst},
{X86::VPMOVZXBQZrmk, 8, 8, rebuildZExtCst},
{X86::VPMOVSXWQZrmk, 8, 16, rebuildSExtCst},
{X86::VPMOVZXWQZrmk, 8, 16, rebuildZExtCst},
{X86::VBROADCASTI64X4rmk, 1, 256, rebuildSplatCst},
{X86::VPMOVSXDQZrmk, 8, 32, rebuildSExtCst},
{X86::VPMOVZXDQZrmk, 8, 32, rebuildZExtCst}},
512, 3);
case X86::VMOVDQA64Zrmkz:
case X86::VMOVDQU64Zrmkz:
return FixupConstant({{X86::VPBROADCASTQZrmkz, 1, 64, rebuildSplatCst},
{X86::VPMOVSXBQZrmkz, 8, 8, rebuildSExtCst},
{X86::VPMOVZXBQZrmkz, 8, 8, rebuildZExtCst},
{X86::VPMOVSXWQZrmkz, 8, 16, rebuildSExtCst},
{X86::VPMOVZXWQZrmkz, 8, 16, rebuildZExtCst},
{X86::VBROADCASTI64X4rmkz, 1, 256, rebuildSplatCst},
{X86::VPMOVSXDQZrmkz, 8, 32, rebuildSExtCst},
{X86::VPMOVZXDQZrmkz, 8, 32, rebuildZExtCst}},
512, 2);
}

auto ConvertToBroadcastAVX512 = [&](unsigned OpSrc32, unsigned OpSrc64) {
unsigned OpBcst32 = 0, OpBcst64 = 0;
unsigned OpNoBcst32 = 0, OpNoBcst64 = 0;
auto ConvertToBroadcastAVX512 = [&](unsigned OpSrc16, unsigned OpSrc32,
unsigned OpSrc64) {
if (OpSrc16) {
if (const X86FoldTableEntry *Mem2Bcst =
llvm::lookupBroadcastFoldTableBySize(OpSrc16, 16)) {
unsigned OpBcst16 = Mem2Bcst->DstOp;
unsigned OpNoBcst16 = Mem2Bcst->Flags & TB_INDEX_MASK;
FixupEntry Fixups[] = {{(int)OpBcst16, 1, 16, rebuildSplatCst}};
// TODO: Add support for RegBitWidth, but currently rebuildSplatCst
// doesn't require it (defaults to Constant::getPrimitiveSizeInBits).
if (FixupConstant(Fixups, 0, OpNoBcst16))
return true;
}
}
if (OpSrc32) {
if (const X86FoldTableEntry *Mem2Bcst =
llvm::lookupBroadcastFoldTableBySize(OpSrc32, 32)) {
OpBcst32 = Mem2Bcst->DstOp;
OpNoBcst32 = Mem2Bcst->Flags & TB_INDEX_MASK;
unsigned OpBcst32 = Mem2Bcst->DstOp;
unsigned OpNoBcst32 = Mem2Bcst->Flags & TB_INDEX_MASK;
FixupEntry Fixups[] = {{(int)OpBcst32, 1, 32, rebuildSplatCst}};
// TODO: Add support for RegBitWidth, but currently rebuildSplatCst
// doesn't require it (defaults to Constant::getPrimitiveSizeInBits).
if (FixupConstant(Fixups, 0, OpNoBcst32))
return true;
}
}
if (OpSrc64) {
if (const X86FoldTableEntry *Mem2Bcst =
llvm::lookupBroadcastFoldTableBySize(OpSrc64, 64)) {
OpBcst64 = Mem2Bcst->DstOp;
OpNoBcst64 = Mem2Bcst->Flags & TB_INDEX_MASK;
unsigned OpBcst64 = Mem2Bcst->DstOp;
unsigned OpNoBcst64 = Mem2Bcst->Flags & TB_INDEX_MASK;
FixupEntry Fixups[] = {{(int)OpBcst64, 1, 64, rebuildSplatCst}};
// TODO: Add support for RegBitWidth, but currently rebuildSplatCst
// doesn't require it (defaults to Constant::getPrimitiveSizeInBits).
if (FixupConstant(Fixups, 0, OpNoBcst64))
return true;
}
}
assert(((OpBcst32 == 0) || (OpBcst64 == 0) || (OpNoBcst32 == OpNoBcst64)) &&
"OperandNo mismatch");

if (OpBcst32 || OpBcst64) {
unsigned OpNo = OpBcst32 == 0 ? OpNoBcst64 : OpNoBcst32;
FixupEntry Fixups[] = {{(int)OpBcst32, 32, 32, rebuildSplatCst},
{(int)OpBcst64, 64, 64, rebuildSplatCst}};
// TODO: Add support for RegBitWidth, but currently rebuildSplatCst
// doesn't require it (defaults to Constant::getPrimitiveSizeInBits).
return FixupConstant(Fixups, 0, OpNo);
}
return false;
};

// Attempt to find a AVX512 mapping from a full width memory-fold instruction
// to a broadcast-fold instruction variant.
if ((MI.getDesc().TSFlags & X86II::EncodingMask) == X86II::EVEX)
return ConvertToBroadcastAVX512(Opc, Opc);
return ConvertToBroadcastAVX512(Opc, Opc, Opc);

// Reverse the X86InstrInfo::setExecutionDomainCustom EVEX->VEX logic
// conversion to see if we can convert to a broadcasted (integer) logic op.
Expand Down Expand Up @@ -679,7 +865,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
break;
}
if (OpSrc32 || OpSrc64)
return ConvertToBroadcastAVX512(OpSrc32, OpSrc64);
return ConvertToBroadcastAVX512(0, OpSrc32, OpSrc64);
}

return false;
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7243,6 +7243,14 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
"Unsupported vector type for broadcast.");

// On AVX512VL targets we're better off keeping the full width constant load
// and letting X86FixupVectorConstantsPass handle conversion to
// broadcast/broadcast-fold.
// AVX512 targets without AVX512VL can do this only for 512-bit vectors.
if (Subtarget.hasAVX512() && (Subtarget.hasVLX() || VT.is512BitVector()) &&
BVOp->isConstant())
return SDValue();

// See if the build vector is a repeating sequence of scalars (inc. splat).
SDValue Ld;
BitVector UndefElements;
Expand Down
24 changes: 8 additions & 16 deletions llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1480,10 +1480,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm2
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
Expand All @@ -1495,10 +1493,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm2
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
Expand Down Expand Up @@ -3260,10 +3256,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm3
; AVX512F-NEXT: vpaddb (%rdx), %ymm3, %ymm1
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
Expand All @@ -3277,10 +3271,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm3
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm3, %ymm1
; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
Expand Down
Loading

0 comments on commit 27c0a8a

Please sign in to comment.