From 1552b91162bbb410971e2d4e5ec7afd1c7cc932f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 21 Nov 2023 17:38:49 +0000 Subject: [PATCH] [X86] X86FixupVectorConstantsPass - attempt to match VEX logic ops back to EVEX if we can create a broadcast fold On non-DQI AVX512 targets, X86InstrInfo::setExecutionDomainCustom will convert EVEX int-domain instructions to VEX fp-domain instructions. But, if we have the chance to use a broadcast fold we're better off using a EVEX instruction, so handle a reverse fold. --- .../Target/X86/X86FixupVectorConstants.cpp | 89 ++++++++++++++++--- llvm/test/CodeGen/X86/combine-abs.ll | 18 +++- .../CodeGen/X86/vector-shuffle-128-v16.ll | 13 ++- .../CodeGen/X86/vector-shuffle-256-v32.ll | 23 ++++- 4 files changed, 120 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp index d4d5cd8c3e16a4..326e09a1254a0b 100644 --- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp +++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp @@ -233,6 +233,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, bool HasAVX2 = ST->hasAVX2(); bool HasDQI = ST->hasDQI(); bool HasBWI = ST->hasBWI(); + bool HasVLX = ST->hasVLX(); auto ConvertToBroadcast = [&](unsigned OpBcst256, unsigned OpBcst128, unsigned OpBcst64, unsigned OpBcst32, @@ -352,20 +353,22 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, 1); } - // Attempt to find a AVX512 mapping from a full width memory-fold instruction - // to a broadcast-fold instruction variant. - if ((MI.getDesc().TSFlags & X86II::EncodingMask) == X86II::EVEX) { + auto ConvertToBroadcastAVX512 = [&](unsigned OpSrc32, unsigned OpSrc64) { unsigned OpBcst32 = 0, OpBcst64 = 0; unsigned OpNoBcst32 = 0, OpNoBcst64 = 0; - if (const X86MemoryFoldTableEntry *Mem2Bcst = - llvm::lookupBroadcastFoldTable(Opc, 32)) { - OpBcst32 = Mem2Bcst->DstOp; - OpNoBcst32 = Mem2Bcst->Flags & TB_INDEX_MASK; + if (OpSrc32) { + if (const X86MemoryFoldTableEntry *Mem2Bcst = + llvm::lookupBroadcastFoldTable(OpSrc32, 32)) { + OpBcst32 = Mem2Bcst->DstOp; + OpNoBcst32 = Mem2Bcst->Flags & TB_INDEX_MASK; + } } - if (const X86MemoryFoldTableEntry *Mem2Bcst = - llvm::lookupBroadcastFoldTable(Opc, 64)) { - OpBcst64 = Mem2Bcst->DstOp; - OpNoBcst64 = Mem2Bcst->Flags & TB_INDEX_MASK; + if (OpSrc64) { + if (const X86MemoryFoldTableEntry *Mem2Bcst = + llvm::lookupBroadcastFoldTable(OpSrc64, 64)) { + OpBcst64 = Mem2Bcst->DstOp; + OpNoBcst64 = Mem2Bcst->Flags & TB_INDEX_MASK; + } } assert(((OpBcst32 == 0) || (OpBcst64 == 0) || (OpNoBcst32 == OpNoBcst64)) && "OperandNo mismatch"); @@ -374,6 +377,70 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, unsigned OpNo = OpBcst32 == 0 ? OpNoBcst64 : OpNoBcst32; return ConvertToBroadcast(0, 0, OpBcst64, OpBcst32, 0, 0, OpNo); } + return false; + }; + + // Attempt to find a AVX512 mapping from a full width memory-fold instruction + // to a broadcast-fold instruction variant. + if ((MI.getDesc().TSFlags & X86II::EncodingMask) == X86II::EVEX) + return ConvertToBroadcastAVX512(Opc, Opc); + + // Reverse the X86InstrInfo::setExecutionDomainCustom EVEX->VEX logic + // conversion to see if we can convert to a broadcasted (integer) logic op. + if (HasVLX && !HasDQI) { + unsigned OpSrc32 = 0, OpSrc64 = 0; + switch (Opc) { + case X86::VANDPDrm: + case X86::VANDPSrm: + case X86::VPANDrm: + OpSrc32 = X86 ::VPANDDZ128rm; + OpSrc64 = X86 ::VPANDQZ128rm; + break; + case X86::VANDPDYrm: + case X86::VANDPSYrm: + case X86::VPANDYrm: + OpSrc32 = X86 ::VPANDDZ256rm; + OpSrc64 = X86 ::VPANDQZ256rm; + break; + case X86::VANDNPDrm: + case X86::VANDNPSrm: + case X86::VPANDNrm: + OpSrc32 = X86 ::VPANDNDZ128rm; + OpSrc64 = X86 ::VPANDNQZ128rm; + break; + case X86::VANDNPDYrm: + case X86::VANDNPSYrm: + case X86::VPANDNYrm: + OpSrc32 = X86 ::VPANDNDZ256rm; + OpSrc64 = X86 ::VPANDNQZ256rm; + break; + case X86::VORPDrm: + case X86::VORPSrm: + case X86::VPORrm: + OpSrc32 = X86 ::VPORDZ128rm; + OpSrc64 = X86 ::VPORQZ128rm; + break; + case X86::VORPDYrm: + case X86::VORPSYrm: + case X86::VPORYrm: + OpSrc32 = X86 ::VPORDZ256rm; + OpSrc64 = X86 ::VPORQZ256rm; + break; + case X86::VXORPDrm: + case X86::VXORPSrm: + case X86::VPXORrm: + OpSrc32 = X86 ::VPXORDZ128rm; + OpSrc64 = X86 ::VPXORQZ128rm; + break; + case X86::VXORPDYrm: + case X86::VXORPSYrm: + case X86::VPXORYrm: + OpSrc32 = X86 ::VPXORDZ256rm; + OpSrc64 = X86 ::VPXORQZ256rm; + break; + } + if (OpSrc32 || OpSrc64) + return ConvertToBroadcastAVX512(OpSrc32, OpSrc64); } return false; diff --git a/llvm/test/CodeGen/X86/combine-abs.ll b/llvm/test/CodeGen/X86/combine-abs.ll index 410218b33eb9c9..202c88109eaeb2 100644 --- a/llvm/test/CodeGen/X86/combine-abs.ll +++ b/llvm/test/CodeGen/X86/combine-abs.ll @@ -164,10 +164,20 @@ define <16 x i8> @combine_v16i8_abs_constant(<16 x i8> %a) { ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_v16i8_abs_constant: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: combine_v16i8_abs_constant: +; AVX2: # %bb.0: +; AVX2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_v16i8_abs_constant: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: combine_v16i8_abs_constant: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: retq %1 = insertelement <16 x i8> undef, i8 15, i32 0 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer %3 = and <16 x i8> %a, %2 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index 57a3c95f31717f..ec75631a9b5ed2 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -711,10 +711,15 @@ define <16 x i8> @shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz( ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index 1e7745a4b8836b..f5c5ba66317504 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -2362,10 +2362,25 @@ define <32 x i8> @load_fold_pblendvb_commute(ptr %px, <32 x i8> %y) { } define <32 x i8> @shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31(<32 x i8> %a) { -; ALL-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: -; ALL: # %bb.0: -; ALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: +; AVX2: # %bb.0: +; AVX2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; XOP-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: +; XOP: # %bb.0: +; XOP-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; XOP-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> ret <32 x i8> %shuffle }